MATH 253 Topic 7 Exercises

ISL 8.4.1

Figures 8.1 and 8.2 provide a model. The question is whether your two plots are consistent with one another. Note that the dividing line at each node will be perpendicular to the axis of the variable used for splitting.

ISL 8.4.2

ISL 8.4.3

classification_error <- function(p_1) 1 - pmax(p_1, 1 - p_1)
gini <- function(p_1) p_1 * (1 - p_1)
cross_entropy <- function(p_1) - (p_1 * log(p_1) + (1 - p_1) * log(1 - p_1))
x <- seq(0, 1, length = 1000)
plot(x, classification_error(x), type = "l", ylim = c(0, 0.75))
lines(x, gini(x), col = "blue")
lines(x, cross_entropy(x), col = "green")

ISL 8.4.4

Figure 8.11 from ISLR

ISL 8.4.5

The bootstrapped estimates of probability of “red” are:

probs <- c(0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, 0.75)
# majority_vote 
mean(probs > 0.5) # so red

## [1] 0.6

# average probability
mean(probs) # so green

## [1] 0.45

ISL 8.4.12

A prediction of the number of bicyclists who will use a rail trail, using the mosaicData::RailTrail data

data(RailTrail, package = "mosaicData") 
train_inds <- sample(nrow(RailTrail), size = nrow(RailTrail) / 2)

For each of the methods, the workflow is similar:
1. Build the model using the training subset 2. Evaluate the model on the testing subset 3. Calculate the mean square prediction error for the testing subset

Bagging

library(randomForest)

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

set.seed(1101) # for reproducibility when debugging
bag_train <- randomForest(volume ~ . , data=RailTrail[train_inds,],
                          mtry = ncol(RailTrail) - 1, importance = TRUE)
preds <- predict(bag_train, newdata = RailTrail[ - train_inds, ])
mse_bag <- mean((RailTrail$volume[-train_inds] - preds)^2)
mse_bag

## [1] 14664.87

Random Forests

rf_train <- randomForest(volume ~ . , data=RailTrail[train_inds,],
                          mtry = ncol(RailTrail) / 3, importance = TRUE)
preds <- predict(rf_train, newdata = RailTrail[ - train_inds, ])
mse_rf <- mean((RailTrail$volume[-train_inds] - preds)^2)
mse_rf

## [1] 13093.42

Boosting

library(gbm)

## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1

boost_train <- gbm(volume ~ . , data=RailTrail[train_inds,],
                   distribution = "gaussian", n.trees = 5000, interaction.depth = 2)
preds <- predict(boost_train, newdata = RailTrail[ - train_inds, ], n.trees = 5000)
mse_boost <- mean((RailTrail$volume[-train_inds] - preds)^2)
mse_boost

## [1] 14181.85

Linear Models

lm_train <- lm(volume ~ . , data=RailTrail[train_inds,])
preds <- predict(lm_train, newdata = RailTrail[ - train_inds, ])

## Warning in predict.lm(lm_train, newdata = RailTrail[-train_inds, ]):
## prediction from a rank-deficient fit may be misleading

mse_lm <- mean((RailTrail$volume[-train_inds] - preds)^2)
mse_lm

## [1] 14715.86