machine_learning

Random Forests

Extending the bagging technique

Similar trees - tree correlation

Six decision trees based on different bootstrap samples.

Tree correlation

Random forests achieve this in two ways:

1) Bootstrap:

2) Split-variable randomization:

Basic algorithm

The basic algorithm for a regression random forest can be generalized:

1.  Given training data set
2.  Select number of trees to build (ntrees)
3.  for i = 1 to ntrees do
4.  |  Generate a bootstrap sample of the original data
5.  |  Grow a regression tree to the bootstrapped data
6.  |  for each split do
7.  |  | Select m variables at random from all p variables
8.  |  | Pick the best variable/split-point among the m
9.  |  | Split the node into two child nodes
10. |  end
11. | Use tree model stopping criteria to determine: tree complete 
12. end

The algorithm randomly selects a bootstrap sample to train and predictors to use at each split.

Characteristics

out-of-bag error

Out-of-bag error vs. validation error

–>

Preparation - random forests

library(rsample)      # data splitting 
library(randomForest) # basic implementation
library(ranger)       # a faster implementation of randomForest
# an aggregator package for performing many 
# machine learning models
library(caret)        

The Ames housing data

set.seed(123)
ames_data <- AmesHousing::ames_raw
set.seed(123)
ames_split <- rsample::initial_split(ames_data,prop=.7)
ames_train <- rsample::training(ames_split)
ames_test  <- rsample::testing(ames_split)

Basic implementation

randomForest::randomForest

set.seed(123)
# default RF model
(m1 <- randomForest(formula = Sale_Price ~ .,data=ames_train))
## 
## Call:
##  randomForest(formula = Sale_Price ~ ., data = ames_train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 26
## 
##           Mean of squared residuals: 639516350
##                     % Var explained: 89.7

Plotting the model

plot(m1,main="Error rate")

{height=65%}

Random forests - out-of-the-box algorithm

Tuning Random forests

Tuning parameters (I)

number of trees

number of variables

Tuning parameters (II)

Number of samples

Tuning parameters (III)

minimum number of samples within the terminal nodes:

maximum number of terminal nodes

Initial tuning with randomForest

features <- setdiff(names(ames_train), "Sale_Price")
set.seed(123)
m2<-tuneRF(x= ames_train[,features],
  y= ames_train$Sale_Price,ntreeTry   = 500,
  mtryStart  = 5,stepFactor = 2,
  improve    = 0.01,trace=FALSE)

Full grid search with ranger

Assessing the speed

randomForest speed

system.time(
  ames_randomForest <- randomForest(
    formula = Sale_Price ~ ., 
    data    = ames_train, 
    ntree   = 500,
    mtry    = floor(length(features) / 3)
  )
)
#       User      System    elapsed 
#     145.47        0.09      152.48 

ranger speed

system.time(
  ames_ranger <- ranger(formula=Sale_Price ~ ., 
    data      = ames_train,num.trees = 500,
    mtry      = floor(length(features) / 3))
)
##    user  system elapsed 
##    9.13    0.03    3.23
# hyperparameter grid search
hyper_grid <- expand.grid(
  mtry       = seq(20, 30, by = 2),
  node_size  = seq(3, 9, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_RMSE   = 0
)
nrow(hyper_grid) # total number of combinations
## [1] 96

Loop - hyperparameter combination (I)

for(i in 1:nrow(hyper_grid)) {
  model <- ranger(formula= Sale_Price ~ .,data= ames_train, 
    num.trees       = 500,mtry= hyper_grid$mtry[i],
    min.node.size   = hyper_grid$node_size[i],
    sample.fraction = hyper_grid$sampe_size[i],
    seed            = 123)
    # add OOB error to grid
  hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}

The results - samll difference between RMSE

hyper_grid %>% dplyr::arrange(OOB_RMSE) %>% head(10)
##    mtry node_size sampe_size OOB_RMSE
## 1    26         3        0.8 25404.60
## 2    28         3        0.8 25405.92
## 3    28         5        0.8 25459.46
## 4    26         5        0.8 25493.80
## 5    30         3        0.8 25528.26
## 6    22         3        0.7 25552.73
## 7    26         9        0.8 25554.31
## 8    28         7        0.8 25578.45
## 9    20         3        0.8 25581.23
## 10   24         3        0.8 25590.73

Hyperparameter grid search - categorical variables

# one-hot encode our categorical variables
(one_hot <- dummyVars(~ ., ames_train, fullRank = FALSE))
## Dummy Variable Object
## 
## Formula: ~.
## 81 variables, 46 factors
## Variables and levels will be separated by '.'
## A less than full rank encoding is used

Make a dataframe of dummy variable object

ames_train_hot<-predict(one_hot,ames_train)%>%as.data.frame()

Hot encoding and hypergrid

# make ranger compatible names
names(ames_train_hot) <- make.names(names(ames_train_hot), 
                                    allow_ = FALSE)
# --> same as above but with increased mtry values
hyper_grid_2 <- expand.grid(
  mtry       = seq(50, 200, by = 25),
  node_size  = seq(3, 9, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_RMSE  = 0
)

The best model

The best random forest model:

How to proceed

Random forests with ranger

https://people.cs.pitt.edu/~milos/courses/cs2750-Spring03/lectures/class19.pdf –>

OOB_RMSE <- vector(mode = "numeric", length = 100)
for(i in seq_along(OOB_RMSE)) {
  optimal_ranger <- ranger(formula= Sale_Price ~ ., 
    data            = ames_train, 
    num.trees       = 500,
    mtry            = 24,
    min.node.size   = 5,
    sample.fraction = .8,
    importance      = 'impurity')
  OOB_RMSE[i] <- sqrt(optimal_ranger$prediction.error)
}

Variable importance / node impurity

http://mason.gmu.edu/~jgentle/csi772/16s/L10_CART_16s.pdf

https://stats.stackexchange.com/questions/158583/what-does-node-size-refer-to-in-the-random-forest –>

Plot the variable importance

varimp_ranger <- optimal_ranger$variable.importance 
lattice::barchart(sort(varimp_ranger)[1:25],col="royalblue")

{height=60%}

A histogram of OOB RMSE

hist(OOB_RMSE, breaks = 20,col="royalblue")

{height=75%}

Predicting

# randomForest
pred_randomForest <- predict(ames_randomForest, ames_test)
head(pred_randomForest)
##        1        2        3        4        5        6 
## 113543.1 185556.4 259258.1 190943.9 179071.0 480952.3
# ranger
pred_ranger <- predict(ames_ranger, ames_test)
head(pred_ranger$predictions)
## [1] 129258.1 186520.7 265628.2 197745.5 175517.6 392691.7

Summary - random forests

Advantages & Disadvantages

Advantages - random forrests

Disadvantages - random forrests

These slides are mainly based on