
Random Forests

Extending the bagging technique

Similar trees - tree correlation

Six decision trees based on different bootstrap samples.

Tree correlation

Random forests achieve this in two ways:

1) Bootstrap:

2) Split-variable randomization:

Basic algorithm

The basic algorithm for a regression random forest can be generalized:

1.  Given training data set
2.  Select number of trees to build (ntrees)
3.  for i = 1 to ntrees do
4.  |  Generate a bootstrap sample of the original data
5.  |  Grow a regression tree to the bootstrapped data
6.  |  for each split do
7.  |  | Select m variables at random from all p variables
8.  |  | Pick the best variable/split-point among the m
9.  |  | Split the node into two child nodes
10. |  end
11. | Use tree model stopping criteria to determine: tree complete 
12. end

The algorithm randomly selects a bootstrap sample to train and predictors to use at each split.


out-of-bag error

Out-of-bag error vs. validation error


Preparation - random forests

library(rsample)      # data splitting 
library(randomForest) # basic implementation
library(ranger)       # a faster implementation of randomForest
# an aggregator package for performing many 
# machine learning models

The Ames housing data

ames_data <- AmesHousing::ames_raw
ames_split <- rsample::initial_split(ames_data,prop=.7)
ames_train <- rsample::training(ames_split)
ames_test  <- rsample::testing(ames_split)

Basic implementation


# default RF model
(m1 <- randomForest(formula = Sale_Price ~ .,data=ames_train))
## Call:
##  randomForest(formula = Sale_Price ~ ., data = ames_train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 26
##           Mean of squared residuals: 639516350
##                     % Var explained: 89.7

Plotting the model

plot(m1,main="Error rate")


Random forests - out-of-the-box algorithm

Tuning Random forests

Tuning parameters (I)

number of trees

number of variables

Tuning parameters (II)

Number of samples

Tuning parameters (III)

minimum number of samples within the terminal nodes:

maximum number of terminal nodes

Initial tuning with randomForest

features <- setdiff(names(ames_train), "Sale_Price")
m2<-tuneRF(x= ames_train[,features],
  y= ames_train$Sale_Price,ntreeTry   = 500,
  mtryStart  = 5,stepFactor = 2,
  improve    = 0.01,trace=FALSE)

Full grid search with ranger

Assessing the speed

randomForest speed

  ames_randomForest <- randomForest(
    formula = Sale_Price ~ ., 
    data    = ames_train, 
    ntree   = 500,
    mtry    = floor(length(features) / 3)
#       User      System    elapsed 
#     145.47        0.09      152.48 

ranger speed

  ames_ranger <- ranger(formula=Sale_Price ~ ., 
    data      = ames_train,num.trees = 500,
    mtry      = floor(length(features) / 3))
##    user  system elapsed 
##    9.13    0.03    3.23
# hyperparameter grid search
hyper_grid <- expand.grid(
  mtry       = seq(20, 30, by = 2),
  node_size  = seq(3, 9, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_RMSE   = 0
nrow(hyper_grid) # total number of combinations
## [1] 96

Loop - hyperparameter combination (I)

for(i in 1:nrow(hyper_grid)) {
  model <- ranger(formula= Sale_Price ~ .,data= ames_train, 
    num.trees       = 500,mtry= hyper_grid$mtry[i],
    min.node.size   = hyper_grid$node_size[i],
    sample.fraction = hyper_grid$sampe_size[i],
    seed            = 123)
    # add OOB error to grid
  hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)

The results - samll difference between RMSE

hyper_grid %>% dplyr::arrange(OOB_RMSE) %>% head(10)
##    mtry node_size sampe_size OOB_RMSE
## 1    26         3        0.8 25404.60
## 2    28         3        0.8 25405.92
## 3    28         5        0.8 25459.46
## 4    26         5        0.8 25493.80
## 5    30         3        0.8 25528.26
## 6    22         3        0.7 25552.73
## 7    26         9        0.8 25554.31
## 8    28         7        0.8 25578.45
## 9    20         3        0.8 25581.23
## 10   24         3        0.8 25590.73

Hyperparameter grid search - categorical variables

# one-hot encode our categorical variables
(one_hot <- dummyVars(~ ., ames_train, fullRank = FALSE))
## Dummy Variable Object
## Formula: ~.
## 81 variables, 46 factors
## Variables and levels will be separated by '.'
## A less than full rank encoding is used

Make a dataframe of dummy variable object


Hot encoding and hypergrid

# make ranger compatible names
names(ames_train_hot) <- make.names(names(ames_train_hot), 
                                    allow_ = FALSE)
# --> same as above but with increased mtry values
hyper_grid_2 <- expand.grid(
  mtry       = seq(50, 200, by = 25),
  node_size  = seq(3, 9, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_RMSE  = 0

The best model

The best random forest model:

How to proceed

Random forests with ranger –>

OOB_RMSE <- vector(mode = "numeric", length = 100)
for(i in seq_along(OOB_RMSE)) {
  optimal_ranger <- ranger(formula= Sale_Price ~ ., 
    data            = ames_train, 
    num.trees       = 500,
    mtry            = 24,
    min.node.size   = 5,
    sample.fraction = .8,
    importance      = 'impurity')
  OOB_RMSE[i] <- sqrt(optimal_ranger$prediction.error)

Variable importance / node impurity –>

Plot the variable importance

varimp_ranger <- optimal_ranger$variable.importance 


A histogram of OOB RMSE

hist(OOB_RMSE, breaks = 20,col="royalblue")



# randomForest
pred_randomForest <- predict(ames_randomForest, ames_test)
##        1        2        3        4        5        6 
## 113543.1 185556.4 259258.1 190943.9 179071.0 480952.3
# ranger
pred_ranger <- predict(ames_ranger, ames_test)
## [1] 129258.1 186520.7 265628.2 197745.5 175517.6 392691.7

Summary - random forests

Advantages & Disadvantages

Advantages - random forrests

Disadvantages - random forrests

These slides are mainly based on