When combined, these trees produce a powerful “committee” often hard to beat with other algorithms.
{ height=70% }
1.) Fit a decission tree: $F_1(x)=y$
2.) the next decission tree is fixed to the residuals of the previous: $h_1(x)=y-F_1(x)$
3.) Add this new tree to our algorithm: $F_2(x)=F_1(x)+h_1(x)$
4.) The next decission tree is fixed to the residuals of $h_2(x)=y-F_2(x)$
5.) Add the new tree to the algorithm: $F_3(x)=F_2(x) + h_1(x)$
Continue this process until some mechanism (i.e. cross validation) tells us to stop.
A good strategy to beat your friend is to take the path with the steepest slope.
library(rsample) # data splitting
library(gbm) # basic implementation
library(xgboost) # a faster implementation of gbm
library(caret) # aggregator package - machine learning
library(pdp) # model visualization
library(ggplot2) # model visualization
library(lime) # model visualization
ames_data <- AmesHousing::make_ames()
set.seed(123)
ames_split <- initial_split(ames_data,prop=.7)
ames_train <- training(ames_split)
ames_test <- testing(ames_split)
The most popular implementations of GBM in R:
The original R implementation of GBMs
A fast and efficient gradient boosting framework (C++ backend).
A powerful java-based interface that provides parallel distributed algorithms and efficient productionalization.
gbm
gbm
R package is an implementation of extensions to Freund and Schapire’s AdaBoost algorithm and Friedman’s gradient boosting machine.gbm::gbm
and gbm::gbm.fit
.gbm::gbm
uses the formula interface to specify the modelgbm::gbm.fit
requires the separated x and y matrices (more efficient with many variables).gbm
include a learning rate (shrinkage) of 0.001.gbm
uses the default number of 100 trees, which is rarely sufficient.interaction.depth
) is 1, which means we are ensembling a bunch of stumps.cv.folds
to perform a 5 fold cross validation.distribution
- depends on the response (e.g. bernoulli for binomial)gaussian
is the defaulf valueset.seed(123)
gbm.fit <- gbm(formula = Sale_Price ~ .,distribution="gaussian",
data = ames_train,n.trees = 10000,interaction.depth = 1,
shrinkage = 0.001,cv.folds = 5,n.cores = NULL,verbose = FALSE)
print(gbm.fit) # print results
## gbm(formula = Sale_Price ~ ., distribution = "gaussian", data = ames_train,
## n.trees = 10000, interaction.depth = 1, shrinkage = 0.001,
## cv.folds = 5, verbose = FALSE, n.cores = NULL)
## A gradient boosted model with gaussian loss function.
## 10000 iterations were performed.
## The best cross-validation iteration was 10000.
## There were 80 predictors of which 45 had non-zero influence.
gbm.fit
object to get comfortable with its components.sqrt(min(gbm.fit$cv.error))
## [1] 29133.33
gbm.perf(gbm.fit, method = "cv")
## [1] 10000
set.seed(123)
gbm.fit2 <- gbm(formula = Sale_Price ~ .,
distribution = "gaussian",data = ames_train,
n.trees = 5000,interaction.depth = 3,shrinkage = 0.1,
cv.folds = 5,n.cores = NULL,verbose = FALSE)
# find index for n trees with minimum CV error
min_MSE <- which.min(gbm.fit2$cv.error)
# get MSE and compute RMSE
sqrt(gbm.fit2$cv.error[min_MSE])
## [1] 23112.1
gbm.perf(gbm.fit2, method = "cv")
## [1] 1260
n.minobsinnode
is the minimum number of observations allowed in the trees (nr. for terminal nodes is varied)hyper_grid <- expand.grid(
shrinkage = c(.01, .1, .3),
interaction.depth = c(1, 3, 5),
n.minobsinnode = c(5, 10, 15),
bag.fraction = c(.65, .8, 1),
optimal_trees = 0,# a place to dump results
min_RMSE = 0
)
# total number of combinations
nrow(hyper_grid)
## [1] 81
train.fraction
use the first XX% of the data so its important to randomize the rows in case there is any logic ordering (i.e. ordered by neighborhood).random_index <- sample(1:nrow(ames_train), nrow(ames_train))
random_ames_train <- ames_train[random_index, ]
for(i in 1:nrow(hyper_grid)) {
set.seed(123)
gbm.tune <- gbm(
formula = Sale_Price ~ .,distribution = "gaussian",
data = random_ames_train,n.trees = 5000,
interaction.depth = hyper_grid$interaction.depth[i],
shrinkage = hyper_grid$shrinkage[i],
n.minobsinnode = hyper_grid$n.minobsinnode[i],
bag.fraction = hyper_grid$bag.fraction[i],
train.fraction = .75,n.cores = NULL,verbose = FALSE
)
# add min training error and trees to grid
hyper_grid$optimal_trees[i] <- which.min(gbm.tune$valid.error)
hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}
hyper_grid %>%
dplyr::arrange(min_RMSE) %>%
head(10)
interaction.depth = 1
); there are likely some important interactions that the deeper trees are able to capture.bag.fraction
< 1 seems to help; there may be some local minimas in our loss function gradient,# modify hyperparameter grid
hyper_grid <- expand.grid(
shrinkage = c(.01, .05, .1),
interaction.depth = c(3, 5, 7),
n.minobsinnode = c(5, 7, 10),
bag.fraction = c(.65, .8, 1),
optimal_trees = 0,# a place to dump results
min_RMSE = 0# a place to dump results
)
# total number of combinations
nrow(hyper_grid)
## [1] 81
set.seed(123)
# train GBM model
gbm.fit.final <- gbm(formula = Sale_Price ~ .,
distribution = "gaussian",data = ames_train,
n.trees = 483,interaction.depth = 5,
shrinkage = 0.1,n.minobsinnode = 5,
bag.fraction = .65,train.fraction = 1,
n.cores = NULL, # will use all cores by default
verbose = FALSE)
cBars
allows you to adjust the number of variables to showsummary(gbm.fit.final,cBars = 10,
# also can use permutation.test.gbm
method = relative.influence,las = 2)
## var rel.inf
## Overall_Qual Overall_Qual 4.048861e+01
## Gr_Liv_Area Gr_Liv_Area 1.402976e+01
## Neighborhood Neighborhood 1.320178e+01
## Total_Bsmt_SF Total_Bsmt_SF 5.227608e+00
## Garage_Cars Garage_Cars 3.296480e+00
## First_Flr_SF First_Flr_SF 2.927688e+00
## Garage_Area Garage_Area 2.779194e+00
## Exter_Qual Exter_Qual 1.618451e+00
## Second_Flr_SF Second_Flr_SF 1.517907e+00
## Lot_Area Lot_Area 1.296325e+00
## Bsmt_Qual Bsmt_Qual 1.106896e+00
## MS_SubClass MS_SubClass 9.788950e-01
## Bsmt_Unf_SF Bsmt_Unf_SF 8.313946e-01
## Year_Remod_Add Year_Remod_Add 7.381023e-01
## BsmtFin_Type_1 BsmtFin_Type_1 6.967389e-01
## Overall_Cond Overall_Cond 6.939359e-01
## Kitchen_Qual Kitchen_Qual 6.649051e-01
## Garage_Type Garage_Type 5.727906e-01
## Fireplace_Qu Fireplace_Qu 5.281111e-01
## Bsmt_Exposure Bsmt_Exposure 4.601763e-01
## Sale_Type Sale_Type 4.063181e-01
## Exterior_1st Exterior_1st 3.889487e-01
## Open_Porch_SF Open_Porch_SF 3.797533e-01
## Sale_Condition Sale_Condition 3.771138e-01
## Exterior_2nd Exterior_2nd 3.482380e-01
## Year_Built Year_Built 3.289944e-01
## Full_Bath Full_Bath 3.070894e-01
## Screen_Porch Screen_Porch 2.907584e-01
## Fireplaces Fireplaces 2.906831e-01
## Bsmt_Full_Bath Bsmt_Full_Bath 2.890666e-01
## Wood_Deck_SF Wood_Deck_SF 2.448412e-01
## Longitude Longitude 2.249308e-01
## Central_Air Central_Air 2.232338e-01
## Condition_1 Condition_1 1.867403e-01
## Latitude Latitude 1.650208e-01
## Lot_Frontage Lot_Frontage 1.488962e-01
## Heating_QC Heating_QC 1.456177e-01
## Mo_Sold Mo_Sold 1.406729e-01
## TotRms_AbvGrd TotRms_AbvGrd 1.162828e-01
## Functional Functional 1.118959e-01
## Mas_Vnr_Area Mas_Vnr_Area 1.005664e-01
## Paved_Drive Paved_Drive 9.346526e-02
## Land_Contour Land_Contour 9.026977e-02
## Enclosed_Porch Enclosed_Porch 7.512751e-02
## Garage_Cond Garage_Cond 7.143649e-02
## Lot_Config Lot_Config 6.355982e-02
## Year_Sold Year_Sold 6.198938e-02
## Bedroom_AbvGr Bedroom_AbvGr 6.146463e-02
## Mas_Vnr_Type Mas_Vnr_Type 5.820946e-02
## Roof_Style Roof_Style 5.682478e-02
## Roof_Matl Roof_Matl 4.948087e-02
## Alley Alley 4.279173e-02
## BsmtFin_Type_2 BsmtFin_Type_2 3.482743e-02
## Low_Qual_Fin_SF Low_Qual_Fin_SF 3.427532e-02
## Garage_Qual Garage_Qual 3.136598e-02
## Foundation Foundation 2.803887e-02
## Bsmt_Cond Bsmt_Cond 2.768063e-02
## Condition_2 Condition_2 2.751263e-02
## Bsmt_Half_Bath Bsmt_Half_Bath 2.650291e-02
## MS_Zoning MS_Zoning 2.346874e-02
## Pool_QC Pool_QC 2.211997e-02
## Half_Bath Half_Bath 2.148830e-02
## Three_season_porch Three_season_porch 1.833322e-02
## BsmtFin_SF_2 BsmtFin_SF_2 1.774457e-02
## Garage_Finish Garage_Finish 1.407325e-02
## Fence Fence 1.188101e-02
## House_Style House_Style 1.105981e-02
## Exter_Cond Exter_Cond 1.061516e-02
## Misc_Val Misc_Val 7.996187e-03
## Street Street 7.460435e-03
## Land_Slope Land_Slope 6.847183e-03
## Lot_Shape Lot_Shape 6.681489e-03
## Electrical Electrical 5.498046e-03
## BsmtFin_SF_1 BsmtFin_SF_1 4.149129e-03
## Pool_Area Pool_Area 3.622225e-03
## Misc_Feature Misc_Feature 7.232842e-04
## Utilities Utilities 0.000000e+00
## Bldg_Type Bldg_Type 0.000000e+00
## Heating Heating 0.000000e+00
## Kitchen_AbvGr Kitchen_AbvGr 0.000000e+00
Gr_Liv_Area
while holding all other variables constant.
<!–Gr_Liv_Area
gbm.fit.final %>% partial(pred.var = "Gr_Liv_Area",
n.trees = gbm.fit.final$n.trees,
grid.resolution = 100) %>%
autoplot(rug = TRUE, train = ames_train) +
scale_y_continuous(labels = scales::dollar)
Gr_Liv_Area
increases;ice1 <- gbm.fit.final %>%
partial(
pred.var = "Gr_Liv_Area",
n.trees = gbm.fit.final$n.trees,
grid.resolution = 100,
ice = TRUE
) %>%
autoplot(rug = TRUE, train = ames_train, alpha = .1) +
ggtitle("Non-centered") +
scale_y_continuous(labels = scales::dollar)
ice2 <- gbm.fit.final %>%
partial(
pred.var = "Gr_Liv_Area",
n.trees = gbm.fit.final$n.trees,
grid.resolution = 100,
ice = TRUE
) %>%
autoplot(rug = TRUE, train = ames_train, alpha = .1,
center = TRUE) + ggtitle("Centered") +
scale_y_continuous(labels = scales::dollar)
gridExtra::grid.arrange(ice1, ice2, nrow = 1)
{height=80%}
lime
package on a gbm
model we need to define model type and prediction methods.model_type.gbm <- function(x, ...) {
return("regression")
}
predict_model.gbm <- function(x, newdata, ...) {
pred <- predict(x, newdata, n.trees = x$n.trees)
return(as.data.frame(pred))
}
# get a few observations to perform local interpretation on
local_obs <- ames_test[1:2, ]
# apply LIME
explainer <- lime(ames_train, gbm.fit.final)
explanation <- explain(local_obs, explainer, n_features = 5)
plot_features(explanation)
?predict.gbm
for details).# predict values for test data
pred <- predict(gbm.fit.final, n.trees = gbm.fit.final$n.trees,
ames_test)
# results
caret::RMSE(pred, ames_test$Sale_Price)
## [1] 21533.65
xgboost
xgboost
R package provides an R API to “Extreme Gradient Boosting”, which is an efficient implementation of gradient boosting framework (approx. 10x faster than gbm).https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ –> <!–
xgboost
package has been quite popular and successful on Kaggle for data mining competitions.
–>XGBoost
only works with matrices that contain all numeric variables; consequently, we need to hot encode our data. There are different ways to do this in R (i.e. Matrix::sparse.model.matrix
, caret::dummyVars
) but here we will use the vtreat
package.vtreat
is a robust package for data prep and helps to eliminate problems caused by missing values, novel categorical levels that appear in future data sets that were not in the training data, etc. vtreat
is not very intuitive.vtreat
to one-hot encode the training and testing data sets.# variable names
features <- setdiff(names(ames_train), "Sale_Price")
# Create the treatment plan from the training data
treatplan <- vtreat::designTreatmentsZ(ames_train, features,
verbose = FALSE)
# Get the "clean" variable names from the scoreFrame
new_vars <- treatplan %>%
magrittr::use_series(scoreFrame) %>%
dplyr::filter(code %in% c("clean", "lev")) %>%
magrittr::use_series(varName)
# Prepare the training data
features_train <- vtreat::prepare(treatplan, ames_train,
varRestriction = new_vars) %>% as.matrix()
response_train <- ames_train$Sale_Price
features_test <- vtreat::prepare(treatplan, ames_test,
varRestriction = new_vars) %>% as.matrix()
response_test <- ames_test$Sale_Price
dim(features_train)
## [1] 2051 343
dim(features_test)
## [1] 879 343
xgboost
- training functionsxgboost
provides different training functions (i.e. xgb.train
which is just a wrapper for xgboost
).To train an XGBoost we typically want to use xgb.cv
, which incorporates cross-validation. The following trains a basic 5-fold cross validated XGBoost model with 1,000 trees. There are many parameters available in xgb.cv
but the ones used in this tutorial include the following default values:
max_depth
): 6min_child_weight
): 1bag.fraction
): 100%set.seed(123)
xgb.fit1 <- xgb.cv(
data = features_train,
label = response_train,
nrounds = 1000, nfold = 5,
objective = "reg:linear", # for regression models
verbose = 0 # silent,
)
xgb.fit1
object contains lots of good information.xgb.fit1$evaluation_log
to identify the minimum RMSE and the optimal number of trees for both the training data and the cross-validated error.xgb.fit1$evaluation_log %>%
dplyr::summarise(
ntrees.train=which(train_rmse_mean==min(train_rmse_mean))[1],
rmse.train= min(train_rmse_mean),
ntrees.test=which(test_rmse_mean==min(test_rmse_mean))[1],
rmse.test = min(test_rmse_mean)
)
## ntrees.train rmse.train ntrees.test rmse.test
## 1 924 0.0483002 60 27337.79
ggplot(xgb.fit1$evaluation_log) +
geom_line(aes(iter, train_rmse_mean), color = "red") +
geom_line(aes(iter, test_rmse_mean), color = "blue")
xgb.cv
is early stopping.set.seed(123)
xgb.fit2 <- xgb.cv(data = features_train,
label = response_train,
nrounds = 1000, nfold = 5,
objective = "reg:linear", # for regression models
verbose = 0, # silent,
# stop if no improvement for 10 consecutive trees
early_stopping_rounds = 10)
ggplot(xgb.fit2$evaluation_log) +
geom_line(aes(iter, train_rmse_mean), color = "red") +
geom_line(aes(iter, test_rmse_mean), color = "blue")
To tune the XGBoost model we pass parameters as a list object to the params argument. The most common parameters include:
eta:controls
- the learning ratemax_depth
: tree depthmin_child_weight
: minimum number of observations required in each terminal nodesubsample
: percent of training data to sample for each treecolsample_bytrees
: percent of columns to sample from for each tree params <- list(
eta = .1,
max_depth = 5,
min_child_weight = 2,
subsample = .8,
colsample_bytree = .9
)
gbm
.# create hyperparameter grid
hyper_grid <- expand.grid(
eta = c(.01, .05, .1, .3),
max_depth = c(1, 3, 5, 7),
min_child_weight = c(1, 3, 5, 7),
subsample = c(.65, .8, 1),
colsample_bytree = c(.8, .9, 1),
optimal_trees = 0,# a place to dump results
min_RMSE = 0# a place to dump results
)
nrow(hyper_grid)
## [1] 576
set.seed(123)
xgb.fit3 <- xgb.cv(
params = params,
data = features_train,
label = response_train,
nrounds = 1000,
nfold = 5,
objective = "reg:linear", # for regression models
verbose = 0, # silent,
# stop if no improvement for 10 consecutive trees
early_stopping_rounds = 10
)
xgb.fit3$evaluation_log %>%
dplyr::summarise(
ntrees.train=which(train_rmse_mean==min(train_rmse_mean))[1],
rmse.train= min(train_rmse_mean),
ntrees.test= which(test_rmse_mean==min(test_rmse_mean))[1],
rmse.test= min(test_rmse_mean)
)
## ntrees.train rmse.train ntrees.test rmse.test
## 1 211 5222.229 201 24411.64
XGBoost
modelXGBoost
model for each hyperparameter combination and dump the results in the hyper_grid
data frame.for(i in 1:nrow(hyper_grid)) {
params <- list(# create parameter list
eta = hyper_grid$eta[i],max_depth = hyper_grid$max_depth[i],
min_child_weight = hyper_grid$min_child_weight[i],
subsample = hyper_grid$subsample[i],
colsample_bytree = hyper_grid$colsample_bytree[i])
set.seed(123)
xgb.tune <- xgb.cv(params = params,data = features_train,
label = response_train,nrounds=5000,nfold=5,objective = "reg:linear",
#stop if no improvement for 10 consecutive trees
verbose = 0,early_stopping_rounds = 10 )
# add min training error and trees to grid
hyper_grid$optimal_trees[i]<-which.min(
xgb.tune$evaluation_log$test_rmse_mean)
hyper_grid$min_RMSE[i] <- min(
xgb.tune$evaluation_log$test_rmse_mean)
}
hyper_grid %>%
dplyr::arrange(min_RMSE) %>%
head(10)
## eta max_depth min_child_weight subsample colsample_bytree
## 1 0.01 1 1 0.65 0.8
## 2 0.05 1 1 0.65 0.8
## 3 0.10 1 1 0.65 0.8
## 4 0.30 1 1 0.65 0.8
## 5 0.01 3 1 0.65 0.8
## 6 0.05 3 1 0.65 0.8
## 7 0.10 3 1 0.65 0.8
## 8 0.30 3 1 0.65 0.8
## 9 0.01 5 1 0.65 0.8
## 10 0.05 5 1 0.65 0.8
## optimal_trees min_RMSE
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
xgboost
.
–>xgb.train
.# parameter list
params <- list(
eta = 0.01,
max_depth = 5,
min_child_weight = 5,
subsample = 0.65,
colsample_bytree = 1
)
xgb.fit.final <- xgboost(
params = params,
data = features_train,
label = response_train,
nrounds = 1576,
objective = "reg:linear",
verbose = 0
)
xgboost
Input type: xgboost
takes several types of input data:
xgb.DMatrix
: its own class (recommended).xgb.DMatrix
object with getinfo
xgboost
provides built-in variable importance plotting. First, you need to create the importance matrix with xgb.importance and then feed this matrix into xgb.plot.importance. There are 3 variable importance measure:
importance_matrix <- xgb.importance(model = xgb.fit.final)
xgb.plot.importance(importance_matrix, top_n = 10, measure = "Gain")
# one-hot encode the local observations to be assessed.
local_obs_onehot <- vtreat::prepare(treatplan, local_obs,
varRestriction = new_vars)
# apply LIME
explainer <- lime(data.frame(features_train), xgb.fit.final)
explanation <- explain(local_obs_onehot, explainer,
n_features = 5)
plot_features(explanation)
unlike GBM we do not need to provide the number of trees. Our test set RMSE is only about $600 different than that produced by our gbm model.
# predict values for test data
pred <- predict(xgb.fit.final, features_test)
# results
caret::RMSE(pred, response_test)
## [1] 21898
## [1] 21319.3