# nolint start
library(mlexperiments)
library(mllrnrs)The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
# nolint start
library(mlexperiments)
library(mllrnrs)See https://github.com/kapsner/mllrnrs/blob/main/R/learner_xgboost.R for implementation details.
library(mlbench)
data("PimaIndiansDiabetes2")
dataset <- PimaIndiansDiabetes2 |>
data.table::as.data.table() |>
na.omit()
feature_cols <- colnames(dataset)[1:8]
target_col <- "diabetes"seed <- 123
if (isTRUE(as.logical(Sys.getenv("_R_CHECK_LIMIT_CORES_")))) {
# on cran
ncores <- 2L
} else {
ncores <- ifelse(
test = parallel::detectCores() > 4,
yes = 4L,
no = ifelse(
test = parallel::detectCores() < 2L,
yes = 1L,
no = parallel::detectCores()
)
)
}
options("mlexperiments.bayesian.max_init" = 4L)
options("mlexperiments.optim.xgb.nrounds" = 20L)
options("mlexperiments.optim.xgb.early_stopping_rounds" = 5L)data_split <- splitTools::partition(
y = dataset[, get(target_col)],
p = c(train = 0.7, test = 0.3),
type = "stratified",
seed = seed
)
train_x <- model.matrix(
~ -1 + .,
dataset[data_split$train, .SD, .SDcols = feature_cols]
)
train_y <- as.integer(dataset[data_split$train, get(target_col)]) - 1L
test_x <- model.matrix(
~ -1 + .,
dataset[data_split$test, .SD, .SDcols = feature_cols]
)
test_y <- as.integer(dataset[data_split$test, get(target_col)]) - 1Lfold_list <- splitTools::create_folds(
y = train_y,
k = 3,
type = "stratified",
seed = seed
)# required learner arguments, not optimized
learner_args <- list(
objective = "binary:logistic",
eval_metric = "logloss"
)
# set arguments for predict function and performance metric,
# required for mlexperiments::MLCrossValidation and
# mlexperiments::MLNestedCV
predict_args <- NULL
performance_metric <- metric("auc")
performance_metric_args <- list(positive = "1", negative = "0")
return_models <- FALSE
# required for grid search and initialization of bayesian optimization
parameter_grid <- expand.grid(
subsample = seq(0.6, 1, .2),
colsample_bytree = seq(0.6, 1, .2),
min_child_weight = seq(1, 5, 4),
learning_rate = seq(0.1, 0.2, 0.1),
max_depth = seq(1, 5, 4)
)
# reduce to a maximum of 10 rows
if (nrow(parameter_grid) > 10) {
set.seed(123)
sample_rows <- sample(seq_len(nrow(parameter_grid)), 10, FALSE)
parameter_grid <- kdry::mlh_subset(parameter_grid, sample_rows)
}
# required for bayesian optimization
parameter_bounds <- list(
subsample = c(0.2, 1),
colsample_bytree = c(0.2, 1),
min_child_weight = c(1L, 10L),
learning_rate = c(0.1, 0.2),
max_depth = c(1L, 10L)
)
optim_args <- list(
n_iter = ncores,
kappa = 3.5,
acq = "ucb"
)tuner <- mlexperiments::MLTuneParameters$new(
learner = mllrnrs::LearnerXgboost$new(
metric_optimization_higher_better = FALSE
),
strategy = "grid",
ncores = ncores,
seed = seed
)
tuner$parameter_grid <- parameter_grid
tuner$learner_args <- learner_args
tuner$split_type <- "stratified"
tuner$set_data(
x = train_x,
y = train_y
)
tuner_results_grid <- tuner$execute(k = 3)
#>
#> Parameter settings [==================>-----------------------------------------------------------------------------] 2/10 ( 20%)
#> Parameter settings [============================>-------------------------------------------------------------------] 3/10 ( 30%)
#> Parameter settings [=====================================>----------------------------------------------------------] 4/10 ( 40%)
#> Parameter settings [===============================================>------------------------------------------------] 5/10 ( 50%)
#> Parameter settings [=========================================================>--------------------------------------] 6/10 ( 60%)
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> Parameter settings [===============================================================================================] 10/10 (100%)
head(tuner_results_grid)
#> setting_id metric_optim_mean nrounds subsample colsample_bytree min_child_weight learning_rate max_depth objective eval_metric
#> <int> <num> <int> <num> <num> <num> <num> <num> <char> <char>
#> 1: 1 0.3977489 62 0.6 0.8 5 0.2 1 binary:logistic logloss
#> 2: 2 0.3915203 67 1.0 0.8 5 0.1 5 binary:logistic logloss
#> 3: 3 0.3972711 96 0.8 0.8 5 0.1 1 binary:logistic logloss
#> 4: 4 0.3951791 62 0.6 0.8 5 0.2 5 binary:logistic logloss
#> 5: 5 0.3786375 44 1.0 0.8 1 0.1 5 binary:logistic logloss
#> 6: 6 0.3956902 75 0.8 0.8 5 0.1 5 binary:logistic loglosstuner <- mlexperiments::MLTuneParameters$new(
learner = mllrnrs::LearnerXgboost$new(
metric_optimization_higher_better = FALSE
),
strategy = "bayesian",
ncores = ncores,
seed = seed
)
tuner$parameter_grid <- parameter_grid
tuner$parameter_bounds <- parameter_bounds
tuner$learner_args <- learner_args
tuner$optim_args <- optim_args
tuner$split_type <- "stratified"
tuner$set_data(
x = train_x,
y = train_y
)
tuner_results_bayesian <- tuner$execute(k = 3)
#>
#> Registering parallel backend using 4 cores.
head(tuner_results_bayesian)
#> Epoch setting_id subsample colsample_bytree min_child_weight learning_rate max_depth gpUtility acqOptimum inBounds Elapsed
#> <num> <int> <num> <num> <num> <num> <num> <num> <lgcl> <lgcl> <num>
#> 1: 0 1 0.6 0.8 5 0.2 1 NA FALSE TRUE 0.890
#> 2: 0 2 1.0 0.8 5 0.1 5 NA FALSE TRUE 0.892
#> 3: 0 3 0.8 0.8 5 0.1 1 NA FALSE TRUE 0.991
#> 4: 0 4 0.6 0.8 5 0.2 5 NA FALSE TRUE 0.911
#> 5: 0 5 1.0 0.8 1 0.1 5 NA FALSE TRUE 0.173
#> 6: 0 6 0.8 0.8 5 0.1 5 NA FALSE TRUE 0.183
#> Score metric_optim_mean nrounds errorMessage objective eval_metric
#> <num> <num> <int> <lgcl> <char> <char>
#> 1: -0.3977489 0.3977489 62 NA binary:logistic logloss
#> 2: -0.3915203 0.3915203 67 NA binary:logistic logloss
#> 3: -0.3972711 0.3972711 96 NA binary:logistic logloss
#> 4: -0.3951791 0.3951791 62 NA binary:logistic logloss
#> 5: -0.3786375 0.3786375 44 NA binary:logistic logloss
#> 6: -0.3956902 0.3956902 75 NA binary:logistic loglossvalidator <- mlexperiments::MLCrossValidation$new(
learner = mllrnrs::LearnerXgboost$new(
metric_optimization_higher_better = FALSE
),
fold_list = fold_list,
ncores = ncores,
seed = seed
)
validator$learner_args <- tuner$results$best.setting[-1]
validator$predict_args <- predict_args
validator$performance_metric <- performance_metric
validator$performance_metric_args <- performance_metric_args
validator$return_models <- return_models
validator$set_data(
x = train_x,
y = train_y
)
validator_results <- validator$execute()
#>
#> CV fold: Fold1
#>
#> CV fold: Fold2
#>
#> CV fold: Fold3
head(validator_results)
#> fold performance subsample colsample_bytree min_child_weight learning_rate max_depth nrounds objective eval_metric
#> <char> <num> <num> <num> <num> <num> <num> <int> <char> <char>
#> 1: Fold1 0.8947647 1 0.8 1 0.1 5 44 binary:logistic logloss
#> 2: Fold2 0.8720254 1 0.8 1 0.1 5 44 binary:logistic logloss
#> 3: Fold3 0.9010741 1 0.8 1 0.1 5 44 binary:logistic loglossvalidator <- mlexperiments::MLNestedCV$new(
learner = mllrnrs::LearnerXgboost$new(
metric_optimization_higher_better = FALSE
),
strategy = "grid",
fold_list = fold_list,
k_tuning = 3L,
ncores = ncores,
seed = seed
)
validator$parameter_grid <- parameter_grid
validator$learner_args <- learner_args
validator$split_type <- "stratified"
validator$predict_args <- predict_args
validator$performance_metric <- performance_metric
validator$performance_metric_args <- performance_metric_args
validator$return_models <- return_models
validator$set_data(
x = train_x,
y = train_y
)
validator_results <- validator$execute()
#>
#> CV fold: Fold1
#>
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> Parameter settings [===============================================================================================] 10/10 (100%)
#> CV fold: Fold2
#> CV progress [====================================================================>-----------------------------------] 2/3 ( 67%)
#>
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> Parameter settings [===============================================================================================] 10/10 (100%)
#> CV fold: Fold3
#> CV progress [========================================================================================================] 3/3 (100%)
#>
#> Parameter settings [===============================================>------------------------------------------------] 5/10 ( 50%)
#> Parameter settings [=========================================================>--------------------------------------] 6/10 ( 60%)
#> Parameter settings [==================================================================>-----------------------------] 7/10 ( 70%)
#> Parameter settings [============================================================================>-------------------] 8/10 ( 80%)
#> Parameter settings [=====================================================================================>----------] 9/10 ( 90%)
#> Parameter settings [===============================================================================================] 10/10 (100%)
head(validator_results)
#> fold performance nrounds subsample colsample_bytree min_child_weight learning_rate max_depth objective eval_metric
#> <char> <num> <int> <num> <num> <num> <num> <num> <char> <char>
#> 1: Fold1 0.8714966 40 0.6 1.0 1 0.2 1 binary:logistic logloss
#> 2: Fold2 0.8754627 35 1.0 1.0 5 0.1 5 binary:logistic logloss
#> 3: Fold3 0.8883550 41 0.8 0.8 5 0.1 1 binary:logistic loglossvalidator <- mlexperiments::MLNestedCV$new(
learner = mllrnrs::LearnerXgboost$new(
metric_optimization_higher_better = FALSE
),
strategy = "bayesian",
fold_list = fold_list,
k_tuning = 3L,
ncores = ncores,
seed = seed
)
validator$parameter_grid <- parameter_grid
validator$learner_args <- learner_args
validator$split_type <- "stratified"
validator$parameter_bounds <- parameter_bounds
validator$optim_args <- optim_args
validator$predict_args <- predict_args
validator$performance_metric <- performance_metric
validator$performance_metric_args <- performance_metric_args
validator$return_models <- TRUE
validator$set_data(
x = train_x,
y = train_y
)
validator_results <- validator$execute()
#>
#> CV fold: Fold1
#>
#> Registering parallel backend using 4 cores.
#>
#> CV fold: Fold2
#> CV progress [====================================================================>-----------------------------------] 2/3 ( 67%)
#>
#> Registering parallel backend using 4 cores.
#>
#> CV fold: Fold3
#> CV progress [========================================================================================================] 3/3 (100%)
#>
#> Registering parallel backend using 4 cores.
head(validator_results)
#> fold performance subsample colsample_bytree min_child_weight learning_rate max_depth nrounds objective eval_metric
#> <char> <num> <num> <num> <num> <num> <num> <int> <char> <char>
#> 1: Fold1 0.8714966 0.6 1.0000000 1 0.2000000 1 40 binary:logistic logloss
#> 2: Fold2 0.8754627 1.0 1.0000000 5 0.1000000 5 35 binary:logistic logloss
#> 3: Fold3 0.8810062 1.0 0.6293304 1 0.1034034 1 56 binary:logistic loglosspreds_xgboost <- mlexperiments::predictions(
object = validator,
newdata = test_x
)perf_xgboost <- mlexperiments::performance(
object = validator,
prediction_results = preds_xgboost,
y_ground_truth = test_y,
type = "binary"
)
perf_xgboost
#> model performance AUC Brier BrierScaled BAC TP TN FP FN TPR TNR FPR FNR
#> <char> <num> <num> <num> <num> <num> <int> <int> <int> <int> <num> <num> <num> <num>
#> 1: Fold1 0.7913015 0.7913015 0.1743251 0.2121706 0.6994482 20 70 9 19 0.5128205 0.8860759 0.1139241 0.4871795
#> 2: Fold2 0.7745862 0.7745862 0.1856610 0.1609401 0.6481662 16 70 9 23 0.4102564 0.8860759 0.1139241 0.5897436
#> 3: Fold3 0.7917884 0.7917884 0.1739823 0.2137198 0.6609867 17 70 9 22 0.4358974 0.8860759 0.1139241 0.5641026
#> PPV NPV FDR MCC F1 GMEAN GPR ACC MMCE BER
#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
#> 1: 0.6896552 0.7865169 0.3103448 0.4358249 0.5882353 0.6740904 0.5947010 0.7627119 0.2372881 0.3005518
#> 2: 0.6400000 0.7526882 0.3600000 0.3411249 0.5000000 0.6029248 0.5124101 0.7288136 0.2711864 0.3518338
#> 3: 0.6538462 0.7608696 0.3461538 0.3654140 0.5230769 0.6214807 0.5338631 0.7372881 0.2627119 0.3390133These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.