###
### Benchmarking experiment ###
###
# Pipeline: OpenML for datasets, mlr for benchmark, batchtools for parallelisat.
# TODO
# !!! Take care of packages version, especially mlr (s. packages)!!!
# !!! Take care of makeClusterFunction !!!
# Setup ---------------------------------------------------
library(checkpoint) # get package snapshots used packages
checkpoint("2019-04-17", project = getwd())
library(batchtools)
packs <- c("mlr", "pec", "glmnet", "mboost", "prioritylasso", "CoxBoost",
"ipflasso", "randomForestSRC", "ranger", "dplyr", "data.table", "survival",
"GRridge", "tuneRanger", "SGL", "blockForest", "survAUC", "riskRegression")
# unlink("bench_exp", recursive = TRUE)
regis <- makeExperimentRegistry("bench_exp",
packages = packs,
source = "ancillary_code_bench.R")
# 0. Partial reproduction -----------------------------------------------------
# If only parts of the benchmark experiment are to be run, e.g. for reproducing
# single results due to computation times, change settings as desired
# (here full bench experiment).
# Important: Change seed to randomly choose the datasets/learners for which
# the results should be reproduced.
# n_lrns <- 14
# n_datsets <- 18
# seed <- 124
#
# partition <- sample_reproduction(n_lrns = n_lrns,
# n_datsets = n_datsets,
# seed = seed)
# 1. Tasks and problems--------------------------------------------------------
# OpenML dataset ids to querry
load("data/datset_ids.RData")
nams <- c("LAML", "BLCA", "LGG", "BRCA", "COAD", "ESCA",
"HNSC", "KIRC", "KIRP", "LIHC", "LUAD", "LUSC",
"OV", "PAAD", "SARC", "SKCM", "STAD", "UCEC")
# For partial reproduction:
# nams <- nams[partition$datsets]
for (nam in nams) {
# download dataset
dat_part1 <- getOMLDataSet(datset_ids[[nam]][[1]])
dat_part2 <- getOMLDataSet(datset_ids[[nam]][[2]])
dat <- cbind.data.frame(dat_part1, dat_part2)
if (nam = "BRCA") {
dat_part3 <- getOMLDataSet(datset_ids[[nam]][[3]])
dat <- cbind.data.frame(dat, dat_part3)
}
task <- convertOMLDataSetToMlr(obj = dat,
task.type = "Survival analysis",
target = c("time", "status"),
fix.colnames = FALSE)
# convert to mlr task
task <- makeSurvTask(id = nam,
data = task[, -1], # delete patient code
target = c("time", "status"))
# adding task as batch problem
addProblem(name = getTaskId(task), data = task)
}
# ------------------
# get data from disk
# for (nam in nams) {
# load("data/", nam, ".RData"))
# task <- get(nam)[, -1]
# task <- makeSurvTask(id = nam, data = task, target = c("time", "status"))
# addProblem(name = getTaskId(task), data = task)
# }
# ------------------
rm(list = c(nams)) # the loaded datasets occupy a lot memory, so delete
# 2. Algorithms and learners --------------------------------------------------
# For each learner an algorithm is defined to make paralleliz. on learner-level
# possible.
# Generic learners do not depend on dataset specific parameters (group structure)
# Task specific learners depend on dataset specific parameters (group structure)
#
# In general, lrnrs use default settings. Find other lrnr-configurations
# in the following list.
# Task specific arguments cannot be set here and are added in make_spec_lrns().
# They are listed here for convenience overview and set to NULL.
# defining lrn params in advance and using for loop to DRY
l_lrn_args = list("lrn_km" = list(cl = "surv.kaplanmeier",
id = "Kaplanmeier",
predict.type = "prob"),
"lrn_lasso" = list(cl = "surv.cv.glmnet2",
id = "Lasso",
s = "lambda.min",
predict.type = "prob"),
"lrn_glmboost" = list(cl = "surv.cv.glmboost",
id = "glmBoost",
use.formula = FALSE,
mstop = 1L,
predict.type = "prob"),
"lrn_CoxBoost" = list(cl = "surv.cv.CoxBoost2",
id = "CoxBoost",
predict.type = "prob"),
"lrn_rfsrc" = list(cl = "surv.randomForestSRC",
id = "rfsrc",
predict.type = "prob"),
"lrn_ranger" = list(cl = "surv.tuneMtryFast2",
id = "ranger",
predict.type = "prob",
write.forest = TRUE),
"lrn_clin_ref" = list(cl = "surv.clinic_reference",
id = "Clinical only",
predict.type = "prob",
clinicals = NULL,
nfolds = 10), # only used if p_clin > n_train (which is not the case)
"lrn_ts_prior" = list(cl = "surv.ts.priorlasso",
id = "Prioritylasso",
blocks = NULL,
predict.type = "prob",
favoring = FALSE),
"lrn_ts_prior_fav" = list(cl = "surv.ts.priorlasso",
id = "Prioritylasso favoring",
blocks = NULL,
predict.type = "prob",
favoring = TRUE),
"lrn_tsipf" = list(cl = "surv.ts.ipflasso",
id = "IPF-Lasso",
blocks = NULL,
nfolds = 10,
ncv = 1,
predict.type = "prob"),
"lrn_cv_coxboost_unpen" = list(cl = "surv.cv.CoxBoost2",
id = "Coxboost favoring",
predict.type = "prob",
unpen.index = NULL),
"lrn_blockForest" = list(cl = "surv.blockForest",
id = "blockForest",
predict.type = "prob",
blocks = NULL),
"lrn_SGL" = list(cl = "surv.cvSGL",
id = "SGL",
predict.type = "prob",
index = NULL,
nfold = 10),
"lrn_grridge" = list(cl = "surv.grridge",
id = "grridge",
partitions = NULL,
predict.type = "prob",
innfold = 10,
standardizeX = TRUE,
selectionEN = TRUE,
maxsel = c(1000))
)
# You can also choose specific lrns (random selection will only take place if
# no manual selection is done)
# all_lrns <- names(l_lrn_args) # default is not to select manually (i.e., all lrns)
# l_lrn_args <- l_lrn_args[all_lrns]
# if (length(l_lrn_args) == 14) {
# l_lrn_args <- l_lrn_args[partition$datsets] # check if $datsets correct argument
# }
addAlgorithm("learner", fun = function(job, data, instance, lrns, ...) {
par.vals = list(...)
lrnr = lrns
# 1. Task ----------------------------------------------------------------- all equal
task = data
task_id = getTaskId(task)
# 2. Learner -------------------------------------------------------------- main diffs here
if (lrnr %in% c("lrn_clin_ref", "lrn_ts_prior", "lrn_ts_prior_fav",
"lrn_tsipf","lrn_cv_coxboost_unpen", "lrn_blockForest",
"lrn_SGL", "lrn_grridge")) {
lrn = make_spec_lrns(task, args = l_lrn_args, lrnr = lrnr)
} else {
lrn = list(assign(lrnr, do.call(makeLearner, l_lrn_args[[lrnr]])))
}
# 3. Measures ------------------------------------------------------------- all equal
mrs = list(timetrain, cindex.uno, ibrier, featselc_default, featselc_clin,
featselc_cnv, featselc_mirna, featselc_mutation, featselc_rna)
# 4. Resampling ----------------------------------------------------------- all equal
if (task_id %in% c("BRCA", "LUAD", "LUSC", "HNSC", "LGG", "UCEC", "BLCA")) {
rdesc = makeResampleDesc("RepCV", reps = 5, folds = 5, stratify = TRUE)
} else {
rdesc = makeResampleDesc("RepCV", reps = 10, folds = 5, stratify = TRUE)
}
set.seed(124)
rin = makeResampleInstance(rdesc, task = task)
if (lrnr == "lrn_cv_coxboost_unpen") {
save(rin, file = paste0(task_id, "_rin.RData"))
}
# 5. Benchmarking --------------------------------------------------------- all equal
configureMlr(on.learner.error = "warn", show.learner.output = FALSE)
set.seed(421)
# keep.pred must be TRUE to be able to merge the results later
bmr = benchmark(lrn, task, rin, mrs,
keep.pred = TRUE, models = FALSE, show.info = TRUE)
bmr
})
algo.designs <- list(learner = data.table(lrns = c(names(l_lrn_args))))
addExperiments(algo.designs = algo.designs)
summarizeExperiments()
# kernels to use (depends on system, here linux)
regis$cluster.functions <- makeClusterFunctionsMulticore(6) # change according to system (?makeClusterFunctionsMulticore)
# group jobs by learners with fast, normal, and slow run time and submit sequentially
# fast <- findExperiments(
# algo.name = "learner",
# algo.pars = (
# lrns == "lrn_km" |
# lrns == "lrn_ranger" |
# lrns == "lrn_clin_ref"
# )
# )
# normal <-
# findExperiments(
# algo.name = "learner",
# algo.pars = (
# lrns != "lrn_blockForest" &
# lrns != "lrn_grridge" &
# lrns != "lrn_SGL" &
# lrns != "lrn_km" &
# lrns != "lrn_ranger" &
# lrns != "lrn_clin_ref"
# )
# )
# slow <-
# findExperiments(
# algo.name = "learner",
# algo.pars = (
# lrns == "lrn_blockForest" |
# lrns == "lrn_grridge"
# )
# )
submitJobs()
getStatus()
done = findDone()
erg <- reduceResultsList(findExperiments())
errors = getErrorMessages()
save(erg, errors, file = "ergebnis.RData")