Diff of /R/bench_experiment.R [000000] .. [bb6f8d]

Switch to side-by-side view

--- a
+++ b/R/bench_experiment.R
@@ -0,0 +1,284 @@
+### 
+### Benchmarking experiment ###
+###
+
+# Pipeline: OpenML for datasets, mlr for benchmark, batchtools for parallelisat.
+
+# TODO
+# !!! Take care of packages version, especially mlr (s. packages)!!!
+# !!! Take care of makeClusterFunction !!!
+
+# Setup ---------------------------------------------------
+
+library(checkpoint) # get package snapshots used packages
+checkpoint("2019-04-17", project = getwd())
+
+library(batchtools)
+
+packs <- c("mlr", "pec", "glmnet", "mboost", "prioritylasso", "CoxBoost", 
+           "ipflasso", "randomForestSRC", "ranger", "dplyr", "data.table", "survival",
+           "GRridge", "tuneRanger", "SGL", "blockForest", "survAUC", "riskRegression")
+
+# unlink("bench_exp", recursive = TRUE)
+regis <- makeExperimentRegistry("bench_exp", 
+                                packages = packs,
+                                  source = "ancillary_code_bench.R")
+
+
+# 0. Partial reproduction -----------------------------------------------------
+
+# If only parts of the benchmark experiment are to be run, e.g. for reproducing 
+# single results due to computation times, change settings as desired 
+# (here full bench experiment). 
+# Important: Change seed to randomly choose the datasets/learners for which
+# the results should be reproduced.
+
+# n_lrns <- 14  
+# n_datsets <- 18
+# seed <- 124
+# 
+# partition <- sample_reproduction(n_lrns = n_lrns, 
+#                                  n_datsets = n_datsets, 
+#                                  seed = seed)
+
+# 1. Tasks and problems--------------------------------------------------------
+
+# OpenML dataset ids to querry
+
+load("data/datset_ids.RData")
+
+nams <- c("LAML", "BLCA", "LGG",  "BRCA", "COAD", "ESCA", 
+          "HNSC", "KIRC", "KIRP", "LIHC", "LUAD", "LUSC", 
+          "OV", "PAAD", "SARC", "SKCM", "STAD", "UCEC")
+
+# For partial reproduction:
+# nams <- nams[partition$datsets] 
+
+for (nam in nams) {
+
+  # download dataset
+  dat_part1 <- getOMLDataSet(datset_ids[[nam]][[1]])
+  dat_part2 <- getOMLDataSet(datset_ids[[nam]][[2]])
+
+  dat <- cbind.data.frame(dat_part1, dat_part2)
+  
+  if (nam = "BRCA") {
+    dat_part3 <- getOMLDataSet(datset_ids[[nam]][[3]])
+    dat <- cbind.data.frame(dat, dat_part3)
+  }
+  
+  task <- convertOMLDataSetToMlr(obj = dat,
+                                 task.type = "Survival analysis",
+                                 target = c("time", "status"),
+                                 fix.colnames = FALSE)
+  # convert to mlr task
+  task <- makeSurvTask(id = nam, 
+                       data = task[, -1], # delete patient code
+                       target = c("time", "status"))
+
+  # adding task as batch problem
+  addProblem(name = getTaskId(task), data = task)
+}
+
+
+# ------------------
+# get data from disk
+
+# for (nam in nams) {
+#   load("data/", nam, ".RData"))
+#   task <- get(nam)[, -1]
+#   task <- makeSurvTask(id = nam, data = task, target = c("time", "status"))
+#   addProblem(name = getTaskId(task), data = task)
+# }
+# ------------------
+
+rm(list = c(nams)) # the loaded datasets occupy a lot memory, so delete
+
+
+# 2. Algorithms and learners --------------------------------------------------
+
+# For each learner an algorithm is defined to make paralleliz. on learner-level
+# possible.
+# Generic learners do not depend on dataset specific parameters (group structure)
+# Task specific learners depend on dataset specific parameters (group structure)
+#
+# In general, lrnrs use default settings. Find other lrnr-configurations
+# in the following list. 
+# Task specific arguments cannot be set here and are added in make_spec_lrns(). 
+# They are listed here for convenience overview and set to NULL.
+
+# defining lrn params in advance and using for loop to DRY
+l_lrn_args = list("lrn_km" = list(cl = "surv.kaplanmeier",
+                                  id = "Kaplanmeier",
+                                  predict.type = "prob"),
+                  "lrn_lasso" = list(cl = "surv.cv.glmnet2",
+                                     id = "Lasso",
+                                     s = "lambda.min",
+                                     predict.type = "prob"),
+                  "lrn_glmboost" = list(cl = "surv.cv.glmboost",
+                                        id = "glmBoost",
+                                        use.formula = FALSE,
+                                        mstop = 1L,
+                                        predict.type = "prob"),
+                  "lrn_CoxBoost" = list(cl = "surv.cv.CoxBoost2",
+                                        id = "CoxBoost",
+                                        predict.type = "prob"),
+                  "lrn_rfsrc" = list(cl = "surv.randomForestSRC",
+                                     id = "rfsrc",
+                                     predict.type = "prob"),
+                  "lrn_ranger" = list(cl = "surv.tuneMtryFast2",
+                                      id = "ranger",
+                                      predict.type = "prob",
+                                      write.forest = TRUE),
+                  "lrn_clin_ref" = list(cl = "surv.clinic_reference",
+                                        id = "Clinical only",
+                                        predict.type = "prob",
+                                        clinicals = NULL,
+                                        nfolds = 10), # only used if p_clin > n_train (which is not the case)
+                  "lrn_ts_prior" = list(cl = "surv.ts.priorlasso",
+                                        id = "Prioritylasso",
+                                        blocks = NULL,
+                                        predict.type = "prob",
+                                        favoring = FALSE),
+                  "lrn_ts_prior_fav" = list(cl = "surv.ts.priorlasso",
+                                            id = "Prioritylasso favoring",
+                                            blocks = NULL,
+                                            predict.type = "prob",
+                                            favoring = TRUE),
+                  "lrn_tsipf" = list(cl = "surv.ts.ipflasso",
+                                     id = "IPF-Lasso",
+                                     blocks = NULL,
+                                     nfolds = 10,
+                                     ncv = 1,
+                                     predict.type = "prob"),
+                  "lrn_cv_coxboost_unpen" = list(cl = "surv.cv.CoxBoost2",
+                                                 id = "Coxboost favoring",
+                                                 predict.type = "prob",
+                                                 unpen.index = NULL),
+                  "lrn_blockForest" = list(cl = "surv.blockForest",
+                                           id = "blockForest",
+                                           predict.type = "prob",
+                                           blocks = NULL),
+                  "lrn_SGL" = list(cl = "surv.cvSGL",
+                                   id = "SGL",
+                                   predict.type = "prob",
+                                   index = NULL,
+                                   nfold = 10),
+                  "lrn_grridge" = list(cl = "surv.grridge",
+                                       id = "grridge",
+                                       partitions = NULL,
+                                       predict.type = "prob",
+                                       innfold = 10,
+                                       standardizeX = TRUE,
+                                       selectionEN = TRUE,
+                                       maxsel = c(1000))
+)
+
+# You can also choose specific lrns (random selection will only take place if
+# no manual selection is done)
+
+# all_lrns <- names(l_lrn_args) # default is not to select manually (i.e., all lrns)
+# l_lrn_args <- l_lrn_args[all_lrns]
+
+# if (length(l_lrn_args) == 14) {
+#   l_lrn_args <- l_lrn_args[partition$datsets] # check if $datsets correct argument
+# }
+
+
+addAlgorithm("learner", fun = function(job, data, instance, lrns, ...) {
+  par.vals = list(...)
+  
+  lrnr = lrns
+  
+  # 1. Task -----------------------------------------------------------------   all equal
+  task = data
+  task_id = getTaskId(task)
+  
+  # 2. Learner --------------------------------------------------------------   main diffs here
+  if (lrnr %in% c("lrn_clin_ref", "lrn_ts_prior", "lrn_ts_prior_fav", 
+                  "lrn_tsipf","lrn_cv_coxboost_unpen", "lrn_blockForest",
+                  "lrn_SGL", "lrn_grridge")) {
+    lrn = make_spec_lrns(task, args = l_lrn_args, lrnr = lrnr)
+  } else {
+    lrn = list(assign(lrnr, do.call(makeLearner, l_lrn_args[[lrnr]])))
+  }
+  
+  
+  # 3. Measures -------------------------------------------------------------   all equal
+  mrs = list(timetrain, cindex.uno, ibrier, featselc_default, featselc_clin, 
+             featselc_cnv, featselc_mirna, featselc_mutation, featselc_rna) 
+  
+  # 4. Resampling -----------------------------------------------------------   all equal
+  if (task_id %in% c("BRCA", "LUAD", "LUSC", "HNSC", "LGG", "UCEC", "BLCA")) {
+    rdesc = makeResampleDesc("RepCV", reps = 5, folds = 5, stratify = TRUE)
+  } else {
+    rdesc = makeResampleDesc("RepCV", reps = 10, folds = 5, stratify = TRUE)    
+  }
+  
+  set.seed(124) 
+  rin = makeResampleInstance(rdesc, task = task)
+  if (lrnr == "lrn_cv_coxboost_unpen") {
+    save(rin, file = paste0(task_id, "_rin.RData"))
+  }
+  
+  # 5. Benchmarking ---------------------------------------------------------   all equal 
+  configureMlr(on.learner.error = "warn", show.learner.output = FALSE)
+  set.seed(421)
+  
+  # keep.pred must be TRUE to be able to merge the results later
+  bmr = benchmark(lrn, task, rin, mrs, 
+                  keep.pred = TRUE, models = FALSE, show.info = TRUE)
+  bmr
+})
+
+
+algo.designs <- list(learner = data.table(lrns = c(names(l_lrn_args))))
+
+addExperiments(algo.designs = algo.designs)
+summarizeExperiments()
+
+# kernels to use (depends on system, here linux)
+regis$cluster.functions <- makeClusterFunctionsMulticore(6)                    # change according to system (?makeClusterFunctionsMulticore)
+
+
+# group jobs by learners with fast, normal, and slow run time and submit sequentially
+# fast <- findExperiments(
+#   algo.name = "learner",
+#   algo.pars = (
+#     lrns == "lrn_km" |
+#       lrns == "lrn_ranger" |
+#       lrns == "lrn_clin_ref"
+#   )
+# )
+
+# normal <-
+#   findExperiments(
+#     algo.name = "learner",
+#     algo.pars = (
+#       lrns != "lrn_blockForest" &
+#         lrns != "lrn_grridge" & 
+#         lrns != "lrn_SGL" &
+#         lrns != "lrn_km" &
+#         lrns != "lrn_ranger" &
+#         lrns != "lrn_clin_ref"
+#     )
+#   )
+
+# slow <-
+#   findExperiments(
+#     algo.name = "learner",
+#     algo.pars = (
+#       lrns == "lrn_blockForest" |
+#       lrns == "lrn_grridge"
+#     )
+# )
+
+submitJobs()
+getStatus()
+
+done = findDone()
+
+erg <- reduceResultsList(findExperiments())
+errors = getErrorMessages()
+
+save(erg, errors, file = "ergebnis.RData")