a b/R/bench_experiment.R
1
### 
2
### Benchmarking experiment ###
3
###
4
5
# Pipeline: OpenML for datasets, mlr for benchmark, batchtools for parallelisat.
6
7
# TODO
8
# !!! Take care of packages version, especially mlr (s. packages)!!!
9
# !!! Take care of makeClusterFunction !!!
10
11
# Setup ---------------------------------------------------
12
13
library(checkpoint) # get package snapshots used packages
14
checkpoint("2019-04-17", project = getwd())
15
16
library(batchtools)
17
18
packs <- c("mlr", "pec", "glmnet", "mboost", "prioritylasso", "CoxBoost", 
19
           "ipflasso", "randomForestSRC", "ranger", "dplyr", "data.table", "survival",
20
           "GRridge", "tuneRanger", "SGL", "blockForest", "survAUC", "riskRegression")
21
22
# unlink("bench_exp", recursive = TRUE)
23
regis <- makeExperimentRegistry("bench_exp", 
24
                                packages = packs,
25
                                  source = "ancillary_code_bench.R")
26
27
28
# 0. Partial reproduction -----------------------------------------------------
29
30
# If only parts of the benchmark experiment are to be run, e.g. for reproducing 
31
# single results due to computation times, change settings as desired 
32
# (here full bench experiment). 
33
# Important: Change seed to randomly choose the datasets/learners for which
34
# the results should be reproduced.
35
36
# n_lrns <- 14  
37
# n_datsets <- 18
38
# seed <- 124
39
# 
40
# partition <- sample_reproduction(n_lrns = n_lrns, 
41
#                                  n_datsets = n_datsets, 
42
#                                  seed = seed)
43
44
# 1. Tasks and problems--------------------------------------------------------
45
46
# OpenML dataset ids to querry
47
48
load("data/datset_ids.RData")
49
50
nams <- c("LAML", "BLCA", "LGG",  "BRCA", "COAD", "ESCA", 
51
          "HNSC", "KIRC", "KIRP", "LIHC", "LUAD", "LUSC", 
52
          "OV", "PAAD", "SARC", "SKCM", "STAD", "UCEC")
53
54
# For partial reproduction:
55
# nams <- nams[partition$datsets] 
56
57
for (nam in nams) {
58
59
  # download dataset
60
  dat_part1 <- getOMLDataSet(datset_ids[[nam]][[1]])
61
  dat_part2 <- getOMLDataSet(datset_ids[[nam]][[2]])
62
63
  dat <- cbind.data.frame(dat_part1, dat_part2)
64
  
65
  if (nam = "BRCA") {
66
    dat_part3 <- getOMLDataSet(datset_ids[[nam]][[3]])
67
    dat <- cbind.data.frame(dat, dat_part3)
68
  }
69
  
70
  task <- convertOMLDataSetToMlr(obj = dat,
71
                                 task.type = "Survival analysis",
72
                                 target = c("time", "status"),
73
                                 fix.colnames = FALSE)
74
  # convert to mlr task
75
  task <- makeSurvTask(id = nam, 
76
                       data = task[, -1], # delete patient code
77
                       target = c("time", "status"))
78
79
  # adding task as batch problem
80
  addProblem(name = getTaskId(task), data = task)
81
}
82
83
84
# ------------------
85
# get data from disk
86
87
# for (nam in nams) {
88
#   load("data/", nam, ".RData"))
89
#   task <- get(nam)[, -1]
90
#   task <- makeSurvTask(id = nam, data = task, target = c("time", "status"))
91
#   addProblem(name = getTaskId(task), data = task)
92
# }
93
# ------------------
94
95
rm(list = c(nams)) # the loaded datasets occupy a lot memory, so delete
96
97
98
# 2. Algorithms and learners --------------------------------------------------
99
100
# For each learner an algorithm is defined to make paralleliz. on learner-level
101
# possible.
102
# Generic learners do not depend on dataset specific parameters (group structure)
103
# Task specific learners depend on dataset specific parameters (group structure)
104
#
105
# In general, lrnrs use default settings. Find other lrnr-configurations
106
# in the following list. 
107
# Task specific arguments cannot be set here and are added in make_spec_lrns(). 
108
# They are listed here for convenience overview and set to NULL.
109
110
# defining lrn params in advance and using for loop to DRY
111
l_lrn_args = list("lrn_km" = list(cl = "surv.kaplanmeier",
112
                                  id = "Kaplanmeier",
113
                                  predict.type = "prob"),
114
                  "lrn_lasso" = list(cl = "surv.cv.glmnet2",
115
                                     id = "Lasso",
116
                                     s = "lambda.min",
117
                                     predict.type = "prob"),
118
                  "lrn_glmboost" = list(cl = "surv.cv.glmboost",
119
                                        id = "glmBoost",
120
                                        use.formula = FALSE,
121
                                        mstop = 1L,
122
                                        predict.type = "prob"),
123
                  "lrn_CoxBoost" = list(cl = "surv.cv.CoxBoost2",
124
                                        id = "CoxBoost",
125
                                        predict.type = "prob"),
126
                  "lrn_rfsrc" = list(cl = "surv.randomForestSRC",
127
                                     id = "rfsrc",
128
                                     predict.type = "prob"),
129
                  "lrn_ranger" = list(cl = "surv.tuneMtryFast2",
130
                                      id = "ranger",
131
                                      predict.type = "prob",
132
                                      write.forest = TRUE),
133
                  "lrn_clin_ref" = list(cl = "surv.clinic_reference",
134
                                        id = "Clinical only",
135
                                        predict.type = "prob",
136
                                        clinicals = NULL,
137
                                        nfolds = 10), # only used if p_clin > n_train (which is not the case)
138
                  "lrn_ts_prior" = list(cl = "surv.ts.priorlasso",
139
                                        id = "Prioritylasso",
140
                                        blocks = NULL,
141
                                        predict.type = "prob",
142
                                        favoring = FALSE),
143
                  "lrn_ts_prior_fav" = list(cl = "surv.ts.priorlasso",
144
                                            id = "Prioritylasso favoring",
145
                                            blocks = NULL,
146
                                            predict.type = "prob",
147
                                            favoring = TRUE),
148
                  "lrn_tsipf" = list(cl = "surv.ts.ipflasso",
149
                                     id = "IPF-Lasso",
150
                                     blocks = NULL,
151
                                     nfolds = 10,
152
                                     ncv = 1,
153
                                     predict.type = "prob"),
154
                  "lrn_cv_coxboost_unpen" = list(cl = "surv.cv.CoxBoost2",
155
                                                 id = "Coxboost favoring",
156
                                                 predict.type = "prob",
157
                                                 unpen.index = NULL),
158
                  "lrn_blockForest" = list(cl = "surv.blockForest",
159
                                           id = "blockForest",
160
                                           predict.type = "prob",
161
                                           blocks = NULL),
162
                  "lrn_SGL" = list(cl = "surv.cvSGL",
163
                                   id = "SGL",
164
                                   predict.type = "prob",
165
                                   index = NULL,
166
                                   nfold = 10),
167
                  "lrn_grridge" = list(cl = "surv.grridge",
168
                                       id = "grridge",
169
                                       partitions = NULL,
170
                                       predict.type = "prob",
171
                                       innfold = 10,
172
                                       standardizeX = TRUE,
173
                                       selectionEN = TRUE,
174
                                       maxsel = c(1000))
175
)
176
177
# You can also choose specific lrns (random selection will only take place if
178
# no manual selection is done)
179
180
# all_lrns <- names(l_lrn_args) # default is not to select manually (i.e., all lrns)
181
# l_lrn_args <- l_lrn_args[all_lrns]
182
183
# if (length(l_lrn_args) == 14) {
184
#   l_lrn_args <- l_lrn_args[partition$datsets] # check if $datsets correct argument
185
# }
186
187
188
addAlgorithm("learner", fun = function(job, data, instance, lrns, ...) {
189
  par.vals = list(...)
190
  
191
  lrnr = lrns
192
  
193
  # 1. Task -----------------------------------------------------------------   all equal
194
  task = data
195
  task_id = getTaskId(task)
196
  
197
  # 2. Learner --------------------------------------------------------------   main diffs here
198
  if (lrnr %in% c("lrn_clin_ref", "lrn_ts_prior", "lrn_ts_prior_fav", 
199
                  "lrn_tsipf","lrn_cv_coxboost_unpen", "lrn_blockForest",
200
                  "lrn_SGL", "lrn_grridge")) {
201
    lrn = make_spec_lrns(task, args = l_lrn_args, lrnr = lrnr)
202
  } else {
203
    lrn = list(assign(lrnr, do.call(makeLearner, l_lrn_args[[lrnr]])))
204
  }
205
  
206
  
207
  # 3. Measures -------------------------------------------------------------   all equal
208
  mrs = list(timetrain, cindex.uno, ibrier, featselc_default, featselc_clin, 
209
             featselc_cnv, featselc_mirna, featselc_mutation, featselc_rna) 
210
  
211
  # 4. Resampling -----------------------------------------------------------   all equal
212
  if (task_id %in% c("BRCA", "LUAD", "LUSC", "HNSC", "LGG", "UCEC", "BLCA")) {
213
    rdesc = makeResampleDesc("RepCV", reps = 5, folds = 5, stratify = TRUE)
214
  } else {
215
    rdesc = makeResampleDesc("RepCV", reps = 10, folds = 5, stratify = TRUE)    
216
  }
217
  
218
  set.seed(124) 
219
  rin = makeResampleInstance(rdesc, task = task)
220
  if (lrnr == "lrn_cv_coxboost_unpen") {
221
    save(rin, file = paste0(task_id, "_rin.RData"))
222
  }
223
  
224
  # 5. Benchmarking ---------------------------------------------------------   all equal 
225
  configureMlr(on.learner.error = "warn", show.learner.output = FALSE)
226
  set.seed(421)
227
  
228
  # keep.pred must be TRUE to be able to merge the results later
229
  bmr = benchmark(lrn, task, rin, mrs, 
230
                  keep.pred = TRUE, models = FALSE, show.info = TRUE)
231
  bmr
232
})
233
234
235
algo.designs <- list(learner = data.table(lrns = c(names(l_lrn_args))))
236
237
addExperiments(algo.designs = algo.designs)
238
summarizeExperiments()
239
240
# kernels to use (depends on system, here linux)
241
regis$cluster.functions <- makeClusterFunctionsMulticore(6)                    # change according to system (?makeClusterFunctionsMulticore)
242
243
244
# group jobs by learners with fast, normal, and slow run time and submit sequentially
245
# fast <- findExperiments(
246
#   algo.name = "learner",
247
#   algo.pars = (
248
#     lrns == "lrn_km" |
249
#       lrns == "lrn_ranger" |
250
#       lrns == "lrn_clin_ref"
251
#   )
252
# )
253
254
# normal <-
255
#   findExperiments(
256
#     algo.name = "learner",
257
#     algo.pars = (
258
#       lrns != "lrn_blockForest" &
259
#         lrns != "lrn_grridge" & 
260
#         lrns != "lrn_SGL" &
261
#         lrns != "lrn_km" &
262
#         lrns != "lrn_ranger" &
263
#         lrns != "lrn_clin_ref"
264
#     )
265
#   )
266
267
# slow <-
268
#   findExperiments(
269
#     algo.name = "learner",
270
#     algo.pars = (
271
#       lrns == "lrn_blockForest" |
272
#       lrns == "lrn_grridge"
273
#     )
274
# )
275
276
submitJobs()
277
getStatus()
278
279
done = findDone()
280
281
erg <- reduceResultsList(findExperiments())
282
errors = getErrorMessages()
283
284
save(erg, errors, file = "ergebnis.RData")