|
a |
|
b/R/bench_experiment.R |
|
|
1 |
### |
|
|
2 |
### Benchmarking experiment ### |
|
|
3 |
### |
|
|
4 |
|
|
|
5 |
# Pipeline: OpenML for datasets, mlr for benchmark, batchtools for parallelisat. |
|
|
6 |
|
|
|
7 |
# TODO |
|
|
8 |
# !!! Take care of packages version, especially mlr (s. packages)!!! |
|
|
9 |
# !!! Take care of makeClusterFunction !!! |
|
|
10 |
|
|
|
11 |
# Setup --------------------------------------------------- |
|
|
12 |
|
|
|
13 |
library(checkpoint) # get package snapshots used packages |
|
|
14 |
checkpoint("2019-04-17", project = getwd()) |
|
|
15 |
|
|
|
16 |
library(batchtools) |
|
|
17 |
|
|
|
18 |
packs <- c("mlr", "pec", "glmnet", "mboost", "prioritylasso", "CoxBoost", |
|
|
19 |
"ipflasso", "randomForestSRC", "ranger", "dplyr", "data.table", "survival", |
|
|
20 |
"GRridge", "tuneRanger", "SGL", "blockForest", "survAUC", "riskRegression") |
|
|
21 |
|
|
|
22 |
# unlink("bench_exp", recursive = TRUE) |
|
|
23 |
regis <- makeExperimentRegistry("bench_exp", |
|
|
24 |
packages = packs, |
|
|
25 |
source = "ancillary_code_bench.R") |
|
|
26 |
|
|
|
27 |
|
|
|
28 |
# 0. Partial reproduction ----------------------------------------------------- |
|
|
29 |
|
|
|
30 |
# If only parts of the benchmark experiment are to be run, e.g. for reproducing |
|
|
31 |
# single results due to computation times, change settings as desired |
|
|
32 |
# (here full bench experiment). |
|
|
33 |
# Important: Change seed to randomly choose the datasets/learners for which |
|
|
34 |
# the results should be reproduced. |
|
|
35 |
|
|
|
36 |
# n_lrns <- 14 |
|
|
37 |
# n_datsets <- 18 |
|
|
38 |
# seed <- 124 |
|
|
39 |
# |
|
|
40 |
# partition <- sample_reproduction(n_lrns = n_lrns, |
|
|
41 |
# n_datsets = n_datsets, |
|
|
42 |
# seed = seed) |
|
|
43 |
|
|
|
44 |
# 1. Tasks and problems-------------------------------------------------------- |
|
|
45 |
|
|
|
46 |
# OpenML dataset ids to querry |
|
|
47 |
|
|
|
48 |
load("data/datset_ids.RData") |
|
|
49 |
|
|
|
50 |
nams <- c("LAML", "BLCA", "LGG", "BRCA", "COAD", "ESCA", |
|
|
51 |
"HNSC", "KIRC", "KIRP", "LIHC", "LUAD", "LUSC", |
|
|
52 |
"OV", "PAAD", "SARC", "SKCM", "STAD", "UCEC") |
|
|
53 |
|
|
|
54 |
# For partial reproduction: |
|
|
55 |
# nams <- nams[partition$datsets] |
|
|
56 |
|
|
|
57 |
for (nam in nams) { |
|
|
58 |
|
|
|
59 |
# download dataset |
|
|
60 |
dat_part1 <- getOMLDataSet(datset_ids[[nam]][[1]]) |
|
|
61 |
dat_part2 <- getOMLDataSet(datset_ids[[nam]][[2]]) |
|
|
62 |
|
|
|
63 |
dat <- cbind.data.frame(dat_part1, dat_part2) |
|
|
64 |
|
|
|
65 |
if (nam = "BRCA") { |
|
|
66 |
dat_part3 <- getOMLDataSet(datset_ids[[nam]][[3]]) |
|
|
67 |
dat <- cbind.data.frame(dat, dat_part3) |
|
|
68 |
} |
|
|
69 |
|
|
|
70 |
task <- convertOMLDataSetToMlr(obj = dat, |
|
|
71 |
task.type = "Survival analysis", |
|
|
72 |
target = c("time", "status"), |
|
|
73 |
fix.colnames = FALSE) |
|
|
74 |
# convert to mlr task |
|
|
75 |
task <- makeSurvTask(id = nam, |
|
|
76 |
data = task[, -1], # delete patient code |
|
|
77 |
target = c("time", "status")) |
|
|
78 |
|
|
|
79 |
# adding task as batch problem |
|
|
80 |
addProblem(name = getTaskId(task), data = task) |
|
|
81 |
} |
|
|
82 |
|
|
|
83 |
|
|
|
84 |
# ------------------ |
|
|
85 |
# get data from disk |
|
|
86 |
|
|
|
87 |
# for (nam in nams) { |
|
|
88 |
# load("data/", nam, ".RData")) |
|
|
89 |
# task <- get(nam)[, -1] |
|
|
90 |
# task <- makeSurvTask(id = nam, data = task, target = c("time", "status")) |
|
|
91 |
# addProblem(name = getTaskId(task), data = task) |
|
|
92 |
# } |
|
|
93 |
# ------------------ |
|
|
94 |
|
|
|
95 |
rm(list = c(nams)) # the loaded datasets occupy a lot memory, so delete |
|
|
96 |
|
|
|
97 |
|
|
|
98 |
# 2. Algorithms and learners -------------------------------------------------- |
|
|
99 |
|
|
|
100 |
# For each learner an algorithm is defined to make paralleliz. on learner-level |
|
|
101 |
# possible. |
|
|
102 |
# Generic learners do not depend on dataset specific parameters (group structure) |
|
|
103 |
# Task specific learners depend on dataset specific parameters (group structure) |
|
|
104 |
# |
|
|
105 |
# In general, lrnrs use default settings. Find other lrnr-configurations |
|
|
106 |
# in the following list. |
|
|
107 |
# Task specific arguments cannot be set here and are added in make_spec_lrns(). |
|
|
108 |
# They are listed here for convenience overview and set to NULL. |
|
|
109 |
|
|
|
110 |
# defining lrn params in advance and using for loop to DRY |
|
|
111 |
l_lrn_args = list("lrn_km" = list(cl = "surv.kaplanmeier", |
|
|
112 |
id = "Kaplanmeier", |
|
|
113 |
predict.type = "prob"), |
|
|
114 |
"lrn_lasso" = list(cl = "surv.cv.glmnet2", |
|
|
115 |
id = "Lasso", |
|
|
116 |
s = "lambda.min", |
|
|
117 |
predict.type = "prob"), |
|
|
118 |
"lrn_glmboost" = list(cl = "surv.cv.glmboost", |
|
|
119 |
id = "glmBoost", |
|
|
120 |
use.formula = FALSE, |
|
|
121 |
mstop = 1L, |
|
|
122 |
predict.type = "prob"), |
|
|
123 |
"lrn_CoxBoost" = list(cl = "surv.cv.CoxBoost2", |
|
|
124 |
id = "CoxBoost", |
|
|
125 |
predict.type = "prob"), |
|
|
126 |
"lrn_rfsrc" = list(cl = "surv.randomForestSRC", |
|
|
127 |
id = "rfsrc", |
|
|
128 |
predict.type = "prob"), |
|
|
129 |
"lrn_ranger" = list(cl = "surv.tuneMtryFast2", |
|
|
130 |
id = "ranger", |
|
|
131 |
predict.type = "prob", |
|
|
132 |
write.forest = TRUE), |
|
|
133 |
"lrn_clin_ref" = list(cl = "surv.clinic_reference", |
|
|
134 |
id = "Clinical only", |
|
|
135 |
predict.type = "prob", |
|
|
136 |
clinicals = NULL, |
|
|
137 |
nfolds = 10), # only used if p_clin > n_train (which is not the case) |
|
|
138 |
"lrn_ts_prior" = list(cl = "surv.ts.priorlasso", |
|
|
139 |
id = "Prioritylasso", |
|
|
140 |
blocks = NULL, |
|
|
141 |
predict.type = "prob", |
|
|
142 |
favoring = FALSE), |
|
|
143 |
"lrn_ts_prior_fav" = list(cl = "surv.ts.priorlasso", |
|
|
144 |
id = "Prioritylasso favoring", |
|
|
145 |
blocks = NULL, |
|
|
146 |
predict.type = "prob", |
|
|
147 |
favoring = TRUE), |
|
|
148 |
"lrn_tsipf" = list(cl = "surv.ts.ipflasso", |
|
|
149 |
id = "IPF-Lasso", |
|
|
150 |
blocks = NULL, |
|
|
151 |
nfolds = 10, |
|
|
152 |
ncv = 1, |
|
|
153 |
predict.type = "prob"), |
|
|
154 |
"lrn_cv_coxboost_unpen" = list(cl = "surv.cv.CoxBoost2", |
|
|
155 |
id = "Coxboost favoring", |
|
|
156 |
predict.type = "prob", |
|
|
157 |
unpen.index = NULL), |
|
|
158 |
"lrn_blockForest" = list(cl = "surv.blockForest", |
|
|
159 |
id = "blockForest", |
|
|
160 |
predict.type = "prob", |
|
|
161 |
blocks = NULL), |
|
|
162 |
"lrn_SGL" = list(cl = "surv.cvSGL", |
|
|
163 |
id = "SGL", |
|
|
164 |
predict.type = "prob", |
|
|
165 |
index = NULL, |
|
|
166 |
nfold = 10), |
|
|
167 |
"lrn_grridge" = list(cl = "surv.grridge", |
|
|
168 |
id = "grridge", |
|
|
169 |
partitions = NULL, |
|
|
170 |
predict.type = "prob", |
|
|
171 |
innfold = 10, |
|
|
172 |
standardizeX = TRUE, |
|
|
173 |
selectionEN = TRUE, |
|
|
174 |
maxsel = c(1000)) |
|
|
175 |
) |
|
|
176 |
|
|
|
177 |
# You can also choose specific lrns (random selection will only take place if |
|
|
178 |
# no manual selection is done) |
|
|
179 |
|
|
|
180 |
# all_lrns <- names(l_lrn_args) # default is not to select manually (i.e., all lrns) |
|
|
181 |
# l_lrn_args <- l_lrn_args[all_lrns] |
|
|
182 |
|
|
|
183 |
# if (length(l_lrn_args) == 14) { |
|
|
184 |
# l_lrn_args <- l_lrn_args[partition$datsets] # check if $datsets correct argument |
|
|
185 |
# } |
|
|
186 |
|
|
|
187 |
|
|
|
188 |
addAlgorithm("learner", fun = function(job, data, instance, lrns, ...) { |
|
|
189 |
par.vals = list(...) |
|
|
190 |
|
|
|
191 |
lrnr = lrns |
|
|
192 |
|
|
|
193 |
# 1. Task ----------------------------------------------------------------- all equal |
|
|
194 |
task = data |
|
|
195 |
task_id = getTaskId(task) |
|
|
196 |
|
|
|
197 |
# 2. Learner -------------------------------------------------------------- main diffs here |
|
|
198 |
if (lrnr %in% c("lrn_clin_ref", "lrn_ts_prior", "lrn_ts_prior_fav", |
|
|
199 |
"lrn_tsipf","lrn_cv_coxboost_unpen", "lrn_blockForest", |
|
|
200 |
"lrn_SGL", "lrn_grridge")) { |
|
|
201 |
lrn = make_spec_lrns(task, args = l_lrn_args, lrnr = lrnr) |
|
|
202 |
} else { |
|
|
203 |
lrn = list(assign(lrnr, do.call(makeLearner, l_lrn_args[[lrnr]]))) |
|
|
204 |
} |
|
|
205 |
|
|
|
206 |
|
|
|
207 |
# 3. Measures ------------------------------------------------------------- all equal |
|
|
208 |
mrs = list(timetrain, cindex.uno, ibrier, featselc_default, featselc_clin, |
|
|
209 |
featselc_cnv, featselc_mirna, featselc_mutation, featselc_rna) |
|
|
210 |
|
|
|
211 |
# 4. Resampling ----------------------------------------------------------- all equal |
|
|
212 |
if (task_id %in% c("BRCA", "LUAD", "LUSC", "HNSC", "LGG", "UCEC", "BLCA")) { |
|
|
213 |
rdesc = makeResampleDesc("RepCV", reps = 5, folds = 5, stratify = TRUE) |
|
|
214 |
} else { |
|
|
215 |
rdesc = makeResampleDesc("RepCV", reps = 10, folds = 5, stratify = TRUE) |
|
|
216 |
} |
|
|
217 |
|
|
|
218 |
set.seed(124) |
|
|
219 |
rin = makeResampleInstance(rdesc, task = task) |
|
|
220 |
if (lrnr == "lrn_cv_coxboost_unpen") { |
|
|
221 |
save(rin, file = paste0(task_id, "_rin.RData")) |
|
|
222 |
} |
|
|
223 |
|
|
|
224 |
# 5. Benchmarking --------------------------------------------------------- all equal |
|
|
225 |
configureMlr(on.learner.error = "warn", show.learner.output = FALSE) |
|
|
226 |
set.seed(421) |
|
|
227 |
|
|
|
228 |
# keep.pred must be TRUE to be able to merge the results later |
|
|
229 |
bmr = benchmark(lrn, task, rin, mrs, |
|
|
230 |
keep.pred = TRUE, models = FALSE, show.info = TRUE) |
|
|
231 |
bmr |
|
|
232 |
}) |
|
|
233 |
|
|
|
234 |
|
|
|
235 |
algo.designs <- list(learner = data.table(lrns = c(names(l_lrn_args)))) |
|
|
236 |
|
|
|
237 |
addExperiments(algo.designs = algo.designs) |
|
|
238 |
summarizeExperiments() |
|
|
239 |
|
|
|
240 |
# kernels to use (depends on system, here linux) |
|
|
241 |
regis$cluster.functions <- makeClusterFunctionsMulticore(6) # change according to system (?makeClusterFunctionsMulticore) |
|
|
242 |
|
|
|
243 |
|
|
|
244 |
# group jobs by learners with fast, normal, and slow run time and submit sequentially |
|
|
245 |
# fast <- findExperiments( |
|
|
246 |
# algo.name = "learner", |
|
|
247 |
# algo.pars = ( |
|
|
248 |
# lrns == "lrn_km" | |
|
|
249 |
# lrns == "lrn_ranger" | |
|
|
250 |
# lrns == "lrn_clin_ref" |
|
|
251 |
# ) |
|
|
252 |
# ) |
|
|
253 |
|
|
|
254 |
# normal <- |
|
|
255 |
# findExperiments( |
|
|
256 |
# algo.name = "learner", |
|
|
257 |
# algo.pars = ( |
|
|
258 |
# lrns != "lrn_blockForest" & |
|
|
259 |
# lrns != "lrn_grridge" & |
|
|
260 |
# lrns != "lrn_SGL" & |
|
|
261 |
# lrns != "lrn_km" & |
|
|
262 |
# lrns != "lrn_ranger" & |
|
|
263 |
# lrns != "lrn_clin_ref" |
|
|
264 |
# ) |
|
|
265 |
# ) |
|
|
266 |
|
|
|
267 |
# slow <- |
|
|
268 |
# findExperiments( |
|
|
269 |
# algo.name = "learner", |
|
|
270 |
# algo.pars = ( |
|
|
271 |
# lrns == "lrn_blockForest" | |
|
|
272 |
# lrns == "lrn_grridge" |
|
|
273 |
# ) |
|
|
274 |
# ) |
|
|
275 |
|
|
|
276 |
submitJobs() |
|
|
277 |
getStatus() |
|
|
278 |
|
|
|
279 |
done = findDone() |
|
|
280 |
|
|
|
281 |
erg <- reduceResultsList(findExperiments()) |
|
|
282 |
errors = getErrorMessages() |
|
|
283 |
|
|
|
284 |
save(erg, errors, file = "ergebnis.RData") |