|
a |
|
b/R/IL_Utilities.R |
|
|
1 |
############################### |
|
|
2 |
#### Print learner summary #### |
|
|
3 |
############################### |
|
|
4 |
|
|
|
5 |
print.learner <- function(res,...){ |
|
|
6 |
num_layers <- length(res$X_train_layers) |
|
|
7 |
|
|
|
8 |
cat("Time for model fit :",res$time,"minutes \n") |
|
|
9 |
if(res$family=="binomial"){ |
|
|
10 |
|
|
|
11 |
cat("========================================\n") |
|
|
12 |
cat("Model fit for individual layers:",res$base_learner,"\n") |
|
|
13 |
if(res$run_stacked==TRUE){cat("Model fit for stacked layer:",res$meta_learner,"\n")} |
|
|
14 |
if(res$run_concat==TRUE){cat("Model fit for concatenated layer:",res$base_learner,"\n")} |
|
|
15 |
|
|
|
16 |
cat("========================================\n") |
|
|
17 |
cat("AUC metric for training data: \n") |
|
|
18 |
cat("Individual layers: \n") |
|
|
19 |
print(res$AUC.train[1:num_layers]) |
|
|
20 |
cat("======================\n") |
|
|
21 |
|
|
|
22 |
if(res$run_stacked==TRUE){ |
|
|
23 |
cat("Stacked model:") |
|
|
24 |
cat(as.numeric(res$AUC.train["stacked"]),"\n") |
|
|
25 |
cat("======================\n") |
|
|
26 |
|
|
|
27 |
} |
|
|
28 |
if(res$run_concat==TRUE){ |
|
|
29 |
cat("Concatenated model:") |
|
|
30 |
cat(as.numeric(res$AUC.train["concatenated"]),"\n") |
|
|
31 |
cat("======================\n") |
|
|
32 |
|
|
|
33 |
} |
|
|
34 |
cat("========================================\n") |
|
|
35 |
if(res$test==TRUE){ |
|
|
36 |
|
|
|
37 |
#cat("======================\n") |
|
|
38 |
cat("AUC metric for test data: \n") |
|
|
39 |
cat("Individual layers: \n") |
|
|
40 |
print(res$AUC.test[1:num_layers]) |
|
|
41 |
cat("======================\n") |
|
|
42 |
|
|
|
43 |
if(res$run_stacked==TRUE){ |
|
|
44 |
cat("Stacked model:") |
|
|
45 |
cat(as.numeric(res$AUC.test["stacked"]),"\n") |
|
|
46 |
cat("======================\n") |
|
|
47 |
|
|
|
48 |
} |
|
|
49 |
if(res$run_concat==TRUE){ |
|
|
50 |
cat("Concatenated model:") |
|
|
51 |
cat(as.numeric(res$AUC.test["concatenated"]),"\n") |
|
|
52 |
cat("======================\n") |
|
|
53 |
|
|
|
54 |
} |
|
|
55 |
cat("========================================\n") |
|
|
56 |
|
|
|
57 |
} |
|
|
58 |
|
|
|
59 |
} else if(res$family=="gaussian"){ |
|
|
60 |
|
|
|
61 |
|
|
|
62 |
cat("========================================\n") |
|
|
63 |
cat("Model fit for individual layers:",res$base_learner,"\n") |
|
|
64 |
if(res$run_stacked==TRUE){cat("Model fit for stacked layer:",res$meta_learner,"\n")} |
|
|
65 |
if(res$run_concat==TRUE){cat("Model fit for concatenated layer:",res$base_learner,"\n")} |
|
|
66 |
|
|
|
67 |
cat("========================================\n") |
|
|
68 |
cat("R^2 for training data: \n") |
|
|
69 |
cat("Individual layers: \n") |
|
|
70 |
print(res$R2.train[1:num_layers]) |
|
|
71 |
cat("======================\n") |
|
|
72 |
if(res$run_stacked==TRUE){ |
|
|
73 |
cat("Stacked model:") |
|
|
74 |
cat(as.numeric(res$R2.train["stacked"]),"\n") |
|
|
75 |
cat("======================\n") |
|
|
76 |
|
|
|
77 |
} |
|
|
78 |
if(res$run_concat==TRUE){ |
|
|
79 |
cat("Concatenated model:") |
|
|
80 |
cat(as.numeric(res$R2.train["concatenated"]),"\n") |
|
|
81 |
cat("======================\n") |
|
|
82 |
} |
|
|
83 |
cat("========================================\n") |
|
|
84 |
if(res$test==TRUE){ |
|
|
85 |
#cat("======================\n") |
|
|
86 |
cat("R^2 for test data: \n") |
|
|
87 |
cat("Individual layers: \n") |
|
|
88 |
print(res$R2.test[1:num_layers]) |
|
|
89 |
cat("======================\n") |
|
|
90 |
if(res$run_stacked==TRUE){ |
|
|
91 |
cat("Stacked model:") |
|
|
92 |
cat(as.numeric(res$R2.test["stacked"]),"\n") |
|
|
93 |
cat("======================\n") |
|
|
94 |
|
|
|
95 |
} |
|
|
96 |
if(res$run_concat==TRUE){ |
|
|
97 |
cat("Concatenated model:") |
|
|
98 |
cat(as.numeric(res$R2.test["concatenated"]),"\n") |
|
|
99 |
cat("======================\n") |
|
|
100 |
} |
|
|
101 |
cat("========================================\n") |
|
|
102 |
|
|
|
103 |
} |
|
|
104 |
|
|
|
105 |
} |
|
|
106 |
|
|
|
107 |
if(res$meta_learner=="SL.nnls.auc" & res$run_stacked){ |
|
|
108 |
|
|
|
109 |
cat("Weights for individual layers predictions in IntegratedLearner: \n") |
|
|
110 |
print(round(res$weights,digits=3)) |
|
|
111 |
cat("========================================\n") |
|
|
112 |
|
|
|
113 |
} |
|
|
114 |
|
|
|
115 |
} |
|
|
116 |
|
|
|
117 |
expit <- function(x){ |
|
|
118 |
return(1/(1+exp(-x))) |
|
|
119 |
} |
|
|
120 |
|
|
|
121 |
##################################################################### |
|
|
122 |
# Rename after adding serialize = TRUE in bartMachine2 (from ck37r) # |
|
|
123 |
##################################################################### |
|
|
124 |
|
|
|
125 |
# Temporary wrapper that needs to be fixed in SuperLearner |
|
|
126 |
#' Wrapper for bartMachine learner |
|
|
127 |
#' |
|
|
128 |
#' Support bayesian additive regression trees via the bartMachine package. |
|
|
129 |
#' |
|
|
130 |
#' @param Y Outcome variable |
|
|
131 |
#' @param X Covariate dataframe |
|
|
132 |
#' @param newX Optional dataframe to predict the outcome |
|
|
133 |
#' @param obsWeights Optional observation-level weights (supported but not tested) |
|
|
134 |
#' @param id Optional id to group observations from the same unit (not used |
|
|
135 |
#' currently). |
|
|
136 |
#' @param family "gaussian" for regression, "binomial" for binary |
|
|
137 |
#' classification |
|
|
138 |
#' @param num_trees The number of trees to be grown in the sum-of-trees model. |
|
|
139 |
#' @param num_burn_in Number of MCMC samples to be discarded as "burn-in". |
|
|
140 |
#' @param num_iterations_after_burn_in Number of MCMC samples to draw from the |
|
|
141 |
#' posterior distribution of f(x). |
|
|
142 |
#' @param alpha Base hyperparameter in tree prior for whether a node is |
|
|
143 |
#' nonterminal or not. |
|
|
144 |
#' @param beta Power hyperparameter in tree prior for whether a node is |
|
|
145 |
#' nonterminal or not. |
|
|
146 |
#' @param k For regression, k determines the prior probability that E(Y|X) is |
|
|
147 |
#' contained in the interval (y_{min}, y_{max}), based on a normal |
|
|
148 |
#' distribution. For example, when k=2, the prior probability is 95\%. For |
|
|
149 |
#' classification, k determines the prior probability that E(Y|X) is between |
|
|
150 |
#' (-3,3). Note that a larger value of k results in more shrinkage and a more |
|
|
151 |
#' conservative fit. |
|
|
152 |
#' @param q Quantile of the prior on the error variance at which the data-based |
|
|
153 |
#' estimate is placed. Note that the larger the value of q, the more |
|
|
154 |
#' aggressive the fit as you are placing more prior weight on values lower |
|
|
155 |
#' than the data-based estimate. Not used for classification. |
|
|
156 |
#' @param nu Degrees of freedom for the inverse chi^2 prior. Not used for |
|
|
157 |
#' classification. |
|
|
158 |
#' @param verbose Prints information about progress of the algorithm to the |
|
|
159 |
#' screen. |
|
|
160 |
#' @param serialize If TRUE, bartMachine results can be saved to a file, but |
|
|
161 |
#' will require additional RAM. |
|
|
162 |
#' @param ... Additional arguments (not used) |
|
|
163 |
#' |
|
|
164 |
#' @encoding utf-8 |
|
|
165 |
#' @export |
|
|
166 |
SL.BART <- function(Y, X, newX, family, obsWeights, id, |
|
|
167 |
num_trees = 50, num_burn_in = 250, verbose = F, |
|
|
168 |
alpha = 0.95, beta = 2, k = 2, q = 0.9, nu = 3, |
|
|
169 |
num_iterations_after_burn_in = 1000, |
|
|
170 |
serialize = TRUE, seed=5678, |
|
|
171 |
...) { |
|
|
172 |
#.SL.require("bartMachine") |
|
|
173 |
|
|
|
174 |
################ |
|
|
175 |
### CK changes: |
|
|
176 |
if (family$family == "binomial") { |
|
|
177 |
# Need to convert Y to a factor, otherwise bartMachine does regression. |
|
|
178 |
# And importantly, bartMachine expects the first level to be the positive |
|
|
179 |
# class, so we have to specify levels. |
|
|
180 |
Y = factor(Y, levels = c("1", "0")) |
|
|
181 |
} |
|
|
182 |
model = bartMachine::bartMachine(X, Y, num_trees = num_trees, |
|
|
183 |
num_burn_in = num_burn_in, verbose = verbose, |
|
|
184 |
alpha = alpha, beta = beta, k = k, q = q, nu = nu, |
|
|
185 |
num_iterations_after_burn_in = num_iterations_after_burn_in, |
|
|
186 |
serialize = serialize,seed=seed) |
|
|
187 |
# pred returns predicted responses (on the scale of the outcome) |
|
|
188 |
#pred <- bartMachine:::predict.bartMachine(model, newX) |
|
|
189 |
pred <- predict(model, newX) |
|
|
190 |
|
|
|
191 |
fit <- list(object = model) |
|
|
192 |
class(fit) <- c("SL.BART") |
|
|
193 |
|
|
|
194 |
out <- list(pred = pred, fit = fit) |
|
|
195 |
return(out) |
|
|
196 |
} |
|
|
197 |
|
|
|
198 |
#' Predict function for SL.BART |
|
|
199 |
#' |
|
|
200 |
#' @param object object |
|
|
201 |
#' @param newdata newdata |
|
|
202 |
#' |
|
|
203 |
#' @return Prediction from the SL.BART |
|
|
204 |
#' @export |
|
|
205 |
predict.SL.BART <- function(object, newdata, family, X = NULL, Y = NULL,...) { |
|
|
206 |
#.SL.require("bartMachine") |
|
|
207 |
pred <- predict(object$object, newdata) |
|
|
208 |
return(pred) |
|
|
209 |
} |
|
|
210 |
|
|
|
211 |
|
|
|
212 |
|
|
|
213 |
########################################## |
|
|
214 |
# Mixed BART implementation using mxBART # |
|
|
215 |
########################################## |
|
|
216 |
SL.mxBART <- function(Y, X, newX, family, obsWeights, id, |
|
|
217 |
sparse=FALSE,ntree=50, |
|
|
218 |
ndpost=1000,nskip=100,keepevery=10, |
|
|
219 |
mxps=list(list(prior=1,df=3,scale=1)), |
|
|
220 |
...) { |
|
|
221 |
#.SL.require("mxBART") |
|
|
222 |
|
|
|
223 |
if(family$famil=="gaussian"){ |
|
|
224 |
type='wbart' |
|
|
225 |
}else if(family$family=="binomial"){ |
|
|
226 |
type='pbart' |
|
|
227 |
} |
|
|
228 |
|
|
|
229 |
model = mxBART::mxbart(y.train=Y, |
|
|
230 |
x.train=X, |
|
|
231 |
x.test = newX, |
|
|
232 |
id.train=list(id), |
|
|
233 |
sparse=FALSE,ntree=ntree,type=type, |
|
|
234 |
ndpost=ndpost,nskip=nskip,keepevery=keepevery, |
|
|
235 |
mxps = mxps) |
|
|
236 |
|
|
|
237 |
# pred returns predicted responses (on the scale of the outcome) |
|
|
238 |
if(family$family=="gaussian"){ |
|
|
239 |
pred <- model$fhat.test.mean |
|
|
240 |
|
|
|
241 |
}else if(family$family=="binomial"){ |
|
|
242 |
pred <- model$prob.test.mean |
|
|
243 |
} |
|
|
244 |
|
|
|
245 |
fit <- list(object = model) |
|
|
246 |
class(fit) <- c("SL.mxBART") |
|
|
247 |
|
|
|
248 |
out <- list(pred = pred, fit = fit) |
|
|
249 |
return(out) |
|
|
250 |
} |
|
|
251 |
|
|
|
252 |
|
|
|
253 |
predict.SL.mxBART <- function(object, newdata,family=family, X=X, Y=Y, |
|
|
254 |
obsWeights, id, |
|
|
255 |
sparse=FALSE,ntree=50, |
|
|
256 |
ndpost=1000,nskip=100,keepevery=10, |
|
|
257 |
mxps=list(list(prior=1,df=3,scale=1)), |
|
|
258 |
...) { |
|
|
259 |
#.SL.require("mxbart") |
|
|
260 |
if(family$famil=="gaussian"){ |
|
|
261 |
type='wbart' |
|
|
262 |
}else if(family$family=="binomial"){ |
|
|
263 |
type='pbart' |
|
|
264 |
} |
|
|
265 |
|
|
|
266 |
model = mxBART::mxbart(y.train=Y, |
|
|
267 |
x.train=X, |
|
|
268 |
x.test = newdata, |
|
|
269 |
id.train=list(id), |
|
|
270 |
sparse=FALSE,ntree=ntree,type=type, |
|
|
271 |
ndpost=ndpost,nskip=nskip,keepevery=keepevery, |
|
|
272 |
mxps = mxps) |
|
|
273 |
|
|
|
274 |
if(family$family=="gaussian"){ |
|
|
275 |
pred <- model$fhat.test.mean |
|
|
276 |
|
|
|
277 |
}else if(family$family=="binomial"){ |
|
|
278 |
pred <- model$prob.test.mean |
|
|
279 |
} |
|
|
280 |
|
|
|
281 |
|
|
|
282 |
return(pred) |
|
|
283 |
} |
|
|
284 |
|
|
|
285 |
|
|
|
286 |
|
|
|
287 |
|
|
|
288 |
########################################## |
|
|
289 |
# Elastic net wrapper with a fixed alpha # |
|
|
290 |
########################################## |
|
|
291 |
|
|
|
292 |
#' @title Elastic net regression, including lasso and ridge with a fixed alpha |
|
|
293 |
#' |
|
|
294 |
#' @description |
|
|
295 |
#' Penalized regression using elastic net. Alpha = 0 corresponds to ridge |
|
|
296 |
#' regression and alpha = 1 corresponds to Lasso. |
|
|
297 |
#' |
|
|
298 |
#' See \code{vignette("glmnet_beta", package = "glmnet")} for a nice tutorial on |
|
|
299 |
#' glmnet. |
|
|
300 |
#' |
|
|
301 |
#' @param Y Outcome variable |
|
|
302 |
#' @param X Covariate dataframe |
|
|
303 |
#' @param newX Dataframe to predict the outcome |
|
|
304 |
#' @param obsWeights Optional observation-level weights |
|
|
305 |
#' @param id Optional id to group observations from the same unit (not used |
|
|
306 |
#' currently). |
|
|
307 |
#' @param family "gaussian" for regression, "binomial" for binary |
|
|
308 |
#' classification. Untested options: "multinomial" for multiple classification |
|
|
309 |
#' or "mgaussian" for multiple response, "poisson" for non-negative outcome |
|
|
310 |
#' with proportional mean and variance, "cox". |
|
|
311 |
#' @param alpha Elastic net mixing parameter, range [0, 1]. 0 = ridge regression |
|
|
312 |
#' and 1 = lasso. |
|
|
313 |
#' @param nfolds Number of folds for internal cross-validation to optimize lambda. |
|
|
314 |
#' @param nlambda Number of lambda values to check, recommended to be 100 or more. |
|
|
315 |
#' @param loss Loss function, can be "deviance", "mse", or "mae". If family = |
|
|
316 |
#' binomial can also be "auc" or "class" (misclassification error). |
|
|
317 |
#' @param useMin If TRUE use lambda that minimizes risk, otherwise use 1 |
|
|
318 |
#' standard-error rule which chooses a higher penalty with performance within |
|
|
319 |
#' one standard error of the minimum (see Breiman et al. 1984 on CART for |
|
|
320 |
#' background). |
|
|
321 |
#' @param ... Any additional arguments are passed through to cv.glmnet. |
|
|
322 |
#' |
|
|
323 |
#' @examples |
|
|
324 |
#' |
|
|
325 |
#' # Load a test dataset. |
|
|
326 |
#' data(PimaIndiansDiabetes2, package = "mlbench") |
|
|
327 |
#' data = PimaIndiansDiabetes2 |
|
|
328 |
#' |
|
|
329 |
#' # Omit observations with missing data. |
|
|
330 |
#' data = na.omit(data) |
|
|
331 |
#' |
|
|
332 |
#' Y = as.numeric(data$diabetes == "pos") |
|
|
333 |
#' X = subset(data, select = -diabetes) |
|
|
334 |
#' |
|
|
335 |
#' set.seed(1, "L'Ecuyer-CMRG") |
|
|
336 |
#' |
|
|
337 |
#' sl = SuperLearner(Y, X, family = binomial(), |
|
|
338 |
#' SL.library = c("SL.mean", "SL.glm", "SL.glmnet")) |
|
|
339 |
#' sl |
|
|
340 |
#' |
|
|
341 |
#' @references |
|
|
342 |
#' |
|
|
343 |
#' Friedman, J., Hastie, T., & Tibshirani, R. (2010). Regularization paths for |
|
|
344 |
#' generalized linear models via coordinate descent. Journal of statistical |
|
|
345 |
#' software, 33(1), 1. |
|
|
346 |
#' |
|
|
347 |
#' Hoerl, A. E., & Kennard, R. W. (1970). Ridge regression: Biased estimation |
|
|
348 |
#' for nonorthogonal problems. Technometrics, 12(1), 55-67. |
|
|
349 |
#' |
|
|
350 |
#' Tibshirani, R. (1996). Regression shrinkage and selection via the lasso. |
|
|
351 |
#' Journal of the Royal Statistical Society. Series B (Methodological), 267-288. |
|
|
352 |
#' |
|
|
353 |
#' Zou, H., & Hastie, T. (2005). Regularization and variable selection via the |
|
|
354 |
#' elastic net. Journal of the Royal Statistical Society: Series B (Statistical |
|
|
355 |
#' Methodology), 67(2), 301-320. |
|
|
356 |
#' |
|
|
357 |
#' @seealso \code{\link{predict.SL.glmnet}} \code{\link[glmnet]{cv.glmnet}} |
|
|
358 |
#' \code{\link[glmnet]{glmnet}} |
|
|
359 |
#' |
|
|
360 |
#' @export |
|
|
361 |
SL.glmnet2 <- function(Y, X, newX, family, obsWeights, id, |
|
|
362 |
alpha = 0.5, nfolds = 10, nlambda = 100, useMin = TRUE, |
|
|
363 |
loss = "deviance", |
|
|
364 |
...) { |
|
|
365 |
#.SL.require('glmnet') |
|
|
366 |
|
|
|
367 |
# X must be a matrix, should we use model.matrix or as.matrix |
|
|
368 |
# TODO: support sparse matrices. |
|
|
369 |
if (!is.matrix(X)) { |
|
|
370 |
X <- model.matrix(~ -1 + ., X) |
|
|
371 |
newX <- model.matrix(~ -1 + ., newX) |
|
|
372 |
} |
|
|
373 |
|
|
|
374 |
# Use CV to find optimal lambda. |
|
|
375 |
fitCV <- glmnet::cv.glmnet(x = X, y = Y, weights = obsWeights, |
|
|
376 |
lambda = NULL, |
|
|
377 |
type.measure = loss, |
|
|
378 |
nfolds = nfolds, |
|
|
379 |
family = family$family, |
|
|
380 |
alpha = alpha, |
|
|
381 |
nlambda = nlambda, |
|
|
382 |
intercept = FALSE, |
|
|
383 |
...) |
|
|
384 |
|
|
|
385 |
# If we predict with the cv.glmnet object we can specify lambda using a |
|
|
386 |
# string. |
|
|
387 |
pred <- predict(fitCV, newx = newX, type = "response", |
|
|
388 |
s = ifelse(useMin, "lambda.min", "lambda.1se")) |
|
|
389 |
|
|
|
390 |
fit <- list(object = fitCV, useMin = useMin) |
|
|
391 |
class(fit) <- "SL.glmnet" |
|
|
392 |
|
|
|
393 |
out <- list(pred = pred, fit = fit) |
|
|
394 |
return(out) |
|
|
395 |
} |
|
|
396 |
|
|
|
397 |
######################################## |
|
|
398 |
# LASSO wrapper with intercept = FALSE # |
|
|
399 |
######################################## |
|
|
400 |
|
|
|
401 |
#' @title Elastic net regression, including lasso and ridge with a fixed alpha |
|
|
402 |
#' |
|
|
403 |
#' @description |
|
|
404 |
#' Penalized regression using elastic net. Alpha = 0 corresponds to ridge |
|
|
405 |
#' regression and alpha = 1 corresponds to Lasso. |
|
|
406 |
#' |
|
|
407 |
#' See \code{vignette("glmnet_beta", package = "glmnet")} for a nice tutorial on |
|
|
408 |
#' glmnet. |
|
|
409 |
#' |
|
|
410 |
#' @param Y Outcome variable |
|
|
411 |
#' @param X Covariate dataframe |
|
|
412 |
#' @param newX Dataframe to predict the outcome |
|
|
413 |
#' @param obsWeights Optional observation-level weights |
|
|
414 |
#' @param id Optional id to group observations from the same unit (not used |
|
|
415 |
#' currently). |
|
|
416 |
#' @param family "gaussian" for regression, "binomial" for binary |
|
|
417 |
#' classification. Untested options: "multinomial" for multiple classification |
|
|
418 |
#' or "mgaussian" for multiple response, "poisson" for non-negative outcome |
|
|
419 |
#' with proportional mean and variance, "cox". |
|
|
420 |
#' @param alpha Elastic net mixing parameter, range [0, 1]. 0 = ridge regression |
|
|
421 |
#' and 1 = lasso. |
|
|
422 |
#' @param nfolds Number of folds for internal cross-validation to optimize lambda. |
|
|
423 |
#' @param nlambda Number of lambda values to check, recommended to be 100 or more. |
|
|
424 |
#' @param loss Loss function, can be "deviance", "mse", or "mae". If family = |
|
|
425 |
#' binomial can also be "auc" or "class" (misclassification error). |
|
|
426 |
#' @param useMin If TRUE use lambda that minimizes risk, otherwise use 1 |
|
|
427 |
#' standard-error rule which chooses a higher penalty with performance within |
|
|
428 |
#' one standard error of the minimum (see Breiman et al. 1984 on CART for |
|
|
429 |
#' background). |
|
|
430 |
#' @param ... Any additional arguments are passed through to cv.glmnet. |
|
|
431 |
#' |
|
|
432 |
#' @examples |
|
|
433 |
#' |
|
|
434 |
#' # Load a test dataset. |
|
|
435 |
#' data(PimaIndiansDiabetes2, package = "mlbench") |
|
|
436 |
#' data = PimaIndiansDiabetes2 |
|
|
437 |
#' |
|
|
438 |
#' # Omit observations with missing data. |
|
|
439 |
#' data = na.omit(data) |
|
|
440 |
#' |
|
|
441 |
#' Y = as.numeric(data$diabetes == "pos") |
|
|
442 |
#' X = subset(data, select = -diabetes) |
|
|
443 |
#' |
|
|
444 |
#' set.seed(1, "L'Ecuyer-CMRG") |
|
|
445 |
#' |
|
|
446 |
#' sl = SuperLearner(Y, X, family = binomial(), |
|
|
447 |
#' SL.library = c("SL.mean", "SL.glm", "SL.glmnet")) |
|
|
448 |
#' sl |
|
|
449 |
#' |
|
|
450 |
#' @references |
|
|
451 |
#' |
|
|
452 |
#' Friedman, J., Hastie, T., & Tibshirani, R. (2010). Regularization paths for |
|
|
453 |
#' generalized linear models via coordinate descent. Journal of statistical |
|
|
454 |
#' software, 33(1), 1. |
|
|
455 |
#' |
|
|
456 |
#' Hoerl, A. E., & Kennard, R. W. (1970). Ridge regression: Biased estimation |
|
|
457 |
#' for nonorthogonal problems. Technometrics, 12(1), 55-67. |
|
|
458 |
#' |
|
|
459 |
#' Tibshirani, R. (1996). Regression shrinkage and selection via the lasso. |
|
|
460 |
#' Journal of the Royal Statistical Society. Series B (Methodological), 267-288. |
|
|
461 |
#' |
|
|
462 |
#' Zou, H., & Hastie, T. (2005). Regularization and variable selection via the |
|
|
463 |
#' elastic net. Journal of the Royal Statistical Society: Series B (Statistical |
|
|
464 |
#' Methodology), 67(2), 301-320. |
|
|
465 |
#' |
|
|
466 |
#' @seealso \code{\link{predict.SL.glmnet}} \code{\link[glmnet]{cv.glmnet}} |
|
|
467 |
#' \code{\link[glmnet]{glmnet}} |
|
|
468 |
#' |
|
|
469 |
#' @export |
|
|
470 |
SL.LASSO <- function(Y, X, newX, family, obsWeights, id, |
|
|
471 |
alpha = 1, nfolds = 10, nlambda = 100, useMin = TRUE, |
|
|
472 |
loss = "deviance", |
|
|
473 |
...) { |
|
|
474 |
#.SL.require('glmnet') |
|
|
475 |
|
|
|
476 |
# X must be a matrix, should we use model.matrix or as.matrix |
|
|
477 |
# TODO: support sparse matrices. |
|
|
478 |
if (!is.matrix(X)) { |
|
|
479 |
X <- model.matrix(~ -1 + ., X) |
|
|
480 |
newX <- model.matrix(~ -1 + ., newX) |
|
|
481 |
} |
|
|
482 |
|
|
|
483 |
# Use CV to find optimal lambda. |
|
|
484 |
fitCV <- glmnet::cv.glmnet(x = X, y = Y, weights = obsWeights, |
|
|
485 |
lambda = NULL, |
|
|
486 |
type.measure = loss, |
|
|
487 |
nfolds = nfolds, |
|
|
488 |
family = family$family, |
|
|
489 |
alpha = alpha, |
|
|
490 |
nlambda = nlambda, |
|
|
491 |
intercept = FALSE, |
|
|
492 |
...) |
|
|
493 |
|
|
|
494 |
# If we predict with the cv.glmnet object we can specify lambda using a |
|
|
495 |
# string. |
|
|
496 |
pred <- predict(fitCV, newx = newX, type = "response", |
|
|
497 |
s = ifelse(useMin, "lambda.min", "lambda.1se")) |
|
|
498 |
|
|
|
499 |
fit <- list(object = fitCV, useMin = useMin) |
|
|
500 |
class(fit) <- "SL.glmnet" |
|
|
501 |
|
|
|
502 |
out <- list(pred = pred, fit = fit) |
|
|
503 |
return(out) |
|
|
504 |
} |
|
|
505 |
|
|
|
506 |
|
|
|
507 |
############################################ |
|
|
508 |
# Elastic net wrapper with optimized alpha # |
|
|
509 |
############################################ |
|
|
510 |
|
|
|
511 |
# NOTE: COMPUTATIONALLY INTENSIVE |
|
|
512 |
|
|
|
513 |
#' @title Elastic net regression, including lasso and ridge with optimized alpha and lambda |
|
|
514 |
#' |
|
|
515 |
#' @description |
|
|
516 |
#' Penalized regression using elastic net. Alpha = 0 corresponds to ridge |
|
|
517 |
#' regression and alpha = 1 corresponds to Lasso. |
|
|
518 |
#' |
|
|
519 |
#' See \code{vignette("glmnet_beta", package = "glmnet")} for a nice tutorial on |
|
|
520 |
#' glmnet. |
|
|
521 |
#' |
|
|
522 |
#' @param Y Outcome variable |
|
|
523 |
#' @param X Covariate dataframe |
|
|
524 |
#' @param newX Dataframe to predict the outcome |
|
|
525 |
#' @param obsWeights Optional observation-level weights |
|
|
526 |
#' @param id Optional id to group observations from the same unit (not used |
|
|
527 |
#' currently). |
|
|
528 |
#' @param family "gaussian" for regression, "binomial" for binary |
|
|
529 |
#' classification. Untested options: "multinomial" for multiple classification |
|
|
530 |
#' or "mgaussian" for multiple response, "poisson" for non-negative outcome |
|
|
531 |
#' with proportional mean and variance, "cox". |
|
|
532 |
#' @param alpha Elastic net mixing parameter, range [0, 1]. 0 = ridge regression |
|
|
533 |
#' and 1 = lasso. |
|
|
534 |
#' @param nfolds Number of folds for internal cross-validation to optimize lambda. |
|
|
535 |
#' @param nlambda Number of lambda values to check, recommended to be 100 or more. |
|
|
536 |
#' @param loss Loss function, can be "deviance", "mse", or "mae". If family = |
|
|
537 |
#' binomial can also be "auc" or "class" (misclassification error). |
|
|
538 |
#' @param useMin If TRUE use lambda that minimizes risk, otherwise use 1 |
|
|
539 |
#' standard-error rule which chooses a higher penalty with performance within |
|
|
540 |
#' one standard error of the minimum (see Breiman et al. 1984 on CART for |
|
|
541 |
#' background). |
|
|
542 |
#' @param ... Any additional arguments are passed through to cv.glmnet. |
|
|
543 |
#' |
|
|
544 |
#' @examples |
|
|
545 |
#' |
|
|
546 |
#' # Load a test dataset. |
|
|
547 |
#' data(PimaIndiansDiabetes2, package = "mlbench") |
|
|
548 |
#' data = PimaIndiansDiabetes2 |
|
|
549 |
#' |
|
|
550 |
#' # Omit observations with missing data. |
|
|
551 |
#' data = na.omit(data) |
|
|
552 |
#' |
|
|
553 |
#' Y = as.numeric(data$diabetes == "pos") |
|
|
554 |
#' X = subset(data, select = -diabetes) |
|
|
555 |
#' |
|
|
556 |
#' set.seed(1, "L'Ecuyer-CMRG") |
|
|
557 |
#' |
|
|
558 |
#' sl = SuperLearner(Y, X, family = binomial(), |
|
|
559 |
#' SL.library = c("SL.mean", "SL.glm", "SL.glmnet")) |
|
|
560 |
#' sl |
|
|
561 |
#' |
|
|
562 |
#' @references |
|
|
563 |
#' |
|
|
564 |
#' Friedman, J., Hastie, T., & Tibshirani, R. (2010). Regularization paths for |
|
|
565 |
#' generalized linear models via coordinate descent. Journal of statistical |
|
|
566 |
#' software, 33(1), 1. |
|
|
567 |
#' |
|
|
568 |
#' Hoerl, A. E., & Kennard, R. W. (1970). Ridge regression: Biased estimation |
|
|
569 |
#' for nonorthogonal problems. Technometrics, 12(1), 55-67. |
|
|
570 |
#' |
|
|
571 |
#' Tibshirani, R. (1996). Regression shrinkage and selection via the lasso. |
|
|
572 |
#' Journal of the Royal Statistical Society. Series B (Methodological), 267-288. |
|
|
573 |
#' |
|
|
574 |
#' Zou, H., & Hastie, T. (2005). Regularization and variable selection via the |
|
|
575 |
#' elastic net. Journal of the Royal Statistical Society: Series B (Statistical |
|
|
576 |
#' Methodology), 67(2), 301-320. |
|
|
577 |
#' |
|
|
578 |
#' @seealso \code{\link{predict.SL.glmnet}} \code{\link[glmnet]{cv.glmnet}} |
|
|
579 |
#' \code{\link[glmnet]{glmnet}} |
|
|
580 |
#' |
|
|
581 |
#' @export |
|
|
582 |
SL.enet <- function(Y, X, newX, family, obsWeights, id, |
|
|
583 |
alpha = seq(0, 1, 0.1), nfolds = 10, nlambda = 100, useMin = TRUE, |
|
|
584 |
loss = "deviance", |
|
|
585 |
...) { |
|
|
586 |
#.SL.require('glmnet') |
|
|
587 |
|
|
|
588 |
# X must be a matrix, should we use model.matrix or as.matrix |
|
|
589 |
# TODO: support sparse matrices. |
|
|
590 |
if (!is.matrix(X)) { |
|
|
591 |
X <- model.matrix(~ -1 + ., X) |
|
|
592 |
newX <- model.matrix(~ -1 + ., newX) |
|
|
593 |
} |
|
|
594 |
|
|
|
595 |
# Use CV to find optimal alpha. |
|
|
596 |
fit0 <- glmnetUtils::cva.glmnet(x = X, |
|
|
597 |
y = Y, |
|
|
598 |
weights = obsWeights, |
|
|
599 |
lambda = NULL, |
|
|
600 |
type.measure = loss, |
|
|
601 |
nfolds = nfolds, |
|
|
602 |
family = family$family, |
|
|
603 |
alpha = alpha, |
|
|
604 |
nlambda = nlambda, |
|
|
605 |
intercept = FALSE, |
|
|
606 |
...) |
|
|
607 |
|
|
|
608 |
# Extract best alpha |
|
|
609 |
enet_performance <- data.frame(alpha = fit0$alpha) |
|
|
610 |
models <- fit0$modlist |
|
|
611 |
enet_performance$cvm <- vapply(models, get_cvm, numeric(1)) |
|
|
612 |
minix <- which.min(enet_performance$cvm) |
|
|
613 |
best_alpha <- fit0$alpha[minix] |
|
|
614 |
|
|
|
615 |
# Use CV to find optimal lambda. |
|
|
616 |
fitCV <- glmnet::cv.glmnet(x = X, y = Y, weights = obsWeights, |
|
|
617 |
lambda = NULL, |
|
|
618 |
type.measure = loss, |
|
|
619 |
nfolds = nfolds, |
|
|
620 |
family = family$family, |
|
|
621 |
alpha = best_alpha, |
|
|
622 |
nlambda = nlambda, |
|
|
623 |
intercept=FALSE, |
|
|
624 |
...) |
|
|
625 |
fitCV$alpha <- best_alpha |
|
|
626 |
# If we predict with the cv.glmnet object we can specify lambda using a |
|
|
627 |
# string. |
|
|
628 |
pred <- predict(fitCV, newx = newX, type = "response", |
|
|
629 |
s = ifelse(useMin, "lambda.min", "lambda.1se")) |
|
|
630 |
|
|
|
631 |
fit <- list(object = fitCV, useMin = useMin) |
|
|
632 |
class(fit) <- "SL.glmnet" |
|
|
633 |
|
|
|
634 |
out <- list(pred = pred, fit = fit) |
|
|
635 |
return(out) |
|
|
636 |
} |
|
|
637 |
|
|
|
638 |
# Helper function for extracting best alpha from cva.glmnet |
|
|
639 |
get_cvm <- function(model) { |
|
|
640 |
index <- match(model$lambda.1se, model$lambda) |
|
|
641 |
model$cvm[index] |
|
|
642 |
} |
|
|
643 |
|
|
|
644 |
|
|
|
645 |
|
|
|
646 |
#' Horseshoe regression |
|
|
647 |
#' |
|
|
648 |
#' @param Y Outcome variable |
|
|
649 |
#' @param X Covariate data frame |
|
|
650 |
#' @param newX Dataframe to predict the outcome |
|
|
651 |
#' @param family "gaussian" for regression, "binomial" for binary |
|
|
652 |
#' classification. Untested options: "poisson" for for integer or count data |
|
|
653 |
#' @param prior prior for regression coefficients to use. "Horseshoe" by default. |
|
|
654 |
#' Untested options: ridge regression (prior="rr" or prior="ridge"), |
|
|
655 |
#' lasso regression (prior="lasso") and horseshoe+ regression (prior="hs+" |
|
|
656 |
#' or prior="horseshoe+") |
|
|
657 |
#' @param N Number of posterior samples to generate. |
|
|
658 |
#' @param burnin Number of burn-in samples. |
|
|
659 |
#' @param thinning Desired level of thinning. |
|
|
660 |
#' @param ... other parameters passed to bayesreg function |
|
|
661 |
#' |
|
|
662 |
#' @return SL object |
|
|
663 |
#' @export |
|
|
664 |
SL.horseshoe <- function(Y, X, newX, family, prior = "horseshoe", N = 20000L, burnin = 1000L, |
|
|
665 |
thinning = 1L, ...){ |
|
|
666 |
if (family$family == "binomial") { |
|
|
667 |
# Need to convert Y to a factor, otherwise bartMachine does regression. |
|
|
668 |
# And importantly, bayesreg expects the second level to be the positive |
|
|
669 |
# class, so we have to specify levels. |
|
|
670 |
Y = factor(Y, levels = c("0", "1")) |
|
|
671 |
} |
|
|
672 |
|
|
|
673 |
#.SL.require('bayesreg') |
|
|
674 |
df <- data.frame(X,Y) |
|
|
675 |
model.HS=bayesreg(Y~.,df,model = family$family,prior=prior, |
|
|
676 |
n.samples=N,burnin = burnin,thin = thinning) |
|
|
677 |
newX <- as.matrix(newX) |
|
|
678 |
ynew.samp <- rep(1,nrow(newX)) %*% model.HS$beta0+ newX %*% model.HS$beta |
|
|
679 |
if(family$family=="binomial"){ |
|
|
680 |
ynew.samp <- expit(ynew.samp) |
|
|
681 |
} |
|
|
682 |
pred <- apply(ynew.samp, 1, median) |
|
|
683 |
|
|
|
684 |
fit <- list(object = model.HS) |
|
|
685 |
class(fit) <- c("SL.horseshoe") |
|
|
686 |
|
|
|
687 |
out <- list(pred = pred, fit = fit) |
|
|
688 |
return(out) |
|
|
689 |
|
|
|
690 |
#} |
|
|
691 |
|
|
|
692 |
} |
|
|
693 |
|
|
|
694 |
predict.SL.horseshoe <- function(object, newdata, family, X = NULL, Y = NULL,...) { |
|
|
695 |
#.SL.require("bayesreg") |
|
|
696 |
model.HS <- object$object |
|
|
697 |
newdata <- as.matrix(newdata) |
|
|
698 |
ynew.samp <- rep(1,nrow(newdata)) %*% model.HS$beta0+ newdata %*% model.HS$beta |
|
|
699 |
if(family$family=="binomial"){ |
|
|
700 |
ynew.samp <- expit(ynew.samp) |
|
|
701 |
} |
|
|
702 |
pred <- apply(ynew.samp, 1, median) |
|
|
703 |
|
|
|
704 |
return(pred) |
|
|
705 |
} |
|
|
706 |
|
|
|
707 |
|
|
|
708 |
|
|
|
709 |
|
|
|
710 |
######################################################## |
|
|
711 |
# Non-negative least squares or rank loss minimization # |
|
|
712 |
######################################################## |
|
|
713 |
|
|
|
714 |
#' Title Meta level Objective function: NNLS for gaussian; Rank loss for binary observations |
|
|
715 |
#' |
|
|
716 |
#' @param b Weights vector |
|
|
717 |
#' @param X Design matrix (data frame) |
|
|
718 |
#' @param Y Outcome variable |
|
|
719 |
#' |
|
|
720 |
#' @return 1 - AUC |
|
|
721 |
#' @export |
|
|
722 |
auc.obj <- function(b,X,Y){ |
|
|
723 |
#Doesn't use observation weights in this part right now |
|
|
724 |
wavg <- as.matrix(X) %*% b |
|
|
725 |
pred = ROCR::prediction(wavg, Y) |
|
|
726 |
AUC = ROCR::performance(pred, "auc")@y.values[[1]] |
|
|
727 |
return((1-AUC)) |
|
|
728 |
|
|
|
729 |
} |
|
|
730 |
|
|
|
731 |
#' NNLS function to optimize weights of several base learners |
|
|
732 |
#' |
|
|
733 |
#' @param x x |
|
|
734 |
#' @param y y |
|
|
735 |
#' @param wt wt |
|
|
736 |
#' |
|
|
737 |
#' @return Solution of the quadratic programming problem |
|
|
738 |
#' @export |
|
|
739 |
NNLS <- function(x, y, wt) { |
|
|
740 |
wX <- sqrt(wt) * x |
|
|
741 |
wY <- sqrt(wt) * y |
|
|
742 |
# Z'Z = n * cov(Z) |
|
|
743 |
D <- t(wX) %*% wX |
|
|
744 |
d <- t(t(wY) %*% wX) |
|
|
745 |
A <- rbind(rep(1,ncol(wX)),diag(ncol(wX))) |
|
|
746 |
b <- c(1,rep(0, ncol(wX))) |
|
|
747 |
# This will give an error if cov(Z) is singular, meaning at least two |
|
|
748 |
# columns are linearly dependent. |
|
|
749 |
# TODO: This will also error if any learner failed. Fix this. |
|
|
750 |
fit <- quadprog::solve.QP(Dmat = D, dvec = d, Amat = t(A), bvec = b, meq=1) |
|
|
751 |
return(fit) |
|
|
752 |
} |
|
|
753 |
|
|
|
754 |
#' Combined SuperLearner function for both NNLS/AUC maximization |
|
|
755 |
#' |
|
|
756 |
#' @param Y Y |
|
|
757 |
#' @param X X |
|
|
758 |
#' |
|
|
759 |
#' @return Estimated meta-learner coefficients and predictions |
|
|
760 |
#' @export |
|
|
761 |
SL.nnls.auc <- function(Y, X, newX, family, obsWeights,bounds = c(0, Inf), ...) { |
|
|
762 |
if(family$family=="gaussian"){ |
|
|
763 |
fit.nnls <- NNLS(x=as.matrix(X),y=Y,wt=obsWeights) |
|
|
764 |
initCoef <- fit.nnls$solution |
|
|
765 |
initCoef[is.na(initCoef)] <- 0 |
|
|
766 |
if (sum(initCoef) > 0) { |
|
|
767 |
coef <- initCoef/sum(initCoef) |
|
|
768 |
} else { |
|
|
769 |
warning("All algorithms have zero weight", call. = FALSE) |
|
|
770 |
coef <- initCoef |
|
|
771 |
} |
|
|
772 |
pred <- crossprod(t(as.matrix(newX)), coef) |
|
|
773 |
fit <- list(object = fit.nnls) |
|
|
774 |
class(fit) <- "SL.nnls.auc" |
|
|
775 |
out <- list(pred = pred, fit = fit) |
|
|
776 |
}else if (family$family=="binomial"){ |
|
|
777 |
|
|
|
778 |
|
|
|
779 |
nmethods = ncol(X) |
|
|
780 |
coef_init <- runif(nmethods) |
|
|
781 |
coef_init <- coef_init/sum(coef_init) |
|
|
782 |
fit.rankloss <- nloptr::nloptr(x0=coef_init, |
|
|
783 |
eval_f = auc.obj, |
|
|
784 |
lb=rep(bounds[1],nmethods), |
|
|
785 |
ub=rep(bounds[2],nmethods), |
|
|
786 |
opts = list(algorithm="NLOPT_LN_COBYLA",xtol_rel = 1e-8,maxeval=10000), |
|
|
787 |
X = X, |
|
|
788 |
Y = Y) |
|
|
789 |
if (fit.rankloss$status < 1 || fit.rankloss$status > 4) { |
|
|
790 |
warning(fit.rankloss$message) |
|
|
791 |
} |
|
|
792 |
coef <- fit.rankloss$solution |
|
|
793 |
if (anyNA(coef)) { |
|
|
794 |
warning("Some algorithms have weights of NA, setting to 0.") |
|
|
795 |
coef[is.na(coef)] <- 0 |
|
|
796 |
} |
|
|
797 |
if (!sum(coef) > 0) warning("All algorithms have zero weight", call. = FALSE) |
|
|
798 |
|
|
|
799 |
coef <- coef/sum(coef) |
|
|
800 |
|
|
|
801 |
# solution stores normalized coefficients |
|
|
802 |
fit.rankloss$solution <- coef |
|
|
803 |
|
|
|
804 |
pred <- crossprod(t(as.matrix(newX)), coef) |
|
|
805 |
fit <- list(object = fit.rankloss) |
|
|
806 |
class(fit) <- "SL.nnls.auc" |
|
|
807 |
out <- list(pred = pred, fit = fit) |
|
|
808 |
} |
|
|
809 |
return(out) |
|
|
810 |
} |
|
|
811 |
|
|
|
812 |
#' Predict function for SL.nnls.auc |
|
|
813 |
#' |
|
|
814 |
#' @param object object |
|
|
815 |
#' @param newdata newdata |
|
|
816 |
#' |
|
|
817 |
#' @return Prediction from the meta-learner |
|
|
818 |
#' @export |
|
|
819 |
predict.SL.nnls.auc <- function(object, newdata, ...) { |
|
|
820 |
initCoef <- object$object$solution |
|
|
821 |
initCoef[is.na(initCoef)] <- 0 |
|
|
822 |
if (sum(initCoef) > 0) { |
|
|
823 |
coef <- initCoef/sum(initCoef) |
|
|
824 |
} else { |
|
|
825 |
warning("All algorithms have zero weight", call. = FALSE) |
|
|
826 |
coef <- initCoef |
|
|
827 |
} |
|
|
828 |
pred <- crossprod(t(as.matrix(newdata)), coef) |
|
|
829 |
return(pred) |
|
|
830 |
} |
|
|
831 |
|
|
|
832 |
|
|
|
833 |
# predict.learner <- function(fit, |
|
|
834 |
# feature_table_valid = NULL, # Feature table from validation set. Must have the exact same structure as feature_table. If missing, uses feature_table for feature_table_valid. |
|
|
835 |
# sample_metadata_valid = NULL, # Optional: Sample-specific metadata table from independent validation set. Must have the exact same structure as sample_metadata. |
|
|
836 |
# feature_metadata=NULL){ |
|
|
837 |
# |
|
|
838 |
# if(all(fit$feature.names==rownames(feature_metadata))==FALSE){ |
|
|
839 |
# stop("Both training feature_table and feature_metadata should have the same rownames.") |
|
|
840 |
# } |
|
|
841 |
# |
|
|
842 |
# |
|
|
843 |
# if(is.null(feature_table_valid)){ |
|
|
844 |
# stop("Feature table for validation set cannot be empty") |
|
|
845 |
# } |
|
|
846 |
# # if(is.null(sample_metadata_valid)){ |
|
|
847 |
# # stop("Sample metadata for validation set cannot be empty") |
|
|
848 |
# # } |
|
|
849 |
# |
|
|
850 |
# if (!is.null(feature_table_valid)){ |
|
|
851 |
# if(all(fit$feature.names==rownames(feature_table_valid))==FALSE) |
|
|
852 |
# stop("Both feature_table and feature_table_valid should have the same rownames.") |
|
|
853 |
# } |
|
|
854 |
# |
|
|
855 |
# if (!is.null(sample_metadata_valid)){ |
|
|
856 |
# if(all(colnames(feature_table_valid)==rownames(sample_metadata_valid))==FALSE) |
|
|
857 |
# stop("Row names of sample_metadata_valid must match the column names of feature_table_valid") |
|
|
858 |
# } |
|
|
859 |
# |
|
|
860 |
# |
|
|
861 |
# |
|
|
862 |
# if (!'featureID' %in% colnames(feature_metadata)){ |
|
|
863 |
# stop("feature_metadata must have a column named 'featureID' describing per-feature unique identifiers.") |
|
|
864 |
# } |
|
|
865 |
# |
|
|
866 |
# if (!'featureType' %in% colnames(feature_metadata)){ |
|
|
867 |
# stop("feature_metadata must have a column named 'featureType' describing the corresponding source layers.") |
|
|
868 |
# } |
|
|
869 |
# |
|
|
870 |
# if (!is.null(sample_metadata_valid)){ |
|
|
871 |
# if (!'subjectID' %in% colnames(sample_metadata_valid)){ |
|
|
872 |
# stop("sample_metadata_valid must have a column named 'subjectID' describing per-subject unique identifiers.") |
|
|
873 |
# } |
|
|
874 |
# |
|
|
875 |
# if (!'Y' %in% colnames(sample_metadata_valid)){ |
|
|
876 |
# stop("sample_metadata_valid must have a column named 'Y' describing the outcome of interest.") |
|
|
877 |
# } |
|
|
878 |
# } |
|
|
879 |
# |
|
|
880 |
# ############################################################################################# |
|
|
881 |
# # Extract validation Y right away (will not be used anywhere during the validation process) # |
|
|
882 |
# ############################################################################################# |
|
|
883 |
# |
|
|
884 |
# if (!is.null(sample_metadata_valid)){validY<-sample_metadata_valid['Y']} |
|
|
885 |
# |
|
|
886 |
# ##################################################################### |
|
|
887 |
# # Stacked generalization input data preparation for validation data # |
|
|
888 |
# ##################################################################### |
|
|
889 |
# feature_metadata$featureType<-as.factor(feature_metadata$featureType) |
|
|
890 |
# name_layers<-with(droplevels(feature_metadata), list(levels = levels(featureType)), |
|
|
891 |
# nlevels = nlevels(featureType))$levels |
|
|
892 |
# |
|
|
893 |
# X_test_layers <- vector("list", length(name_layers)) |
|
|
894 |
# names(X_test_layers) <- name_layers |
|
|
895 |
# |
|
|
896 |
# layer_wise_prediction_valid<-vector("list", length(name_layers)) |
|
|
897 |
# names(layer_wise_prediction_valid)<-name_layers |
|
|
898 |
# |
|
|
899 |
# for(i in seq_along(name_layers)){ |
|
|
900 |
# |
|
|
901 |
# ############################################################ |
|
|
902 |
# # Prepare single-omic validation data and save predictions # |
|
|
903 |
# ############################################################ |
|
|
904 |
# include_list<-feature_metadata %>% filter(featureType == name_layers[i]) |
|
|
905 |
# t_dat_slice_valid<-feature_table_valid[rownames(feature_table_valid) %in% include_list$featureID, ] |
|
|
906 |
# dat_slice_valid<-as.data.frame(t(t_dat_slice_valid)) |
|
|
907 |
# X_test_layers[[i]] <- dat_slice_valid |
|
|
908 |
# layer_wise_prediction_valid[[i]]<-predict.SuperLearner(fit$SL_fits$SL_fit_layers[[i]], newdata = dat_slice_valid)$pred |
|
|
909 |
# rownames(layer_wise_prediction_valid[[i]])<-rownames(dat_slice_valid) |
|
|
910 |
# rm(dat_slice_valid); rm(include_list) |
|
|
911 |
# } |
|
|
912 |
# |
|
|
913 |
# combo_valid <- as.data.frame(do.call(cbind, layer_wise_prediction_valid)) |
|
|
914 |
# names(combo_valid)<-name_layers |
|
|
915 |
# |
|
|
916 |
# if(fit$run_stacked==TRUE){ |
|
|
917 |
# stacked_prediction_valid<-predict.SuperLearner(fit$SL_fits$SL_fit_stacked, newdata = combo_valid)$pred |
|
|
918 |
# rownames(stacked_prediction_valid)<-rownames(combo_valid) |
|
|
919 |
# } |
|
|
920 |
# if(fit$run_concat==TRUE){ |
|
|
921 |
# fulldat_valid<-as.data.frame(t(feature_table_valid)) |
|
|
922 |
# concat_prediction_valid<-predict.SuperLearner(fit$SL_fits$SL_fit_concat, |
|
|
923 |
# newdata = fulldat_valid)$pred |
|
|
924 |
# rownames(concat_prediction_valid)<-rownames(fulldat_valid) |
|
|
925 |
# } |
|
|
926 |
# |
|
|
927 |
# res=list() |
|
|
928 |
# |
|
|
929 |
# if (!is.null(sample_metadata_valid)){ |
|
|
930 |
# Y_test=validY$Y |
|
|
931 |
# res$Y_test =Y_test |
|
|
932 |
# } |
|
|
933 |
# |
|
|
934 |
# if(fit$run_concat & fit$run_stacked){ |
|
|
935 |
# yhat.test <- cbind(combo_valid, stacked_prediction_valid , concat_prediction_valid) |
|
|
936 |
# colnames(yhat.test) <- c(colnames(combo_valid),"stacked","concatenated") |
|
|
937 |
# }else if(fit$run_concat & !fit$run_stacked){ |
|
|
938 |
# yhat.test <- cbind(combo_valid, concat_prediction_valid) |
|
|
939 |
# colnames(yhat.test) <- c(colnames(combo_valid),"concatenated") |
|
|
940 |
# }else if(!fit$run_concat & fit$run_stacked){ |
|
|
941 |
# yhat.test <- cbind(combo_valid, stacked_prediction_valid ) |
|
|
942 |
# colnames(yhat.test) <- c(colnames(combo_valid),"stacked") |
|
|
943 |
# }else{ |
|
|
944 |
# yhat.test <- combo_valid |
|
|
945 |
# } |
|
|
946 |
# |
|
|
947 |
# res$yhat.test <- yhat.test |
|
|
948 |
# if (!is.null(sample_metadata_valid)){ |
|
|
949 |
# if(fit$family=='binomial'){ |
|
|
950 |
# # Calculate AUC for each layer, stacked and concatenated |
|
|
951 |
# pred=apply(res$yhat.test, 2, ROCR::prediction, labels=res$Y_test) |
|
|
952 |
# AUC=vector(length = length(pred)) |
|
|
953 |
# names(AUC)=names(pred) |
|
|
954 |
# for(i in seq_along(pred)){ |
|
|
955 |
# AUC[i] = round(ROCR::performance(pred[[i]], "auc")@y.values[[1]], 3) |
|
|
956 |
# } |
|
|
957 |
# res$AUC.test <- AUC |
|
|
958 |
# |
|
|
959 |
# } |
|
|
960 |
# |
|
|
961 |
# if(fit$family=='gaussian'){ |
|
|
962 |
# # Calculate R^2 for each layer, stacked and concatenated |
|
|
963 |
# R2=vector(length = ncol(res$yhat.test)) |
|
|
964 |
# names(R2)=names(res$yhat.test) |
|
|
965 |
# for(i in seq_along(R2)){ |
|
|
966 |
# R2[i] = as.vector(cor(res$yhat.test[ ,i], res$Y_test)^2) |
|
|
967 |
# } |
|
|
968 |
# res$R2.test <- R2 |
|
|
969 |
# } |
|
|
970 |
# } |
|
|
971 |
# |
|
|
972 |
# return(res) |
|
|
973 |
# |
|
|
974 |
# } |
|
|
975 |
# |
|
|
976 |
# |
|
|
977 |
# update.learner <- function(fit, |
|
|
978 |
# feature_table_valid, # Feature table from validation set. Must have the exact same structure as feature_table. If missing, uses feature_table for feature_table_valid. |
|
|
979 |
# sample_metadata_valid=NULL, # OPTIONAL (can provide feature_table_valid and not this): Sample-specific metadata table from independent validation set. Must have the exact same structure as sample_metadata. |
|
|
980 |
# feature_metadata_valid, |
|
|
981 |
# seed = 1234, # Specify the arbitrary seed value for reproducibility. Default is 1234. |
|
|
982 |
# verbose=FALSE |
|
|
983 |
# ){ |
|
|
984 |
# # Check that feature table and feature meta data valid is not empty here |
|
|
985 |
# if(is.null(feature_table_valid | is.null(feature_metadata_valid))){ |
|
|
986 |
# stop("feature table/ feature metadata cannot be NULL for validation set in update learner") |
|
|
987 |
# } |
|
|
988 |
# |
|
|
989 |
# if(fit$family=="gaussian"){ |
|
|
990 |
# family=gaussian() |
|
|
991 |
# }else if(fit$family=="binomial"){ |
|
|
992 |
# family=binomial() |
|
|
993 |
# } |
|
|
994 |
# |
|
|
995 |
# if (!is.null(sample_metadata_valid)){ |
|
|
996 |
# validY<-sample_metadata_valid['Y'] |
|
|
997 |
# } |
|
|
998 |
# |
|
|
999 |
# |
|
|
1000 |
# feature_metadata_valid$featureType<-as.factor(feature_metadata_valid$featureType) |
|
|
1001 |
# name_layers_valid<-with(droplevels(feature_metadata_valid), list(levels = levels(featureType)), nlevels = nlevels(featureType))$levels |
|
|
1002 |
# |
|
|
1003 |
# |
|
|
1004 |
# name_layers <- names(fit$model_fits$model_layers) |
|
|
1005 |
# |
|
|
1006 |
# # If layers in validation match layers in train |
|
|
1007 |
# # Just run predict function and return its object |
|
|
1008 |
# if(length(intersect(name_layers_valid,name_layers))==length(name_layers)){ |
|
|
1009 |
# |
|
|
1010 |
# # Check if feature names are same for the train and test |
|
|
1011 |
# |
|
|
1012 |
# return(predict.learner(fit, |
|
|
1013 |
# feature_table_valid = feature_table_valid, |
|
|
1014 |
# sample_metadata_valid = sample_metadata_valid, |
|
|
1015 |
# feature_metadata = feature_metadata_valid)) |
|
|
1016 |
# }else if(length(intersect(name_layers_valid,name_layers))==0){ |
|
|
1017 |
# |
|
|
1018 |
# stop("Validation set has no layers in common with model fit") |
|
|
1019 |
# |
|
|
1020 |
# }else{ |
|
|
1021 |
# |
|
|
1022 |
# name_layers_common <- intersect(name_layers_valid,name_layers) |
|
|
1023 |
# |
|
|
1024 |
# |
|
|
1025 |
# |
|
|
1026 |
# # Extract only common name layers part of the fit object |
|
|
1027 |
# fit$model_fits$model_layers <- fit$model_fits$model_layers[name_layers_common] |
|
|
1028 |
# fit$SL_fits$SL_fit_layers <- fit$SL_fits$SL_fit_layers[name_layers_common] |
|
|
1029 |
# fit$X_train_layers <- fit$X_train_layers[name_layers_common] |
|
|
1030 |
# |
|
|
1031 |
# # Use common layers to get layer wise predictions for validation set |
|
|
1032 |
# X_test_layers <- vector("list", length(name_layers_common)) |
|
|
1033 |
# names(X_test_layers) <- name_layers_common |
|
|
1034 |
# |
|
|
1035 |
# if (!is.null(feature_table_valid)){ |
|
|
1036 |
# layer_wise_prediction_valid<-vector("list", length(name_layers_common)) |
|
|
1037 |
# names(layer_wise_prediction_valid)<-name_layers_common |
|
|
1038 |
# } |
|
|
1039 |
# |
|
|
1040 |
# |
|
|
1041 |
# for(i in seq_along(name_layers_common)){ |
|
|
1042 |
# include_list<-feature_metadata_valid %>% filter(featureType == name_layers_common[i]) |
|
|
1043 |
# |
|
|
1044 |
# # check if feature names in common layers match for train and test set |
|
|
1045 |
# if(!all(include_list$featureID==colnames(fit$X_train_layers[name_layers_common[i]]))){ |
|
|
1046 |
# stop(paste0("Validation set feature names for layer ", name_layers_common[i]," do not match with training data" )) |
|
|
1047 |
# } |
|
|
1048 |
# |
|
|
1049 |
# |
|
|
1050 |
# if (!is.null(feature_table_valid)){ |
|
|
1051 |
# t_dat_slice_valid<-feature_table_valid[rownames(feature_table_valid) %in% include_list$featureID, ] |
|
|
1052 |
# dat_slice_valid<-as.data.frame(t(t_dat_slice_valid)) |
|
|
1053 |
# X_test_layers[[i]] <- dat_slice_valid |
|
|
1054 |
# layer_wise_prediction_valid[[i]]<-predict.SuperLearner(fit$SL_fits$SL_fit_layers[[i]], newdata = dat_slice_valid)$pred |
|
|
1055 |
# rownames(layer_wise_prediction_valid[[i]])<-rownames(dat_slice_valid) |
|
|
1056 |
# fit$SL_fits$SL_fit_layers[[i]]$validX<-dat_slice_valid |
|
|
1057 |
# fit$SL_fits$SL_fit_layers[[i]]$validPrediction<-layer_wise_prediction_valid[[i]] |
|
|
1058 |
# colnames(fit$SL_fits$SL_fit_layers[[i]]$validPrediction)<-'validPrediction' |
|
|
1059 |
# rm(dat_slice_valid); rm(include_list) |
|
|
1060 |
# } |
|
|
1061 |
# } |
|
|
1062 |
# |
|
|
1063 |
# combo <- fit$yhat.train[ ,name_layers_common] |
|
|
1064 |
# |
|
|
1065 |
# if (!is.null(feature_table_valid)){ |
|
|
1066 |
# combo_valid <- as.data.frame(do.call(cbind, layer_wise_prediction_valid)) |
|
|
1067 |
# names(combo_valid)<-name_layers_valid |
|
|
1068 |
# } |
|
|
1069 |
# |
|
|
1070 |
# |
|
|
1071 |
# if(fit$run_stacked){ |
|
|
1072 |
# |
|
|
1073 |
# cat('Running new stacked model...\n') |
|
|
1074 |
# #} |
|
|
1075 |
# |
|
|
1076 |
# ################################### |
|
|
1077 |
# # Run user-specified meta learner # |
|
|
1078 |
# ################################### |
|
|
1079 |
# |
|
|
1080 |
# SL_fit_stacked<-SuperLearner::SuperLearner(Y = fit$Y_train, |
|
|
1081 |
# X = combo, |
|
|
1082 |
# cvControl = fit$cvControl, |
|
|
1083 |
# verbose = verbose, |
|
|
1084 |
# SL.library = fit$meta_learner, |
|
|
1085 |
# family=family, |
|
|
1086 |
# id=fit$id) |
|
|
1087 |
# |
|
|
1088 |
# # Extract the fit object from superlearner |
|
|
1089 |
# model_stacked <- SL_fit_stacked$fitLibrary[[1]]$object |
|
|
1090 |
# |
|
|
1091 |
# ################################################### |
|
|
1092 |
# # Append the corresponding y and X to the results # |
|
|
1093 |
# ################################################### |
|
|
1094 |
# |
|
|
1095 |
# SL_fit_stacked$Y<-fit$Y_train |
|
|
1096 |
# SL_fit_stacked$X<-combo |
|
|
1097 |
# if (!is.null(sample_metadata_valid)) SL_fit_stacked$validY<-validY |
|
|
1098 |
# |
|
|
1099 |
# ################################################################# |
|
|
1100 |
# # Prepate stacked input data for validation and save prediction # |
|
|
1101 |
# ################################################################# |
|
|
1102 |
# |
|
|
1103 |
# if (!is.null(feature_table_valid)){ |
|
|
1104 |
# stacked_prediction_valid<-predict.SuperLearner(SL_fit_stacked, newdata = combo_valid)$pred |
|
|
1105 |
# rownames(stacked_prediction_valid)<-rownames(combo_valid) |
|
|
1106 |
# SL_fit_stacked$validX<-combo_valid |
|
|
1107 |
# SL_fit_stacked$validPrediction<-stacked_prediction_valid |
|
|
1108 |
# colnames(SL_fit_stacked$validPrediction)<-'validPrediction' |
|
|
1109 |
# } |
|
|
1110 |
# |
|
|
1111 |
# fit$model_fits$model_stacked <- model_stacked |
|
|
1112 |
# fit$SL_fits$SL_fit_stacked <- SL_fit_stacked |
|
|
1113 |
# fit$yhat.train$stacked <- SL_fit_stacked$Z |
|
|
1114 |
# |
|
|
1115 |
# |
|
|
1116 |
# } |
|
|
1117 |
# |
|
|
1118 |
# |
|
|
1119 |
# if(fit$run_concat){ |
|
|
1120 |
# #if (verbose) { |
|
|
1121 |
# cat('Running new concatenated model...\n') |
|
|
1122 |
# #} |
|
|
1123 |
# ################################### |
|
|
1124 |
# # Prepate concatenated input data # |
|
|
1125 |
# ################################### |
|
|
1126 |
# feature_table <- Reduce(cbind.data.frame,fit$X_train_layers) |
|
|
1127 |
# feature_table <- feature_table[ ,feature_metadata_valid$featureID] |
|
|
1128 |
# fulldat<-as.data.frame(feature_table) |
|
|
1129 |
# |
|
|
1130 |
# ################################### |
|
|
1131 |
# # Run user-specified base learner # |
|
|
1132 |
# ################################### |
|
|
1133 |
# |
|
|
1134 |
# SL_fit_concat<-SuperLearner::SuperLearner(Y = fit$Y_train, |
|
|
1135 |
# X = fulldat, |
|
|
1136 |
# cvControl = fit$cvControl, |
|
|
1137 |
# verbose = verbose, |
|
|
1138 |
# SL.library = fit$base_learner, |
|
|
1139 |
# family=family, |
|
|
1140 |
# id=fit$id) |
|
|
1141 |
# |
|
|
1142 |
# # Extract the fit object from superlearner |
|
|
1143 |
# model_concat <- SL_fit_concat$fitLibrary[[1]]$object |
|
|
1144 |
# |
|
|
1145 |
# ################################################### |
|
|
1146 |
# # Append the corresponding y and X to the results # |
|
|
1147 |
# ################################################### |
|
|
1148 |
# |
|
|
1149 |
# SL_fit_concat$Y<-fit$Y_train |
|
|
1150 |
# SL_fit_concat$X<-fulldat |
|
|
1151 |
# if (!is.null(sample_metadata_valid)) SL_fit_concat$validY<-validY |
|
|
1152 |
# |
|
|
1153 |
# ######################################################################### |
|
|
1154 |
# # Prepate concatenated input data for validaton set and save prediction # |
|
|
1155 |
# ######################################################################### |
|
|
1156 |
# |
|
|
1157 |
# if (!is.null(feature_table_valid)){ |
|
|
1158 |
# fulldat_valid<-as.data.frame(t(feature_table_valid)) |
|
|
1159 |
# concat_prediction_valid<-predict.SuperLearner(SL_fit_concat, newdata = fulldat_valid)$pred |
|
|
1160 |
# SL_fit_concat$validX<-fulldat_valid |
|
|
1161 |
# rownames(concat_prediction_valid)<-rownames(fulldat_valid) |
|
|
1162 |
# SL_fit_concat$validPrediction<-concat_prediction_valid |
|
|
1163 |
# colnames(SL_fit_concat$validPrediction)<-'validPrediction' |
|
|
1164 |
# } |
|
|
1165 |
# |
|
|
1166 |
# fit$model_fits$model_concat <- model_concat |
|
|
1167 |
# fit$SL_fits$SL_fit_concat <- SL_fit_concat |
|
|
1168 |
# fit$yhat.train$concatenated <- SL_fit_concat$Z |
|
|
1169 |
# } |
|
|
1170 |
# |
|
|
1171 |
# |
|
|
1172 |
# if(fit$run_concat & fit$run_stacked){ |
|
|
1173 |
# fit$yhat.train <- fit$yhat.train[ ,c(name_layers_common,"stacked","concatenated")] |
|
|
1174 |
# |
|
|
1175 |
# }else if(fit$run_concat & !fit$run_stacked){ |
|
|
1176 |
# fit$yhat.train <- fit$yhat.train[ ,c(name_layers_common,"concatenated")] |
|
|
1177 |
# |
|
|
1178 |
# }else if(!fit$run_concat & fit$run_stacked){ |
|
|
1179 |
# fit$yhat.train <- fit$yhat.train[ ,c(name_layers_common,"stacked")] |
|
|
1180 |
# |
|
|
1181 |
# }else if(!fit$run_concat & !fit$run_stacked){ |
|
|
1182 |
# fit$yhat.train <- fit$yhat.train[ ,name_layers_common] |
|
|
1183 |
# |
|
|
1184 |
# } |
|
|
1185 |
# |
|
|
1186 |
# |
|
|
1187 |
# if(!is.null(feature_table_valid)){ |
|
|
1188 |
# |
|
|
1189 |
# if(fit$run_concat & fit$run_stacked){ |
|
|
1190 |
# yhat.test <- cbind(combo_valid, SL_fit_stacked$validPrediction,SL_fit_concat$validPrediction) |
|
|
1191 |
# colnames(yhat.test) <- c(colnames(combo_valid),"stacked","concatenated") |
|
|
1192 |
# |
|
|
1193 |
# }else if(fit$run_concat & !fit$run_stacked){ |
|
|
1194 |
# yhat.test <- cbind(combo_valid, SL_fit_concat$validPrediction) |
|
|
1195 |
# colnames(yhat.test) <- c(colnames(combo_valid),"concatenated") |
|
|
1196 |
# |
|
|
1197 |
# }else if(!fit$run_concat & fit$run_stacked){ |
|
|
1198 |
# yhat.test <- cbind(combo_valid, SL_fit_stacked$validPrediction) |
|
|
1199 |
# colnames(yhat.test) <- c(colnames(combo_valid),"stacked") |
|
|
1200 |
# |
|
|
1201 |
# }else if(!fit$run_concat & !fit$run_stacked){ |
|
|
1202 |
# yhat.test <- cbind(combo_valid) |
|
|
1203 |
# colnames(yhat.test) <- c(colnames(combo_valid)) |
|
|
1204 |
# |
|
|
1205 |
# } |
|
|
1206 |
# fit$yhat.test <- yhat.test |
|
|
1207 |
# fit$X_test_layers <- X_test_layers |
|
|
1208 |
# } |
|
|
1209 |
# if(is.null(sample_metadata_valid)){ |
|
|
1210 |
# fit$test=FALSE |
|
|
1211 |
# }else{ |
|
|
1212 |
# fit$test=TRUE |
|
|
1213 |
# } |
|
|
1214 |
# if(fit$meta_learner=="SL.nnls.auc" & fit$run_stacked){ |
|
|
1215 |
# fit$weights <- fit$model_fits$model_stacked$solution |
|
|
1216 |
# names(fit$weights) <- colnames(combo) |
|
|
1217 |
# } |
|
|
1218 |
# |
|
|
1219 |
# if(!is.null(sample_metadata_valid)){fit$Y_test=validY$Y} |
|
|
1220 |
# |
|
|
1221 |
# if(fit$family=="binomial"){ |
|
|
1222 |
# # Calculate AUC for each layer, stacked and concatenated |
|
|
1223 |
# pred=apply(fit$yhat.train, 2, ROCR::prediction, labels=fit$Y_train) |
|
|
1224 |
# AUC=vector(length = length(pred)) |
|
|
1225 |
# names(AUC)=names(pred) |
|
|
1226 |
# for(i in seq_along(pred)){ |
|
|
1227 |
# AUC[i] = round(ROCR::performance(pred[[i]], "auc")@y.values[[1]], 3) |
|
|
1228 |
# } |
|
|
1229 |
# fit$AUC.train <- AUC |
|
|
1230 |
# |
|
|
1231 |
# if(fit$test==TRUE){ |
|
|
1232 |
# # Calculate AUC for each layer, stacked and concatenated |
|
|
1233 |
# pred=apply(fit$yhat.test, 2, ROCR::prediction, labels=fit$Y_test) |
|
|
1234 |
# AUC=vector(length = length(pred)) |
|
|
1235 |
# names(AUC)=names(pred) |
|
|
1236 |
# for(i in seq_along(pred)){ |
|
|
1237 |
# AUC[i] = round(ROCR::performance(pred[[i]], "auc")@y.values[[1]], 3) |
|
|
1238 |
# } |
|
|
1239 |
# fit$AUC.test <- AUC |
|
|
1240 |
# } |
|
|
1241 |
# } |
|
|
1242 |
# if(fit$family=="gaussian"){ |
|
|
1243 |
# |
|
|
1244 |
# # Calculate R^2 for each layer, stacked and concatenated |
|
|
1245 |
# R2=vector(length = ncol(fit$yhat.train)) |
|
|
1246 |
# names(R2)=names(fit$yhat.train) |
|
|
1247 |
# for(i in seq_along(R2)){ |
|
|
1248 |
# R2[i] = as.vector(cor(fit$yhat.train[ ,i], fit$Y_train)^2) |
|
|
1249 |
# } |
|
|
1250 |
# fit$R2.train <- R2 |
|
|
1251 |
# if(fit$test==TRUE){ |
|
|
1252 |
# # Calculate R^2 for each layer, stacked and concatenated |
|
|
1253 |
# R2=vector(length = ncol(fit$yhat.test)) |
|
|
1254 |
# names(R2)=names(fit$yhat.test) |
|
|
1255 |
# for(i in seq_along(R2)){ |
|
|
1256 |
# R2[i] = as.vector(cor(fit$yhat.test[ ,i], fit$Y_test)^2) |
|
|
1257 |
# } |
|
|
1258 |
# fit$R2.test <- R2 |
|
|
1259 |
# } |
|
|
1260 |
# |
|
|
1261 |
# } |
|
|
1262 |
# fit$feature.names <- rownames(feature_table_valid) |
|
|
1263 |
# print.learner(fit) |
|
|
1264 |
# return(fit) |
|
|
1265 |
# } |
|
|
1266 |
# |
|
|
1267 |
# |
|
|
1268 |
# } |
|
|
1269 |
# |
|
|
1270 |
# plot.learner <- function(fit,label_size=8, label_x=0.3,vjust=0.1,rowwise=TRUE){ |
|
|
1271 |
# |
|
|
1272 |
# clean_base_learner <- str_remove_all(fit$base_learner, 'SL.') |
|
|
1273 |
# clean_meta_learner <- str_remove_all(fit$meta_learner, 'SL.') |
|
|
1274 |
# method <- paste(clean_base_learner,clean_meta_learner,sep=' + ') |
|
|
1275 |
# |
|
|
1276 |
# if(fit$family=='binomial'){ |
|
|
1277 |
# |
|
|
1278 |
# # Extract ROC plot data |
|
|
1279 |
# list.ROC<-vector("list", length = ncol(fit$yhat.train)) |
|
|
1280 |
# names(list.ROC)<-colnames(fit$yhat.train) |
|
|
1281 |
# |
|
|
1282 |
# y <- fit$Y_train |
|
|
1283 |
# # Loop over layers |
|
|
1284 |
# for(k in 1:length(list.ROC)){ |
|
|
1285 |
# preds<-fit$yhat.train[ ,k] |
|
|
1286 |
# pred = ROCR::prediction(preds, y) |
|
|
1287 |
# AUC = round(ROCR::performance(pred, "auc")@y.values[[1]], 2) |
|
|
1288 |
# perf = ROCR::performance(pred, "sens", "spec") |
|
|
1289 |
# list.ROC[[k]] <- data.frame(sensitivity = methods::slot(perf, "y.values")[[1]], |
|
|
1290 |
# specificity = 1 - methods::slot(perf, "x.values")[[1]], |
|
|
1291 |
# AUC = AUC, |
|
|
1292 |
# layer = names(list.ROC)[k], |
|
|
1293 |
# method = method) |
|
|
1294 |
# } |
|
|
1295 |
# |
|
|
1296 |
# # Combine |
|
|
1297 |
# ROC_table<-do.call('rbind', list.ROC) |
|
|
1298 |
# |
|
|
1299 |
# # Prepare data for plotting |
|
|
1300 |
# plot_data<-ROC_table |
|
|
1301 |
# plot_data$displayItem<-paste(plot_data$layer, " AUC = ", plot_data$AUC, sep="") |
|
|
1302 |
# plot_data$displayItem<-factor(plot_data$displayItem, |
|
|
1303 |
# levels = unique(plot_data$displayItem)) |
|
|
1304 |
# |
|
|
1305 |
# # ROC curves |
|
|
1306 |
# p1<-ggplot(plot_data, |
|
|
1307 |
# aes(x=specificity, |
|
|
1308 |
# y=sensitivity, |
|
|
1309 |
# group=displayItem)) + |
|
|
1310 |
# geom_line(aes(x=specificity,y=sensitivity,color=displayItem)) + |
|
|
1311 |
# #ggtitle(paste('Training data: ', method, sep=''))+ |
|
|
1312 |
# theme(legend.position="bottom", |
|
|
1313 |
# legend.background=element_blank(), |
|
|
1314 |
# legend.box.background=element_rect(colour="black")) + |
|
|
1315 |
# theme_bw() + |
|
|
1316 |
# xlab("False Positive Rate") + |
|
|
1317 |
# ylab("True Positive Rate") + |
|
|
1318 |
# theme(legend.position = "right", legend.direction = "vertical") + |
|
|
1319 |
# labs(color='') |
|
|
1320 |
# |
|
|
1321 |
# if(fit$test==TRUE){ |
|
|
1322 |
# |
|
|
1323 |
# # Extract ROC plot data |
|
|
1324 |
# list.ROC.valid<-vector("list", length = ncol(fit$yhat.test)) |
|
|
1325 |
# names(list.ROC.valid)<-colnames(fit$yhat.test) |
|
|
1326 |
# |
|
|
1327 |
# y <- fit$Y_test |
|
|
1328 |
# # Loop over layers |
|
|
1329 |
# for(k in 1:length(list.ROC.valid)){ |
|
|
1330 |
# preds<-fit$yhat.test[ ,k] |
|
|
1331 |
# pred = ROCR::prediction(preds, y) |
|
|
1332 |
# AUC = round(ROCR::performance(pred, "auc")@y.values[[1]], 2) |
|
|
1333 |
# perf = ROCR::performance(pred, "sens", "spec") |
|
|
1334 |
# list.ROC.valid[[k]] <- data.frame(sensitivity = methods::slot(perf, "y.values")[[1]], |
|
|
1335 |
# specificity = 1 - methods::slot(perf, "x.values")[[1]], |
|
|
1336 |
# AUC = AUC, |
|
|
1337 |
# layer = names(list.ROC.valid)[k], |
|
|
1338 |
# method = method) |
|
|
1339 |
# } |
|
|
1340 |
# |
|
|
1341 |
# # Combine |
|
|
1342 |
# ROC_table_valid<-do.call('rbind', list.ROC.valid) |
|
|
1343 |
# |
|
|
1344 |
# # Prepare data for plotting |
|
|
1345 |
# plot_data<-ROC_table_valid |
|
|
1346 |
# plot_data$displayItem<-paste(plot_data$layer, " AUC = ", plot_data$AUC, sep="") |
|
|
1347 |
# plot_data$displayItem<-factor(plot_data$displayItem, |
|
|
1348 |
# levels = unique(plot_data$displayItem)) |
|
|
1349 |
# |
|
|
1350 |
# # ROC curves |
|
|
1351 |
# p2<-ggplot(plot_data, |
|
|
1352 |
# aes(x=specificity, |
|
|
1353 |
# y=sensitivity, |
|
|
1354 |
# group=displayItem)) + |
|
|
1355 |
# geom_line(aes(x=specificity,y=sensitivity,color=displayItem)) + |
|
|
1356 |
# #ggtitle(paste('Test data: ', method, sep=''))+ |
|
|
1357 |
# theme(legend.position="bottom", |
|
|
1358 |
# legend.background=element_blank(), |
|
|
1359 |
# legend.box.background=element_rect(colour="black")) + |
|
|
1360 |
# theme_bw() + |
|
|
1361 |
# xlab("False Positive Rate") + |
|
|
1362 |
# ylab("True Positive Rate") + |
|
|
1363 |
# theme(legend.position = "right", legend.direction = "vertical") + |
|
|
1364 |
# labs(color='') |
|
|
1365 |
# |
|
|
1366 |
# p<-plot_grid(p1, |
|
|
1367 |
# p2, |
|
|
1368 |
# ifelse(rowwise,nrow = 2,ncol=2), |
|
|
1369 |
# labels = c(paste('A. ', fit$folds,'-fold CV',sep = ''), |
|
|
1370 |
# 'B. Independent Validation'), |
|
|
1371 |
# label_size = label_size, label_x = label_x,vjust = vjust)+ |
|
|
1372 |
# theme(plot.margin = unit(c(1,1,1,1), "cm")) |
|
|
1373 |
# print(p) |
|
|
1374 |
# return(list('plot'=p,'ROC_table'=ROC_table,'ROC_table_valid'=ROC_table_valid)) |
|
|
1375 |
# } |
|
|
1376 |
# p <- plot_grid(p1, |
|
|
1377 |
# nrow = 1, |
|
|
1378 |
# labels = c(paste('A. ', fit$folds,'-fold CV',sep = '')), |
|
|
1379 |
# label_size = label_size, label_x = label_x,vjust = vjust)+ |
|
|
1380 |
# theme(plot.margin = unit(c(1,1,1,1), "cm")) |
|
|
1381 |
# print(p) |
|
|
1382 |
# return(list('plot'=p,'ROC_table'=ROC_table)) |
|
|
1383 |
# } |
|
|
1384 |
# else if(fit$family=='gaussian'){ |
|
|
1385 |
# |
|
|
1386 |
# |
|
|
1387 |
# # Extract R2 plot data |
|
|
1388 |
# list.R2<-vector("list", length = ncol(fit$yhat.train)) |
|
|
1389 |
# names(list.R2)<-colnames(fit$yhat.train) |
|
|
1390 |
# |
|
|
1391 |
# y <- fit$Y_train |
|
|
1392 |
# # Loop over layers |
|
|
1393 |
# for(k in 1:length(list.R2)){ |
|
|
1394 |
# preds<-fit$yhat.train[ ,k] |
|
|
1395 |
# R2<- as.vector(cor(preds, y)^2) |
|
|
1396 |
# list.R2[[k]] <- data.frame(R2 = R2, |
|
|
1397 |
# layer = names(list.R2)[k], |
|
|
1398 |
# method = method) |
|
|
1399 |
# } |
|
|
1400 |
# |
|
|
1401 |
# # Combine |
|
|
1402 |
# R2_table<-do.call('rbind', list.R2) |
|
|
1403 |
# |
|
|
1404 |
# # Plot |
|
|
1405 |
# p1<-ggplot(R2_table, aes(x = method, y = R2)) + |
|
|
1406 |
# geom_bar(position="dodge", stat="identity", aes(fill=layer)) + |
|
|
1407 |
# xlab("") + |
|
|
1408 |
# ylab(expression(paste("Prediction accuracy (", R^2, ")"))) + |
|
|
1409 |
# scale_fill_discrete(name="") + |
|
|
1410 |
# theme(legend.position="bottom", |
|
|
1411 |
# legend.background=element_blank(), |
|
|
1412 |
# legend.box.background=element_rect(colour="black")) + |
|
|
1413 |
# theme_bw() + |
|
|
1414 |
# guides(fill=guide_legend(title="")) + |
|
|
1415 |
# theme(legend.position = "right", legend.direction = "vertical", |
|
|
1416 |
# strip.background = element_blank()) + |
|
|
1417 |
# labs(fill='') |
|
|
1418 |
# |
|
|
1419 |
# |
|
|
1420 |
# |
|
|
1421 |
# if(fit$test==TRUE){ |
|
|
1422 |
# |
|
|
1423 |
# |
|
|
1424 |
# # Extract R2 plot data |
|
|
1425 |
# list.R2.valid<-vector("list", length = ncol(fit$yhat.test)) |
|
|
1426 |
# names(list.R2.valid)<-colnames(fit$yhat.test) |
|
|
1427 |
# |
|
|
1428 |
# y <- fit$Y_test |
|
|
1429 |
# # Loop over layers |
|
|
1430 |
# for(k in 1:length(list.R2.valid)){ |
|
|
1431 |
# preds<-fit$yhat.test[ ,k] |
|
|
1432 |
# R2<- as.vector(cor(preds, y)^2) |
|
|
1433 |
# list.R2.valid[[k]] <- data.frame(R2 = R2, |
|
|
1434 |
# layer = names(list.R2.valid)[k], |
|
|
1435 |
# method = method) |
|
|
1436 |
# } |
|
|
1437 |
# |
|
|
1438 |
# # Combine |
|
|
1439 |
# R2_table_valid<-do.call('rbind', list.R2.valid) |
|
|
1440 |
# |
|
|
1441 |
# # Plot |
|
|
1442 |
# p2<-ggplot(R2_table_valid, aes(x = method, y = R2)) + |
|
|
1443 |
# geom_bar(position="dodge", stat="identity", aes(fill=layer)) + |
|
|
1444 |
# xlab("") + |
|
|
1445 |
# ylab(expression(paste("Prediction accuracy (", R^2, ")"))) + |
|
|
1446 |
# scale_fill_discrete(name="") + |
|
|
1447 |
# theme(legend.position="bottom", |
|
|
1448 |
# legend.background=element_blank(), |
|
|
1449 |
# legend.box.background=element_rect(colour="black")) + |
|
|
1450 |
# theme_bw() + |
|
|
1451 |
# guides(fill=guide_legend(title="")) + |
|
|
1452 |
# theme(legend.position = "right", legend.direction = "vertical", |
|
|
1453 |
# strip.background = element_blank()) + |
|
|
1454 |
# labs(fill='') |
|
|
1455 |
# |
|
|
1456 |
# p<-plot_grid(p1, |
|
|
1457 |
# p2, |
|
|
1458 |
# ifelse(rowwise,nrow = 2,ncol=2), |
|
|
1459 |
# labels = c(paste('A. ', fit$folds,'-fold CV',sep = ''), |
|
|
1460 |
# 'B. Independent Validation'), |
|
|
1461 |
# label_size = label_size, label_x = label_x,vjust = vjust)+ |
|
|
1462 |
# theme(plot.margin = unit(c(1,1,1,1), "cm")) |
|
|
1463 |
# print(p) |
|
|
1464 |
# return(list('plot'=p,'R2_table'=R2_table,'R2_table_valid'=R2_table_valid)) |
|
|
1465 |
# |
|
|
1466 |
# } |
|
|
1467 |
# p <- plot_grid(p1, |
|
|
1468 |
# ncol = 1, |
|
|
1469 |
# labels = c(paste('A. ', fit$folds,'-fold CV',sep = '')), |
|
|
1470 |
# label_size = label_size, label_x = label_x,vjust = vjust)+ |
|
|
1471 |
# theme(plot.margin = unit(c(1,1,1,1), "cm")) |
|
|
1472 |
# print(p) |
|
|
1473 |
# return(list('plot'=p,'R2_table'=R2_table)) |
|
|
1474 |
# |
|
|
1475 |
# } |
|
|
1476 |
# } |
|
|
1477 |
|
|
|
1478 |
# Borrowed from mia package |
|
|
1479 |
.require_package <- function(pkg){ |
|
|
1480 |
if(!requireNamespace(pkg, quietly = TRUE)){ |
|
|
1481 |
stop("'",pkg,"' package not found. Please install the '", pkg, |
|
|
1482 |
"' package to use this function.", call. = FALSE) |
|
|
1483 |
} |
|
|
1484 |
return(NULL) |
|
|
1485 |
} |