multi_omics_benchmark / Git / [bb6f8d] /R/ancillary_code

Models:
AlyssaS/
multi_omics_benchmark
Downloads: 1
[bb6f8d]: / R / ancillary_code_bench.R
History
Download this file
1639 lines (1446 with data), 58.2 kB

#### 
#### sources & code  ####
#### 

# 1. custom learners ----------------------------------------------------------

#  1.1 surv.clinicalreference -------------------------------------------------
makeRLearner.surv.clinic_reference = function() {
  makeRLearnerSurv(
    cl = "surv.clinic_reference",
    package = "survival",
    par.set = makeParamSet(
      makeDiscreteLearnerParam(
        id = "model",
        default = "coxph",
        values = c("coxph", "lasso", "ridge")
      ), 
      makeUntypedLearnerParam(id = "clinicals", default = NULL), 
      makeIntegerLearnerParam(
        id = "maxit",
        default = 100000,
        lower = 10^4,
        upper = 10^6
      ), 
      makeIntegerLearnerParam(id = "nfolds", default = 10L, lower = 3L)
    ),
    properties = c("numerics", "factors", "weights", "prob"),
    name = "Reference model for survival prediction",
    short.name = "surv.clinic_ref",
    note = "Computes a reference models for survival analysis. This is most 
    usefull in (multi-)omics contexts. The reference is a model with 
    clinical-variables only. 
    The clinical-only model can be fit as a standard coxph model 
    (survival::coxph) or with regularisation via lasso or ridge 
    regression (glmnet::cv.glmnet).
    It is required to provide a vector of indices in the 'clinicals' 
    argument, indicating the position of clinical variables in the 
    feature matrix.",
    callees = c("coxph", "surv")
  )
}

trainLearner.surv.clinic_reference = function(.learner, .task, .subset, .weights = NULL, ...) {
  if (is.null(getLearnerParVals(.learner)$model)) {
    mod <- "coxph"
  } else {
    mod <- getLearnerParVals(.learner)$model
  }
  
  clinicals <- getLearnerParVals(.learner)$clinicals + 2
  data <- getTaskData(.task, subset = .subset)[, c(1, 2, clinicals)]
  
  if (is.null(clinicals)) {
    stop("No index vector for clinical variables supplied") 
  } 
  
  if ((getTaskSize(.task) <= length(clinicals)) & mod == "coxph") {
    warning("There are less observations than clinical variables. Fitting coxph 
            model is not possible. Lasso model fitted instead." )
    mod = "lasso"
  }
  
  if (mod != "coxph") {
    alpha = ifelse(mod == "lasso", 1, 0)
    args = list(x = as.matrix(data[, -c(1:2)]), 
                y = Surv(data$time, data$status), 
                family = "cox",
                type.measure = "deviance", 
                alpha = alpha, 
                maxit = getLearnerParVals(.learner)$maxit,
                nfolds = getLearnerParVals(.learner)$nfolds)
    model <- do.call(glmnet::cv.glmnet, args)
  } else {
    model <- coxph(Surv(time, status) ~ ., data = data, x = TRUE)
  }
  if (.learner$predict.type == "response") {
    model
  } else {
    if (mod != "coxph") {
      # check for NA coeffs and set to zero 
      if (any(is.na(model$coefficients))) {
        model$coefficients[is.na(model$coefficients)] = 0
      }
      # get index of nonzero coeffs
      coeffs = as.vector(coef(model, s = model$lambda.min))
      index_nz = get_nz(coeffs)
      
      cph = compute_coxph_mod(coeffs, index_nz, data = data) 
      model = mlr:::attachTrainingTime(model, .task, data)
      model$cph = cph
      model$index_nz = index_nz
      model
    } else {
      model = mlr:::attachTrainingTime(model, .task, data)
      model
    }
  }
}

predictLearner.surv.clinic_reference = function(.learner, .model, .newdata, ...) {
  coxreg <- inherits(.model$learner.model, "coxph")
  newdata = .newdata[, getLearnerParVals(.learner)$clinicals]
  
  if (coxreg) {
    preds = predict(.model$learner.model,
                    newdata = newdata,
                    type = "lp",
                    ...)
  } else {
    preds = as.numeric(predict(
      .model$learner.model,
      newx = as.matrix(newdata),
      type = "link",
      ...
    ))
  }
  
  if (.learner$predict.type == "response") {
    preds 
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    if (coxreg) {
      if (any(is.na(.model$learner.model$coefficients))) {
        .model$learner.model$coefficients[is.na(.model$learner.model$coefficients)] = 0
      }
      probs = pec::predictSurvProb(.model$learner.model, newdata = newdata, times = train.times)
      list(preds = preds,
           probs = probs,
           train.times = train.times)
    } else {
      probs = pec::predictSurvProb(.model$learner.model$cph,
                                   newdata = newdata[, .model$learner.model$index_nz, drop = FALSE],
                                   times = train.times)
      list(preds = preds,
           probs = probs,
           train.times = train.times)
    }
  }  
}

#  1.2 surv.cv.glmnet (alias for lasso) -------------------------------------------
makeRLearner.surv.cv.glmnet2 = function() {
  makeRLearnerSurv(
    cl = "surv.cv.glmnet2",
    package = "glmnet",
    par.set = makeParamSet(
      makeNumericLearnerParam(id = "alpha", default = 1, lower = 0, upper = 1),
      makeIntegerLearnerParam(id = "nfolds", default = 10L, lower = 3L),
      makeDiscreteLearnerParam(id = "type.measure", values = "deviance", default = "deviance"),
      makeLogicalLearnerParam(id = "grouped", default = TRUE),
      makeDiscreteLearnerParam(id = "s", values = c("lambda.1se", "lambda.min"), default = "lambda.1se", when = "predict"),
      makeIntegerLearnerParam(id = "nlambda", default = 100L, lower = 1L),
      makeNumericLearnerParam(id = "lambda.min.ratio", lower = 0, upper = 1),
      makeLogicalLearnerParam(id = "standardize", default = TRUE),
      makeLogicalLearnerParam(id = "intercept", default = TRUE),
      makeNumericLearnerParam(id = "thresh", default = 1e-07, lower = 0),
      makeIntegerLearnerParam(id = "dfmax", lower = 0L),
      makeIntegerLearnerParam(id = "pmax", lower = 0L),
      makeIntegerVectorLearnerParam(id = "exclude", lower = 1L), # this is like a subset?
      makeNumericVectorLearnerParam(id = "penalty.factor", lower = 0, upper = 1),
      makeNumericVectorLearnerParam(id = "lower.limits", upper = 0),
      makeNumericVectorLearnerParam(id = "upper.limits", lower = 0),
      makeIntegerLearnerParam(id = "maxit", default = 100000L, lower = 1L),
      makeNumericLearnerParam(id = "fdev", default = 1.0e-5, lower = 0, upper = 1),
      makeNumericLearnerParam(id = "devmax", default = 0.999, lower = 0, upper = 1),
      makeNumericLearnerParam(id = "eps", default = 1.0e-6, lower = 0, upper = 1),
      makeNumericLearnerParam(id = "big", default = 9.9e35),
      makeIntegerLearnerParam(id = "mnlam", default = 5, lower = 1),
      makeNumericLearnerParam(id = "pmin", default = 1.0e-9, lower = 0, upper = 1),
      makeNumericLearnerParam(id = "exmx", default = 250.0),
      makeNumericLearnerParam(id = "prec", default = 1e-10),
      makeIntegerLearnerParam(id = "mxit", default = 100, lower = 1),
      makeLogicalLearnerParam(id = "factory", default = FALSE)
    ),
    properties = c("numerics", "factors", "ordered", "weights", "prob"),
    name = "GLM with Regularization (Cross Validated Lambda)",
    short.name = "cvglmnet",
    note = "Factors automatically get converted to dummy columns, ordered factors to integer.",
    callees = c("cv.glmnet", "glmnet", "glmnet.control", "predict.glmnet")
  )
}

trainLearner.surv.cv.glmnet2 = function(.learner, .task, .subset, .weights = NULL,  ...) {
  
  d = getTaskData(.task, 
                  subset = .subset, 
                  target.extra = TRUE, 
                  recode.target = "surv")
  
  info = mlr:::getFixDataInfo(d$data, 
                              factors.to.dummies = TRUE, 
                              ordered.to.int = TRUE)
  
  args = c(list(x = as.matrix(mlr:::fixDataForLearner(d$data, info)), 
                y = d$target, 
                family = "cox", 
                parallel = FALSE), 
           list(...))
  
  
  if (!is.null(.weights))
    args$weights = .weights
  
  saved.ctrl = glmnet::glmnet.control()
  is.ctrl.arg = names(args) %in% names(saved.ctrl)
  
  if (any(is.ctrl.arg)) {
    on.exit(do.call(glmnet::glmnet.control, saved.ctrl))
    do.call(glmnet::glmnet.control, args[is.ctrl.arg])
    args = args[!is.ctrl.arg]
  }
  
  model = mlr:::attachTrainingInfo(do.call(glmnet::cv.glmnet, args), info)  
  
  if (.learner$predict.type == "response") {  
    model
  } else {
    dat = getTaskData(.task, subset = .subset)
    
    # computing baseline survival
    coeffs = as.vector(coef(model, s = model$lambda.min))
    index_nz = get_nz(coeffs)
    
    cph = compute_coxph_mod(coeffs, index_nz, data = dat) 
    model = mlr:::attachTrainingTime(model, .task, dat)
    model$cph = cph
    model$index_nz <- index_nz
    model
  }
}

predictLearner.surv.cv.glmnet2 = function(.learner, .model, .newdata, ...) {
  info = mlr:::getTrainingInfo(.model)
  .newdata = mlr:::fixDataForLearner(.newdata, info)
  
  preds = as.numeric(predict(.model$learner.model, newx = as.matrix(.newdata), type = "link", ...))
  
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    probs = pec::predictSurvProb(.model$learner.model$cph,
                                 newdata = .newdata[, .model$learner.model$index_nz, drop = FALSE],
                                 times = train.times)
    list(preds = preds, probs = probs, train.times = train.times)
  }
}

#  1.3 surv.ts.ipflasso -------------------------------------------------------
makeRLearner.surv.ts.ipflasso = function() {
  makeRLearnerSurv(
    cl = "surv.ts.ipflasso",
    package = "ipflasso",
    par.set = makeParamSet(
      makeLogicalLearnerParam(id = "standardize", default = TRUE),
      makeNumericLearnerParam(id = "alpha", default = 1, lower = 0, upper = 1),
      makeUntypedLearnerParam(id = "blocks"),
      # makeNumericVectorLearnerParam(id = "pf", tunable = FALSE),
      makeIntegerLearnerParam(id = "nfolds", default = 5L, lower = 3L),
      makeIntegerLearnerParam(id = "ncv", default = 10L)
    ),
    properties = c("numerics", "factors", "prob"),
    name = "Two Step IPF-Lasso with prespecified penalty factors",
    short.name = "tsipflasso",
    note = "The penalty factors get prespecified via ridge regression on the training data. The block 
    structure has to be supplied."
  )
}

trainLearner.surv.ts.ipflasso = function(.learner, .task, .subset, ...) {
  
  # data, info and arguments
  d = getTaskData(
    .task,
    subset = .subset,
    target.extra = TRUE,
    recode.target = "surv"
  )
  
  info = mlr:::getFixDataInfo(d$data,
                              factors.to.dummies = TRUE,
                              ordered.to.int = TRUE)
  
  # blocks <- getLearnerParVals(.learner)$blocks
  
  # new args for cvr.adaptive.lasso
  args <- c(
    list(
      X = as.matrix(d$data),
      Y = d$target,
      family = "cox", # target variable datatype, i.e. survival
      type.measure = "deviance", # CV-criterion to optimize in cvr.ipflasso
      standardize = TRUE, # 
      alpha = 0, # Ridge regression in the first step
      type.step1 = "sep" # separate models in the first step
    ),
    list(...)  # blocks, ncv and nfolds are provided within the bench experiment routine (ncv = 1, nfolds = 10)
  )
  rm(d)
  
  # model fitting
  mod <- do.call(ipflasso::cvr.adaptive.ipflasso, args) # 1. and 2. step
  
  
  if (.learner$predict.type == "response") {
    if (sum(mod$coeff[ , mod$ind.bestlambda] != 0) == 0) { 
      class(mod) <- "FailureModel"
    } else {
      class(mod) <- "cvr.ipflasso" 
    }
    mod 
  } else {
    coeffs <- mod$coeff[-1, mod$ind.bestlambda] # -1 to remove intercept (which is NA for Cox!)
    index_nz = get_nz(coeffs) # if all coefs are zero, failure model results
    
    dat <- getTaskData(.task, subset = .subset)
    cph <- compute_coxph_mod(coeffs, index_nz, dat)
    mod$cph <- cph
    mod$index_nz <- index_nz
    mod = mlr:::attachTrainingTime(mod, .task, dat) 
    class(mod) <- "cvr.ipflasso"
    mod 
  }
}

predictLearner.surv.ts.ipflasso = function(.learner, .model, .newdata, ...) {
  
  mod = .model$learner.model
  preds = as.matrix(.newdata) %*% mod$coeff[-1, mod$ind.bestlambda]
  
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    probs = pec::predictSurvProb(.model$learner.model$cph,
                                 newdata = .newdata[, .model$learner.model$index_nz, drop = FALSE],
                                 times = train.times)
    list(preds = preds, probs = probs, train.times = train.times)
  }
}

#  1.4 surv.ts.priorlasso -----------------------------------------------------
makeRLearner.surv.ts.priorlasso = function() {
  makeRLearnerSurv(
    cl = "surv.ts.priorlasso",
    package = "prioritylasso",
    par.set = makeParamSet(
      makeUntypedLearnerParam(id = "blocks"),
      makeLogicalLearnerParam(id = "block1.penalization", default = TRUE),
      makeDiscreteLearnerParam(
        id = "lambda.type",
        default = "lambda.min",
        values = c("lambda.min", "lambda.1se"),
        tunable = FALSE
      ),
      makeLogicalLearnerParam(id = "standardize", default = TRUE),
      makeIntegerLearnerParam(
        id = "nfolds",
        default = 10L,
        lower = 2L
      ),
      makeLogicalLearnerParam(id = "cvoffset", default = FALSE),
      makeIntegerLearnerParam(
        id = "cvoffsetnfolds",
        default = 10L,
        lower = 2L,
        requires = quote(cvoffset == TRUE)
      ),
      makeLogicalLearnerParam(id = "favoring", default = FALSE)
    ),
    properties = c("numerics", "factors", "prob"),
    name = "Two step Priority Lasso",
    short.name = "ts.priorlasso",
    note = "Prioritylasso variante determining the priority datadriven in an separte step. Similiar to the 1. step
    in tsipf-lasso in the 1. step penalty factors for each modality are fit. According to these penalty factors the
    priortiy order is determined. The smaller (near zero) the penalty factor is, the greater is the importance of the
    associated modality. Ordering the penalty factors implies the priority order of the modalities."
  )
}

trainLearner.surv.ts.priorlasso = function(.learner, .task, .subset, ...) {
  
  d = getTaskData(
    .task,
    subset = .subset,
    target.extra = TRUE,
    recode.target = "surv"
  )
  
  info = mlr:::getFixDataInfo(d$data,
                              factors.to.dummies = TRUE,
                              ordered.to.int = TRUE)
  
  blocks <- getLearnerParVals(.learner)$blocks
  favoring <- getLearnerParVals(.learner)$favoring
  
  if (is.null(names(blocks))) {
    warning("The list of bocks is not named. Naming the blocks by 'block_i' 
            with i in {1, ..., number of blocks}")
    names(blocks) <- paste("block", seq(1:length(blocks)), sep = "_")
  }
  
  l_pf <- prespecify_pf(
    X = as.matrix(d$data),
    Y = d$target,
    blocks = blocks,
    separate = TRUE,
    alpha = 0
  )
  
  if(length(blocks) != length(l_pf$pf)) {
    stop("Length of penalty factors unequal to number of blocks.")
  }
  print("Penalty factors successfully computed. Continuing with model fitting")
  
  args <- list(
    X = as.matrix(d$data),
    Y = d$target,
    family = "cox",
    type.measure = "deviance"
  )
  
  # should clinical variables be favored
  if (favoring) {
    priority_order <- blocks[c(names(l_pf$pf[1]), names(sort(l_pf$pf[-1])))]
    args$blocks <- priority_order
    args$block1.penalization <- FALSE
  } else {
    priority_order <- blocks[names(sort(l_pf$pf))]
    args$blocks <- priority_order
  }
  
  print(c("Priortiy order: ", names(sort(l_pf$pf))), quote = FALSE)
  
  model <- do.call(prioritylasso::prioritylasso, args)
  model$priority_order <-  names(priority_order)
  
  if (.learner$predict.type == "response") {
    model
  } else {
    coeffs <- model$coefficients
    index_nz = get_nz(coeffs)
    
    dat <- getTaskData(.task, subset = .subset)
    cph <- compute_coxph_mod(coeffs, index_nz, dat)
    model$cph = cph
    model$index_nz <- index_nz
    model = mlr:::attachTrainingTime(model, .task, dat) 
    model
  }
}

predictLearner.surv.ts.priorlasso = function(.learner, .model, .newdata, ...) {
  preds <- predict(.model$learner.model, newdata = .newdata, type = "link", ...)
  
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    probs = pec::predictSurvProb(.model$learner.model$cph,
                                 newdata = .newdata[, .model$learner.model$index_nz, drop = FALSE],
                                 times = train.times)
    list(preds = preds, probs = probs, train.times = train.times)
  }
}

#  1.5 surv.cv.glmboost -------------------------------------------------------
makeRLearner.surv.cv.glmboost = function() {
  makeRLearnerSurv(
    cl = "surv.cv.glmboost",
    package = c("!survival", "mboost"),
    par.set = makeParamSet(
      makeDiscreteLearnerParam(
        id = "family",
        default = "CoxPH",
        values = c(
          "CoxPH",
          "Weibull",
          "Loglog",
          "Lognormal",
          "Gehan",
          "custom.family"
        )
      ),
      makeNumericVectorLearnerParam(
        id = "nuirange",
        default = c(0, 100),
        requires = quote(family %in% c("Weibull", "Loglog", "Lognormal"))
      ),
      makeUntypedLearnerParam(
        id = "custom.family.definition",
        requires = quote(family == "custom.family")),
      makeIntegerLearnerParam(
        id = "mstop",
        default = 100L,
        lower = 1L
      ),
      makeNumericLearnerParam(
        id = "nu",
        default = 0.1,
        lower = 0,
        upper = 1
      ),
      makeDiscreteLearnerParam(
        id = "risk",
        values = c("inbag", "oobag", "none")
      ),
      makeLogicalLearnerParam(
        id = "stopintern",
        default = FALSE
      ),
      # 'risk' and 'stopintern' will be kept for completeness sake
      makeLogicalLearnerParam(
        id = "center",
        default = FALSE
      ),
      makeLogicalLearnerParam(
        id = "use.formula",
        default = TRUE,
        when = "both"
      ),
      makeLogicalLearnerParam(
        id = "trace",
        default = FALSE,
        tunable = FALSE
      )
    ), 
    par.vals = list(
      family = "CoxPH",
      use.formula = TRUE
    ),
    properties = c("numerics", "factors", "ordered", "weights", "prob"),
    name = "Gradient Boosting with Componentwise Linear Models and automatically 
    tuned mstop.",
    short.name = "cv.glmboost",
    note = "`family` has been set to `CoxPH()` by default.",
    callees = c("glmboost", "mboost_fit", "boost_control", "CoxPH", 
                "Weibull", "Loglog", "Lognormal", "Gehan", "cvrisk")
  )
}

trainLearner.surv.cv.glmboost = function(.learner,
                                         .task,
                                         .subset,
                                         .weights = NULL,
                                         nuirange = c(0, 100),
                                         family,
                                         custom.family.definition,
                                         mstop,
                                         nu,
                                         risk,
                                         stopintern,
                                         trace,
                                         use.formula,
                                         ...) {
  ctrl = learnerArgsToControl(
    mboost::boost_control, 
    mstop, 
    nu, 
    risk, 
    trace, 
    stopintern
  )
  family = switch(
    family,
    CoxPH = mboost::CoxPH(),
    Weibull = mboost::Weibull(nuirange = nuirange),
    Loglog = mboost::Loglog(nuirange = nuirange),
    Lognormal = mboost::Lognormal(nuirange = nuirange),
    Gehan = mboost::Gehan(),
    custom.family = custom.family.definition
  )
  
  if (use.formula) {
    f = getTaskFormula(.task)
    model = if (is.null(.weights)) {
      mboost::glmboost(
        f,
        data = getTaskData(.task, subset = .subset, recode.target = "surv"),
        control = ctrl,
        family = family,
        ...
      )
    } else  {
      mboost::glmboost(
        f,
        data = getTaskData(.task, subset = .subset, recode.target = "surv"), 
        control = ctrl,
        weights = .weights,
        family = family,
        ...
      )
    }
  } else {
    data = getTaskData(
      .task,
      subset = .subset,
      target.extra = TRUE,
      recode.target = "surv"
    )
    info = mlr:::getFixDataInfo(data$data,
                                factors.to.dummies = TRUE,
                                ordered.to.int = TRUE)
    data$data = as.matrix(mlr:::fixDataForLearner(data$data, info))
    data$data = cbind(intercept = 1, data$data)
    
    model = if (is.null(.weights)) {
      mboost::glmboost(
        x = data$data,
        y = data$target,
        control = ctrl,
        family = family,
        ...
      )
    } else {
      mboost::glmboost(
        x = data$data,
        y = data$target,
        control = ctrl,
        weights = .weights,
        family = family,
        ...
      )
    }
  }
  
  # compute optimal number of boosting iterations and adjust model accordingly
  cv10f <- cv(model.weights(model), type = "kfold")
  cvm = cvrisk(model, folds = cv10f, grid = 0L:1000L, papply = lapply) 
  mstop(model) <- mstop(cvm)
  model = mlr:::attachTrainingInfo(model, info)
  
  if (.learner$predict.type == "response") {
    model
  } else {
    coeffs = coef(model)
    features = names(coeffs)
    dat = cbind(getTaskTargets(.task), data$data)
    sub_data = dat[, c("time", "status", features)]
    cph = coxph(
      Surv(time, status) ~ .,
      data = sub_data,
      init = coeffs,
      iter.max = 0,
      x = TRUE
    )
    model$cph = cph
    model$features = features
    model = mlr:::attachTrainingTime(model, .task, dat) 
    model
  }
}

predictLearner.surv.cv.glmboost = function(.learner,
                                           .model,
                                           .newdata,
                                           use.formula,
                                           ...) {
  if (!use.formula) {
    info = mlr:::getTrainingInfo(.model)
    newdata = mlr:::fixDataForLearner(.newdata, info)
    newdata = cbind(intercept = 1, newdata)
  }
  preds = predict(
    .model$learner.model,
    newdata = as.matrix(newdata),
    type = "link"
  )
  
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    probs = pec::predictSurvProb(
      .model$learner.model$cph,
      newdata = newdata[, .model$learner.model$features, drop = FALSE],
      times = train.times
    )
    list(preds = preds,
         probs = probs,
         train.times = train.times)
  }
}
#  1.6 surv.cv.coxboost -------------------------------------------------------
makeRLearner.surv.cv.CoxBoost2 = function() {
  makeRLearnerSurv(
    cl = "surv.cv.CoxBoost2",
    package = "!CoxBoost",
    par.set = makeParamSet(
      makeIntegerLearnerParam(id = "maxstepno", 
                              default = 100L, 
                              lower = 0L),
      makeIntegerLearnerParam(id = "K",
                              default = 10L,
                              lower = 1L), 
      makeDiscreteLearnerParam(id = "type", 
                               default = "verweij", 
                               values = c("verweij", "naive")),
      makeLogicalLearnerParam(id = "parallel",
                              default = FALSE,
                              tunable = FALSE), 
      makeLogicalLearnerParam(id = "upload.x",
                              default = FALSE,
                              tunable = FALSE), 
      makeLogicalLearnerParam(id = "multicore",
                              default = FALSE,
                              tunable = FALSE), 
      makeIntegerVectorLearnerParam(id = "unpen.index"),
      makeLogicalLearnerParam(id = "standardize", default = TRUE), 
      makeNumericLearnerParam(id = "penalty", lower = 0),
      makeDiscreteLearnerParam(id = "criterion", 
                               default = "pscore", 
                               values = c("pscore", "score", "hpscore", "hscore")),
      makeNumericLearnerParam(id = "stepsize.factor",
                              default = 1,
                              lower = 0), 
      makeLogicalLearnerParam(id = "trace",
                              default = FALSE,
                              tunable = FALSE)
    ),
    properties = c("numerics", "factors", "weights", "prob"),
    name = "Cox Proportional Hazards Model with Componentwise Likelihood based 
    Boosting, tuned for the optimal number of boosting steps",
    short.name = "cv.CoxBoost",
    note = "Factors automatically get converted to dummy columns, 
    ordered factors to integer.",
    callees = c("cv.CoxBoost", "CoxBoost")
  )
}

trainLearner.surv.cv.CoxBoost2 = function(.learner,
                                          .task,
                                          .subset,
                                          .weights = NULL,
                                          penalty = NULL,
                                          unpen.index = NULL,
                                          ...) {
  # preparing data and params -------------------------------------------------
  data = getTaskData(.task,
                     subset = .subset,
                     target.extra = TRUE,
                     recode.target = "surv")
  
  info = mlr:::getFixDataInfo(data$data,
                              factors.to.dummies = TRUE,
                              ordered.to.int = TRUE)
  
  unpen.index = getLearnerParVals(.learner)$`unpen.index`
  if (is.null(penalty))
    penalty = 9 * sum(data$target[, 2L])
  
  pars = c(list(time = data$target[, 1L],
                status = data$target[, 2L],
                x = as.matrix(mlr:::fixDataForLearner(data$data, info)),
                penalty = penalty,
                weights = .weights,
                unpen.index = unpen.index), 
           list(...))
  rm(data)
  
  # tuning --------------------------------------------------------------------
  # determining the optimal number of boosting iterations
  res = do.call(CoxBoost::cv.CoxBoost, pars)
  res$optimal.step
  if (res$optimal.step == 0L)
    warning("Could not determine the optimal step number in cv.CoxBoost")
  
  pars = BBmisc:::insert(pars, list(stepno = res$optimal.step))
  pars$maxstepno = NULL
  
  # model fitting -------------------------------------------------------------
  # with optimal number of boosting iterations
  model = mlr:::attachTrainingInfo(do.call(CoxBoost::CoxBoost, pars), info)
  
  if (.learner$predict.type == "response") {
    model
  } else {
    dat = getTaskData(.task, subset = .subset)
    model = mlr:::attachTrainingTime(model, .task, dat)
    model
  }
}

predictLearner.surv.cv.CoxBoost2 = function(.learner, .model, .newdata, ...) {
  info = mlr:::getTrainingInfo(.model)
  .newdata = as.matrix(mlr:::fixDataForLearner(.newdata, info))
  preds = as.numeric(predict(.model$learner.model, 
                             newdata = .newdata, 
                             type = "lp"))
  
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    probs = predict(.model$learner.model, 
                    .newdata, type = "risk", 
                    times = train.times)
    
    list(preds = preds,
         probs = probs,
         train.times = train.times)
  }
}

#  1.7 surv.blockForest -------------------------------------------------------
makeRLearner.surv.blockForest = function() {
  makeRLearnerSurv(
    cl = "surv.blockForest",
    package = "blockForest",
    par.set = makeParamSet(
      makeUntypedLearnerParam(id = "blocks"),
      makeIntegerLearnerParam(id = "num.trees", default = 2000L, lower = 1L),
      makeIntegerLearnerParam(id = "mtry", default = NULL, special.vals = list(NULL)),
      makeIntegerLearnerParam(id = "nsets", default = 300, lower = 1),
      makeIntegerLearnerParam(id = "num.trees.pre", default = 1500, lower = 1),
      makeIntegerLearnerParam(id = "num.threads", default = NULL, special.vals = list(NULL)),
      makeDiscreteLearnerParam(id = "splitrule", default = "extratrees", values = c("extratrees", "logrank", "C", "maxstat"))
    ),
    properties = c("missings", "numerics", "factors", "weights", "prob", "rcens"),
    name = "Random Forest variants for block-structured covariate data",
    short.name = "blockForest",
    note = ""
  )
}

trainLearner.surv.blockForest = function (.learner, .task, .subset, .weights = NULL, ...) {
  d = getTaskData(.task, subset = .subset, target.extra = TRUE)
  info = mlr:::getFixDataInfo(d$data,
                              factors.to.dummies = TRUE,
                              ordered.to.int = TRUE)
  
  args = c(list(X = as.matrix(d$data), 
                y = as.matrix(d$target),
                block.method = "BlockForest"), 
           list(...))
  
  model = do.call(blockForest::blockfor, args)
  dat = getTaskData(.task, subset = .subset)
  model = mlr:::attachTrainingTime(model, .task, dat)
  model
}

predictLearner.surv.blockForest = function (.learner, .model, .newdata, ...) {
  p = predict(object = .model$learner.model$forest, data = .newdata)
  preds = rowMeans(p$chf)
  
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    ptemp = p$survival
    pos = prodlim::sindex(jump.times = p$unique.death.times, eval.times = train.times)
    probs = cbind(1, ptemp)[, pos + 1, drop = FALSE]
    list(preds = preds, probs = probs, train.times = train.times)
  }
}

#  1.8 surv.cvSGL ---------------------------------------------------------------
makeRLearner.surv.cvSGL = function() {
  makeRLearnerSurv(
    cl = "surv.cvSGL",
    package = "SGL",
    par.set = makeParamSet(
      makeUntypedLearnerParam(id = "index", default = NULL),
      makeIntegerLearnerParam(id = "maxit", default = 1000),
      makeNumericLearnerParam(id = "thresh", default = 0.001),
      makeNumericLearnerParam(id = "min.frac", default = 0.1),
      makeIntegerLearnerParam(id = "nlam", default = 20),
      makeNumericLearnerParam(id = "gamma", default = 0.8, lower = 0, upper = 1),
      makeIntegerLearnerParam(id = "nfold", default = 10L, lower = 3),
      makeLogicalLearnerParam(id = "standardize", default = TRUE),
      makeLogicalLearnerParam(id = "verbose", default = FALSE),
      makeNumericLearnerParam(id = "step", default = 1, lower = 0, upper = 1),
      makeIntegerLearnerParam(id = "reset", default = 10),
      makeNumericLearnerParam(id = "alpha", default = 0.95, lower = 0, upper = 1),
      makeUntypedLearnerParam(id = "lambda", default = NULL)
    ),
    properties = c("numerics", "factors", "prob"),
    name = "Cross-validated Sparse Group Lasso",
    short.name = "cvSGL",
    note = ""
  )
}

trainLearner.surv.cvSGL = function(.learner, .task, .subset, ...) {
  d = getTaskData(.task, subset = .subset, target.extra = TRUE)
  info = mlr:::getFixDataInfo(d$data, factors.to.dummies = TRUE, ordered.to.int = TRUE)
  
  # adjustment for all-zero-vars ---------------------------------------------------
  if (any(apply(d$data, 2, function(x) sum(x) == 0))) {
    d$data = d$data[, -which(apply(d$data, 2, function(x) sum(x == 0)) == nrow(d$data))]
  }
  
  # block adjustment ---------------------------------------------------------------
  blks = get_blocks2(d$data) 
  
  bl_sizes = vapply(blks, length, FUN.VALUE = integer(1L)) 
  sgl_blocks = unlist(mapply(rep.int, times =  bl_sizes, x = seq_along(blks)))
  ind = which(names(list(...)) == "index")
  
  # prepare args -------------------------------------------------------------------
  args = c(list(data = list(x = as.matrix(d$data), 
                            time = d$target$time, 
                            status = d$target$status),
                type = "cox",
                index = sgl_blocks), 
           list(...)[-ind])
  
  # SGL model --------------------------------------------------------------------------
  mod = do.call(SGL::cvSGL, args)
  
  # coefficient extraction and check----------------------------------------------------
  index_nz <- FALSE
  count <- which.min(mod$lldiff)
  while (!any(index_nz)) {
    if (count > length(mod$lldiff)) {
      stop("Null model fitted.")
    }
    
    coeffs = mod$fit$beta[, count]
    index_nz = get_nz(coeffs, check_all_zero = FALSE)
    
    count = count + 1
  }
  
  # coxph model for predicted probs ------------------------------------------------
  dat =  cbind(d$target, d$data)
  cph = compute_coxph_mod(coeffs, index_nz, data = dat) 
  model = mlr:::attachTrainingTime(mod, .task, dat)
  model$cph = cph
  model$index_nz = index_nz
  model
}

predictLearner.surv.cvSGL = function(.learner, .model, .newdata, ...) {
  # Adjust new data matrix for deleted all-zero vars.
  i_selected = names(.newdata) %in% attributes(lmod$fit$X)$dimnames[[2]]
  newX = .newdata[, i_selected]
  
  # compute lin preds 
  coeffs = lmod$cph$coefficients
  vars = attributes(coeffs)$names
  tempX = as.matrix(newX[vars])
  eta = tempX %*% coeffs
  preds = exp(eta) 
  
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    probs = pec::predictSurvProb(.model$learner.model$cph,
                                 newdata = newX[, .model$learner.model$index_nz, drop = FALSE],
                                 times = train.times)
    list(preds = preds,
         probs = probs,
         train.times = train.times)
  }
}

#  1.9 surv.grridge ----------------------------------------------------------
makeRLearner.surv.grridge = function() {
  makeRLearnerSurv(
    cl = "surv.grridge",
    package = "GRridge",
    par.set = makeParamSet(
      makeUntypedLearnerParam(id = "partitions"),                               # normal blocks (or more general list of list)
      makeUntypedLearnerParam(id = "unpenal"),                                  # needs to be formula
      makeDiscreteLearnerParam(id = "method", 
                               default = "exactstable", 
                               values = c("exactstable", "stable", "exact")),
      makeIntegerLearnerParam(id = "niter", 
                              default = 10, 
                              lower = 1),
      makeLogicalLearnerParam(id = "monotone", 
                              default = NULL, 
                              special.vals = list(NULL)),
      makeNumericLearnerParam(id = "oplt", 
                              lower = 0, 
                              upper = Inf, 
                              default = NULL, 
                              special.vals = list(NULL)),
      makeIntegerLearnerParam(id = "innfold", 
                              lower = 0, 
                              default = NULL, 
                              special.vals = list(NULL)),
      makeLogicalLearnerParam(id = "selectionEN", 
                              default = FALSE, 
                              special.vals = list(NULL)),                       # needs to be true for bench experiment
      makeIntegerVectorLearnerParam(id = "maxsel", 
                                    default = c(25,100),
                                    special.vals = list(NULL)),                 # if set to NULL the max. no of selc. features equals nrow
      makeUntypedLearnerParam(id = "dataunpen"),
      makeLogicalLearnerParam(id = "standardizeX", 
                              default = TRUE, 
                              special.vals = list(NULL)),
      makeDiscreteLearnerParam(id = "savepredobj", 
                               default = "all", 
                               values = c("all", "last", "none"))
    ),
    properties = c("missings", "numerics", "factors", "weights", "prob", "rcens"),
    name = "Group-regularized (logistic) ridge regression",
    short.name = "GRridge",
    note = ""
  )
}

trainLearner.surv.grridge = function (.learner, .task, .subset, .weights = NULL, ...) 
{
  d = getTaskData(.task, subset = .subset, target.extra = TRUE, recode.target = "surv")
  info = mlr:::getFixDataInfo(d$data, factors.to.dummies = TRUE, ordered.to.int = TRUE)
  
  # adjustment for all-zero-vars ---------------------------------------------------
  if (any(apply(d$data, 2, function(x) sum(x) == 0))) {
    all_zeros <- which(apply(d$data, 2, function(x) sum(x == 0)) == nrow(d$data))
    d$data = d$data[, -all_zeros]
  }
  
  args = c(list(highdimdata = t(d$data), 
                response = d$target),
           list(...))
  
  if (is.null(args$maxsel))
    args$maxsel <- ncol(d$data)
  
  # 1. First CV the lambdas separately for the groups of different variabels.
  # 2. The square-root of the inverses of these 5 penalties would then be the 
  #    initial group weights, wt in the code below
  # 3. Then apply these as initial weight for GRridge
  # 4. GRidge will then moderate the penalty weights, thereby appropriately 
  #    accounting for possible collinearities across the 5 groups of variables
  blocks <- get_blocks2(d$data)
  lambdas <- vector("integer", length = length(blocks))
  args_optL2 <- list(response = args$response,
                     fold = 5L)
  scaled_dat <- as.data.frame(scale(d$data))
  
  for (opt in seq_along(blocks)) {
    args_optL2$penalized <- scaled_dat[, blocks[[opt]]]
    lambdas[opt] <- do.call(penalized::optL2, args = args_optL2)$lambda
  }
  ngr <- vapply(blocks, length, FUN.VALUE = integer(1))
  wt <- function(i) rep(1/sqrt(lambdas[i]), ngr[i])
  weights0 <- unlist(sapply(seq_along(blocks), wt))
  weights <- weights0/mean(weights0)
  args$highdimdata <- t(d$data * weights)
  args$partitions <- blocks
  args$method <- "exactstable"
  
  model <- do.call(GRridge::grridge, args)
  
  # model coefficients                                                          
  coeffs <- model$betas
  
  # setting NA-coeffs to zero for stability. Will not be used in model
  if (any(is.na(coeffs))) {
    coeffs[is.na(coeffs)] = 0
  }
  
  # routine only works for one maxsel safely! Adjust!
  index_nz <- names(d$data) %in% names(model$resEN[[1]]$betasEN)                # creating logicals instead of using indices since compute_coxph_mod needs logicals
  
  # check if all coeffs are zero and stop make failure model
  if (!any(index_nz)) {
    stop("Null model fitted.")
  }
  
  dat =  cbind(getTaskTargets(.task), d$data)
  cph <- compute_coxph_mod(coeffs, index_nz, dat)                               # kills R if to many non-zero coefficients
  model$cph = cph
  model$index_nz <- index_nz
  model = mlr:::attachTrainingTime(model, .task, dat) # check for problems
  model$weights <- weights
  if (exists("all_zeros")) {
    model$all_zeros <- all_zeros
  } else {
    model$all_zeros <- NULL
  }  
  class(model) <- "grridge"
  model
}

predictLearner.surv.grridge = function (.learner, .model, .newdata, ...) {
  # Adjust new data matrix for deleted all-zero vars and scale by test data
  # newdata differs in ncol to design matrix X since X had been adjusted for all zeros
  newX <- .newdata[, -lmod$all_zeros]
  
  # weighting since special routine by mvd used
  newX <- newX*lmod$weights
  
  i_selected <- names(.newdata) %in% names(.model$learner.model$resEN[[1]]$betasEN) 
  newX <- .newdata[, i_selected, drop = FALSE]
  
  coeffs <- coef(lmod)
  vars <- attributes(coeffs)$names
  tempX <- as.matrix(newX[vars])
  eta <- tempX %*% coeffs
  preds <- exp(eta)
  
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    probs = pec::predictSurvProb(.model$learner.model$cph, 
                                 newdata = as.data.frame(newX),
                                 times = train.times)
    
    list(preds = preds, probs = probs, train.times = train.times)
  }
}

#  1.10 surv.tunedrfsrc ---------------------------------------------------------
makeRLearner.surv.randomForestSRC2 = function() {
  makeRLearnerSurv(
    cl = "surv.randomForestSRC2",
    package = c("survival", "randomForestSRC"),
    par.set = makeParamSet(
      makeIntegerLearnerParam(id = "ntree", default = 1000L, lower = 1L),
      makeDiscreteLearnerParam(id = "bootstrap", default = "by.root",
                               values = c("by.root", "by.node", "none")),
      makeIntegerLearnerParam(id = "mtry", lower = 1L),
      makeIntegerLearnerParam(id = "nodesize", lower = 1L, default = 3L),
      makeIntegerLearnerParam(id = "nodedepth", default = -1L),
      makeDiscreteLearnerParam(id = "splitrule", default = "logrank",
                               values = c("logrank", "logrankscore", "random")),
      makeIntegerLearnerParam(id = "nsplit", lower = 0L, default = 0L,
                              requires = quote(splitrule != "random")), # nsplit is ignored and internally set to 1 for splitrule = "random"
      makeLogicalLearnerParam(id = "split.null", default = FALSE),
      makeDiscreteLearnerParam(id = "importance", default = FALSE, tunable = FALSE,
                               values = list(`FALSE` = FALSE, `TRUE` = TRUE, "none", "permute", "random",
                                             "anti", "permute.ensemble", "random.ensemble", "anti.ensemble")),
      makeDiscreteLearnerParam(id = "na.action", default = "na.impute",
                               values = c("na.omit", "na.impute"), when = "both"),
      makeIntegerLearnerParam(id = "nimpute", default = 1L, lower = 1L),
      makeUntypedLearnerParam(id = "ntime"), # can be a single integer with number of time points or a numeric vector of time values
      makeDiscreteLearnerParam(id = "proximity", default = FALSE, tunable = FALSE,
                               values = list("inbag", "oob", "all", `TRUE` = TRUE, `FALSE` = FALSE)),
      makeIntegerLearnerParam(id = "sampsize", lower = 1L,
                              requires = quote(bootstrap == "by.root")),
      makeDiscreteLearnerParam(id = "samptype", default = "swr", values = c("swr", "swor"),
                               requires = quote(bootstrap == "by.root")),
      makeNumericVectorLearnerParam(id = "xvar.wt", lower = 0),
      makeLogicalLearnerParam(id = "forest", default = TRUE, tunable = FALSE),
      makeDiscreteLearnerParam(id = "var.used", default = FALSE, tunable = FALSE,
                               values = list(`FALSE` = FALSE, "all.trees", "by.tree")),
      makeDiscreteLearnerParam(id = "split.depth", default = FALSE, tunable = FALSE,
                               values = list(`FALSE` = FALSE, "all.trees", "by.tree")),
      makeIntegerLearnerParam(id = "seed", upper = 0L, tunable = FALSE),
      makeLogicalLearnerParam(id = "do.trace", default = FALSE, tunable = FALSE, when = "both"), # is currently ignored
      makeLogicalLearnerParam(id = "membership", default = TRUE, tunable = FALSE),
      makeLogicalLearnerParam(id = "statistics", default = FALSE, tunable = FALSE),
      makeLogicalLearnerParam(id = "tree.err", default = FALSE, tunable = FALSE)
    ),
    par.vals = list(na.action = "na.impute"),
    properties = c("missings", "numerics", "factors", "ordered", "weights", "oobpreds", "featimp", "prob"),
    name = "Random Forest",
    short.name = "rfsrc",
    note = '`na.action` has been set to `"na.impute"` by default to allow missing data support.',
    callees = "rfsrc"
  )
}

trainLearner.surv.randomForestSRC2 = function(.learner, .task, .subset, .weights = NULL, ...) {
  data = getTaskData(.task, subset = .subset)
  f = getTaskFormula(.task)
  model = randomForestSRC::tune(f, data = data, case.wt = .weights, doBest = TRUE, ...)
  
  if (.learner$predict.type == "response") {
    model
  } else {
    model = model$rf
    model = mlr:::attachTrainingTime(model, .task, data)
    model
  }
}

predictLearner.surv.randomForestSRC2 = function(.learner, .model, .newdata, ...) {
  preds = predict(.model$learner.model, newdata = .newdata, membership = FALSE, ...)$predicted
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    probs = pec::predictSurvProb(.model$learner.model, newdata = .newdata, times = train.times)
    list(preds = preds, probs = probs, train.times = train.times)
  }
}

#  1.11 surv.tunedranger -------------------------------------------------------
makeRLearner.surv.tuneMtryFast2 = function() {
  makeRLearnerSurv(
    cl = "surv.tuneMtryFast2",
    package = "tuneRanger",
    par.set = makeParamSet(
      makeIntegerLearnerParam(id = "mtryStart", lower = 1L),
      makeIntegerLearnerParam(id = "num.treesTry", lower = 1L, default = 50L),
      makeNumericLearnerParam(id = "stepFactor", lower = 0L),
      makeNumericLearnerParam(id = "improve", lower = 0L),
      makeLogicalLearnerParam(id = "trace", default = TRUE),
      makeIntegerLearnerParam(id = "min.node.size", lower = 1L),
      makeLogicalLearnerParam(id = "replace", default = TRUE),
      makeNumericLearnerParam(id = "sample.fraction", lower = 0L, upper = 1L),
      makeNumericVectorLearnerParam(id = "split.select.weights", lower = 0, upper = 1),
      makeUntypedLearnerParam(id = "always.split.variables"),
      makeDiscreteLearnerParam("respect.unordered.factors", values = c("ignore", "order", "partition"), default = "ignore"),
      makeDiscreteLearnerParam(id = "importance", values = c("none", "impurity", "permutation"), default = "none", tunable = FALSE),
      makeLogicalLearnerParam(id = "write.forest", default = TRUE, tunable = FALSE),
      makeLogicalLearnerParam(id = "scale.permutation.importance", default = FALSE, requires = quote(importance == "permutation"), tunable = FALSE),
      makeIntegerLearnerParam(id = "num.threads", lower = 1L, when = "both", tunable = FALSE),
      makeLogicalLearnerParam(id = "save.memory", default = FALSE, tunable = FALSE),
      makeLogicalLearnerParam(id = "verbose", default = TRUE, when = "both", tunable = FALSE),
      makeIntegerLearnerParam(id = "seed", when = "both", tunable = FALSE),
      makeDiscreteLearnerParam(id = "splitrule", values = c("gini", "extratrees"), default = "gini"),
      makeIntegerLearnerParam(id = "num.random.splits", lower = 1L, default = 1L, requires = quote(splitrule == "extratrees")),
      makeLogicalLearnerParam(id = "keep.inbag", default = FALSE, tunable = FALSE)
    ),
    properties = c("numerics", "factors", "ordered",  "weights", "prob"),
    name = "tuneMtryFast for ranger",
    short.name = "tuneMtryFast",
    note = ""
  )
}

trainLearner.surv.tuneMtryFast2 = function(.learner, .task, .subset, .weights = NULL, classwt = NULL, cutoff, ...) {
  data = getTaskData(.task, .subset)
  tn = getTaskTargetNames(.task)
  model = tuneRanger::tuneMtryFast(
    formula = NULL,
    data = data,
    dependent.variable.name = tn[1L],
    status.variable.name = tn[2L],
    num.treesTry = 50,
    doBest = TRUE,
    case.weights = .weights,
    ...
  )
  model = mlr:::attachTrainingTime(model, .task, data)
  model
}

predictLearner.surv.tuneMtryFast2 = function(.learner, .model, .newdata, ...) {
  p = predict(object = .model$learner.model, data = .newdata)
  preds = rowMeans(p$chf)
  if (.learner$predict.type == "response") {
    preds
  } else {
    train.times = sort(unique(c(0, .model$learner.model$times)))
    ptemp = p$survival
    pos = prodlim::sindex(jump.times = p$unique.death.times, eval.times = train.times)
    probs = cbind(1, ptemp)[, pos + 1, drop = FALSE]
    list(preds = preds, probs = probs, train.times = train.times)
  }
}

#  Help functions --------------------------------------------------------------
compute_coxph_mod <- function(coeffs, index_nz, data, ...) {
  index_nz_y <- c(TRUE, TRUE, index_nz)
  sub_data <- data[, index_nz_y, drop = FALSE]
  coxph(
    Surv(time, status) ~ .,
    data = sub_data,
    init = coeffs[index_nz],
    iter.max = 0,
    x = TRUE
  )
}

check_coeffs <- function(coeffs) {
  # check for NA coeffs and set to zero 
  if (any(is.na(coeffs))) {
    coeffs[is.na(coeffs)] = 0
  }
  coeffs
}

make_nz <- function(coeffs, check_all_zero = TRUE) {
  index_nz <- coeffs != 0
  
  if (check_all_zero) {
    if (!any(index_nz)) {
      stop("Null model fitted.")
    }
  }
  
  index_nz
}

get_nz <- function(coeffs, check_all_zero = TRUE) {
  coeffs %>%
    check_coeffs() %>%
    make_nz()
}


#  End 1. ---------------------------------------------------------------------

# 2. custom measures ----------------------------------------------------------

get_selected_feats = function(task, model, pred, feats, extra.args) {
  if (inherits(model, "FailureModel")) {
    return(NA_real_)
  }
  
  if (inherits(model$learner.model, "cv.glmnet")) {
    coefs <- coef.cv.glmnet2(model$learner.model)
  } else {
    coefs <- coef(model$learner.model)  
  }
  
  if (any(is.na(coefs))) {
    coefs[is.na(coefs)] <- 0
  }
  
  if (any(coefs == 0)) {
    coefs <- coefs[coefs != 0]
  }
  
  # get selected features
  if (extra.args$group == "default") {
    length(coefs)
  } else {
    sel_gr_feats <- grep(pattern = extra.args$group,
                         x = names(coefs),
                         value = TRUE)
    
    length(sel_gr_feats)
  }
}

#  2.1 group features ----------------------------------------------------------
group_meas <-
  c("featselc_cnv",
    "featselc_rna",
    "featselc_mirna",
    "featselc_clin",
    "featselc_mutation")

properties = c(
  "classif",
  "classif.multi",
  "multilabel",
  "regr",
  "surv",
  "costsens",
  "cluster",
  "req.model",
  "req.pred"
)

for (m in group_meas) {
  grp <- strsplit(m, "_")[[1]][2]
  
  temp <- makeMeasure(
    id = m,
    minimize = TRUE,
    best = 0,
    worst = Inf,
    properties = properties,
    name = paste0("Amount of features of feature group '", grp, "' used for the model"),
    note = "Useful for feature selection. The feature group is specified via extra.arg.
    The names of the features within the group must contain that string. Make sure names of
    features of other groups do not contain this string.",
    fun =  get_selected_feats,
    extra.args = list(group = paste0("_", grp)),
    aggr = test.mean
  )
  
  assign(m, temp)
}

#  2.2 featselc_default -------------------------------------------------------
featselc_default <-
  makeMeasure(
    id = "featselc_default",
    minimize = TRUE,
    best = 0,
    worst = Inf,
    properties = properties,
    name = "Overall amount of selected features.",
    note = "Useful for feature selection.",
    fun = get_selected_feats,
    extra.args = list(group = "default"),
    aggr = test.mean
  )

#  Help functions --------------------------------------------------------------
coef.cv.SGL <- function(model) {
  model$cph$coefficients
}

coef.grridge <- function(model) {
  model$resEN[[1]]$betasEN
}

coef.cv.glmnet2 <- function(model) {
  lambda_min <- model$lambda.min
  lambda_ind <- which(model$lambda == lambda_min)
  index_nz <- model$glmnet.fit$beta[, lambda_ind] != 0
  coefs <- model$glmnet.fit$beta[index_nz, lambda_ind]
}

coef.cvr.ipflasso <- function(model) {
  model$coeff[, model$ind.bestlambda]
}

#  End 2.----------------------------------------------------------------------

# 3. task specific learners ---------------------------------------------------

#  3.1 Main function ----------------------------------------------------------
make_spec_lrns <- function(task, lrnr, args) {
  
  # derive blocks -------------------------------------------------------------
  data <- mlr:::getTaskData(task)
  blocks <- get_blocks2(data)
  clinicals <- blocks$clinical
  
  # create task specific learners ---------------------------------------------
  args <- args[[lrnr]]
  
  # Clinical reference
  if (lrnr == "lrn_clin_ref") {
    args$clinicals <- clinicals
    lrn_clin_ref <- do.call(makeLearner, args = args)
  }
  
  # TS priority lasso
  if (lrnr == "lrn_ts_prior") {
    args$blocks <- blocks
    lrn_ts_prior <- do.call(makeLearner, args = args)
  }
  
  # TS priority lasso favoring
  if (lrnr == "lrn_ts_prior_fav") {
    args$blocks <- blocks
    lrn_ts_prior_fav <- do.call(makeLearner, args = args)
  }
  
  # IPF-Lasso
  if (lrnr == "lrn_tsipf") {
    args$blocks <- blocks
    lrn_tsipf <- do.call(makeLearner, args = args)
  }
  
  # mandatory boostings
  if (lrnr == "lrn_cv_coxboost_unpen") {
    args$unpen.index <- clinicals
    lrn_cv_coxboost_unpen <- do.call(makeLearner, args = args)
  }
  
  # blockForest
  if (lrnr == "lrn_blockForest") {
    args$blocks <- blocks
    lrn_blockForest <- do.call(makeLearner, args = args)
  }
  
  # SGL
  if (lrnr == "lrn_SGL") {
    args$index <- blocks
    lrn_SGL <- do.call(makeLearner, args = args)
  }
  
  # ggridge 
  if (lrnr == "lrn_grridge") {
    args$partitions <- blocks
    lrn_grridge <- do.call(makeLearner, args = args)
  }
  
  list(get(lrnr))
}

#  3.2 Help functions ---------------------------------------------------------

#   3.2.1 Get groups sizes ----------------------------------------------------
get_grp_size = function(group, data, sep = "_") {
  gr_vars <- grep(paste0(sep, group), names(data))
  length(gr_vars)
}

#   3.2.2 Get indices w.r.t. block membership ---------------------------------
get_blocks2 <- function(data, groups = c("clinical", "cnv", "mirna", "mutation", "rna")) {
  blength <- lapply(groups, function(group, data) get_grp_size(group, data), data = data)
  nam <- groups
  
  if (any(blength == 0)) {
    ind <- which(blength == 0)
    blength <- blength[-ind]
    nam <- nam[-ind]
  }
  
  blocks <- rep(seq_along(blength), times = blength)
  blocks <- lapply(seq_along(blength), function(x) which(blocks == x))
  names(blocks) <- nam
  blocks
}

#   3.2.4 Prespecify penalty factors for 2-step ipflasso/priorlasso------------
#    3.2.4.1 Main function ----------------------------------------------------
prespecify_pf <- function(X, 
                          Y, # Needs to be Surv-Object
                          family = "cox", 
                          standardize = TRUE, 
                          nfolds = 5, 
                          type_measure = "deviance",
                          blocks,
                          separate = TRUE, 
                          alpha = 1, 
                          mean_type = "arith") {
  
  l_args <- list(x = as.matrix(X), 
                 y = Y, 
                 family = family, 
                 standardize = standardize, 
                 nfolds = nfolds, 
                 type.measure = type_measure, 
                 alpha = alpha)
  
  # compute mean coeffs 
  if (separate) {
    means <- compute_coeffs_sep(l_args = l_args, blocks = blocks, mean_type = mean_type)
  } else {
    means <- compute_coeffs_comb(l_args = l_args, blocks = blocks, mean_type = mean_type)
  }
  
  # get penalty factors and subset to non zero effect variables
  if (any(means != 0)) {
    means_check <- 1/means  
    
    exc <- create_exclusion_vector(blocks = blocks, means_check = means_check)  
    pf <- means_check[is.finite(means_check)]
    new_blocks <- subset_blocks(blocks = blocks, means_check)
    
    return(list(exc = exc, pf = pf, new_blocks = new_blocks))
  } else {
    warning("All step 1 coefficients are zero")
  }
}

#    3.2.4.2 Help functions ---------------------------------------------------
compute_coeffs_comb <- function(l_args, mean_type, blocks) {
  
  # model 
  mod <- do.call("cv.glmnet", l_args)
  # mod <- cv.glmnet(x = X, y = Y, family = family, standardize = standardize, nfolds = nfolds, type.measure = type.measure, alpha = alpha)
  
  # coeffs 
  coeffs <- mod %>%
    coef(s = "lambda.min") %>%
    abs() %>%
    as.matrix()
  
  # indexing blocks 
  block_length <- vapply(blocks, length, FUN.VALUE = integer(1))
  n_blocks <- length(blocks)
  index = integer(length = 0)
  for (i in 1L:n_blocks) {
    index <- c(index, rep(i,block_length[i]))
  }
  
  # means
  if (mean_type == "arith") {
    means <- as.numeric(tapply(coeffs[,1], index, mean))
  } else {
    means <- as.numeric(tapply(coeffs[,1], index, geometric.mean))
  }
  return(means)
}

compute_coeffs_sep <- function(l_args, blocks, mean_type) {
  
  means <- c(rep(0,length(blocks)))
  names(means) <- names(blocks) 
  
  # compute separate models for every modality
  for (l in 1:length(blocks)) {
    temp_args <- l_args
    temp_args[[1]] <- temp_args[[1]][, blocks[[l]]]
    
    # model
    assign(paste("model",l,sep = ""), do.call("cv.glmnet", temp_args))
    
    # coefficients
    abs_coeffs <- 
      paste("model", l , sep = "") %>%
      get() %>%
      coef(s = "lambda.min") %>%
      abs()
    
    # mean
    if (mean_type == "arith") {
      means[l] <- mean(abs_coeffs)
    } else {
      means[l] <- geometric.mean(abs_coeffs)
    }
  }
  return(means)
}

create_exclusion_vector <- function(blocks, means_check) {
  exc <- NULL
  a = 1    
  
  for (m in 1:length(blocks)) {
    if(is.infinite(means_check[m])) {
      exc <- append(exc, c(a:(a + length(blocks[[m]]) - 1)))
    }
    a = a + length(blocks[[m]])
  }
  return(exc)
}

subset_blocks <- function(blocks, means_check) {  
  for (o in 1:length(blocks)) {
    if(is.infinite(means_check[o])) 
    {blocks[[paste("block", o, sep = "")]] <- NULL}
  }
  
  a = 1
  for (q in 1:length(blocks)) {
    blocks[[q]] <- a:(a + length(blocks[[q]]) - 1)
    a <- a + length(blocks[[q]])
  }
  return(blocks)
}
#  End 3. ---------------------------------------------------------------------

# 4. sample reproduction ------------------------------------------------------
# sample_reproduction <- function(n_lrns = 12L, n_datsets = 1L, seed = 1234L) {
#   set.seed(seed) 
#   lrns <- sort(sample(1:12, n_lrns))
#   datsets <- sort(sample(1:18, n_datsets))
#   
#   list("lrns" = lrns, "datsets" = datsets)
# }
#  End 4. ---------------------------------------------------------------------