BloodCancerMultiOmics2017 / Git / [226bc8] /R/part11.R

Models:
WandaB/
BloodCancerMultiOmics2017
Downloads: 1
[226bc8]: / R / part11.R
History
Download this file
476 lines (411 with data), 18.9 kB

# Script contains helper functions for drug response prediction

# label the drugs with proper u
giveDrugLabel3 = function(drid, dtab=BloodCancerMultiOmics2017::drugs,
                          ctab=BloodCancerMultiOmics2017::conctab) {
  vapply(strsplit(drid, "_"), function(x) {
    k <- paste(x[1:2], collapse="_")
    paste0(dtab[k, "name"], " ",
           switch(x[3], "1:5"="c1:5", "4:5"="c4:5",
                  paste0(ctab[k, as.integer(x[3])], " \u00B5","M")))
  }, character(1))
}

# select features from the lpd object to use as response and predictors in the model
featureSelectionForLasso = function(objective, predictors, lpd) {
  
  # quiets concerns of R CMD check "no visible binding for global variable"
  dataType=NULL
  
  dimobj = dimnames(objective)
  objectiveName = dimobj[[1]][1]
  
  #design matix
  fx = t(Biobase::exprs(lpd)[predictors, ])

    # check the objective
  stopifnot(identical(rownames(fx), dimobj[[2]]))
  fy = objective[,,drop=TRUE]
  
  # select predictors
  fx = fx[, (fData(lpd)[predictors, "type"] %in% c("Methylation_Cluster",
                                                   "IGHV","gen", "pretreat")),
          drop=FALSE]
  
  #  make sure that there are no NAs in the table
  stopifnot(all(!is.na(fx)))
  
  # print table with numer of predictors from each group
  # message("Objective: ", objectiveName, "\n")
  prefix = paste(ifelse(dataType %in% unique(fData(lpd)[colnames(fx), "type"]),
                        names(dataType), "_"), collapse="")
  
  n = length(unique(fy))
  family = "gaussian"
  return(list(fx=fx, fy=fy, objectiveName=objectiveName, prefix=prefix,
              family=family))
}

# plotting the predictions
plotPredictions = function(fx, fy, objective, pred, coeffs, lpd, nm, lim,
                           objectiveName, colors) {

  # quiets concerns of R CMD check "no visible binding for global variable"
  X=NULL; Y=NULL; Measure=NULL
                           
  # PREPARE DATA FOR PLOTTING
  
  stopifnot( identical(dim(pred), c(length(fy), 1L)), identical(rownames(pred),
                                                                names(fy)) )
  ordy <- order(fy, pred[, 1])
  # design matrix of selected predictors (unscaled)
  mat  <- t(fx[ordy, names(coeffs), drop=FALSE])
  # human-readable names where available & drugs with concentrations
  nicename = names(coeffs) %>% `names<-`(names(coeffs))
  idx = grepl("D_", nicename)
  nicename[idx] = giveDrugLabel3(nicename[idx])
  nicename[!idx] = fData(lpd)[names(nicename)[!idx], "name"] %>%
    `names<-`(names(nicename)[!idx])
  nicename = ifelse(!is.na(nicename) & (nicename!=""), nicename, names(nicename))
  nicename = gsub("Methylation_Cluster", "Methylation cluster", nicename)
  rownames(mat) = nicename[rownames(mat)]
  # prepare plot title
  title=nm
  
  # CREATE INDIVIDUAL PARTS OF THE FIGURE  
  
  # bar plot
  stopifnot(all(coeffs<lim & coeffs>-lim))
  part1df = data.frame(coeffs,
                       nm=factor(names(coeffs), levels=names(rev(coeffs))))
  part1df$col= ifelse(rownames(part1df)=="IGHV", "I",
                      ifelse(rownames(part1df)=="Methylation_Cluster", "M",
                             ifelse(rownames(part1df)=="Pretreatment", "P","G")))
  
  part1 = ggplot(data=part1df, aes(x=nm, y=coeffs, fill=col)) +
    geom_bar(stat="identity", colour="black", position = "identity",
             width=.66, size=0.2) +theme_bw() +
    geom_hline(yintercept=0, size=0.3) + scale_x_discrete(expand=c(0,0.5)) +
    scale_y_continuous(expand=c(0,0)) + coord_flip(ylim=c(-lim,lim)) +
    theme(panel.grid.major=element_blank(),
          panel.background=element_blank(),
          panel.grid.minor = element_blank(),
          axis.text=element_text(size=8),
          panel.border=element_blank()) +
    xlab("") + ylab("Model Coefficients") +
    geom_vline(xintercept=c(0.5), color="black", size=0.6)+
    scale_fill_manual(c("M", "I", "G", "P"),
                      values=c(M=colors[["M"]][2],
                               I=colors[["I"]],
                               G=colors[["G"]],
                               P=colors[["P"]]))
  
  
  # heat map
  # mat contains selected predictors with status for each patient
  # (e.g. 0/1 for mutations and IGHV, 0/0.5/1 for M)
  # to assign colors Gosia added 5 to meth and 2 to IGHV values,resuting in
  # 5 LP, 5.5 IP, 6 HP, 2 unmut IGHV or mut, 4 IGHV mut, 3 mut, 7 pre-treatment
  idx = grep("Methylation cluster", rownames(mat))
  mat[idx,] = mat[idx,]+5
  ## ighv
  idx = grep("IGHV", rownames(mat))
  mat[idx,] = (mat[idx,]*2)+2
  ## gene
  rnm = sapply(rownames(mat), function(nm) strsplit(nm," ")[[1]][1])
  idx = rownames(fData(lpd))[fData(lpd)$type=="gen" & rownames(lpd) %in% rnm]
  idx = match(idx, rnm)
  mat[idx,] = mat[idx,]+2
  ## pretreat
  idx = grep("Pretreatment", rownames(mat))
  mat[idx,] = ifelse(mat[idx,]==0,2,7)
  
  mat2 = meltWholeDF(mat)
  mat2$Measure = factor(mat2$Measure, levels=sort(unique(mat2$Measure)))
  mat2$X = factor(mat2$X, levels=colnames(mat))
  mat2$Y = factor(mat2$Y, levels=rev(rownames(mat)))
  
  part2 = ggplot(mat2, aes(x=X, y=Y, fill=Measure)) +
    geom_tile() + theme_bw() +
    scale_fill_manual(name="Mutated",
                      values=c(`2`="gray96", `3`=paste0(colors["G"], "E5"),
                               `5`=colors[["M"]][1], `5.5`=colors[["M"]][2],
                               `6`=colors[["M"]][3], `7`=colors[["P"]],
                               `4`=paste0(colors["I"],"E5")), guide=FALSE) +
    scale_y_discrete(expand=c(0,0)) +
    theme(axis.text.y=element_text(hjust=0, size=14),
          axis.text.x=element_blank(),
          axis.ticks=element_blank(),
          panel.border=element_rect(colour="gainsboro"),
          plot.title=element_text(size=12),
          legend.title=element_text(size=12),
          legend.text=element_text(size=12),
          panel.background=element_blank(),
          panel.grid.major=element_blank(),
          panel.grid.minor=element_blank()) +
    xlab("patients") + ylab("") + ggtitle(title)
  if(length(levels(mat2$Y)) > 1) {
    part2 = part2 + geom_hline(yintercept=seq(1.5, length(levels(mat2$Y)), 1),
                               colour="gainsboro", size=0.2)
  }
  
  # scatter plot
  mat3 = fy[colnames(mat)]
  mat3 = data.frame(X=factor(names(mat3), levels=names(mat3)), Y=mat3*100)
  Yrange = range(mat3$Y)
  Yhangs = diff(Yrange)*0.05
  Ylims = c(Yrange[1]-Yhangs, Yrange[2]+Yhangs)
  Yran = diff(Yrange)
  Ybreaks = if(Yran<=15) 5 else if(Yran>15 & Yran<=30) 10 else if(Yran>30 & Yran<=40) 15 else if(Yran>40 & Yran<=60) 20 else 40
  
  part4 = ggplot(mat3, aes(x=X, y=Y)) +
    geom_point(shape=21, fill="dimgrey", colour="black", size=1.2) +
    theme_bw() +
    theme(panel.grid.minor=element_blank(),
          panel.grid.major.x=element_blank(),
          axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          axis.text.y=element_text(size=8),
          panel.border=element_rect(colour="dimgrey", size=0.1),
          panel.background=element_rect(fill="gray96")) +
    ylab("Viability [%]") +
    scale_y_continuous(limits=c(Yrange[1]-Yhangs, Yrange[2]+Yhangs),
                       breaks=seq(-200,200,Ybreaks))
  
  
  # MERGE PARTS OF THE FIGURE
  # construct the gtable
  wdths = c(4, 0.5, 0.05*ncol(mat), 0.05)
  hghts = c(0.3, 0.25*nrow(mat), 0.1, 0.4, 0.35)
  gt = gtable(widths=unit(wdths, "in"), heights=unit(hghts, "in"))
  ## make grobs
  gg1 = ggplotGrob(part1)
  gg2 = ggplotGrob(part2)
  gg4 = ggplotGrob(part4)
  ## fill in the gtable
  gt = gtable_add_grob(gt, gtable_filter(gg1, "panel"), 2, 1) # add boxplot
  gt = gtable_add_grob(gt, gtable_filter(gg2, "panel"), 2, 3) # add heatmap
  gt = gtable_add_grob(gt, gtable_filter(gg4, "panel"), 4, 3) # add scatterplot
  gt = gtable_add_grob(gt, gtable_filter(gg2, "xlab"), 5, 3)
  gt = gtable_add_grob(gt, gg2$grobs[[whichInGrob(gg2, "title")]], 1, 3)
  gt = gtable_add_grob(gt, gg4$grobs[[whichInGrob(gg4, "axis-l")]], 4, 2)
  gt = gtable_add_grob(gt, gg1$grobs[[whichInGrob(gg1, "axis-b")]], 3, 1)
  gt = gtable_add_grob(gt, gtable_filter(gg1, "xlab-b"), 4, 1)
  
  # now complicated: add the axis-l labels from gg2
  gia = which(gg2$layout$name == "axis-l")
  gga = gg2$grobs[[gia]]
  gax = gga$children[[2]]
  gax$widths = rev(gax$widths)
  gax$grobs = rev(gax$grobs)
  gt = gtable_add_cols(gt, gg2$widths[gg2$layout[gia, ]$l])
  gt = gtable_add_grob(gt, gax, 2, 5)
  
  wdth = convertUnit(gt$widths, "in", valueOnly=TRUE)[5]
  # add column on the right of appropriate width
  maxwdth = 2
  gt = gtable_add_cols(gt, unit(maxwdth-wdth, "in"))
  
  return(list(plot=gt, width=sum(wdths)+maxwdth, height=sum(hghts),
              name=make.names(objectiveName)))
}


# main function to fit Lasso and produce barplots to find genetic
# determinants of drug response
doLasso = function(objective, predictors, lpd,suffix="",
                  nm=NA, lim=0.21, ncv=10, nfolds=10, std=FALSE,
                  adaLasso = TRUE, colors) {
  
  #construct design and response matrix
  out = featureSelectionForLasso(objective, predictors, lpd)
  
  # for simplification
  fy = out[["fy"]]
  fx = out[["fx"]]
  family = out[["family"]]
  objectiveName = out[["objectiveName"]]
  prefix = out[["prefix"]]
  fxdim = dim(fx)
  print(sprintf("Prediction for: %s; #samples: %d; #features: %d",
                objectiveName, fxdim[1], fxdim[2]))
  
  # adaptive lasso for a more stable feature selection
  set.seed(19087)
  if(adaLasso){
    if(ncol(fx)>= nrow(fx)) {
      RidgeFit <- cv.glmnet(fx, fy, alpha = 0, standardize = std,
                            family = family, nfolds=10)
      # wRidge <- pmin(1/abs((coef(RidgeFit, s = RidgeFit$lambda.min))), 1e+300)
      wRidge <- 1/abs(coef(RidgeFit, s = RidgeFit$lambda.min))
      wRidge <- wRidge[-1]
      weights <- wRidge
    } else {
      lmFit <- lm(fy ~ fx)
      # wLM <- pmin(1/abs(coefficients(lmFit)[-1]), 1e+300)
      wLM <- 1/abs(coefficients(lmFit)[-1])
      weights <- wLM
    }
    excludedFeatures <- which(weights==Inf)
  } else {
    weights <- rep(1, ncol(fx))
    excludedFeatures <- NULL
  }
  
  #perform repeated cross-validation to find an optimal penalisatio
  #parameter minimizing the cross-validated MSE
  cv.out <- cvr.glmnet(Y=fy, X=fx, family=family,
                       alpha=1, nfolds=nfolds,
                       ncv=ncv, standardize=std,
                       exclude = excludedFeatures,
                       type.measure = "mse", penalty.factor = weights)
  
  # #fit Lasso model for optimal lambda                   
  fit = glmnet(y=fy, x=fx, family=family,
               alpha=1,
               standardize=std,
               exclude = excludedFeatures,
               lambda=cv.out$lambda, penalty.factor = weights)
  
  #get optimal lambda and corresponding predictors with coefficients
  lambda <- cv.out$lambda[which.min(cv.out$cvm)]
  coeffs <- coef(fit, lambda)
  coeffs_all <- coeffs
  coeffs <- as.vector(coeffs) %>%
    `names<-`(rownames(coeffs))   # cast from sparse matrix to ordinary vector
  coeffs <- coeffs[ coeffs!=0 ]
  
  # remove intercept term
  stopifnot(names(coeffs)[1]=="(Intercept)")
  if (length(coeffs) > 1) {
    coeffs <- coeffs[-1]
  } else {
    print("No (0) predictors for given parameters!")
    return(0)
  }
  coeffs <- sort(coeffs)
  
  # Residuals in the model
  pred <- predict(fit, newx = fx, s = lambda, type = "response")
  residuals <- pred[,1]-fy
  
  plot = plotPredictions(fx, fy, objective, pred, coeffs, lpd, nm, lim,
                         objectiveName, colors)
  
  return(list(residuals=residuals, coeffs=coeffs_all, plot=plot))
}

# Make list of predictors for the given lpd
makeListOfPredictors = function(lpd) {
  return(list(
    predictorsM = rownames(fData(lpd))[fData(lpd)$type=="Methylation_Cluster"],
    predictorsG = rownames(fData(lpd))[fData(lpd)$type=="gen"],
    predictorsI = rownames(fData(lpd))[fData(lpd)$type=="IGHV"],
    predictorsP = rownames(fData(lpd))[fData(lpd)$type=="pretreat"]
  ))
}


# Pre-process data: explore what is available & feature selection
prepareLPD = function(lpd, minNumSamplesPerGroup, withMC=TRUE) {
  
  # PRETREATMENT
  # update the expression set by adding row about pretreatment
  pretreated <- t(matrix(ifelse(
    BloodCancerMultiOmics2017::patmeta[colnames(lpd),
    "IC50beforeTreatment"], 0, 1),
    dimnames=list(colnames(lpd), "Pretreatment"))) 
  fdata_pretreat <- data.frame(name=NA, type="pretreat", id=NA, subtype=NA,
                               row.names="Pretreatment")
  lpd <- ExpressionSet(assayData=rbind(Biobase::exprs(lpd), pretreated),
                          phenoData=new("AnnotatedDataFrame", data=pData(lpd)),
                          featureData=new("AnnotatedDataFrame",
                                          rbind(fData(lpd), fdata_pretreat)))
  # METHYLATION
  Biobase::exprs(lpd)[fData(lpd)$type=="Methylation_Cluster",] =
    Biobase::exprs(lpd)[fData(lpd)$type=="Methylation_Cluster",]/2
  
  # IGHV: changing name from Uppsala to IGHV
  rownames(lpd)[which(fData(lpd)$type=="IGHV")] = "IGHV"
  
  # GENETICS
  # changing name od del13q any to del13q & remove the rest of del13q (mono, single)
  idx = which(rownames(lpd) %in% c("del13q14_bi", "del13q14_mono"))
  lpd = lpd[-idx,]
  rownames(lpd)[which(rownames(lpd)=="del13q14_any")] = "del13q14"
  
  # remove CHROMOTHRYPSIS
  if("Chromothripsis" %in% rownames(lpd))
    lpd = lpd[-which(rownames(lpd)=="Chromothripsis"),]
  
  # SELECT GOOD SAMPLES
  idx = !is.na(Biobase::exprs(lpd)["IGHV",])
  if(withMC) idx = idx & !is.na(Biobase::exprs(lpd)["Methylation_Cluster",])
  # cut out the data to have information about Methylation_Cluster and IGHV for all samples
  lpd = lpd[, idx]
  # for the genetics - remove the genes which do not have enough samples
  which2remove = names(
    which(!apply(Biobase::exprs(lpd)[rownames(lpd)[fData(lpd)$type %in%
                                            c("gen")],], 1, function(cl) {
    if(all(is.na(cl))) return(FALSE)
    if(sum(is.na(cl)) >= 0.1*length(cl)) return(FALSE)
    tmp = table(cl)
    return(length(tmp)==2 & all(tmp>=minNumSamplesPerGroup))
  })))
  lpd = lpd[-match(which2remove, rownames(lpd)),]
  
  # for the ones with NA put 0 instead
  featOther = rownames(lpd)[fData(lpd)$type %in%
                              c("IGHV", "Methylation_Cluster", "gen", "viab",
                                "pretreat")]
  tmp = Biobase::exprs(lpd)[featOther,]
  tmp[is.na(tmp)] = 0
  Biobase::exprs(lpd)[featOther,] = tmp
  
  return(lpd)
}


# wrapper functions to do Lasso model fitting, plotting and prediction
makePredictions = function(drs, frq, lpd, predictorList, lim, std=FALSE,
                           adaLasso = TRUE, colors) {
                           
  res = lapply(names(drs), function(typ) {
    setNames(lapply(drs[[typ]], function(dr) {
      if(typ=="1:5")
        nm <- paste0(BloodCancerMultiOmics2017::drugs[dr, "name"],
                     " (average of all concentrations)")
        else if(typ=="4:5")
          nm <- paste0(BloodCancerMultiOmics2017::drugs[dr, "name"],
                       " (average of ", paste(
                       BloodCancerMultiOmics2017::conctab[dr,4:5]*1000,
                         collapse = " and "), " nM)")
      # G & I & M & P
      doLasso(Biobase::exprs(lpd)[grepl(dr, rownames(lpd)) &
                           fData(lpd)$subtype==typ,, drop=FALSE], 
              predictors=with(predictorList,
                              c(predictorsI, predictorsG, predictorsM,
                                predictorsP)), 
              lpd=lpd,
              suffix=paste0("_","th0", "_c",gsub(":","-",typ)),
              nm=nm, lim=lim, colors=colors)
    }), nm=drs[[typ]])
  })
  return(res)
}


# Function to plot the legends
makeLegends = function(legendFor, colors) {
  
  # quiets concerns of R CMD check "no visible binding for global variable"
  x=NULL; y=NULL
  
  # select the colors needed
  colors = colors[names(colors) %in% legendFor]
  
  nleg = length(colors)
  wdths = rep(2, length.out=nleg); hghts = c(2)
  gtl = gtable(widths=unit(wdths, "in"), heights=unit(hghts, "in"))
  n=1

  # M
  if("M" %in% names(colors)) {
    Mgg = ggplot(data=data.frame(x=1, y=factor(c("LP","IP","HP"),
                                               levels=c("LP","IP","HP"))),
                 aes(x=x, y=y, fill=y)) + geom_tile() +
      scale_fill_manual(name="Methylation cluster",
                        values=setNames(colors[["M"]], nm=c("LP","IP","HP"))) +
      theme(legend.title=element_text(size=12),
            legend.text=element_text(size=12))
    gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Mgg), "guide-box"), 1, n)
    n = n+1
  }
  
  # I
  if("I" %in% names(colors)) {
    Igg = ggplot(data=data.frame(x=1,
                                 y=factor(c("unmutated","mutated"),
                                          levels=c("unmutated","mutated"))),
                 aes(x=x, y=y, fill=y)) + geom_tile() +
      scale_fill_manual(name="IGHV",
                        values=setNames(c("gray96",
                                          paste0(colors["I"], c("E5"))),
                                        nm=c("unmutated","mutated"))) +
      theme(legend.title=element_text(size=12),
            legend.text=element_text(size=12))
    gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Igg), "guide-box"), 1, n)
    n = n+1
  }
  
  # G
  if("G" %in% names(colors)) {
    Ggg = ggplot(data=data.frame(x=1,
                                 y=factor(c("wild type","mutated"),
                                          levels=c("wild type","mutated"))),
                 aes(x=x, y=y, fill=y)) + geom_tile() +
      scale_fill_manual(name="Gene",
                        values=setNames(c("gray96",
                                          paste0(colors["G"], c("E5"))),
                                        nm=c("wild type","mutated"))) +
      theme(legend.title=element_text(size=12),
            legend.text=element_text(size=12))
    gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Ggg), "guide-box"), 1, n)
    n = n+1
  }
  
  # P
  if("P" %in% names(colors)) {
    Pgg = ggplot(data=data.frame(x=1,
                                 y=factor(c("no","yes"),
                                          levels=c("no","yes"))),
                 aes(x=x, y=y, fill=y)) + geom_tile() +
      scale_fill_manual(name="Pretreatment",
                        values=setNames(c(colors[["P"]], "white"),
                                        nm=c("yes","no"))) +
      theme(legend.title=element_text(size=12),
            legend.text=element_text(size=12))
    gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Pgg), "guide-box"), 1, n)
    n = n+1
  }
  
  return(list(plot=gtl, width=sum(wdths), height=sum(hghts)))
}