--- a +++ b/R/part11.R @@ -0,0 +1,475 @@ +# Script contains helper functions for drug response prediction + +# label the drugs with proper u +giveDrugLabel3 = function(drid, dtab=BloodCancerMultiOmics2017::drugs, + ctab=BloodCancerMultiOmics2017::conctab) { + vapply(strsplit(drid, "_"), function(x) { + k <- paste(x[1:2], collapse="_") + paste0(dtab[k, "name"], " ", + switch(x[3], "1:5"="c1:5", "4:5"="c4:5", + paste0(ctab[k, as.integer(x[3])], " \u00B5","M"))) + }, character(1)) +} + +# select features from the lpd object to use as response and predictors in the model +featureSelectionForLasso = function(objective, predictors, lpd) { + + # quiets concerns of R CMD check "no visible binding for global variable" + dataType=NULL + + dimobj = dimnames(objective) + objectiveName = dimobj[[1]][1] + + #design matix + fx = t(Biobase::exprs(lpd)[predictors, ]) + + # check the objective + stopifnot(identical(rownames(fx), dimobj[[2]])) + fy = objective[,,drop=TRUE] + + # select predictors + fx = fx[, (fData(lpd)[predictors, "type"] %in% c("Methylation_Cluster", + "IGHV","gen", "pretreat")), + drop=FALSE] + + # make sure that there are no NAs in the table + stopifnot(all(!is.na(fx))) + + # print table with numer of predictors from each group + # message("Objective: ", objectiveName, "\n") + prefix = paste(ifelse(dataType %in% unique(fData(lpd)[colnames(fx), "type"]), + names(dataType), "_"), collapse="") + + n = length(unique(fy)) + family = "gaussian" + return(list(fx=fx, fy=fy, objectiveName=objectiveName, prefix=prefix, + family=family)) +} + +# plotting the predictions +plotPredictions = function(fx, fy, objective, pred, coeffs, lpd, nm, lim, + objectiveName, colors) { + + # quiets concerns of R CMD check "no visible binding for global variable" + X=NULL; Y=NULL; Measure=NULL + + # PREPARE DATA FOR PLOTTING + + stopifnot( identical(dim(pred), c(length(fy), 1L)), identical(rownames(pred), + names(fy)) ) + ordy <- order(fy, pred[, 1]) + # design matrix of selected predictors (unscaled) + mat <- t(fx[ordy, names(coeffs), drop=FALSE]) + # human-readable names where available & drugs with concentrations + nicename = names(coeffs) %>% `names<-`(names(coeffs)) + idx = grepl("D_", nicename) + nicename[idx] = giveDrugLabel3(nicename[idx]) + nicename[!idx] = fData(lpd)[names(nicename)[!idx], "name"] %>% + `names<-`(names(nicename)[!idx]) + nicename = ifelse(!is.na(nicename) & (nicename!=""), nicename, names(nicename)) + nicename = gsub("Methylation_Cluster", "Methylation cluster", nicename) + rownames(mat) = nicename[rownames(mat)] + # prepare plot title + title=nm + + # CREATE INDIVIDUAL PARTS OF THE FIGURE + + # bar plot + stopifnot(all(coeffs<lim & coeffs>-lim)) + part1df = data.frame(coeffs, + nm=factor(names(coeffs), levels=names(rev(coeffs)))) + part1df$col= ifelse(rownames(part1df)=="IGHV", "I", + ifelse(rownames(part1df)=="Methylation_Cluster", "M", + ifelse(rownames(part1df)=="Pretreatment", "P","G"))) + + part1 = ggplot(data=part1df, aes(x=nm, y=coeffs, fill=col)) + + geom_bar(stat="identity", colour="black", position = "identity", + width=.66, size=0.2) +theme_bw() + + geom_hline(yintercept=0, size=0.3) + scale_x_discrete(expand=c(0,0.5)) + + scale_y_continuous(expand=c(0,0)) + coord_flip(ylim=c(-lim,lim)) + + theme(panel.grid.major=element_blank(), + panel.background=element_blank(), + panel.grid.minor = element_blank(), + axis.text=element_text(size=8), + panel.border=element_blank()) + + xlab("") + ylab("Model Coefficients") + + geom_vline(xintercept=c(0.5), color="black", size=0.6)+ + scale_fill_manual(c("M", "I", "G", "P"), + values=c(M=colors[["M"]][2], + I=colors[["I"]], + G=colors[["G"]], + P=colors[["P"]])) + + + # heat map + # mat contains selected predictors with status for each patient + # (e.g. 0/1 for mutations and IGHV, 0/0.5/1 for M) + # to assign colors Gosia added 5 to meth and 2 to IGHV values,resuting in + # 5 LP, 5.5 IP, 6 HP, 2 unmut IGHV or mut, 4 IGHV mut, 3 mut, 7 pre-treatment + idx = grep("Methylation cluster", rownames(mat)) + mat[idx,] = mat[idx,]+5 + ## ighv + idx = grep("IGHV", rownames(mat)) + mat[idx,] = (mat[idx,]*2)+2 + ## gene + rnm = sapply(rownames(mat), function(nm) strsplit(nm," ")[[1]][1]) + idx = rownames(fData(lpd))[fData(lpd)$type=="gen" & rownames(lpd) %in% rnm] + idx = match(idx, rnm) + mat[idx,] = mat[idx,]+2 + ## pretreat + idx = grep("Pretreatment", rownames(mat)) + mat[idx,] = ifelse(mat[idx,]==0,2,7) + + mat2 = meltWholeDF(mat) + mat2$Measure = factor(mat2$Measure, levels=sort(unique(mat2$Measure))) + mat2$X = factor(mat2$X, levels=colnames(mat)) + mat2$Y = factor(mat2$Y, levels=rev(rownames(mat))) + + part2 = ggplot(mat2, aes(x=X, y=Y, fill=Measure)) + + geom_tile() + theme_bw() + + scale_fill_manual(name="Mutated", + values=c(`2`="gray96", `3`=paste0(colors["G"], "E5"), + `5`=colors[["M"]][1], `5.5`=colors[["M"]][2], + `6`=colors[["M"]][3], `7`=colors[["P"]], + `4`=paste0(colors["I"],"E5")), guide=FALSE) + + scale_y_discrete(expand=c(0,0)) + + theme(axis.text.y=element_text(hjust=0, size=14), + axis.text.x=element_blank(), + axis.ticks=element_blank(), + panel.border=element_rect(colour="gainsboro"), + plot.title=element_text(size=12), + legend.title=element_text(size=12), + legend.text=element_text(size=12), + panel.background=element_blank(), + panel.grid.major=element_blank(), + panel.grid.minor=element_blank()) + + xlab("patients") + ylab("") + ggtitle(title) + if(length(levels(mat2$Y)) > 1) { + part2 = part2 + geom_hline(yintercept=seq(1.5, length(levels(mat2$Y)), 1), + colour="gainsboro", size=0.2) + } + + # scatter plot + mat3 = fy[colnames(mat)] + mat3 = data.frame(X=factor(names(mat3), levels=names(mat3)), Y=mat3*100) + Yrange = range(mat3$Y) + Yhangs = diff(Yrange)*0.05 + Ylims = c(Yrange[1]-Yhangs, Yrange[2]+Yhangs) + Yran = diff(Yrange) + Ybreaks = if(Yran<=15) 5 else if(Yran>15 & Yran<=30) 10 else if(Yran>30 & Yran<=40) 15 else if(Yran>40 & Yran<=60) 20 else 40 + + part4 = ggplot(mat3, aes(x=X, y=Y)) + + geom_point(shape=21, fill="dimgrey", colour="black", size=1.2) + + theme_bw() + + theme(panel.grid.minor=element_blank(), + panel.grid.major.x=element_blank(), + axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank(), + axis.text.y=element_text(size=8), + panel.border=element_rect(colour="dimgrey", size=0.1), + panel.background=element_rect(fill="gray96")) + + ylab("Viability [%]") + + scale_y_continuous(limits=c(Yrange[1]-Yhangs, Yrange[2]+Yhangs), + breaks=seq(-200,200,Ybreaks)) + + + # MERGE PARTS OF THE FIGURE + # construct the gtable + wdths = c(4, 0.5, 0.05*ncol(mat), 0.05) + hghts = c(0.3, 0.25*nrow(mat), 0.1, 0.4, 0.35) + gt = gtable(widths=unit(wdths, "in"), heights=unit(hghts, "in")) + ## make grobs + gg1 = ggplotGrob(part1) + gg2 = ggplotGrob(part2) + gg4 = ggplotGrob(part4) + ## fill in the gtable + gt = gtable_add_grob(gt, gtable_filter(gg1, "panel"), 2, 1) # add boxplot + gt = gtable_add_grob(gt, gtable_filter(gg2, "panel"), 2, 3) # add heatmap + gt = gtable_add_grob(gt, gtable_filter(gg4, "panel"), 4, 3) # add scatterplot + gt = gtable_add_grob(gt, gtable_filter(gg2, "xlab"), 5, 3) + gt = gtable_add_grob(gt, gg2$grobs[[whichInGrob(gg2, "title")]], 1, 3) + gt = gtable_add_grob(gt, gg4$grobs[[whichInGrob(gg4, "axis-l")]], 4, 2) + gt = gtable_add_grob(gt, gg1$grobs[[whichInGrob(gg1, "axis-b")]], 3, 1) + gt = gtable_add_grob(gt, gtable_filter(gg1, "xlab-b"), 4, 1) + + # now complicated: add the axis-l labels from gg2 + gia = which(gg2$layout$name == "axis-l") + gga = gg2$grobs[[gia]] + gax = gga$children[[2]] + gax$widths = rev(gax$widths) + gax$grobs = rev(gax$grobs) + gt = gtable_add_cols(gt, gg2$widths[gg2$layout[gia, ]$l]) + gt = gtable_add_grob(gt, gax, 2, 5) + + wdth = convertUnit(gt$widths, "in", valueOnly=TRUE)[5] + # add column on the right of appropriate width + maxwdth = 2 + gt = gtable_add_cols(gt, unit(maxwdth-wdth, "in")) + + return(list(plot=gt, width=sum(wdths)+maxwdth, height=sum(hghts), + name=make.names(objectiveName))) +} + + +# main function to fit Lasso and produce barplots to find genetic +# determinants of drug response +doLasso = function(objective, predictors, lpd,suffix="", + nm=NA, lim=0.21, ncv=10, nfolds=10, std=FALSE, + adaLasso = TRUE, colors) { + + #construct design and response matrix + out = featureSelectionForLasso(objective, predictors, lpd) + + # for simplification + fy = out[["fy"]] + fx = out[["fx"]] + family = out[["family"]] + objectiveName = out[["objectiveName"]] + prefix = out[["prefix"]] + fxdim = dim(fx) + print(sprintf("Prediction for: %s; #samples: %d; #features: %d", + objectiveName, fxdim[1], fxdim[2])) + + # adaptive lasso for a more stable feature selection + set.seed(19087) + if(adaLasso){ + if(ncol(fx)>= nrow(fx)) { + RidgeFit <- cv.glmnet(fx, fy, alpha = 0, standardize = std, + family = family, nfolds=10) + # wRidge <- pmin(1/abs((coef(RidgeFit, s = RidgeFit$lambda.min))), 1e+300) + wRidge <- 1/abs(coef(RidgeFit, s = RidgeFit$lambda.min)) + wRidge <- wRidge[-1] + weights <- wRidge + } else { + lmFit <- lm(fy ~ fx) + # wLM <- pmin(1/abs(coefficients(lmFit)[-1]), 1e+300) + wLM <- 1/abs(coefficients(lmFit)[-1]) + weights <- wLM + } + excludedFeatures <- which(weights==Inf) + } else { + weights <- rep(1, ncol(fx)) + excludedFeatures <- NULL + } + + #perform repeated cross-validation to find an optimal penalisatio + #parameter minimizing the cross-validated MSE + cv.out <- cvr.glmnet(Y=fy, X=fx, family=family, + alpha=1, nfolds=nfolds, + ncv=ncv, standardize=std, + exclude = excludedFeatures, + type.measure = "mse", penalty.factor = weights) + + # #fit Lasso model for optimal lambda + fit = glmnet(y=fy, x=fx, family=family, + alpha=1, + standardize=std, + exclude = excludedFeatures, + lambda=cv.out$lambda, penalty.factor = weights) + + #get optimal lambda and corresponding predictors with coefficients + lambda <- cv.out$lambda[which.min(cv.out$cvm)] + coeffs <- coef(fit, lambda) + coeffs_all <- coeffs + coeffs <- as.vector(coeffs) %>% + `names<-`(rownames(coeffs)) # cast from sparse matrix to ordinary vector + coeffs <- coeffs[ coeffs!=0 ] + + # remove intercept term + stopifnot(names(coeffs)[1]=="(Intercept)") + if (length(coeffs) > 1) { + coeffs <- coeffs[-1] + } else { + print("No (0) predictors for given parameters!") + return(0) + } + coeffs <- sort(coeffs) + + # Residuals in the model + pred <- predict(fit, newx = fx, s = lambda, type = "response") + residuals <- pred[,1]-fy + + plot = plotPredictions(fx, fy, objective, pred, coeffs, lpd, nm, lim, + objectiveName, colors) + + return(list(residuals=residuals, coeffs=coeffs_all, plot=plot)) +} + +# Make list of predictors for the given lpd +makeListOfPredictors = function(lpd) { + return(list( + predictorsM = rownames(fData(lpd))[fData(lpd)$type=="Methylation_Cluster"], + predictorsG = rownames(fData(lpd))[fData(lpd)$type=="gen"], + predictorsI = rownames(fData(lpd))[fData(lpd)$type=="IGHV"], + predictorsP = rownames(fData(lpd))[fData(lpd)$type=="pretreat"] + )) +} + + +# Pre-process data: explore what is available & feature selection +prepareLPD = function(lpd, minNumSamplesPerGroup, withMC=TRUE) { + + # PRETREATMENT + # update the expression set by adding row about pretreatment + pretreated <- t(matrix(ifelse( + BloodCancerMultiOmics2017::patmeta[colnames(lpd), + "IC50beforeTreatment"], 0, 1), + dimnames=list(colnames(lpd), "Pretreatment"))) + fdata_pretreat <- data.frame(name=NA, type="pretreat", id=NA, subtype=NA, + row.names="Pretreatment") + lpd <- ExpressionSet(assayData=rbind(Biobase::exprs(lpd), pretreated), + phenoData=new("AnnotatedDataFrame", data=pData(lpd)), + featureData=new("AnnotatedDataFrame", + rbind(fData(lpd), fdata_pretreat))) + # METHYLATION + Biobase::exprs(lpd)[fData(lpd)$type=="Methylation_Cluster",] = + Biobase::exprs(lpd)[fData(lpd)$type=="Methylation_Cluster",]/2 + + # IGHV: changing name from Uppsala to IGHV + rownames(lpd)[which(fData(lpd)$type=="IGHV")] = "IGHV" + + # GENETICS + # changing name od del13q any to del13q & remove the rest of del13q (mono, single) + idx = which(rownames(lpd) %in% c("del13q14_bi", "del13q14_mono")) + lpd = lpd[-idx,] + rownames(lpd)[which(rownames(lpd)=="del13q14_any")] = "del13q14" + + # remove CHROMOTHRYPSIS + if("Chromothripsis" %in% rownames(lpd)) + lpd = lpd[-which(rownames(lpd)=="Chromothripsis"),] + + # SELECT GOOD SAMPLES + idx = !is.na(Biobase::exprs(lpd)["IGHV",]) + if(withMC) idx = idx & !is.na(Biobase::exprs(lpd)["Methylation_Cluster",]) + # cut out the data to have information about Methylation_Cluster and IGHV for all samples + lpd = lpd[, idx] + # for the genetics - remove the genes which do not have enough samples + which2remove = names( + which(!apply(Biobase::exprs(lpd)[rownames(lpd)[fData(lpd)$type %in% + c("gen")],], 1, function(cl) { + if(all(is.na(cl))) return(FALSE) + if(sum(is.na(cl)) >= 0.1*length(cl)) return(FALSE) + tmp = table(cl) + return(length(tmp)==2 & all(tmp>=minNumSamplesPerGroup)) + }))) + lpd = lpd[-match(which2remove, rownames(lpd)),] + + # for the ones with NA put 0 instead + featOther = rownames(lpd)[fData(lpd)$type %in% + c("IGHV", "Methylation_Cluster", "gen", "viab", + "pretreat")] + tmp = Biobase::exprs(lpd)[featOther,] + tmp[is.na(tmp)] = 0 + Biobase::exprs(lpd)[featOther,] = tmp + + return(lpd) +} + + +# wrapper functions to do Lasso model fitting, plotting and prediction +makePredictions = function(drs, frq, lpd, predictorList, lim, std=FALSE, + adaLasso = TRUE, colors) { + + res = lapply(names(drs), function(typ) { + setNames(lapply(drs[[typ]], function(dr) { + if(typ=="1:5") + nm <- paste0(BloodCancerMultiOmics2017::drugs[dr, "name"], + " (average of all concentrations)") + else if(typ=="4:5") + nm <- paste0(BloodCancerMultiOmics2017::drugs[dr, "name"], + " (average of ", paste( + BloodCancerMultiOmics2017::conctab[dr,4:5]*1000, + collapse = " and "), " nM)") + # G & I & M & P + doLasso(Biobase::exprs(lpd)[grepl(dr, rownames(lpd)) & + fData(lpd)$subtype==typ,, drop=FALSE], + predictors=with(predictorList, + c(predictorsI, predictorsG, predictorsM, + predictorsP)), + lpd=lpd, + suffix=paste0("_","th0", "_c",gsub(":","-",typ)), + nm=nm, lim=lim, colors=colors) + }), nm=drs[[typ]]) + }) + return(res) +} + + +# Function to plot the legends +makeLegends = function(legendFor, colors) { + + # quiets concerns of R CMD check "no visible binding for global variable" + x=NULL; y=NULL + + # select the colors needed + colors = colors[names(colors) %in% legendFor] + + nleg = length(colors) + wdths = rep(2, length.out=nleg); hghts = c(2) + gtl = gtable(widths=unit(wdths, "in"), heights=unit(hghts, "in")) + n=1 + + # M + if("M" %in% names(colors)) { + Mgg = ggplot(data=data.frame(x=1, y=factor(c("LP","IP","HP"), + levels=c("LP","IP","HP"))), + aes(x=x, y=y, fill=y)) + geom_tile() + + scale_fill_manual(name="Methylation cluster", + values=setNames(colors[["M"]], nm=c("LP","IP","HP"))) + + theme(legend.title=element_text(size=12), + legend.text=element_text(size=12)) + gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Mgg), "guide-box"), 1, n) + n = n+1 + } + + # I + if("I" %in% names(colors)) { + Igg = ggplot(data=data.frame(x=1, + y=factor(c("unmutated","mutated"), + levels=c("unmutated","mutated"))), + aes(x=x, y=y, fill=y)) + geom_tile() + + scale_fill_manual(name="IGHV", + values=setNames(c("gray96", + paste0(colors["I"], c("E5"))), + nm=c("unmutated","mutated"))) + + theme(legend.title=element_text(size=12), + legend.text=element_text(size=12)) + gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Igg), "guide-box"), 1, n) + n = n+1 + } + + # G + if("G" %in% names(colors)) { + Ggg = ggplot(data=data.frame(x=1, + y=factor(c("wild type","mutated"), + levels=c("wild type","mutated"))), + aes(x=x, y=y, fill=y)) + geom_tile() + + scale_fill_manual(name="Gene", + values=setNames(c("gray96", + paste0(colors["G"], c("E5"))), + nm=c("wild type","mutated"))) + + theme(legend.title=element_text(size=12), + legend.text=element_text(size=12)) + gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Ggg), "guide-box"), 1, n) + n = n+1 + } + + # P + if("P" %in% names(colors)) { + Pgg = ggplot(data=data.frame(x=1, + y=factor(c("no","yes"), + levels=c("no","yes"))), + aes(x=x, y=y, fill=y)) + geom_tile() + + scale_fill_manual(name="Pretreatment", + values=setNames(c(colors[["P"]], "white"), + nm=c("yes","no"))) + + theme(legend.title=element_text(size=12), + legend.text=element_text(size=12)) + gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Pgg), "guide-box"), 1, n) + n = n+1 + } + + return(list(plot=gtl, width=sum(wdths), height=sum(hghts))) +} +