Diff of /R/part11.R [000000] .. [226bc8]

Switch to unified view

a b/R/part11.R
1
# Script contains helper functions for drug response prediction
2
3
# label the drugs with proper u
4
giveDrugLabel3 = function(drid, dtab=BloodCancerMultiOmics2017::drugs,
5
                          ctab=BloodCancerMultiOmics2017::conctab) {
6
  vapply(strsplit(drid, "_"), function(x) {
7
    k <- paste(x[1:2], collapse="_")
8
    paste0(dtab[k, "name"], " ",
9
           switch(x[3], "1:5"="c1:5", "4:5"="c4:5",
10
                  paste0(ctab[k, as.integer(x[3])], " \u00B5","M")))
11
  }, character(1))
12
}
13
14
# select features from the lpd object to use as response and predictors in the model
15
featureSelectionForLasso = function(objective, predictors, lpd) {
16
  
17
  # quiets concerns of R CMD check "no visible binding for global variable"
18
  dataType=NULL
19
  
20
  dimobj = dimnames(objective)
21
  objectiveName = dimobj[[1]][1]
22
  
23
  #design matix
24
  fx = t(Biobase::exprs(lpd)[predictors, ])
25
26
    # check the objective
27
  stopifnot(identical(rownames(fx), dimobj[[2]]))
28
  fy = objective[,,drop=TRUE]
29
  
30
  # select predictors
31
  fx = fx[, (fData(lpd)[predictors, "type"] %in% c("Methylation_Cluster",
32
                                                   "IGHV","gen", "pretreat")),
33
          drop=FALSE]
34
  
35
  #  make sure that there are no NAs in the table
36
  stopifnot(all(!is.na(fx)))
37
  
38
  # print table with numer of predictors from each group
39
  # message("Objective: ", objectiveName, "\n")
40
  prefix = paste(ifelse(dataType %in% unique(fData(lpd)[colnames(fx), "type"]),
41
                        names(dataType), "_"), collapse="")
42
  
43
  n = length(unique(fy))
44
  family = "gaussian"
45
  return(list(fx=fx, fy=fy, objectiveName=objectiveName, prefix=prefix,
46
              family=family))
47
}
48
49
# plotting the predictions
50
plotPredictions = function(fx, fy, objective, pred, coeffs, lpd, nm, lim,
51
                           objectiveName, colors) {
52
53
  # quiets concerns of R CMD check "no visible binding for global variable"
54
  X=NULL; Y=NULL; Measure=NULL
55
                           
56
  # PREPARE DATA FOR PLOTTING
57
  
58
  stopifnot( identical(dim(pred), c(length(fy), 1L)), identical(rownames(pred),
59
                                                                names(fy)) )
60
  ordy <- order(fy, pred[, 1])
61
  # design matrix of selected predictors (unscaled)
62
  mat  <- t(fx[ordy, names(coeffs), drop=FALSE])
63
  # human-readable names where available & drugs with concentrations
64
  nicename = names(coeffs) %>% `names<-`(names(coeffs))
65
  idx = grepl("D_", nicename)
66
  nicename[idx] = giveDrugLabel3(nicename[idx])
67
  nicename[!idx] = fData(lpd)[names(nicename)[!idx], "name"] %>%
68
    `names<-`(names(nicename)[!idx])
69
  nicename = ifelse(!is.na(nicename) & (nicename!=""), nicename, names(nicename))
70
  nicename = gsub("Methylation_Cluster", "Methylation cluster", nicename)
71
  rownames(mat) = nicename[rownames(mat)]
72
  # prepare plot title
73
  title=nm
74
  
75
  # CREATE INDIVIDUAL PARTS OF THE FIGURE  
76
  
77
  # bar plot
78
  stopifnot(all(coeffs<lim & coeffs>-lim))
79
  part1df = data.frame(coeffs,
80
                       nm=factor(names(coeffs), levels=names(rev(coeffs))))
81
  part1df$col= ifelse(rownames(part1df)=="IGHV", "I",
82
                      ifelse(rownames(part1df)=="Methylation_Cluster", "M",
83
                             ifelse(rownames(part1df)=="Pretreatment", "P","G")))
84
  
85
  part1 = ggplot(data=part1df, aes(x=nm, y=coeffs, fill=col)) +
86
    geom_bar(stat="identity", colour="black", position = "identity",
87
             width=.66, size=0.2) +theme_bw() +
88
    geom_hline(yintercept=0, size=0.3) + scale_x_discrete(expand=c(0,0.5)) +
89
    scale_y_continuous(expand=c(0,0)) + coord_flip(ylim=c(-lim,lim)) +
90
    theme(panel.grid.major=element_blank(),
91
          panel.background=element_blank(),
92
          panel.grid.minor = element_blank(),
93
          axis.text=element_text(size=8),
94
          panel.border=element_blank()) +
95
    xlab("") + ylab("Model Coefficients") +
96
    geom_vline(xintercept=c(0.5), color="black", size=0.6)+
97
    scale_fill_manual(c("M", "I", "G", "P"),
98
                      values=c(M=colors[["M"]][2],
99
                               I=colors[["I"]],
100
                               G=colors[["G"]],
101
                               P=colors[["P"]]))
102
  
103
  
104
  # heat map
105
  # mat contains selected predictors with status for each patient
106
  # (e.g. 0/1 for mutations and IGHV, 0/0.5/1 for M)
107
  # to assign colors Gosia added 5 to meth and 2 to IGHV values,resuting in
108
  # 5 LP, 5.5 IP, 6 HP, 2 unmut IGHV or mut, 4 IGHV mut, 3 mut, 7 pre-treatment
109
  idx = grep("Methylation cluster", rownames(mat))
110
  mat[idx,] = mat[idx,]+5
111
  ## ighv
112
  idx = grep("IGHV", rownames(mat))
113
  mat[idx,] = (mat[idx,]*2)+2
114
  ## gene
115
  rnm = sapply(rownames(mat), function(nm) strsplit(nm," ")[[1]][1])
116
  idx = rownames(fData(lpd))[fData(lpd)$type=="gen" & rownames(lpd) %in% rnm]
117
  idx = match(idx, rnm)
118
  mat[idx,] = mat[idx,]+2
119
  ## pretreat
120
  idx = grep("Pretreatment", rownames(mat))
121
  mat[idx,] = ifelse(mat[idx,]==0,2,7)
122
  
123
  mat2 = meltWholeDF(mat)
124
  mat2$Measure = factor(mat2$Measure, levels=sort(unique(mat2$Measure)))
125
  mat2$X = factor(mat2$X, levels=colnames(mat))
126
  mat2$Y = factor(mat2$Y, levels=rev(rownames(mat)))
127
  
128
  part2 = ggplot(mat2, aes(x=X, y=Y, fill=Measure)) +
129
    geom_tile() + theme_bw() +
130
    scale_fill_manual(name="Mutated",
131
                      values=c(`2`="gray96", `3`=paste0(colors["G"], "E5"),
132
                               `5`=colors[["M"]][1], `5.5`=colors[["M"]][2],
133
                               `6`=colors[["M"]][3], `7`=colors[["P"]],
134
                               `4`=paste0(colors["I"],"E5")), guide=FALSE) +
135
    scale_y_discrete(expand=c(0,0)) +
136
    theme(axis.text.y=element_text(hjust=0, size=14),
137
          axis.text.x=element_blank(),
138
          axis.ticks=element_blank(),
139
          panel.border=element_rect(colour="gainsboro"),
140
          plot.title=element_text(size=12),
141
          legend.title=element_text(size=12),
142
          legend.text=element_text(size=12),
143
          panel.background=element_blank(),
144
          panel.grid.major=element_blank(),
145
          panel.grid.minor=element_blank()) +
146
    xlab("patients") + ylab("") + ggtitle(title)
147
  if(length(levels(mat2$Y)) > 1) {
148
    part2 = part2 + geom_hline(yintercept=seq(1.5, length(levels(mat2$Y)), 1),
149
                               colour="gainsboro", size=0.2)
150
  }
151
  
152
  # scatter plot
153
  mat3 = fy[colnames(mat)]
154
  mat3 = data.frame(X=factor(names(mat3), levels=names(mat3)), Y=mat3*100)
155
  Yrange = range(mat3$Y)
156
  Yhangs = diff(Yrange)*0.05
157
  Ylims = c(Yrange[1]-Yhangs, Yrange[2]+Yhangs)
158
  Yran = diff(Yrange)
159
  Ybreaks = if(Yran<=15) 5 else if(Yran>15 & Yran<=30) 10 else if(Yran>30 & Yran<=40) 15 else if(Yran>40 & Yran<=60) 20 else 40
160
  
161
  part4 = ggplot(mat3, aes(x=X, y=Y)) +
162
    geom_point(shape=21, fill="dimgrey", colour="black", size=1.2) +
163
    theme_bw() +
164
    theme(panel.grid.minor=element_blank(),
165
          panel.grid.major.x=element_blank(),
166
          axis.title.x=element_blank(),
167
          axis.text.x=element_blank(),
168
          axis.ticks.x=element_blank(),
169
          axis.text.y=element_text(size=8),
170
          panel.border=element_rect(colour="dimgrey", size=0.1),
171
          panel.background=element_rect(fill="gray96")) +
172
    ylab("Viability [%]") +
173
    scale_y_continuous(limits=c(Yrange[1]-Yhangs, Yrange[2]+Yhangs),
174
                       breaks=seq(-200,200,Ybreaks))
175
  
176
  
177
  # MERGE PARTS OF THE FIGURE
178
  # construct the gtable
179
  wdths = c(4, 0.5, 0.05*ncol(mat), 0.05)
180
  hghts = c(0.3, 0.25*nrow(mat), 0.1, 0.4, 0.35)
181
  gt = gtable(widths=unit(wdths, "in"), heights=unit(hghts, "in"))
182
  ## make grobs
183
  gg1 = ggplotGrob(part1)
184
  gg2 = ggplotGrob(part2)
185
  gg4 = ggplotGrob(part4)
186
  ## fill in the gtable
187
  gt = gtable_add_grob(gt, gtable_filter(gg1, "panel"), 2, 1) # add boxplot
188
  gt = gtable_add_grob(gt, gtable_filter(gg2, "panel"), 2, 3) # add heatmap
189
  gt = gtable_add_grob(gt, gtable_filter(gg4, "panel"), 4, 3) # add scatterplot
190
  gt = gtable_add_grob(gt, gtable_filter(gg2, "xlab"), 5, 3)
191
  gt = gtable_add_grob(gt, gg2$grobs[[whichInGrob(gg2, "title")]], 1, 3)
192
  gt = gtable_add_grob(gt, gg4$grobs[[whichInGrob(gg4, "axis-l")]], 4, 2)
193
  gt = gtable_add_grob(gt, gg1$grobs[[whichInGrob(gg1, "axis-b")]], 3, 1)
194
  gt = gtable_add_grob(gt, gtable_filter(gg1, "xlab-b"), 4, 1)
195
  
196
  # now complicated: add the axis-l labels from gg2
197
  gia = which(gg2$layout$name == "axis-l")
198
  gga = gg2$grobs[[gia]]
199
  gax = gga$children[[2]]
200
  gax$widths = rev(gax$widths)
201
  gax$grobs = rev(gax$grobs)
202
  gt = gtable_add_cols(gt, gg2$widths[gg2$layout[gia, ]$l])
203
  gt = gtable_add_grob(gt, gax, 2, 5)
204
  
205
  wdth = convertUnit(gt$widths, "in", valueOnly=TRUE)[5]
206
  # add column on the right of appropriate width
207
  maxwdth = 2
208
  gt = gtable_add_cols(gt, unit(maxwdth-wdth, "in"))
209
  
210
  return(list(plot=gt, width=sum(wdths)+maxwdth, height=sum(hghts),
211
              name=make.names(objectiveName)))
212
}
213
214
215
# main function to fit Lasso and produce barplots to find genetic
216
# determinants of drug response
217
doLasso = function(objective, predictors, lpd,suffix="",
218
                  nm=NA, lim=0.21, ncv=10, nfolds=10, std=FALSE,
219
                  adaLasso = TRUE, colors) {
220
  
221
  #construct design and response matrix
222
  out = featureSelectionForLasso(objective, predictors, lpd)
223
  
224
  # for simplification
225
  fy = out[["fy"]]
226
  fx = out[["fx"]]
227
  family = out[["family"]]
228
  objectiveName = out[["objectiveName"]]
229
  prefix = out[["prefix"]]
230
  fxdim = dim(fx)
231
  print(sprintf("Prediction for: %s; #samples: %d; #features: %d",
232
                objectiveName, fxdim[1], fxdim[2]))
233
  
234
  # adaptive lasso for a more stable feature selection
235
  set.seed(19087)
236
  if(adaLasso){
237
    if(ncol(fx)>= nrow(fx)) {
238
      RidgeFit <- cv.glmnet(fx, fy, alpha = 0, standardize = std,
239
                            family = family, nfolds=10)
240
      # wRidge <- pmin(1/abs((coef(RidgeFit, s = RidgeFit$lambda.min))), 1e+300)
241
      wRidge <- 1/abs(coef(RidgeFit, s = RidgeFit$lambda.min))
242
      wRidge <- wRidge[-1]
243
      weights <- wRidge
244
    } else {
245
      lmFit <- lm(fy ~ fx)
246
      # wLM <- pmin(1/abs(coefficients(lmFit)[-1]), 1e+300)
247
      wLM <- 1/abs(coefficients(lmFit)[-1])
248
      weights <- wLM
249
    }
250
    excludedFeatures <- which(weights==Inf)
251
  } else {
252
    weights <- rep(1, ncol(fx))
253
    excludedFeatures <- NULL
254
  }
255
  
256
  #perform repeated cross-validation to find an optimal penalisatio
257
  #parameter minimizing the cross-validated MSE
258
  cv.out <- cvr.glmnet(Y=fy, X=fx, family=family,
259
                       alpha=1, nfolds=nfolds,
260
                       ncv=ncv, standardize=std,
261
                       exclude = excludedFeatures,
262
                       type.measure = "mse", penalty.factor = weights)
263
  
264
  # #fit Lasso model for optimal lambda                   
265
  fit = glmnet(y=fy, x=fx, family=family,
266
               alpha=1,
267
               standardize=std,
268
               exclude = excludedFeatures,
269
               lambda=cv.out$lambda, penalty.factor = weights)
270
  
271
  #get optimal lambda and corresponding predictors with coefficients
272
  lambda <- cv.out$lambda[which.min(cv.out$cvm)]
273
  coeffs <- coef(fit, lambda)
274
  coeffs_all <- coeffs
275
  coeffs <- as.vector(coeffs) %>%
276
    `names<-`(rownames(coeffs))   # cast from sparse matrix to ordinary vector
277
  coeffs <- coeffs[ coeffs!=0 ]
278
  
279
  # remove intercept term
280
  stopifnot(names(coeffs)[1]=="(Intercept)")
281
  if (length(coeffs) > 1) {
282
    coeffs <- coeffs[-1]
283
  } else {
284
    print("No (0) predictors for given parameters!")
285
    return(0)
286
  }
287
  coeffs <- sort(coeffs)
288
  
289
  # Residuals in the model
290
  pred <- predict(fit, newx = fx, s = lambda, type = "response")
291
  residuals <- pred[,1]-fy
292
  
293
  plot = plotPredictions(fx, fy, objective, pred, coeffs, lpd, nm, lim,
294
                         objectiveName, colors)
295
  
296
  return(list(residuals=residuals, coeffs=coeffs_all, plot=plot))
297
}
298
299
# Make list of predictors for the given lpd
300
makeListOfPredictors = function(lpd) {
301
  return(list(
302
    predictorsM = rownames(fData(lpd))[fData(lpd)$type=="Methylation_Cluster"],
303
    predictorsG = rownames(fData(lpd))[fData(lpd)$type=="gen"],
304
    predictorsI = rownames(fData(lpd))[fData(lpd)$type=="IGHV"],
305
    predictorsP = rownames(fData(lpd))[fData(lpd)$type=="pretreat"]
306
  ))
307
}
308
309
310
# Pre-process data: explore what is available & feature selection
311
prepareLPD = function(lpd, minNumSamplesPerGroup, withMC=TRUE) {
312
  
313
  # PRETREATMENT
314
  # update the expression set by adding row about pretreatment
315
  pretreated <- t(matrix(ifelse(
316
    BloodCancerMultiOmics2017::patmeta[colnames(lpd),
317
    "IC50beforeTreatment"], 0, 1),
318
    dimnames=list(colnames(lpd), "Pretreatment"))) 
319
  fdata_pretreat <- data.frame(name=NA, type="pretreat", id=NA, subtype=NA,
320
                               row.names="Pretreatment")
321
  lpd <- ExpressionSet(assayData=rbind(Biobase::exprs(lpd), pretreated),
322
                          phenoData=new("AnnotatedDataFrame", data=pData(lpd)),
323
                          featureData=new("AnnotatedDataFrame",
324
                                          rbind(fData(lpd), fdata_pretreat)))
325
  # METHYLATION
326
  Biobase::exprs(lpd)[fData(lpd)$type=="Methylation_Cluster",] =
327
    Biobase::exprs(lpd)[fData(lpd)$type=="Methylation_Cluster",]/2
328
  
329
  # IGHV: changing name from Uppsala to IGHV
330
  rownames(lpd)[which(fData(lpd)$type=="IGHV")] = "IGHV"
331
  
332
  # GENETICS
333
  # changing name od del13q any to del13q & remove the rest of del13q (mono, single)
334
  idx = which(rownames(lpd) %in% c("del13q14_bi", "del13q14_mono"))
335
  lpd = lpd[-idx,]
336
  rownames(lpd)[which(rownames(lpd)=="del13q14_any")] = "del13q14"
337
  
338
  # remove CHROMOTHRYPSIS
339
  if("Chromothripsis" %in% rownames(lpd))
340
    lpd = lpd[-which(rownames(lpd)=="Chromothripsis"),]
341
  
342
  # SELECT GOOD SAMPLES
343
  idx = !is.na(Biobase::exprs(lpd)["IGHV",])
344
  if(withMC) idx = idx & !is.na(Biobase::exprs(lpd)["Methylation_Cluster",])
345
  # cut out the data to have information about Methylation_Cluster and IGHV for all samples
346
  lpd = lpd[, idx]
347
  # for the genetics - remove the genes which do not have enough samples
348
  which2remove = names(
349
    which(!apply(Biobase::exprs(lpd)[rownames(lpd)[fData(lpd)$type %in%
350
                                            c("gen")],], 1, function(cl) {
351
    if(all(is.na(cl))) return(FALSE)
352
    if(sum(is.na(cl)) >= 0.1*length(cl)) return(FALSE)
353
    tmp = table(cl)
354
    return(length(tmp)==2 & all(tmp>=minNumSamplesPerGroup))
355
  })))
356
  lpd = lpd[-match(which2remove, rownames(lpd)),]
357
  
358
  # for the ones with NA put 0 instead
359
  featOther = rownames(lpd)[fData(lpd)$type %in%
360
                              c("IGHV", "Methylation_Cluster", "gen", "viab",
361
                                "pretreat")]
362
  tmp = Biobase::exprs(lpd)[featOther,]
363
  tmp[is.na(tmp)] = 0
364
  Biobase::exprs(lpd)[featOther,] = tmp
365
  
366
  return(lpd)
367
}
368
369
370
# wrapper functions to do Lasso model fitting, plotting and prediction
371
makePredictions = function(drs, frq, lpd, predictorList, lim, std=FALSE,
372
                           adaLasso = TRUE, colors) {
373
                           
374
  res = lapply(names(drs), function(typ) {
375
    setNames(lapply(drs[[typ]], function(dr) {
376
      if(typ=="1:5")
377
        nm <- paste0(BloodCancerMultiOmics2017::drugs[dr, "name"],
378
                     " (average of all concentrations)")
379
        else if(typ=="4:5")
380
          nm <- paste0(BloodCancerMultiOmics2017::drugs[dr, "name"],
381
                       " (average of ", paste(
382
                       BloodCancerMultiOmics2017::conctab[dr,4:5]*1000,
383
                         collapse = " and "), " nM)")
384
      # G & I & M & P
385
      doLasso(Biobase::exprs(lpd)[grepl(dr, rownames(lpd)) &
386
                           fData(lpd)$subtype==typ,, drop=FALSE], 
387
              predictors=with(predictorList,
388
                              c(predictorsI, predictorsG, predictorsM,
389
                                predictorsP)), 
390
              lpd=lpd,
391
              suffix=paste0("_","th0", "_c",gsub(":","-",typ)),
392
              nm=nm, lim=lim, colors=colors)
393
    }), nm=drs[[typ]])
394
  })
395
  return(res)
396
}
397
398
399
# Function to plot the legends
400
makeLegends = function(legendFor, colors) {
401
  
402
  # quiets concerns of R CMD check "no visible binding for global variable"
403
  x=NULL; y=NULL
404
  
405
  # select the colors needed
406
  colors = colors[names(colors) %in% legendFor]
407
  
408
  nleg = length(colors)
409
  wdths = rep(2, length.out=nleg); hghts = c(2)
410
  gtl = gtable(widths=unit(wdths, "in"), heights=unit(hghts, "in"))
411
  n=1
412
413
  # M
414
  if("M" %in% names(colors)) {
415
    Mgg = ggplot(data=data.frame(x=1, y=factor(c("LP","IP","HP"),
416
                                               levels=c("LP","IP","HP"))),
417
                 aes(x=x, y=y, fill=y)) + geom_tile() +
418
      scale_fill_manual(name="Methylation cluster",
419
                        values=setNames(colors[["M"]], nm=c("LP","IP","HP"))) +
420
      theme(legend.title=element_text(size=12),
421
            legend.text=element_text(size=12))
422
    gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Mgg), "guide-box"), 1, n)
423
    n = n+1
424
  }
425
  
426
  # I
427
  if("I" %in% names(colors)) {
428
    Igg = ggplot(data=data.frame(x=1,
429
                                 y=factor(c("unmutated","mutated"),
430
                                          levels=c("unmutated","mutated"))),
431
                 aes(x=x, y=y, fill=y)) + geom_tile() +
432
      scale_fill_manual(name="IGHV",
433
                        values=setNames(c("gray96",
434
                                          paste0(colors["I"], c("E5"))),
435
                                        nm=c("unmutated","mutated"))) +
436
      theme(legend.title=element_text(size=12),
437
            legend.text=element_text(size=12))
438
    gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Igg), "guide-box"), 1, n)
439
    n = n+1
440
  }
441
  
442
  # G
443
  if("G" %in% names(colors)) {
444
    Ggg = ggplot(data=data.frame(x=1,
445
                                 y=factor(c("wild type","mutated"),
446
                                          levels=c("wild type","mutated"))),
447
                 aes(x=x, y=y, fill=y)) + geom_tile() +
448
      scale_fill_manual(name="Gene",
449
                        values=setNames(c("gray96",
450
                                          paste0(colors["G"], c("E5"))),
451
                                        nm=c("wild type","mutated"))) +
452
      theme(legend.title=element_text(size=12),
453
            legend.text=element_text(size=12))
454
    gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Ggg), "guide-box"), 1, n)
455
    n = n+1
456
  }
457
  
458
  # P
459
  if("P" %in% names(colors)) {
460
    Pgg = ggplot(data=data.frame(x=1,
461
                                 y=factor(c("no","yes"),
462
                                          levels=c("no","yes"))),
463
                 aes(x=x, y=y, fill=y)) + geom_tile() +
464
      scale_fill_manual(name="Pretreatment",
465
                        values=setNames(c(colors[["P"]], "white"),
466
                                        nm=c("yes","no"))) +
467
      theme(legend.title=element_text(size=12),
468
            legend.text=element_text(size=12))
469
    gtl = gtable_add_grob(gtl, gtable_filter(ggplotGrob(Pgg), "guide-box"), 1, n)
470
    n = n+1
471
  }
472
  
473
  return(list(plot=gtl, width=sum(wdths), height=sum(hghts)))
474
}
475