a b/Fig7_multivariable_regression_eNet_survival.R
1
GIT_HOME="/research/users/ppolonen/git_home/ImmunogenomicLandscape-BloodCancers/"
2
source(file.path(GIT_HOME, "common_scripts/visualisation/plotting_functions.R"))
3
source(file.path(GIT_HOME, "common_scripts/statistics/functions_statistics.R"))
4
source(file.path(GIT_HOME, "common_scripts/statistics/statistics_wrappers.R"))
5
source(file.path(GIT_HOME, "common_scripts/featurematrix/compute.pairwise.R"))
6
source(file.path(GIT_HOME, "common_scripts/featurematrix/functions_generate_fm.R"))
7
8
library(RColorBrewer)
9
library(survival)
10
library(data.table)
11
library(ggplot2)
12
library(ComplexHeatmap)
13
14
validate.model=function(dataset,coef, ind=1, months=F, summary="3quantiles", filt=NULL){
15
  load(dataset)
16
  
17
  if(!is.null(filt)){
18
    logicalv=list(filt&logicalv[[ind]])
19
    ind=1
20
  }
21
  
22
  time=TIME[logicalv[[ind]]]
23
  time[time==0]=0.1
24
  status=STATUS[logicalv[[ind]]]
25
  
26
  genelist=rownames(coef)
27
  
28
  test.data=cbind(gexp[logicalv[[ind]], colnames(gexp)%in%genelist,drop=F], immunoscore[logicalv[[ind]], colnames(immunoscore)%in%genelist,drop=F], samp[logicalv[[ind]], colnames(samp)%in%genelist,drop=F])
29
  
30
  if(!(all(genelist%in%colnames(test.data)))){
31
    warning(paste("Features not found from test data:", paste(genelist[!genelist%in%colnames(test.data)], collapse = ",")))
32
    coef=coef[rownames(coef)%in%colnames(test.data),,drop=F]
33
    genelist=rownames(coef)
34
  }
35
  
36
  m=test.data[,match(genelist, colnames(test.data))]
37
  riskPI=as.numeric(coef) %*% data.matrix(t(m))
38
  
39
  print(summary(coxph(Surv(time, status) ~ PI.test, data.frame("PI.test"=as.numeric(riskPI)))))
40
  
41
  # plot validation data:
42
  print(fun.kapplanMeier(time, status, CONTINUOUS = as.numeric(riskPI), conf.int = F, MONTHS=months, PVAL=1,LWD = 0.5, CONTINUOUS_SUMMARY = summary, INDIVIDUAL_GROUPS=F, NAME = ""))
43
  
44
  return(riskPI)
45
}
46
47
validate.model.cox=function(dataset,coef, ind=1, months=F, summary="3quantiles", filt=NUL, NAME=""){
48
  load(dataset)
49
  
50
  if(!is.null(filt)){
51
    logicalv=list(filt&logicalv[[ind]])
52
    ind=1
53
  }
54
  
55
  time=TIME[logicalv[[ind]]]
56
  time[time==0]=0.1
57
  status=STATUS[logicalv[[ind]]]
58
  
59
  genelist=rownames(coef)
60
  
61
  test.data=cbind(gexp[logicalv[[ind]], colnames(gexp)%in%genelist,drop=F], immunoscore[logicalv[[ind]], colnames(immunoscore)%in%genelist,drop=F], samp[logicalv[[ind]], colnames(samp)%in%genelist,drop=F])
62
  
63
  if(!(all(genelist%in%colnames(test.data)))){
64
    warning(paste("Features not found from test data:", paste(genelist[!genelist%in%colnames(test.data)], collapse = ",")))
65
    coef=coef[rownames(coef)%in%colnames(test.data),,drop=F]
66
    genelist=rownames(coef)
67
  }
68
  
69
  m=test.data[,match(genelist, colnames(test.data))]
70
  riskPI=as.numeric(coef) %*% data.matrix(t(m))
71
  
72
  fit=coxph(Surv(time, status) ~ PI.test, data.frame("PI.test"=as.numeric(riskPI)))
73
74
  a=summary(fit)
75
  
76
  pval=a$coefficients[,5]
77
  coef=a$conf.int[,c(1,3,4)]
78
  univ=data.frame(rownames(a$coefficients),t(coef), pval, a$concordance[1],NAME, stringsAsFactors = F)
79
  rownames(univ)=NULL
80
  colnames(univ)=c("Feature", "exp(coef)", "lower .95", "upper .95", "P", "concordance", "Name")
81
  return(univ)
82
}
83
84
85
cols=data.frame("name"=c("Subtype","ImmunoScores","Inhibitory ligand", "Stimulatory ligand", "Stromal/cancer gene (Rho > 0)", "Stromal/cancer gene (Rho < 0)","CTL/NK gene","Clinical", "CGA", "MDS-signature gene"),
86
                "color"=c("#acb839","#5e2883","#1f78b4","#b2df8a","#377eb8","grey50","#e41a1c","brown", "#d7a85b", "indianred"), stringsAsFactors = F)
87
88
89
Plot.model=function(dataset, coef, type.feat, ind=1, NAME=NULL){
90
  load(dataset)
91
  
92
  time=TIME[logicalv[[ind]]]
93
  time[time==0]=0.1
94
  status=STATUS[logicalv[[ind]]]
95
  
96
  genelist=rownames(coef)
97
  
98
  gexp=gexp[logicalv[[ind]], colnames(gexp)%in%genelist,drop=F]
99
  immunoscore=immunoscore[logicalv[[ind]], colnames(immunoscore)%in%genelist,drop=F]
100
  samp=samp[logicalv[[ind]], colnames(samp)%in%genelist,drop=F]
101
  
102
  test.data=cbind(gexp, immunoscore, samp)
103
  
104
  if(!(all(genelist%in%colnames(test.data)))){
105
    warning(paste("Features not found from test data:", paste(genelist[!genelist%in%colnames(test.data)], collapse = ",")))
106
    coef=coef[rownames(coef)%in%colnames(test.data),,drop=F]
107
    genelist=rownames(coef)
108
  }
109
  
110
  m=test.data[,match(genelist, colnames(test.data))]
111
  riskPI=as.numeric(coef) %*% data.matrix(t(m))
112
  
113
  gene.annot=data.frame(type.feat[match(rownames(coef), type.feat[,1]),], "HR"=as.numeric(exp(coef)), stringsAsFactors = F)
114
  gene.annot=gene.annot[order(gene.annot$Type, -gene.annot$HR),]
115
  fm.m=t(m)
116
  fm.m=fm.m[match(gene.annot$Feature, rownames(fm.m)),]
117
  
118
  rownames(fm.m)[rownames(fm.m)%in%colnames(gexp)]=paste0("N:GEXP:", rownames(fm.m)[rownames(fm.m)%in%colnames(gexp)])
119
  rownames(fm.m)[rownames(fm.m)%in%colnames(immunoscore)]=paste0("N:GSVA:", rownames(fm.m)[rownames(fm.m)%in%colnames(immunoscore)])
120
  rownames(fm.m)[rownames(fm.m)%in%gene.annot$Feature[gene.annot$Type%in%c("Subtype")]]=paste0("B:SAMP:", rownames(fm.m)[rownames(fm.m)%in%gene.annot$Feature[gene.annot$Type%in%c("Subtype")]])
121
  rownames(fm.m)[rownames(fm.m)%in%gene.annot$Feature[gene.annot$Type%in%c("Clinical")]]=paste0("N:CLIN:", rownames(fm.m)[rownames(fm.m)%in%gene.annot$Feature[gene.annot$Type%in%c("Clinical")]])
122
  
123
  rownames(gene.annot)=rownames(fm.m)
124
  HR=prettyNum(signif(gene.annot[,3], 3))
125
  names(HR)=rownames(gene.annot)
126
  
127
  annotdf=data.frame("OS"=status, "RiskPI"=t(riskPI), stringsAsFactors = F)
128
  rownames(annotdf)=colnames(fm.m)
129
  
130
  plot.complexHM.fm(feats = rownames(gene.annot), text_annot = HR, feats.barplot = c("RiskPI"), split.columns = F, annotdf = annotdf, fm.f = fm.m, order_columns = colnames(fm.m)[order(as.numeric(riskPI))], use_raster = F, order_rows = F, NAME = NAME)
131
132
}
133
134
fun_forestplot=function(data, NAME="data", BOX=0.1,cex=2, colorv="black"){
135
  library(forestplot)
136
  
137
  txt.df=data.frame("Feature"=data$Feature, "Cohort"=gsub("_", " ", data$Name), "P"=signif(data$P, 2), stringsAsFactors = F)
138
  txt.df$Feature[duplicated(txt.df$Feature)]=""
139
  txt.df=rbind(c("Feature", "Cohort", "P"), txt.df)
140
  
141
  coef.df=data.frame("HR"=data$`exp(coef)`, "lower .95"=data$`lower .95`, "upper .95"=data$`upper .95`)
142
  coef.df=rbind(c(NA, NA, NA), coef.df)
143
  
144
  xticks=seq(ifelse(min(data$`lower .95`)<0.5, 0, 0.5), min(c(5, max(data$`upper .95`))), by = 0.5)
145
  
146
  attr(xticks, "labels") = xticks%in%seq(-4, 4, by=1)
147
  
148
  forestplot(txt.df,coef.df, new_page = T, zero = c(0.98, 1.02), 
149
             clip =c(-1, 2), is.summary = c(T, rep(F, length(data$Name))),
150
             xticks=xticks,
151
             boxsize=BOX,
152
             xlab="HR",
153
             col=fpColors(box=colorv),
154
             txt_gp = fpTxtGp(label = list(gpar(fontfamily = "Helvetica", cex=cex*1.25),
155
                                           gpar(fontfamily = "Helvetica", col = "black", cex=cex)),
156
                              summary = gpar(fontfamily = "Helvetica", cex=cex*1.5),
157
                              ticks = gpar(fontfamily = "Helvetica", cex=cex),
158
                              xlab  = gpar(fontfamily = "Helvetica", cex = cex*1.5)))
159
  
160
}
161
162
# survival analysis for all scores:
163
# many coefficients in survival analysis, using regularization and elastic net to select features for cox model.
164
165
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/Published_data_figures")
166
167
#**************************************** training data DLBCL **************************************** 
168
load("Hemap_DLBCL_survival_data.Rdata")
169
170
# significant genes:
171
files=list.files(pattern = "tableS7")
172
173
# significant genes:
174
univariate.results=lapply(files[grepl("signif", files)], fread, data.table=F)
175
names(univariate.results)=files[grepl("signif", files)]
176
177
DLBCL=do.call(rbind, univariate.results[grep("DLBCL", names(univariate.results))])
178
DLBCL$Name=gsub("_RCHOP", "", DLBCL$Name)
179
DLBCL$Feature=gsub("-", ".", DLBCL$Feature)
180
181
# genes with FDR<0.2, beta to same direction and observed in 2 cohorts:
182
genes=table(DLBCL$Feature, DLBCL$`exp(coef)`>1)>1
183
genelist=rownames(genes)[rowSums(genes)==1]
184
185
# combine data:
186
time=TIME[logicalv[[5]]]
187
status=STATUS[logicalv[[5]]]
188
189
regression.data=cbind(gexp[logicalv[[5]], colnames(gexp)%in%genelist,], immunoscore[logicalv[[5]], colnames(immunoscore)%in%genelist,], samp[logicalv[[5]], colnames(samp)%in%genelist,])
190
all(genelist%in%colnames(regression.data))
191
192
# results_dlbcl=fun.cox.elasticnet(DATA_ORG = regression.data, time, status, summary.km = "3quantile", cores = 10, REPEATS = 100, percentage = 0, nfold = 10, min.elnet = 0, max.elnet = 0.1)
193
# save(results_dlbcl, file="Hemap_DLBCL_cox_datasets_filt_adj20_revision.Rdata")
194
load("Hemap_DLBCL_cox_datasets_filt_adj20_revision.Rdata")
195
196
# revision
197
# [1] 0.1
198
# [1] 0.0745611
199
# [1] 11.60345
200
201
annot=get(load("GSE98588_annot.Rdata"))
202
203
pdf("FigS7D_DLBCL_model_cox.pdf", height = 2.5, width = 2)
204
riskInd_Chapuy=validate.model("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients)
205
riskInd_Hemap=validate.model("Hemap_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, 5)
206
dev.off()
207
208
DLBCL_risk1=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$IPI%in%c(0, 1, 2), NAME = "DLBCL IPI 0-2")
209
DLBCL_risk2=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$IPI%in%c(3), NAME = "DLBCL IPI 3")
210
DLBCL_risk3=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$IPI%in%c(4,5), NAME = "DLBCL IPI 4-5")
211
DLBCL_risk4=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$COO_byGEP=="ABC", NAME = "DLBCL ABC")
212
DLBCL_risk5=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$COO_byGEP=="GCB", NAME = "DLBCL GCB")
213
214
type.feat=unique(DLBCL[,c("Feature", "Type")])
215
type.feat=type.feat[order(type.feat[,2]),]
216
217
Plot.model(dataset = "GSE98588_DLBCL_survival_data.Rdata", coef = results_dlbcl$coefficients, type.feat = type.feat, ind = 1,NAME = "Fig7F_Chapuy_model")
218
# Plot.model(dataset = "Reddy_DLBCL_survival_data.Rdata", coef = results_dlbcl$coefficients, type.feat = type.feat, ind = 1,NAME = "Reddy_model")
219
220
#**************************************** training data MM **************************************** 
221
load("Hemap_MM_survival_data.Rdata")
222
223
# significant genes:
224
files=list.files(pattern = "tableS7")
225
226
# significant genes:
227
univariate.results=lapply(files[grepl("signif", files)], fread, data.table=F)
228
names(univariate.results)=files[grepl("signif", files)]
229
230
MM=do.call(rbind, univariate.results[grep("MM", names(univariate.results))])
231
MM$Feature=gsub("-", ".", MM$Feature)
232
233
# genes with FDR<0.2, beta to same direction and observed in 2 cohorts:
234
# use GSE19784 Cancer_Myeloma as training and GSE16716,GSE24080 Cancer_Myeloma as test set
235
genes=table(MM$Feature, MM$`exp(coef)`>1)>1
236
genelist=rownames(genes)[rowSums(genes)==1]
237
genelist=genelist[!genelist%in%"WHSC1_FGFR3_Ig"]
238
239
# combine data:
240
ind=1
241
time=TIME[logicalv[[ind]]]
242
time[time==0]=0.1
243
status=STATUS[logicalv[[ind]]]
244
245
regression.data=cbind(gexp[logicalv[[ind]], colnames(gexp)%in%genelist,drop=F], immunoscore[logicalv[[ind]], colnames(immunoscore)%in%genelist,drop=F], samp[logicalv[[ind]], colnames(samp)%in%genelist,drop=F])
246
all(genelist%in%colnames(regression.data))
247
248
# results_MM=fun.cox.elasticnet(DATA_ORG = regression.data, time, status, summary.km = "3quantile", cores = 10, REPEATS = 100, percentage = 0, nfold = 10)
249
# save(results_MM, file="Hemap_MM_cox_datasets_filt_adj20_revision.Rdata")
250
# [1] 0.05
251
# [1] 0.1089963
252
# [1] 14.10799
253
254
load("Hemap_MM_cox_datasets_filt_adj20_revision.Rdata")
255
256
pdf("FigS7C_MM_model_cox.pdf", height = 2.5, width = 2)
257
riskInd_CoMMpass=validate.model(dataset = "Hemap_MM_survival_data.Rdata", coef = results_MM$coefficients, ind=1, summary="75th_25th_percentile")
258
riskInd_CoMMpass=validate.model("CoMMpass_survival_data.Rdata", results_MM$coefficients, summary="75th_25th_percentile", months = T)
259
dev.off()
260
261
type.feat=unique(MM[,c("Feature", "Type")])
262
type.feat=type.feat[order(type.feat[,2]),]
263
264
Plot.model(dataset = "CoMMpass_survival_data.Rdata", coef = results_MM$coefficients, type.feat = type.feat, ind = 1,NAME = "Fig7E_CoMMpass_model")
265
266
load("CoMMpass_MM_subtypes.Rdata")
267
268
subtype=coordinates.subtype[match(colnames(riskInd_CoMMpass)[order(-riskInd_CoMMpass)],coordinates.subtype$ID),]
269
subtype$subtype[subtype$cluster=="CGA_Prolif"]="CGA_Prolif"
270
271
col=data.frame("subtype"=c("CCND1_Ig", "WHSC1_FGFR3_Ig", "Hyperdiploid_gain11q", "Hyperdiploid_gain1q", "MAF_Ig", "TRAF3_Aberrated", "CGA_Prolif"),
272
"color"=c("#e41a1b", "#357eb8", "#5eb45b", "#9b53a4", "#ff7d00", "#f8f875", "darkred"), stringsAsFactors = F)
273
274
load("CoMMpass_survival_data.Rdata")
275
MM_risk1=validate.model.cox("CoMMpass_survival_data.Rdata", results_MM$coefficients, filt=samp$ISS1==1, NAME = "CoMMpass ISS 1")
276
MM_risk2=validate.model.cox("CoMMpass_survival_data.Rdata", results_MM$coefficients, filt=samp$ISS2==1, NAME = "CoMMpass ISS 2")
277
MM_risk3=validate.model.cox("CoMMpass_survival_data.Rdata", results_MM$coefficients, filt=samp$ISS3==1, NAME = "CoMMpass ISS 3")
278
279
#**************************************** training data AML **************************************** 
280
load("Hemap_AML_survival_data.Rdata")
281
282
# significant genes:
283
files=list.files(pattern = "tableS7")
284
285
# significant genes:
286
univariate.results=lapply(files[grepl("signif", files)], fread, data.table=F)
287
names(univariate.results)=files[grepl("signif", files)]
288
289
AML=do.call(rbind, univariate.results[grep("AML", names(univariate.results))])
290
AML$Feature=gsub("-", ".", AML$Feature)
291
292
# genes with FDR<0.2, beta to same direction and observed in 2 cohorts:
293
genes=table(AML$Feature, AML$`exp(coef)`>1)>1
294
genelist=rownames(genes)[rowSums(genes)==1]
295
genelist=gsub("-", ".", genelist)
296
297
# combine data:
298
time=TIME[logicalv[[1]]]
299
time[time==0]=0.1
300
status=STATUS[logicalv[[1]]]
301
302
regression.data=cbind(gexp[logicalv[[1]], colnames(gexp)%in%genelist,drop=F], immunoscore[logicalv[[1]], colnames(immunoscore)%in%genelist,drop=F], samp[logicalv[[1]], colnames(samp)%in%genelist,drop=F])
303
all(genelist%in%colnames(regression.data))
304
305
# results_AML=fun.cox.elasticnet(DATA_ORG = regression.data, time, status, summary.km = "3quantile", cores = 10, REPEATS = 100, percentage = 0, nfold = 5)
306
307
# model revision
308
# [1] 0.06
309
# [1] 0.1982066
310
# [1] 11.88285
311
312
# save(results_AML, file="Hemap_AML_cox_datasets_filt_adj20_revision.Rdata")
313
load("Hemap_AML_cox_datasets_filt_adj20_revision.Rdata")
314
exp(results_AML$coefficients)
315
316
pdf("FigS7E_Risk_AML_validation.pdf", height = 2.5, width = 2)
317
riskInd_Hemap_AML=validate.model("Hemap_AML_survival_data.Rdata", results_AML$coefficients, summary = "75th_25th_percentile")
318
riskInd_BeatAML=validate.model("BeatAML_survival_data.Rdata", results_AML$coefficients, months=T, summary = "75th_25th_percentile")
319
riskInd_TCGA_AML=validate.model("TCGA_AML_survival_data.Rdata", results_AML$coefficients, summary = "75th_25th_percentile")
320
dev.off()
321
322
type.feat=unique(AML[,c("Feature", "Type")])
323
type.feat=type.feat[order(type.feat[,2]),]
324
325
# Plot.model(dataset = "Hemap_AML_survival_data.Rdata", coef = results_AML$coefficients, type.feat = type.feat, ind = 1,NAME = "Hemap_AML_model")
326
Plot.model(dataset = "BeatAML_survival_data.Rdata", coef = results_AML$coefficients, type.feat = type.feat, ind = 1,NAME = "FigS7G_BeatAML_model")
327
# Plot.model(dataset = "TCGA_AML_survival_data.Rdata", coef = results_AML$coefficients, type.feat = type.feat, ind = 1,NAME = "TCGA_AML_model")
328
329
load("BeatAML_survival_data.Rdata")
330
annot=get(load("BeatAML_fm_annot.Rdata"))
331
332
AML_risk1=validate.model.cox("BeatAML_survival_data.Rdata", results_AML$coefficients, filt=grepl("Adverse", annot$ELN2017), NAME = "BeatAML Adverse")
333
AML_risk2=validate.model.cox("BeatAML_survival_data.Rdata", results_AML$coefficients, filt=annot$ELN2017=="Intermediate", NAME = "BeatAML Intermediate")
334
AML_risk3=validate.model.cox("BeatAML_survival_data.Rdata", results_AML$coefficients, filt=grepl("Favorable", annot$ELN2017), NAME = "BeatAML Favorable")
335
336
dat=rbind(DLBCL_risk1, DLBCL_risk2, DLBCL_risk3, DLBCL_risk4, DLBCL_risk5, MM_risk1, MM_risk2, MM_risk3, AML_risk1, AML_risk2, AML_risk3)
337
338
pdf("FigS7F_Subset_data_model.pdf", width = 4, height = 3)
339
fun_forestplot(dat, "AML, MM, DLBCL", BOX=0.5, cex=0.5, colorv ="black")
340
dev.off()