GIT_HOME="/research/users/ppolonen/git_home/ImmunogenomicLandscape-BloodCancers/"
source(file.path(GIT_HOME, "common_scripts/visualisation/plotting_functions.R"))
source(file.path(GIT_HOME, "common_scripts/statistics/functions_statistics.R"))
source(file.path(GIT_HOME, "common_scripts/statistics/statistics_wrappers.R"))
source(file.path(GIT_HOME, "common_scripts/featurematrix/compute.pairwise.R"))
source(file.path(GIT_HOME, "common_scripts/featurematrix/functions_generate_fm.R"))
library(RColorBrewer)
library(survival)
library(data.table)
library(ggplot2)
library(ComplexHeatmap)
validate.model=function(dataset,coef, ind=1, months=F, summary="3quantiles", filt=NULL){
load(dataset)
if(!is.null(filt)){
logicalv=list(filt&logicalv[[ind]])
ind=1
}
time=TIME[logicalv[[ind]]]
time[time==0]=0.1
status=STATUS[logicalv[[ind]]]
genelist=rownames(coef)
test.data=cbind(gexp[logicalv[[ind]], colnames(gexp)%in%genelist,drop=F], immunoscore[logicalv[[ind]], colnames(immunoscore)%in%genelist,drop=F], samp[logicalv[[ind]], colnames(samp)%in%genelist,drop=F])
if(!(all(genelist%in%colnames(test.data)))){
warning(paste("Features not found from test data:", paste(genelist[!genelist%in%colnames(test.data)], collapse = ",")))
coef=coef[rownames(coef)%in%colnames(test.data),,drop=F]
genelist=rownames(coef)
}
m=test.data[,match(genelist, colnames(test.data))]
riskPI=as.numeric(coef) %*% data.matrix(t(m))
print(summary(coxph(Surv(time, status) ~ PI.test, data.frame("PI.test"=as.numeric(riskPI)))))
# plot validation data:
print(fun.kapplanMeier(time, status, CONTINUOUS = as.numeric(riskPI), conf.int = F, MONTHS=months, PVAL=1,LWD = 0.5, CONTINUOUS_SUMMARY = summary, INDIVIDUAL_GROUPS=F, NAME = ""))
return(riskPI)
}
validate.model.cox=function(dataset,coef, ind=1, months=F, summary="3quantiles", filt=NUL, NAME=""){
load(dataset)
if(!is.null(filt)){
logicalv=list(filt&logicalv[[ind]])
ind=1
}
time=TIME[logicalv[[ind]]]
time[time==0]=0.1
status=STATUS[logicalv[[ind]]]
genelist=rownames(coef)
test.data=cbind(gexp[logicalv[[ind]], colnames(gexp)%in%genelist,drop=F], immunoscore[logicalv[[ind]], colnames(immunoscore)%in%genelist,drop=F], samp[logicalv[[ind]], colnames(samp)%in%genelist,drop=F])
if(!(all(genelist%in%colnames(test.data)))){
warning(paste("Features not found from test data:", paste(genelist[!genelist%in%colnames(test.data)], collapse = ",")))
coef=coef[rownames(coef)%in%colnames(test.data),,drop=F]
genelist=rownames(coef)
}
m=test.data[,match(genelist, colnames(test.data))]
riskPI=as.numeric(coef) %*% data.matrix(t(m))
fit=coxph(Surv(time, status) ~ PI.test, data.frame("PI.test"=as.numeric(riskPI)))
a=summary(fit)
pval=a$coefficients[,5]
coef=a$conf.int[,c(1,3,4)]
univ=data.frame(rownames(a$coefficients),t(coef), pval, a$concordance[1],NAME, stringsAsFactors = F)
rownames(univ)=NULL
colnames(univ)=c("Feature", "exp(coef)", "lower .95", "upper .95", "P", "concordance", "Name")
return(univ)
}
cols=data.frame("name"=c("Subtype","ImmunoScores","Inhibitory ligand", "Stimulatory ligand", "Stromal/cancer gene (Rho > 0)", "Stromal/cancer gene (Rho < 0)","CTL/NK gene","Clinical", "CGA", "MDS-signature gene"),
"color"=c("#acb839","#5e2883","#1f78b4","#b2df8a","#377eb8","grey50","#e41a1c","brown", "#d7a85b", "indianred"), stringsAsFactors = F)
Plot.model=function(dataset, coef, type.feat, ind=1, NAME=NULL){
load(dataset)
time=TIME[logicalv[[ind]]]
time[time==0]=0.1
status=STATUS[logicalv[[ind]]]
genelist=rownames(coef)
gexp=gexp[logicalv[[ind]], colnames(gexp)%in%genelist,drop=F]
immunoscore=immunoscore[logicalv[[ind]], colnames(immunoscore)%in%genelist,drop=F]
samp=samp[logicalv[[ind]], colnames(samp)%in%genelist,drop=F]
test.data=cbind(gexp, immunoscore, samp)
if(!(all(genelist%in%colnames(test.data)))){
warning(paste("Features not found from test data:", paste(genelist[!genelist%in%colnames(test.data)], collapse = ",")))
coef=coef[rownames(coef)%in%colnames(test.data),,drop=F]
genelist=rownames(coef)
}
m=test.data[,match(genelist, colnames(test.data))]
riskPI=as.numeric(coef) %*% data.matrix(t(m))
gene.annot=data.frame(type.feat[match(rownames(coef), type.feat[,1]),], "HR"=as.numeric(exp(coef)), stringsAsFactors = F)
gene.annot=gene.annot[order(gene.annot$Type, -gene.annot$HR),]
fm.m=t(m)
fm.m=fm.m[match(gene.annot$Feature, rownames(fm.m)),]
rownames(fm.m)[rownames(fm.m)%in%colnames(gexp)]=paste0("N:GEXP:", rownames(fm.m)[rownames(fm.m)%in%colnames(gexp)])
rownames(fm.m)[rownames(fm.m)%in%colnames(immunoscore)]=paste0("N:GSVA:", rownames(fm.m)[rownames(fm.m)%in%colnames(immunoscore)])
rownames(fm.m)[rownames(fm.m)%in%gene.annot$Feature[gene.annot$Type%in%c("Subtype")]]=paste0("B:SAMP:", rownames(fm.m)[rownames(fm.m)%in%gene.annot$Feature[gene.annot$Type%in%c("Subtype")]])
rownames(fm.m)[rownames(fm.m)%in%gene.annot$Feature[gene.annot$Type%in%c("Clinical")]]=paste0("N:CLIN:", rownames(fm.m)[rownames(fm.m)%in%gene.annot$Feature[gene.annot$Type%in%c("Clinical")]])
rownames(gene.annot)=rownames(fm.m)
HR=prettyNum(signif(gene.annot[,3], 3))
names(HR)=rownames(gene.annot)
annotdf=data.frame("OS"=status, "RiskPI"=t(riskPI), stringsAsFactors = F)
rownames(annotdf)=colnames(fm.m)
plot.complexHM.fm(feats = rownames(gene.annot), text_annot = HR, feats.barplot = c("RiskPI"), split.columns = F, annotdf = annotdf, fm.f = fm.m, order_columns = colnames(fm.m)[order(as.numeric(riskPI))], use_raster = F, order_rows = F, NAME = NAME)
}
fun_forestplot=function(data, NAME="data", BOX=0.1,cex=2, colorv="black"){
library(forestplot)
txt.df=data.frame("Feature"=data$Feature, "Cohort"=gsub("_", " ", data$Name), "P"=signif(data$P, 2), stringsAsFactors = F)
txt.df$Feature[duplicated(txt.df$Feature)]=""
txt.df=rbind(c("Feature", "Cohort", "P"), txt.df)
coef.df=data.frame("HR"=data$`exp(coef)`, "lower .95"=data$`lower .95`, "upper .95"=data$`upper .95`)
coef.df=rbind(c(NA, NA, NA), coef.df)
xticks=seq(ifelse(min(data$`lower .95`)<0.5, 0, 0.5), min(c(5, max(data$`upper .95`))), by = 0.5)
attr(xticks, "labels") = xticks%in%seq(-4, 4, by=1)
forestplot(txt.df,coef.df, new_page = T, zero = c(0.98, 1.02),
clip =c(-1, 2), is.summary = c(T, rep(F, length(data$Name))),
xticks=xticks,
boxsize=BOX,
xlab="HR",
col=fpColors(box=colorv),
txt_gp = fpTxtGp(label = list(gpar(fontfamily = "Helvetica", cex=cex*1.25),
gpar(fontfamily = "Helvetica", col = "black", cex=cex)),
summary = gpar(fontfamily = "Helvetica", cex=cex*1.5),
ticks = gpar(fontfamily = "Helvetica", cex=cex),
xlab = gpar(fontfamily = "Helvetica", cex = cex*1.5)))
}
# survival analysis for all scores:
# many coefficients in survival analysis, using regularization and elastic net to select features for cox model.
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/Published_data_figures")
#**************************************** training data DLBCL ****************************************
load("Hemap_DLBCL_survival_data.Rdata")
# significant genes:
files=list.files(pattern = "tableS7")
# significant genes:
univariate.results=lapply(files[grepl("signif", files)], fread, data.table=F)
names(univariate.results)=files[grepl("signif", files)]
DLBCL=do.call(rbind, univariate.results[grep("DLBCL", names(univariate.results))])
DLBCL$Name=gsub("_RCHOP", "", DLBCL$Name)
DLBCL$Feature=gsub("-", ".", DLBCL$Feature)
# genes with FDR<0.2, beta to same direction and observed in 2 cohorts:
genes=table(DLBCL$Feature, DLBCL$`exp(coef)`>1)>1
genelist=rownames(genes)[rowSums(genes)==1]
# combine data:
time=TIME[logicalv[[5]]]
status=STATUS[logicalv[[5]]]
regression.data=cbind(gexp[logicalv[[5]], colnames(gexp)%in%genelist,], immunoscore[logicalv[[5]], colnames(immunoscore)%in%genelist,], samp[logicalv[[5]], colnames(samp)%in%genelist,])
all(genelist%in%colnames(regression.data))
# results_dlbcl=fun.cox.elasticnet(DATA_ORG = regression.data, time, status, summary.km = "3quantile", cores = 10, REPEATS = 100, percentage = 0, nfold = 10, min.elnet = 0, max.elnet = 0.1)
# save(results_dlbcl, file="Hemap_DLBCL_cox_datasets_filt_adj20_revision.Rdata")
load("Hemap_DLBCL_cox_datasets_filt_adj20_revision.Rdata")
# revision
# [1] 0.1
# [1] 0.0745611
# [1] 11.60345
annot=get(load("GSE98588_annot.Rdata"))
pdf("FigS7D_DLBCL_model_cox.pdf", height = 2.5, width = 2)
riskInd_Chapuy=validate.model("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients)
riskInd_Hemap=validate.model("Hemap_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, 5)
dev.off()
DLBCL_risk1=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$IPI%in%c(0, 1, 2), NAME = "DLBCL IPI 0-2")
DLBCL_risk2=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$IPI%in%c(3), NAME = "DLBCL IPI 3")
DLBCL_risk3=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$IPI%in%c(4,5), NAME = "DLBCL IPI 4-5")
DLBCL_risk4=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$COO_byGEP=="ABC", NAME = "DLBCL ABC")
DLBCL_risk5=validate.model.cox("GSE98588_DLBCL_survival_data.Rdata", results_dlbcl$coefficients, filt=annot$COO_byGEP=="GCB", NAME = "DLBCL GCB")
type.feat=unique(DLBCL[,c("Feature", "Type")])
type.feat=type.feat[order(type.feat[,2]),]
Plot.model(dataset = "GSE98588_DLBCL_survival_data.Rdata", coef = results_dlbcl$coefficients, type.feat = type.feat, ind = 1,NAME = "Fig7F_Chapuy_model")
# Plot.model(dataset = "Reddy_DLBCL_survival_data.Rdata", coef = results_dlbcl$coefficients, type.feat = type.feat, ind = 1,NAME = "Reddy_model")
#**************************************** training data MM ****************************************
load("Hemap_MM_survival_data.Rdata")
# significant genes:
files=list.files(pattern = "tableS7")
# significant genes:
univariate.results=lapply(files[grepl("signif", files)], fread, data.table=F)
names(univariate.results)=files[grepl("signif", files)]
MM=do.call(rbind, univariate.results[grep("MM", names(univariate.results))])
MM$Feature=gsub("-", ".", MM$Feature)
# genes with FDR<0.2, beta to same direction and observed in 2 cohorts:
# use GSE19784 Cancer_Myeloma as training and GSE16716,GSE24080 Cancer_Myeloma as test set
genes=table(MM$Feature, MM$`exp(coef)`>1)>1
genelist=rownames(genes)[rowSums(genes)==1]
genelist=genelist[!genelist%in%"WHSC1_FGFR3_Ig"]
# combine data:
ind=1
time=TIME[logicalv[[ind]]]
time[time==0]=0.1
status=STATUS[logicalv[[ind]]]
regression.data=cbind(gexp[logicalv[[ind]], colnames(gexp)%in%genelist,drop=F], immunoscore[logicalv[[ind]], colnames(immunoscore)%in%genelist,drop=F], samp[logicalv[[ind]], colnames(samp)%in%genelist,drop=F])
all(genelist%in%colnames(regression.data))
# results_MM=fun.cox.elasticnet(DATA_ORG = regression.data, time, status, summary.km = "3quantile", cores = 10, REPEATS = 100, percentage = 0, nfold = 10)
# save(results_MM, file="Hemap_MM_cox_datasets_filt_adj20_revision.Rdata")
# [1] 0.05
# [1] 0.1089963
# [1] 14.10799
load("Hemap_MM_cox_datasets_filt_adj20_revision.Rdata")
pdf("FigS7C_MM_model_cox.pdf", height = 2.5, width = 2)
riskInd_CoMMpass=validate.model(dataset = "Hemap_MM_survival_data.Rdata", coef = results_MM$coefficients, ind=1, summary="75th_25th_percentile")
riskInd_CoMMpass=validate.model("CoMMpass_survival_data.Rdata", results_MM$coefficients, summary="75th_25th_percentile", months = T)
dev.off()
type.feat=unique(MM[,c("Feature", "Type")])
type.feat=type.feat[order(type.feat[,2]),]
Plot.model(dataset = "CoMMpass_survival_data.Rdata", coef = results_MM$coefficients, type.feat = type.feat, ind = 1,NAME = "Fig7E_CoMMpass_model")
load("CoMMpass_MM_subtypes.Rdata")
subtype=coordinates.subtype[match(colnames(riskInd_CoMMpass)[order(-riskInd_CoMMpass)],coordinates.subtype$ID),]
subtype$subtype[subtype$cluster=="CGA_Prolif"]="CGA_Prolif"
col=data.frame("subtype"=c("CCND1_Ig", "WHSC1_FGFR3_Ig", "Hyperdiploid_gain11q", "Hyperdiploid_gain1q", "MAF_Ig", "TRAF3_Aberrated", "CGA_Prolif"),
"color"=c("#e41a1b", "#357eb8", "#5eb45b", "#9b53a4", "#ff7d00", "#f8f875", "darkred"), stringsAsFactors = F)
load("CoMMpass_survival_data.Rdata")
MM_risk1=validate.model.cox("CoMMpass_survival_data.Rdata", results_MM$coefficients, filt=samp$ISS1==1, NAME = "CoMMpass ISS 1")
MM_risk2=validate.model.cox("CoMMpass_survival_data.Rdata", results_MM$coefficients, filt=samp$ISS2==1, NAME = "CoMMpass ISS 2")
MM_risk3=validate.model.cox("CoMMpass_survival_data.Rdata", results_MM$coefficients, filt=samp$ISS3==1, NAME = "CoMMpass ISS 3")
#**************************************** training data AML ****************************************
load("Hemap_AML_survival_data.Rdata")
# significant genes:
files=list.files(pattern = "tableS7")
# significant genes:
univariate.results=lapply(files[grepl("signif", files)], fread, data.table=F)
names(univariate.results)=files[grepl("signif", files)]
AML=do.call(rbind, univariate.results[grep("AML", names(univariate.results))])
AML$Feature=gsub("-", ".", AML$Feature)
# genes with FDR<0.2, beta to same direction and observed in 2 cohorts:
genes=table(AML$Feature, AML$`exp(coef)`>1)>1
genelist=rownames(genes)[rowSums(genes)==1]
genelist=gsub("-", ".", genelist)
# combine data:
time=TIME[logicalv[[1]]]
time[time==0]=0.1
status=STATUS[logicalv[[1]]]
regression.data=cbind(gexp[logicalv[[1]], colnames(gexp)%in%genelist,drop=F], immunoscore[logicalv[[1]], colnames(immunoscore)%in%genelist,drop=F], samp[logicalv[[1]], colnames(samp)%in%genelist,drop=F])
all(genelist%in%colnames(regression.data))
# results_AML=fun.cox.elasticnet(DATA_ORG = regression.data, time, status, summary.km = "3quantile", cores = 10, REPEATS = 100, percentage = 0, nfold = 5)
# model revision
# [1] 0.06
# [1] 0.1982066
# [1] 11.88285
# save(results_AML, file="Hemap_AML_cox_datasets_filt_adj20_revision.Rdata")
load("Hemap_AML_cox_datasets_filt_adj20_revision.Rdata")
exp(results_AML$coefficients)
pdf("FigS7E_Risk_AML_validation.pdf", height = 2.5, width = 2)
riskInd_Hemap_AML=validate.model("Hemap_AML_survival_data.Rdata", results_AML$coefficients, summary = "75th_25th_percentile")
riskInd_BeatAML=validate.model("BeatAML_survival_data.Rdata", results_AML$coefficients, months=T, summary = "75th_25th_percentile")
riskInd_TCGA_AML=validate.model("TCGA_AML_survival_data.Rdata", results_AML$coefficients, summary = "75th_25th_percentile")
dev.off()
type.feat=unique(AML[,c("Feature", "Type")])
type.feat=type.feat[order(type.feat[,2]),]
# Plot.model(dataset = "Hemap_AML_survival_data.Rdata", coef = results_AML$coefficients, type.feat = type.feat, ind = 1,NAME = "Hemap_AML_model")
Plot.model(dataset = "BeatAML_survival_data.Rdata", coef = results_AML$coefficients, type.feat = type.feat, ind = 1,NAME = "FigS7G_BeatAML_model")
# Plot.model(dataset = "TCGA_AML_survival_data.Rdata", coef = results_AML$coefficients, type.feat = type.feat, ind = 1,NAME = "TCGA_AML_model")
load("BeatAML_survival_data.Rdata")
annot=get(load("BeatAML_fm_annot.Rdata"))
AML_risk1=validate.model.cox("BeatAML_survival_data.Rdata", results_AML$coefficients, filt=grepl("Adverse", annot$ELN2017), NAME = "BeatAML Adverse")
AML_risk2=validate.model.cox("BeatAML_survival_data.Rdata", results_AML$coefficients, filt=annot$ELN2017=="Intermediate", NAME = "BeatAML Intermediate")
AML_risk3=validate.model.cox("BeatAML_survival_data.Rdata", results_AML$coefficients, filt=grepl("Favorable", annot$ELN2017), NAME = "BeatAML Favorable")
dat=rbind(DLBCL_risk1, DLBCL_risk2, DLBCL_risk3, DLBCL_risk4, DLBCL_risk5, MM_risk1, MM_risk2, MM_risk3, AML_risk1, AML_risk2, AML_risk3)
pdf("FigS7F_Subset_data_model.pdf", width = 4, height = 3)
fun_forestplot(dat, "AML, MM, DLBCL", BOX=0.5, cex=0.5, colorv ="black")
dev.off()