[8e0848]: / FigS2AB_microenvironment_analysis_GSEA_GSVA.R

Download this file

133 lines (97 with data), 6.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
GIT_HOME="/research/users/ppolonen/git_home/ImmunogenomicLandscape-BloodCancers/"
source(file.path(GIT_HOME, "common_scripts/featurematrix/functions_generate_fm.R"))
source(file.path(GIT_HOME, "common_scripts/visualisation/plotting_functions.R"))
source(file.path(GIT_HOME, "common_scripts/pathway_analysis/functions.GSEA.R"))
library(GSVA)
library(parallel)
# FigureS2 A-B related analysis
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/Published_data_figures")
# annotations
annot=get(load("Hemap_immunology_Annotations.Rdata"))
load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/data9544_with_gene_symbols.RData")
data = t(data[rownames(data)%in%annot$GSM.identifier..sample.,])
subclass2=get.logical(annovector = list(annot$subclasses), filterv = annot$CELLS_SORTED==0&annot$Sample.type%in%c("Prolif", "Cancer"))
tbly2=get.logical(annovector = list(annot$tbLY), filterv = annot$CELLS_SORTED==0&annot$Sample.type%in%c("Prolif", "Cancer"))
tbly2=tbly2[!names(tbly2)%in%"Lymphoma_BCL"]
list_cancers=lapply(c(subclass2, tbly2), list)
list_cancers=unlist(list_cancers, recursive=F)
l=c("AML", "BCL_DLBCL", "CLL", "CML", "Lymphoma_BCL_CHL", "Lymphoma_BCL_MCL", "Lymphoma_BCL_FL", "pre-B-ALL", "Prolif_Myeloproliferative_MDS", "T-ALL")
logicalvectors=list_cancers[names(list_cancers)%in%l]
#*********************** run GSEA using cytolyticScore as ranking metric on each of the disease: ***********************
GENESETS=file.path(getwd(), "HALLMARKS.gmt")
WD=file.path(getwd(), "GSEA")
OUTDIR=file.path(getwd(), "GSEA")
names(logicalvectors)=gsub("-", "_", names(logicalvectors))
res=parallel::mclapply(seq(logicalvectors), function(i){
filt=logicalvectors[[i]]
command=run.GSEA(data=data[,filt], cls.vector = as.numeric(annot$CytolyticScore[filt]), datatype = "N", GENESETS = GENESETS, dataname = paste0(names(logicalvectors)[i]), clsname = "CytolyticScore", WD=WD, OUTDIR=OUTDIR)
try(system(command))
}, mc.cores=5)
#***********************************************************************************************************************
#************** run GSEA using cytolyticScore as ranking metric on TNK cells vs other normal ******************
cls=annot$immunoNormals%in%c("CD8+Tcell", "NKCell")
filt=!annot$immunoNormals%in%c("")
command=run.GSEA(data=data[,filt], cls.vector = as.numeric(cls[filt]), datatype = "B", GENESETS = GENESETS, dataname = "CD8_NK_pathways", clsname = "CD8_NK", WD=WD, OUTDIR=OUTDIR)
try(system(command))
#***********************************************************************************************************************
# combine results to same matrix:
datp=list.files(path = OUTDIR, pattern = "gsea_report_for_1_*.*.xls|gsea_report_for_feat_pos.*..xls", recursive = T, full.names = T)
datn=list.files(path = OUTDIR, pattern = "gsea_report_for_0_*.*.xls|gsea_report_for_feat_neg.*..xls", recursive = T, full.names = T)
# filter to contain only significant
pwnames=scan(pipe("cut -f1 HALLMARKS.gmt"), "genesets")
FUN=function(f){
dat=try(read.delim(f, stringsAsFactors=F, header=T), silent = T)
dat$FDR.q.val[dat$FDR.q.val==0]=0.0001
# variable.1, variable.2, features (y name), id (x name)
d=data.frame("features"=gsub("-MSIGDB_HALLMARKS", "", dat$NAME), "variable.1"=dat$NES, "variable.2"=-log10(as.numeric(dat$FDR.q.val)), "id"=gsub("_CytolyticScore_.*.|_pathways_.*.|Prolif_Myeloproliferative_|Lymphoma_BCL_|^/BCL_|/","", gsub(eval(OUTDIR), "", f)))
}
pwp=mclapply(datp, FUN, mc.cores=10)
pwp2=mclapply(datn, FUN, mc.cores=10)
dat=do.call(rbind, pwp)
dat2=do.call(rbind, pwp2)
dat_comb=rbind(dat, dat2)
dat_comb$id=factor(gsub("_", "-", dat_comb$id), levels=c("DLBCL","CHL", "FL", "MCL", "CLL", "CML", "AML", "MDS", "pre-B-ALL", "T-ALL", "CD8-NK"))
pdf("FigureS2A.pdf", width = 5.5, height = 6.5)
plot.DotPlot.df(data.plot = dat_comb, cols = c("blue", "white", "red"), fontsize = 8, name.variable.1 = "NES", name.variable.2 = "-log10 FDR", dot.scale = 3)
dev.off()
# plot correlation plot between protein and Pathway
fm=get(load("TCGA_DLBCL_FM_DUFVA.Rdata"))
# run GSVA for certain pathways and add to matrix:
gexp=data.matrix(fm[grepl("N:GEXP", rownames(fm)),])
rownames(gexp)=gsub(":.*.", "", gsub("N:GEXP:", "",rownames(gexp)))
# gene sets:
GENESETS="Combined_pathway_signatures_2017_filtered_robust.gmt"
# get GSVA visualization for the pathways
# Geneset list
Onc.pathways=read.delim(GENESETS, stringsAsFactors = FALSE, header=F, col.names = paste("V",1:max(count.fields(GENESETS, sep = '\t'), na.rm = T)), fill = TRUE)
# Make list
listA=mclapply(1:length(Onc.pathways[,1]), function(i){A=as.character(Onc.pathways[i,3:length(Onc.pathways),])
B=A[!A==""&!A=="NA"]}, mc.cores=6)
names(listA) <- toupper(Onc.pathways[,1])
viz_scores=gsva(expr = gexp, gset.idx.list = listA, parallel.sz=8, tau=0.25)
rownames(viz_scores)=paste0("N:GSVA:", rownames(viz_scores))
fm.add=rbind(fm, viz_scores)
fm.add=fm.add[!(!grepl("^B:GNAB:*.*chr*.*y_n_somatic", rownames(fm.add))&grepl("^B:GNAB:", rownames(fm.add))),!is.na(fm.add["N:RPPA:PARP1:chr1:226548392:226595780:-:PARP_cleaved",])]
annotdf=data.frame("CytolyticScore"=as.numeric(fm["N:SAMP:CytolyticScore",]), "CASP7"=as.numeric(fm["N:RPPA:CASP7:chr10:115438942:115490662:+:Caspase-7_cleavedD198",]), "IRF1"=as.numeric(fm["N:RPPA:IRF1:chr5:131817301:131826490:-:IRF-1",]))
rownames(annotdf)=colnames(fm)
annotdf=annotdf[!is.na(annotdf$CASP7),]
pdf("FigureS2B.pdf", height = 2.5, width = 2.5)
ggscatter(annotdf, x = "CytolyticScore", y = "CASP7",
size = 1.5,
add = "reg.line", # Add regression line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE # Add confidence interval
) +
stat_cor(method = "pearson", label.sep = "\n") +
xlab("Cytolytic Score") +
ylab("CASP7")
ggscatter(annotdf, x = "CytolyticScore", y = "IRF1",
size = 1.5,
add = "reg.line", # Add regression line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE # Add confidence interval
) +
stat_cor(method = "pearson", label.sep = "\n") +
xlab("Cytolytic Score") +
ylab("IRF1")
dev.off()