--- a
+++ b/Fig2_microenvironment_analysis.R
@@ -0,0 +1,356 @@
+GIT_HOME="/research/users/ppolonen/git_home/ImmunogenomicLandscape-BloodCancers/"
+source(file.path(GIT_HOME, "common_scripts/featurematrix/functions_generate_fm.R"))
+source(file.path(GIT_HOME, "common_scripts/visualisation/plotting_functions.R"))
+source(file.path(GIT_HOME, "common_scripts/statistics/functions_statistics.R"))
+source(file.path(GIT_HOME, "common_scripts/statistics/statistics_wrappers.R"))
+library(parallel)
+library(ggplot2)
+library(reshape2)
+library(RColorBrewer)
+library(ggrepel)
+library(gridExtra)
+library(survcomp)
+
+setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/Published_data_figures")
+
+# annotations
+annot=get(load("Hemap_immunology_Annotations.Rdata"))
+
+# data
+load("data9544_with_gene_symbols.RData")
+data = t(data[rownames(data)%in%annot$GSM.identifier..sample.,])
+profile=data.matrix(get(load("mixtureM_profile.Rdata")))
+profile[profile==-1] = 0
+profile=profile[,colnames(profile)%in%colnames(data)]
+
+# geometric mean:
+gm=annot$CytolyticScore
+
+# logical vectors for all diseases:
+subclass2=get.logical(annovector = list(annot$subclasses), filterv = annot$CELLS_SORTED==0&annot$Sample.type%in%c("Prolif", "Cancer"))
+tbly2=get.logical(annovector = list(annot$tbLY), filterv = annot$CELLS_SORTED==0&annot$Sample.type%in%c("Prolif", "Cancer"))
+tbly2=tbly2[!names(tbly2)%in%"Lymphoma_BCL"]
+
+list_cancers=lapply(c(subclass2, tbly2), list)
+list_cancers=unlist(list_cancers, recursive=F)
+
+l=c("BCL_DLBCL", "Lymphoma_BCL_CHL", "Lymphoma_BCL_MCL", "Lymphoma_BCL_FL","CLL", "CML", "AML","Prolif_Myeloproliferative_MDS", "pre-B-ALL",  "T-ALL")
+logicalvectors=list_cancers[match(l, names(list_cancers))]
+names(logicalvectors)=gsub("^BCL_|Lymphoma_BCL_|Prolif_Myeloproliferative_", "", names(logicalvectors))
+
+#********************************** compute summary statistics: **********************************
+
+# Then compute cor, cor.test and TNK FC for same cancers:
+res=do.call(cbind, mclapply(seq(logicalvectors), function(i){
+  logv=logicalvectors[[i]]
+  NAME=names(logicalvectors)[i]
+  cors=cor(t(data[,logv]), as.numeric(gm)[logv], method = "spearman", use = "complete.obs")
+}, mc.cores=10))
+
+NK_log=grepl("CD8|NaturalKiller", annot[,5])
+
+fc_res=do.call(cbind, mclapply(seq(logicalvectors), function(i){
+  logv=logicalvectors[[i]]
+  NAME=names(logicalvectors)[i]
+
+  FC=apply(data[,], 1, function(v){
+    mean(v[NK_log])-mean(v[logv])
+  })
+
+}, mc.cores=10))
+
+res_pval=do.call(cbind, mclapply(seq(logicalvectors), function(i){
+  logv=logicalvectors[[i]]
+  NAME=names(logicalvectors)[i]
+  pvals=-log10(apply(data[,logv,drop=F], 1, cor.test.p, as.numeric(gm)[logv]))
+  pvals[!is.finite(pvals)]=224
+  return(pvals)
+}, mc.cores=10))
+
+# these matrices can be used for plotting:
+colnames(res)=names(logicalvectors)
+colnames(fc_res)=names(logicalvectors)
+colnames(res_pval)=names(logicalvectors)
+
+# sorted samples, see if genes are expressed in cancer
+subclass2=get.logical(annovector = list(annot$subclasses), filterv = annot$CELLS_SORTED==1&annot$Sample.type%in%c("Prolif", "Cancer"))
+
+# diseases used minus FL with 7 samples
+l=c("AML", "BCL", "CLL", "CML", "pre-B-ALL", "Prolif_Myeloproliferative_MDS", "T-ALL")
+logicalvectors.sorted=subclass2[names(subclass2)%in%l]
+names(logicalvectors.sorted)=gsub("^BCL_|Lymphoma_BCL_|Prolif_Myeloproliferative_", "", names(logicalvectors.sorted))
+
+pval.sorted=fisher.matrix.lv(logicalVector.list = logicalvectors.sorted, data = profile, to.log10 = T, adjust.method = "BH", fisher.alternative = "greater")
+pval.sorted=pval.sorted[match(rownames(res), rownames(pval.sorted)),]
+
+# All tests
+save(list = c("pval.sorted", "fc_res", "res", "res_pval", "pval.sorted"), file = "Hemap_microenvironment_summary_statistics.Rdata")
+
+#*****************************************************************************************************
+
+# load from above R object:
+load("Hemap_microenvironment_summary_statistics.Rdata")
+
+res_all=do.call(rbind, mclapply(seq(logicalvectors), function(i){
+  logv=logicalvectors[[i]]
+  NAME=names(logicalvectors)[i]
+  
+  # correlate genes with cytolytic activity
+  pvals=apply(data[,logv,drop=F], 1, cor.test.p, as.numeric(gm)[logv])
+  FC=fc_res[match(names(pvals), rownames(fc_res)),NAME]
+  cors=res[match(names(pvals), rownames(res)),NAME]
+  
+  mean.exp=apply(data[,logv,drop=F], 1, median)
+  
+  data_cor=data.frame("gene"=names(cors), "Rho"=as.numeric(cors), "pval"=pvals, "adj.pval"=p.adjust(pvals), "FCtoTNK"=as.numeric(FC), "disease"=NAME, "mean.expression"=mean.exp, stringsAsFactors = F)
+  data_cor=data_cor[order(data_cor[,2], decreasing = T),]
+  
+  if(!dim(data_cor)[1])return(NULL)
+  
+  # significant
+  data_cor$significant=data_cor$adj.pval<0.01&abs(data_cor$FCtoTNK)>2&abs(data_cor$Rho)>0.2
+  
+  # add category:
+  data_cor$category=""
+  data_cor$category[data_cor$FCtoTNK>0&data_cor$Rho>0&data_cor$significant]="CTL/NK gene"
+  data_cor$category[data_cor$FCtoTNK<0&data_cor$Rho>0&data_cor$significant]="Stromal/cancer gene (Rho > 0)"
+  data_cor$category[data_cor$FCtoTNK<0&data_cor$Rho<0&data_cor$significant]="Stromal/cancer gene (Rho < 0)"
+  
+  return(data_cor)
+}, mc.cores=10))
+
+res_all.filt=res_all[res_all$significant,]
+
+res_all.filt=res_all.filt[order(res_all.filt$category, abs(res_all.filt$Rho), abs(res_all.filt$FCtoTNK), -res_all.filt$adj.pval, decreasing = T),]
+
+write.table(res_all[res_all$gene%in%res_all.filt$gene,], "Hemap_cytolytic_correlated_genes_TableS2.txt", col.names=T,row.names=F, quote = F, sep="\t")
+
+for(i in 2:5)res_all.filt[,i]=prettyNum(signif(res_all.filt[,i], 2))
+
+write.table(res_all.filt, "TableS2.txt", col.names=T,row.names=F, quote = F, sep="\t")
+save(res_all.filt, file="Hemap_cytolytic_correlated_genes_TableS2_onlysignif.Rdata")
+
+#***************************************************************************************************************************
+fun.plot=function(dat, main, xlab, ylab, aval=2, bval=0.2,cval=0.01,genelist=NULL, MAX=25, SIZE=3, T.SIZE=8, height=7.5, width=6){
+
+  if(!is.null(genelist)){
+    dat$significant=dat$gene%in%genelist
+  }else{
+    dat$significant=dat$adj.pval<cval&abs(dat$Rho)>bval&abs(dat$FCtoTNK)>aval
+  }
+  
+  groups=unique(dat$category)
+  dat$plot=F
+  
+  for(g in groups){
+    a2=dat$FCtoTNK[dat$category%in%g]
+    b2=dat$Rho[dat$category%in%g]
+    gene=dat$gene[dat$category%in%g]
+    test=tail(gene[order(abs(a2^2*b2^2))], MAX)
+    dat$plot[dat$gene%in%test&dat$category%in%g&dat$significant]=T
+  }
+  
+  dat=dat[order(dat$plot),]
+  
+  dat$size.point=ifelse(dat$plot, 2.4, 0.6)
+  
+  myColors <- data.frame(c("grey85","#E41A1C", "grey85","#377EB8"), c("", "CTL/NK gene", "Stromal/cancer gene (Rho < 0)", "Stromal/cancer gene (Rho > 0)"), stringsAsFactors = F)
+  myColors=myColors[match(unique(dat$category), myColors[,2]),1]
+  names(myColors) <- unique(dat$category)
+  
+  colScale <- scale_colour_manual(name = "category", values = myColors)
+  
+  library(ggrastr)
+  
+  p=ggplot(dat, aes(FCtoTNK, Rho, colour = category)) +
+    geom_point_rast(aes(size = dat$size.point), raster.width = width, raster.height = height) +
+    scale_size_continuous(range = c(0.6, 2.5))+
+    colScale +
+    theme_classic(base_size = T.SIZE) +
+    ggtitle(main) +
+    labs(x = xlab, y = ylab, fill = "") +
+    scale_y_continuous(breaks = pretty(dat$Rho, n = 6))+
+    
+    geom_text_repel(
+      data = subset(dat, dat$plot),
+      aes(label = subset(dat, dat$plot)$gene),
+      size = SIZE,
+      color="black",
+      force=1.5,
+      segment.size = 0.1,
+      box.padding = unit(0.01, "lines"),
+      point.padding = unit(0.01, "lines"),
+      family="Helvetica"
+    )+
+    theme(legend.position = "none", legend.direction = "horizontal",
+          axis.line = element_line(size=0.5, colour = "black"),
+          panel.grid.major = element_line(colour = "#d3d3d3"),
+          panel.grid.minor = element_blank(),
+          panel.border = element_blank(), panel.background = element_blank(),
+          plot.title = element_text(size = T.SIZE, face = "bold", family="Helvetica"),
+          text=element_text()
+    )+
+    theme(axis.text.x = element_text(colour="grey20",size=T.SIZE,face="plain", family="Helvetica"),
+          axis.text.y = element_text(colour="grey20",size=T.SIZE,face="plain", family="Helvetica"),  
+          axis.title.x = element_text(colour="grey20",size=T.SIZE,face="plain", family="Helvetica"),
+          axis.title.y = element_text(colour="grey20",size=T.SIZE,face="plain", family="Helvetica"))  
+  return(p)
+}
+
+
+# Figure2A:
+res_comb=d13=rowMeans(res[,c("DLBCL", "MCL", "FL", "CHL")])
+fc_res_comb=rowMeans(fc_res[,c("DLBCL", "MCL", "FL", "CHL")])
+
+pval_comb=apply(10^-res_pval[,c("DLBCL", "MCL", "FL", "CHL")], 1, combine.test, method = "z.transform")
+
+data_comb=data.frame("gene"=names(res_comb), "Rho"=as.numeric(res_comb), "adj.pval"=pval_comb, "FCtoTNK"=as.numeric(fc_res_comb), "disease"="BCL", stringsAsFactors = F)
+
+# significant
+data_comb$significant=data_comb$adj.pval<0.01&abs(data_comb$FCtoTNK)>1.5&abs(data_comb$Rho)>0.2
+
+# add category:
+data_comb$category=""
+data_comb$category[data_comb$FCtoTNK>0&data_comb$Rho>0&data_comb$significant]="CTL/NK gene"
+data_comb$category[data_comb$FCtoTNK<0&data_comb$Rho>0&data_comb$significant]="Stromal/cancer gene (Rho > 0)"
+data_comb$category[data_comb$FCtoTNK<0&data_comb$Rho<0&data_comb$significant]="Stromal/cancer gene (Rho < 0)"
+
+# these were picked from above analysis, as they were linked to immunological regulation and myeloid infiltrate
+monoc=c(c("CD14", "LYZ", "S100A8", "CD68", "CD163","MS4A4A", "MS4A6A","TLR8","LILRB2","IDO1","IFI27", "CXCL9", "CXCL10", "CXCL11", "CCL8", "CCL18", "CCL19", "TNFSF13B","CXCL13", "C1QA", "C1QB", "C1QC", "HBB"), c("IFNG", "CCL5","TNFSF10", "CD226", "CD160","KLRF1","KLRG1", "KLRC3", "EOMES", "GZMA", "PRF1", "CD8A"))
+
+lymphomagenes=unique(res_all.filt$gene[res_all.filt$disease%in%c("DLBCL", "MCL", "FL", "CHL")])
+
+NAME="Figure2A.pdf"
+dat=fun.plot(dat = data_comb,
+             main="B cell lymphomas",
+             genelist=unique(monoc[monoc%in%lymphomagenes]),
+             # genelist2=unique(lymphomagenes),
+             aval=1.5, 
+             bval=0.35,
+             cval = 0.01,
+             MAX = 60,
+             SIZE=2.5,
+             T.SIZE = 10,
+             ylab="Average correlation with cytolytic score",
+             xlab= paste0("Average expression fold change\n(CTL/NK cells vs. ", "BCL", ")"))
+
+ggsave(filename = NAME, plot = dat, width = 85, height = 120, units = "mm", dpi=150)
+
+
+# FigureS2:
+NAME="FigureS2C.pdf"
+p.all=lapply(seq(logicalvectors)[-4], function(i){
+  fun.plot(dat = res_all[res_all$disease%in%names(logicalvectors)[i],],
+           main=names(logicalvectors)[i],
+           aval=2, 
+           bval=0.2,
+           cval = 0.01,
+           MAX = 20,
+           SIZE=1.75,
+           T.SIZE = 8,
+           ylab="correlation with cytolytic activity",
+           xlab= paste0("expression fold change\n(CTL/NK cells vs. ", names(logicalvectors)[i], ")"))
+})
+ggsave(NAME, ggpubr::ggarrange(plotlist = p.all, ncol = 3, nrow = 3),  width = 152, height = 200, units = "mm", dpi=300)
+
+# plot dotplot:
+data.plot=data.frame(res_all$gene, res_all$disease, res_all$Rho, abs(res_all$FCtoTNK), "category"=res_all$category)
+data.plot=data.plot[data.plot$res_all.gene%in%res_all.filt$gene,]
+colnames(data.plot)=c("features", "id", "variable.1", "variable.2", "category")
+data.plot$id=factor(data.plot$id,  levels=c("DLBCL","CHL", "FL", "MCL", "CLL", "CML", "AML", "MDS", "pre-B-ALL", "T-ALL"))
+
+#************************************ Figure 2B: ***********************************
+
+TNK.genes=rownames(pval.sorted)[rowSums(pval.sorted>2)==0&rownames(pval.sorted)%in%res_all.filt$gene[res_all.filt$category=="CTL/NK gene"]]
+
+monoc=c(c("IFNG", "CCL5","TNFSF10", "CD226", "CD160","KLRF1","KLRG1", "KLRC3", "EOMES"), c("CD14", "LYZ", "S100A8", "CD68", "CD163","MS4A4A", "MS4A6A","TLR8","LILRB2","IDO1","IFI27", "CXCL9", "CXCL10", "CXCL11", "CCL8", "CCL18", "CCL19", "TNFSF13B","CXCL13", "C1QA", "C1QB", "C1QC", "HBB"))
+
+d=data.plot[data.plot$features%in%monoc,]
+d=d[order(match(d$features, monoc)),]
+
+d$features=factor(as.character(d$features), levels = monoc)
+
+# pdf("Hemap_microenvironment_Figure2_stroma.pdf", height = 4, width = 3.25)
+# plot.DotPlot.df(data.plot = d, name.variable.1 = "Rho", name.variable.2 = "FC", cols = c("blue", "white", "red"), dot.scale = 3, col.min = -0.5, col.max = 0.75)
+# dev.off()
+# 
+# pdf("Hemap_microenvironment_TNK_genes.pdf", height = 7, width = 3.25)
+# plot.DotPlot.df(data.plot = data.plot[data.plot$features%in%TNK.genes,], name.variable.1 = "Rho", name.variable.2 = "FC", cols = c("blue", "white", "red"), dot.scale = 3, col.min = -0.5, col.max = 0.75)
+# dev.off()
+
+# plot dotplot:
+data.plot=data.frame(res_all$gene, res_all$disease, res_all$Rho, res_all$mean.expression, "category"=res_all$category)
+data.plot=data.plot[data.plot$res_all.gene%in%res_all.filt$gene,]
+colnames(data.plot)=c("features", "id", "variable.1", "variable.2", "category")
+data.plot$id=factor(data.plot$id,  levels=c("DLBCL","CHL", "FL", "MCL", "CLL", "CML", "AML", "MDS", "pre-B-ALL", "T-ALL"))
+
+d=data.plot[data.plot$features%in%monoc,]
+d=d[order(match(d$features, monoc)),]
+d$features=factor(as.character(d$features), levels = monoc)
+
+
+pdf("Figure2B.pdf", height = 4, width = 4.25)
+plot.DotPlot.df(data.plot = d, name.variable.1 = "Rho", name.variable.2 = "Median expression", cols = c("blue", "white", "red"), dot.scale = 3, col.min = -0.5, col.max = 0.75)
+dev.off()
+
+
+#*********************** Plot genes in normal cells ************************
+normals2=get.logical(annovector = list(annot$immunoNormals),filterv = !annot$immunoNormals=="")
+normals2=normals2[!names(normals2)%in%c("TcellActivatedMononuclear", "LangerhansCell", "Gamma-delta-Tcell", "AlveolarMacrophage")]
+
+normals=get.logical(annovector = list(annot$plotNormals),filterv = !annot$plotNormals=="")
+
+normals=append(normals, get.logical(annovector = list(annot$immunoNormals),filterv = grepl("Macrophage", annot$immunoNormals)))
+normals=normals[!names(normals)%in%c("PBMC", "Langerhans Cell", "Langerhans cell",  "Gamma-delta-Tcell", "Macrophage", "AlveolarMacrophage")]
+
+normals=append(normals, normals2[names(normals2)%in%"CD4+Tcell"])
+
+median_res=do.call(cbind, mclapply(seq(normals), function(i){
+  logv=normals[[i]]
+  NAME=names(normals)[i]
+  
+  FC=apply(data[,], 1, function(v){
+    median(v[logv])
+  })
+  
+}, mc.cores=10))
+
+colnames(median_res)=names(normals)
+
+subclass2=get.logical(annovector = list(annot$subclasses), filterv = annot$CELLS_SORTED==1&annot$Sample.type%in%c("Prolif", "Cancer"))
+l=c("AML", "BCL", "CLL", "CML", "pre-B-ALL", "Prolif_Myeloproliferative_MDS", "T-ALL")
+logicalvectors.sorted=subclass2[names(subclass2)%in%l]
+names(logicalvectors.sorted)=gsub("^BCL_|Lymphoma_BCL_|Prolif_Myeloproliferative_", "", names(logicalvectors.sorted))
+
+median_res_sorted=do.call(cbind, mclapply(seq(logicalvectors.sorted), function(i){
+  logv=logicalvectors.sorted[[i]]
+  NAME=names(logicalvectors.sorted)[i]
+  
+  FC=apply(data[,], 1, function(v){
+    median(v[logv])
+  })
+  
+}, mc.cores=10))
+
+colnames(median_res_sorted)=names(logicalvectors.sorted)
+
+# picked, plot
+d=t(scale(t(median_res[match(monoc, rownames(median_res)),])))
+d[d>2]=2
+d[d<(-2)]=2
+
+pdf("Figre2B_NormalCells.pdf", width = 4, height = 7)
+Heatmap(d, cluster_rows = F)
+Heatmap(median_res[match(monoc, rownames(median_res)),], cluster_rows = F)
+Heatmap(median_res_sorted[match(monoc, rownames(median_res_sorted)),], cluster_rows = F)
+dev.off()
+
+d=t(scale(t(median_res[match(TNK.genes, rownames(median_res_sorted)),])))
+d[d>2]=2
+d[d<(-2)]=2
+
+# pdf("Fig2_normal_cells_allTNK_genes.pdf", width = 4, height = 15)
+# Heatmap(d, cluster_rows = F)
+# Heatmap(median_res[match(TNK.genes, rownames(median_res)),], cluster_rows = F)
+# Heatmap(median_res_sorted[match(TNK.genes, rownames(median_res_sorted)),], cluster_rows = F)
+# dev.off()