--- a
+++ b/FigS4EF_CIITA_methylation_validation_ERRBS.R
@@ -0,0 +1,240 @@
+library(methylSig)
+library(ggplot2)
+library(ComplexHeatmap)
+library(circlize)
+
+# Instructions to use the package (v 0.1):
+# http://sartorlab.ccmb.med.umich.edu/sites/default/files/methylSig.pdf
+# https://qcb.ucla.edu/wp-content/uploads/sites/14/2017/02/Workshop-6-WGBS-D1.pdf # check also 2 and 3
+# note, v 0.1 of the package is not available in https://github.com/sartorlab/methylSig. Current version of the package is very different.
+
+setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/Published_data_figures")
+
+# annotations:
+annot=read.delim("GSE86952_gsm2patient.txt", header = T, stringsAsFactors = F)
+annot$X.Sample_title=gsub("AML blast | AML blast ", "", annot$X.Sample_title)
+
+# use same samples as cancermap
+coords=read.delim("cancermap_GSE86952_AML_15pct_genes_BH-SNE_mean-shift_BW1.5.txt", header=T, stringsAsFactors = F)
+coords=coords[match(annot$X.Sample_title, as.character(coords$ID)),]
+annot=annot[!is.na(coords[,1]),]
+
+meth=get(load("meth_hlalow_pml.Rdata"))
+myDiffSigboth=get(load("myDiffSigboth_hlalow_pml.Rdata"))
+
+#******************************* visualize location **********************************
+
+# wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz
+# gunzip *.gz
+refGeneInfo = getRefgeneInfo(system.file("annotation", "refGene.txt",
+                                         package = "methylSig"))
+
+
+refGeneAnn = refGeneAnnotation(refGeneInfo, myDiffSigboth)
+
+cpgInfo = getCpGInfo("cpgIslandExt.txt")
+
+# DE methylated cites:
+myDiffDMCs = myDiffSigboth[myDiffSigboth[,"qvalue"] < 0.05
+                           & abs(myDiffSigboth[,"meth.diff"]) > 25,]
+
+# Annotate
+cpgAnn = cpgAnnotation(cpgInfo,myDiffSigboth)
+
+# annotate refgene
+refGeneInfo = getRefgeneInfo("refGene.txt")
+
+refGeneAnn = refGeneAnnotation(refGeneInfo, myDiffSigboth)
+
+# what kin are DMCs
+refGeneAnnotationPlot(refGeneAnn,main="ALL",
+                      priority=c("promoter","cds", "noncoding", "5'utr", "3'utr"))
+
+refGeneAnnotationPlot(refGeneAnnDmc, main="DMC",
+                      priority=c("promoter","cds", "noncoding", "5'utr", "3'utr"))
+
+# TF cites:
+tfbsInfo = getTFBSInfo("ENCODE_AwgTfbs.hg19.txt")
+
+DMCIndex = (myDiffSigboth[,"qvalue"] < 0.05
+            & abs(myDiffSigboth[,"meth.diff"]) > 25)
+
+pvalue = methylSig.tfbsEnrichTest(myDiffSigboth, DMCIndex, tfbsInfo, plot = F)
+
+highest=sort(pvalue)
+plot_h=data.frame("n"=names(highest)[highest<1E-5], "v"=-1*log10(highest)[highest<1E-5]) # wrote this out
+
+write.table(rownames(plot_h), "ENCODE_AwgTfbs.hg19_inDMC.txt", row.names = F, col.names = F, quote = F, sep="\t")
+
+system("grep -f Binomial_TFs_enriched_DMC.txt ENCODE_AwgTfbs.hg19.txt >ENCODE_AwgTfbs.hg19_inDMC.txt")
+
+tfbsInfo2 = getTFBSInfo("ENCODE_AwgTfbs.hg19_inDMC.txt")
+
+# CIITA and enhancers: chr16:10960573-11019050
+# CIITA only chr16:10970055-11018840
+
+pdf("FigureSE_CIITA_methylation_RRBS.pdf", width = 8, height = 12)
+
+p <-ggplot(plot_h, aes(n, v))
+p=p +geom_bar(stat = "identity", aes(fill = n), position = "dodge") +
+  xlab("TFbinding_site") + ylab("-log10 P-value") +
+  ggtitle("Binomial test, DMC vs TFbinding ENCODE") +
+  theme_bw() +
+  theme(axis.text.x=element_text(angle = 45, hjust = 1))
+p
+
+methylSigPlot(meth, "chr16", c(10960573, 11019050), groups=c(1,0),
+              cpgInfo=cpgInfo, refGeneInfo=refGeneInfo, myDiff=myDiffSigboth,
+              tfbsInfo=tfbsInfo, tfbsDense=T, sigQ=0.01, noGroupEst = F, noDataLine=T, cex=0.1)
+
+
+methylSigPlot(meth, "chr16", c(10960055, 10990055), groups=c(1,0),
+              cpgInfo=cpgInfo, refGeneInfo=refGeneInfo, myDiff=myDiffSigboth,
+              tfbsInfo=tfbsInfo, tfbsDense=T, sigQ=0.01, noGroupEst = F, noDataLine=T, cex=0.1)
+
+# only significant binding cites
+methylSigPlot(meth, "chr16", c(10960055, 10990055), groups=c(1,0),
+              cpgInfo=cpgInfo, refGeneInfo=refGeneInfo, myDiff=myDiffSigboth,
+              tfbsInfo=tfbsInfo2, tfbsDense=F, sigQ=0.01, noGroupEst = F, noDataLine=T, cex=0.1)
+
+dev.off()
+
+coord=data.frame(refGeneInfo[[2]], stringsAsFactors = F)
+
+coord=coord[grepl("^HLA", coord$gene)&!grepl("_", coord$chr)&!grepl("HLA-F", coord$gene),]
+coord$chr=as.character(coord$chr)
+
+for(i in 1:dim(coord)[1]){
+  pdf(paste0(getwd(), "/", coord$gene[i], "_methylation_RRBS.pdf"), width = 8, height = 12)
+  
+  
+  methylSigPlot(meth, chr = coord[i,1], loc.range = c(coord[i,3]-2000, coord[i,4]+2000), groups=c(1,0),
+                cpgInfo=cpgInfo, refGeneInfo=refGeneInfo, myDiff=myDiffSigboth,
+                tfbsInfo=tfbsInfo, tfbsDense=T, sigQ=0.01, noGroupEst = F, noDataLine=T, cex=0.1)
+  dev.off()
+  
+}
+
+
+#******************************* visualize as heatmap **********************************
+
+# summarize per group:
+
+# take DE meth regions and extract from data
+x = meth[,"numCs"]/meth[, "coverage"]
+colnames(x) = meth@sample.ids
+rownames(x) = meth@data.ids
+
+listInMeth = match(myDiffDMCs@data.ids, meth@data.ids)
+y = x[listInMeth,]
+colnames(y) = meth@sample.ids
+
+hlalow_samples=colnames(y)%in%hlalow
+HLAlow_mean=rowMeans(y[,hlalow_samples], na.rm=T)
+HLArest_mean=rowMeans(y[,!hlalow_samples], na.rm=T)
+
+y2=data.frame("chr"=as.character(myDiffDMCs@data.chr), "start"=myDiffDMCs@data.start, "end"=myDiffDMCs@data.end,"ID"=myDiffDMCs@data.ids, stringsAsFactors=F)
+
+refgene=data.frame(refGeneInfo[[2]], stringsAsFactors = F)
+i <- sapply(refgene, is.factor)
+refgene[i] <- lapply(refgene[i], as.character)
+
+refgene$start[refgene$strand=="+"]=refgene$start[refgene$strand=="+"]-1000
+refgene$end[refgene$strand=="-"]=refgene$end[refgene$strand=="-"]+1000
+refgene$start[refgene$start<0]=0
+
+# intersect this with methylation regions, return name of the region and gene, transcript name:
+bed=cbind(refgene[,1], refgene[,3], refgene[,4], refgene[,8], refgene[,9])
+
+temp1=tempfile()
+temp2=tempfile()
+temp3=tempfile()
+
+write.table(bed, temp1, row.names = F, col.names = F, quote = F, sep="\t")
+write.table(y2, temp2, row.names = F, col.names = F, quote = F, sep="\t")
+
+command=paste0("bedtools intersect -a ",temp1, " -b ",temp2, " -wa -wb |awk -v OFS=\"\t\" -F\"\t\" '{print $4, $5, $9}' |sort -u > ", temp3)
+cat(command,"\n")
+try(system(command))
+
+matched_data=read.delim(temp3, header = F, stringsAsFactors = F)
+unlink(c(temp1, temp2, temp3))
+
+y4=y[y2$ID%in%matched_data[grepl("CIITA|HLA-D", matched_data[,2]),3],]
+
+dat=y[,order(colnames(y)%in%hlalow, decreasing = T)]
+dat_sub=dat[rownames(dat)%in%rownames(y4),]
+
+fab=annot$X.Sample_characteristics_ch1.4
+cyto=annot$X.Sample_characteristics_ch1.5
+npm=annot$X.Sample_characteristics_ch1.18=="npm1: Pos"
+pml=annot$X.Sample_characteristics_ch1.9=="t(15;17): Pos"
+cbfb=annot$X.Sample_characteristics_ch1.11=="inv(16): Pos"
+runx1_t1=annot$X.Sample_characteristics_ch1.10=="t(8;21): Pos"
+mll=annot$X.Sample_characteristics_ch1.6=="t(v;11q23): Pos"
+del5q=grepl("cytogenetics: -5", annot$X.Sample_characteristics_ch1.5)
+idh1=annot$X.Sample_characteristics_ch1.23=="idh1: Pos"
+idh2=annot$X.Sample_characteristics_ch1.24=="idh2: Pos"
+dnmt3a=annot$X.Sample_characteristics_ch1.25=="dnmt3a: Pos"
+dnmt3a_type=annot$X.Sample_characteristics_ch1.26
+evi1=annot$X.Sample_characteristics_ch1.27=="evi1 overexpression: Pos"
+kit=annot$X.Sample_characteristics_ch1.22=="kit: Pos"
+cebpa=annot$X.Sample_characteristics_ch1.21 =="cebpa: Sil"
+cebpaDM=annot$X.Sample_characteristics_ch1.21 =="cebpa: DM" 
+kras=annot$X.Sample_characteristics_ch1.20=="kras: Pos"
+nras=annot$X.Sample_characteristics_ch1.19=="nras: Pos"
+flt1=annot$X.Sample_characteristics_ch1.17=="flt3 tkd: Pos"
+flt2=annot$X.Sample_characteristics_ch1.16=="flt3 itd: Pos"
+normal=annot$X.Sample_characteristics_ch1.15=="normal karyotype: Pos"
+complex=grepl("complex|Complex", annot$X.Sample_characteristics_ch1.5)
+
+load("GSE6891_immunoscores.Rdata")
+immunoscores=immunoscores[,match(annot$X.Sample_title, colnames(immunoscores))]
+colnames(immunoscores)=annot$X.Sample_title
+coords=read.delim("cancermap_GSE86952_AML_15pct_genes_BH-SNE_mean-shift_BW1.5.txt", header=T, stringsAsFactors = F)
+coords=coords[match(annot$X.Sample_title, as.character(coords$ID)),]
+
+hlalow=scan("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/GSE86952_AML_RRBS/HLAII_low_samples_Zscore_minus1.txt", "a")
+hlalow.s=(pml|annot$X.Sample_title%in%hlalow)*1
+ha=data.frame("HLAlow"=hlalow.s,pml,cebpa,cebpaDM,idh1,idh2,dnmt3a,mll, cbfb,runx1_t1,del5q, fab,cyto,npm,evi1,kit,kras,nras,flt1,flt2,normal,complex, t(immunoscores), "cluster"=as.character(coords$cluster), "CIITA Meth Mean"=colMeans(dat_sub, na.rm = T), stringsAsFactors = F)
+
+# remove MDS samples:
+dat=dat[,!is.na(coords$cluster)]
+dat_sub=dat_sub[,!is.na(coords$cluster)]
+
+ha=ha[!is.na(ha$cluster),]
+
+# order based on mutual exclusivity and HLA groups
+m=cbind(ha$HLAlow, ha$pml, ha$cebpa, ha$cebpaDM, ha$idh1, ha$idh2, ha$dnmt3a, ha$mll, ha$cbfb, ha$runx1_t1, ha$del5q,  ha$npm, ha$kras, ha$nras, ha$flt1, ha$flt2)
+rownames(m)=rownames(ha)
+colnames(m)=c("HLAIIlow", "PML-RARA", "CEBPA", "CEBPA_DM", "IDH1", "IDH2", "DNMT3A","MLL", "CBFB-MYH11", "RUNX1_RUNX1T1", "-5/7(q)", "NPM1", "KRAS", "NRAS", "FLT3tkd", "FLT3itd")
+
+m=m[do.call(order, -as.data.frame(m)),,drop=F]
+sample_ord=rownames(m)
+
+# order samples
+ha=ha[match(sample_ord, rownames(ha)),]
+dat=dat[,match(sample_ord, colnames(dat))]
+dat_sub=dat_sub[,match(sample_ord, colnames(dat_sub))]
+
+# add row annotations:
+cpgInfo = getCpGInfo("cpgIslandExt.txt")
+cpgAnn = cpgAnnotation(cpgInfo,myDiffSigboth)
+
+rowannot=cpgAnn[myDiffSigboth@data.ids%in%rownames(dat_sub)]
+rowannot=as.character(rowannot)
+rownames(dat_sub)=apply(cbind(rowannot,matched_data[match(rownames(dat_sub), matched_data[,3]),2]), 1, paste, collapse="_")
+
+# simplify annotations for heatmap:
+ha2=data.frame(ha$HLAlow,ha$fab,ha$DUFVA_HLAI_SCORE, ha$DUFVA_HLAII_SCORE, ha$DUFVA_CYTOLYTIC_SCORE,ha$CIITA, ha$normal,ha$complex, ha$cluster, ha$CIITA.Meth.Mean)
+
+# Final plot:
+pdf("FigureS4F_ComplexHeatmap_DMC_methylation.pdf", height = 15, width = 15)
+Heatmap(dat_sub, top_annotation = HeatmapAnnotation(df = ha2), name = "DMC", cluster_columns = F, cluster_rows = F, col = colorRamp2(c(0, 0.5, 1), c("brown","lightgray", "orange")), use_raster = F)+
+  Heatmap(t(scale(data.frame(ha$DUFVA_HLAI_SCORE, ha$DUFVA_HLAII_SCORE, ha$DUFVA_CYTOLYTIC_SCORE, ha$CIITA))),name = "DMC", cluster_columns = F, cluster_rows = F, use_raster = F)
+dev.off()
+
+source("/research/users/ppolonen/git_home/common_scripts/visualisation/oncoprint_memo.R")
+pdf("FigureS4F_ComplexHeatmap_DMC_methylation_genetics.pdf", height = 15, width = 15)
+oncoPrint(t(m), sort=F)
+dev.off()
\ No newline at end of file