--- a
+++ b/12-ELMER.R
@@ -0,0 +1,363 @@
+#Installing 
+# devtools::install_github(repo = "tiagochst/ELMER.data")
+# devtools::install_github(repo = "tiagochst/ELMER")
+if (!requireNamespace("BiocManager", quietly=TRUE))
+  install.packages("BiocManager")
+BiocManager::install("ELMER")
+# BiocManager::install(version='devel')
+# BiocManager::install("TCGAbiolinks")
+# BiocManager::install("TCGAbiolinks", version = "2.5.9") 
+# source("https://bioconductor.org/biocLite.R")
+# biocLite("TCGAbiolinks")
+library(ELMER)
+library(MultiAssayExperiment)
+library(ELMER.data)
+library(sesameData)
+library(tibble)
+library(TCGAbiolinks)
+#example
+{
+  distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
+  head(distal.probes)
+  data(LUSC_RNA_refined,package = "ELMER.data")
+  data(LUSC_meth_refined,package = "ELMER.data")
+  GeneExp[1:5,1:5]
+  Meth[1:5,1:5]
+  mae <- createMAE(exp = GeneExp, 
+                   met = Meth,
+                   save = TRUE,
+                   linearize.exp = TRUE,
+                   save.filename = "tmp.rda",
+                   filter.probes = distal.probes,
+                   met.platform = "450K",
+                   genome = "hg19",
+                   TCGA = TRUE)
+  tmp = as.data.frame(colData(mae)[1:5,])
+  View(tmp)
+}
+#data: RNA and methy
+{
+  setwd("E:/workplace/mywork/methy/dbgap/chf/data_chf_contr/early_chf/c1_UMN_JHU/RNA/")
+  load(file="./Gene.Rdata")
+  load(file="./exon.Rdata")
+  exon[1:4,1:4];dim(exon)
+  Gene[1:4,1:4];dim(Gene)
+  exon$id <- paste("sample",exon$X,sep="_")
+  exon <- column_to_rownames(exon,"id")
+  exon <- exon[,-c(1:2)]
+  Gene$id <- paste("sample",Gene$X,sep="_")
+  Gene <- column_to_rownames(Gene,"id")
+  Gene <- Gene[,-c(1:2)]
+  
+  setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/new_champ/")
+  load("CorrectedBeta.Rdata")
+  CorrectedBeta[1:4,1:4]
+  dim(CorrectedBeta)
+  data_cpg <- read.csv("../champ_rawdata_train.csv")
+  colnames(CorrectedBeta) <- paste("sample",data_cpg$shareid,sep="_")
+  
+  id = setdiff(rownames(exon),colnames(CorrectedBeta))
+  exon_data = exon[!(rownames(exon) %in% id),]
+  exon_data <- t(exon_data)
+  exon_data[1:4,1:4];dim(exon_data)
+  
+  Gene_data = Gene[!(rownames(Gene) %in% id),]
+  Gene_data <- t(Gene_data)
+  Gene_data[1:4,1:4];dim(Gene_data)
+  
+  CorrectedBeta <- subset(CorrectedBeta,select = colnames(exon_data))
+  dim(CorrectedBeta);dim(exon_data);dim(Gene_data)
+  CorrectedBeta[1:4,1:4]
+  exon_data[1:4,1:4]
+  Gene_data[1:4,1:4]
+  save(CorrectedBeta,exon_data,Gene_data,file="result/MAE.Rdata")
+  {
+    library(stringr) 
+    exon_data[1:4,1:5]
+    ids=data.frame(ensembl_id=str_split(rownames(exon_data),'[.]',simplify = T)[,1],
+                   median=apply(exon_data,1,median))
+    head(ids)
+    head(ids$ensembl_id)
+    library(org.Hs.eg.db)
+    g2s=unique(toTable(org.Hs.egSYMBOL))
+    head(g2s)
+    g2e=unique(toTable(org.Hs.egENSEMBL)) 
+    head(g2e)
+    s2e=merge(g2e,g2s,by='gene_id')
+    head(s2e)
+    table(ids$ensembl_id %in% s2e$symbol)
+    # FALSE  TRUE 
+    # 2866 15448 
+    ids=ids[ids$ensembl_id %in% s2e$symbol,]
+    ids$ENSEMBL=s2e[match(ids$ensembl_id,s2e$symbol),2]
+    length(unique(ids$ENSEMBL))
+    head(ids) 
+    ids=ids[order(ids$ENSEMBL,ids$median,decreasing = T),]
+    ids=ids[!duplicated(ids$ENSEMBL),]
+    dim(ids) 
+    exon_data = as.data.frame(exon_data)
+    exon_data = rownames_to_column(exon_data,"ensembl_id")
+    exon_data = merge(ids,exon_data,by="ensembl_id")
+    exon_data = column_to_rownames(exon_data,"ENSEMBL")
+    exon_data = exon_data[,-c(1,2)]
+    exon_data[1:4,1:4]  
+    dim(exon_data)
+  }
+  {
+    library(stringr) 
+    Gene_data[1:4,1:5]
+    ids=data.frame(ensembl_id=str_split(rownames(Gene_data),'[.]',simplify = T)[,1],
+                   median=apply(Gene_data,1,median))
+    head(ids)
+    head(ids$ensembl_id)
+    library(org.Hs.eg.db)
+    g2s=unique(toTable(org.Hs.egSYMBOL))
+    head(g2s)
+    g2e=unique(toTable(org.Hs.egENSEMBL)) 
+    head(g2e)
+    s2e=merge(g2e,g2s,by='gene_id')
+    head(s2e)
+    table(ids$ensembl_id %in% s2e$symbol)
+    # FALSE  TRUE 
+    # 2529 14870 
+    ids=ids[ids$ensembl_id %in% s2e$symbol,]
+    ids$ENSEMBL=s2e[match(ids$ensembl_id,s2e$symbol),2]
+    length(unique(ids$ENSEMBL))
+    head(ids) 
+    ids=ids[order(ids$ENSEMBL,ids$median,decreasing = T),]
+    ids=ids[!duplicated(ids$ENSEMBL),]
+    dim(ids) 
+    Gene_data = as.data.frame(Gene_data)
+    Gene_data = rownames_to_column(Gene_data,"ensembl_id")
+    Gene_data = merge(ids,Gene_data,by="ensembl_id")
+    Gene_data = column_to_rownames(Gene_data,"ENSEMBL")
+    Gene_data = Gene_data[,-c(1,2)]
+    Gene_data[1:4,1:4]  
+    dim(Gene_data)
+  }
+  dim(Gene_data);dim(exon_data)
+  save(exon_data,Gene_data,file="result/MAE_addENSEMBL.Rdata")
+}
+#creat mea
+{
+  load("result/MAE_addENSEMBL.Rdata") #exon_data,Gene_data
+  distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
+  assay <- c(rep("DNA methylation", ncol(CorrectedBeta)),
+             rep("Gene expression", ncol(exon_data)))
+  primary <- c(colnames(CorrectedBeta),colnames(exon_data))
+  data_cpg <- read.csv("../champ_rawdata_train.csv")
+  data_cpg$shareid <- paste("sample",data_cpg$shareid,sep="_")
+  colname <- data_cpg[colnames(CorrectedBeta) %in% data_cpg$shareid ,"Sample_Group"]
+  colname <- c(colnames(CorrectedBeta),colnames(exon_data))
+  sampleMap <- data.frame(assay,primary,colname)
+  #distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
+  colData <- data.frame(sample = colnames(CorrectedBeta))
+  tmp = data_cpg[data_cpg$shareid %in% colData$sample ,]
+  colData <- merge(colData,tmp[,c(1,11)],by.x="sample",by.y="shareid")
+  colData = unique(colData)
+  rownames(colData) <- colData$sample
+  mae <- createMAE(exp = exon_data, 
+                   met = CorrectedBeta,
+                   save = TRUE,
+                   filter.probes = distal.probes,
+                   colData = colData,
+                   #sampleMap = sampleMap,
+                   linearize.exp = TRUE,
+                   save.filename = "mae.rda",
+                   met.platform = "450K",
+                   genome = "hg19",
+                   TCGA = FALSE)
+  save(mae,"result/mae.rda")
+}
+#DMP
+{
+  mae <- get(load("result/mae.rda"))
+  sig.diff <- get.diff.meth(data = mae, 
+                            group.col = "Sample_Group",
+                            group1 =  "chf",
+                            group2 = "nochf",
+                            # if supervised mode set to 1
+                            mode = "supervised",
+                            minSubgroupFrac = 1, 
+                            sig.dif = 0,
+                            diff.dir = "both", # “hypo” and “hyper” 
+                            cores = 1, 
+                            dir.out ="result", 
+                            save =  FALSE ,
+                            pvalue = 0.05#0.05#1
+                            )
+  head(sig.diff);
+  dim(sig.diff)#pvalue = 0.05,19;0.5,8537
+  write.table(sig.diff,"result/getMethdiff.both.probes.csv")#pvalue = 0.5,8537
+  write.table(sig.diff,"result/getMethdiff.both.probes.significant.csv")#pvalue = 0.05,19
+  ##"getMethdiff.hypo.probes.csv"            
+  ##"getMethdiff.hypo.probes.significant.csv"
+  save(sig.diff,file="result/sig_diff.Rdata")#pvalue = 0.5,8537
+}
+#Identifying putative probe-gene pairs
+{
+  setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/new_champ/")
+  mae <- get(load("result/mae.rda"))
+  #sig.diff <- read.csv("result/getMethdiff.both.probes.significant.csv",sep = " ")
+  nearGenes <- GetNearGenes(data = mae, 
+                            probes = sig.diff$probe, 
+                            numFlankingGenes = 20) # 10 upstream and 10 dowstream genes
+  save(nearGenes,file="result/nearGenes.Rdata")
+  Hypo.pair <- get.pair(data = mae,
+                        group.col = "Sample_Group",
+                        group1 =  "chf",
+                        group2 = "nochf",
+                        nearGenes = nearGenes,
+                        #minSubgroupFrac = 1,
+                        mode = "unsupervised",#supervised or unsupervised
+                        permu.dir = "result/permu",
+                        #Please set to 100000 to get significant results
+                        #permu.size = 100000,
+                        raw.pvalue = 0.05,#1,
+                        #Please set to 0.001 to get significant results
+                        Pe = 0.05,#1, 
+                        cores = 1,
+                        #See preAssociationProbeFiltering function
+                        filter.probes = FALSE, 
+                        # filter.percentage = 0.01,
+                        # filter.portion = 0.3,
+                        dir.out = "result",
+                        label = "both",
+                        save = FALSE)
+  dim(Hypo.pair)
+  write.table(Hypo.pair,"result/getPair.both.all.pairs.statistic5.csv")
+  write.table(Hypo.pair,"result/getPair.both.all.pairs.statistic.significant.csv")
+  #"getPair.hypo.all.pairs.statistic.csv"                  
+  #"getPair.hypo.pairs.significant.csv"                    
+  #"getPair.hypo.pairs.statistic.with.empirical.pvalue.csv"
+}
+#motif
+{
+  # Load results from previous sections
+  mae <- get(load("result/mae.rda"))
+  Hypo.pair1 = read.csv("result/getPair.both.all.pairs.statistic1.csv",sep=" ")
+  Hypo.pair2 = read.csv("result/getPair.both.all.pairs.statistic2.csv",sep=" ")
+  Hypo.pair3 = read.csv("result/getPair.both.all.pairs.statistic3.csv",sep=" ")
+  Hypo.pair4 = read.csv("result/getPair.both.all.pairs.statistic4.csv",sep=" ")
+  Hypo.pair5 = read.csv("result/getPair.both.all.pairs.statistic5.csv",sep=" ")
+  pair <- rbind(rbind(rbind(rbind(Hypo.pair1,Hypo.pair2),Hypo.pair3),Hypo.pair4),Hypo.pair5)
+  head(pair) # significantly hypomethylated probes with putative target genes
+  pair = Hypo.pair
+  # Identify enriched motif for significantly hypomethylated probes which 
+  # have putative target genes.
+  enriched.motif <- get.enriched.motif(data = mae,
+                                       #probes = pair$Probe, 
+                                       probes = sig.diff$probe, 
+                                       dir.out = "result", 
+                                       label = "both",
+                                       min.incidence = 0,
+                                       lower.OR = 1,
+                                       save = FALSE)
+  names(enriched.motif)
+  head(enriched.motif[names(enriched.motif)[1]]) ## probes in the given set that have the first motif.
+  write.table(enriched.motif$FOSL2_HUMAN.H11MO.0.A,"result/getMotif.both.motif.enrichment.txt")
+  save(enriched.motif,file="result/getMotif.both.enriched.motifs.rda")
+  #"getMotif.hypo.enriched.motifs.rda"  "getMotif.both.motif.enrichment.csv" "motif.enrichment.pdf") 
+}
+#TF
+{
+  # Load results from previous sections
+  mae <- get(load("mae.rda"))
+  load("result/getMotif.both.enriched.motifs12.rda")
+  TF <- get.TFs(data = mae, 
+                group.col = "Sample_Group",
+                group1 =  "chf",
+                group2 = "nochf",
+                mode = "unsupervised",
+                enriched.motif = enriched.motif,
+                dir.out = "result", 
+                cores = 1, 
+                label = "both",
+                save =  FALSE )
+  ##"getTF.hypo.TFs.with.motif.pvalue.rda"             
+  ##"getTF.hypo.significant.TFs.with.motif.summary.csv"
+  write.table(TF,"result/getTF.both.significant.TFs.with.motif.summary.csv")
+  save(TF,file="result/getTF.both.TFs.with.motif.pvalue.rda")
+}
+#Scatter plots
+{
+  mae <- get(load("mae.rda"))
+  load("result/getMotif.hypo.enriched.motifs.rda")
+  scatter.plot(data = mae,
+               byProbe = list(probe = c("cg27401945"), numFlankingGenes = 20), 
+               category = "Sample_Group", 
+               lm = TRUE, # Draw linear regression curve
+               save = TRUE) 
+  scatter.plot(data = mae,
+               byPair = list(probe = c("cg27401945"), gene = c("ENSG00000148704")), 
+               category = "Sample_Group", save = TRUE, lm_line = TRUE) 
+  
+  load("result/getMotif.hypo.enriched.motifs.rda")
+  names(enriched.motif)[1]
+  scatter.plot(data = mae,
+               byTF = list(TF = c("VAX1","DIP2C"),
+                           probe = enriched.motif[[names(enriched.motif)[2]]]), 
+               category = "Sample_Group",
+               save = TRUE, 
+               lm_line = TRUE)
+}
+#XX 
+{
+  # Load results from previous sections
+  mae <- get(load("mae.rda"))
+  #pair <- read.csv("result/getPair.hypo.pairs.significant.csv")
+  schematic.plot(pair = pair, 
+                 data = mae,
+                 group.col = "Sample_Group",
+                 byProbe = pair$Probe[1],
+                 save = FALSE)
+  schematic.plot(pair = pair, 
+                 data = mae,   
+                 group.col = "Sample_Group", 
+                 byGene = pair$GeneID[1],
+                 save = FALSE)
+}
+#XX motif
+{
+  motif.enrichment.plot(motif.enrichment = enriched.motif,
+                        significant = list(OR = 1.5,lowerOR = 1.3),
+                        label = "both",
+                        save = FALSE)
+  motif.enrichment.plot(motif.enrichment = "result/getMotif.hypo.motif.enrichment.csv",
+                        significant = list(OR = 1.5,lowerOR = 1.3),
+                        label = "hypo",
+                        summary = TRUE,
+                        save = FALSE)  
+}
+#TF
+{
+  load("result/getTF.hypo.TFs.with.motif.pvalue.rda")
+  motif <- colnames(TF.meth.cor)[1]
+  TF.rank.plot(motif.pvalue = TF.meth.cor,
+               motif = motif,
+               save = FALSE)
+}
+#XX heatmap
+{
+  # Load results from previous sections
+  mae <- get(load("mae.rda"))
+  pair <- read.csv("result/getPair.hypo.pairs.significant.csv")
+  heatmapPairs(data = mae, 
+               group.col = "Sample_Group",
+               group1 = "Chf", 
+               #annotation.col = c("years_smoked","gender"),
+               group2 = "Nochf",
+               pairs = pair,
+               filename =  NULL)
+}
+
+{
+  library(plyr)
+  A<-strsplit(as.character(names(enriched.motif)), "_")
+  tmp2 <- mapply( cbind,  A )
+  df0 <- ldply (tmp2[1,], data.frame)
+  #median level of methylation estimated from all distal probes
+  length(enriched.motif$FOSL2_HUMAN.H11MO.0.A)
+  FOSL2 = enriched.motif$FOSL2_HUMAN.H11MO.0.A
+  FOSL1 = enriched.motif$FOSL1_HUMAN.H11MO.0.A
+}
\ No newline at end of file