HFmeRisk / Git / [d2c46b] /12-ELMER.R

Models:
RaymondKing/
HFmeRisk
Downloads: 1
[d2c46b]: / 12-ELMER.R
History
Download this file
363 lines (357 with data), 14.7 kB

#Installing 
# devtools::install_github(repo = "tiagochst/ELMER.data")
# devtools::install_github(repo = "tiagochst/ELMER")
if (!requireNamespace("BiocManager", quietly=TRUE))
  install.packages("BiocManager")
BiocManager::install("ELMER")
# BiocManager::install(version='devel')
# BiocManager::install("TCGAbiolinks")
# BiocManager::install("TCGAbiolinks", version = "2.5.9") 
# source("https://bioconductor.org/biocLite.R")
# biocLite("TCGAbiolinks")
library(ELMER)
library(MultiAssayExperiment)
library(ELMER.data)
library(sesameData)
library(tibble)
library(TCGAbiolinks)
#example
{
  distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
  head(distal.probes)
  data(LUSC_RNA_refined,package = "ELMER.data")
  data(LUSC_meth_refined,package = "ELMER.data")
  GeneExp[1:5,1:5]
  Meth[1:5,1:5]
  mae <- createMAE(exp = GeneExp, 
                   met = Meth,
                   save = TRUE,
                   linearize.exp = TRUE,
                   save.filename = "tmp.rda",
                   filter.probes = distal.probes,
                   met.platform = "450K",
                   genome = "hg19",
                   TCGA = TRUE)
  tmp = as.data.frame(colData(mae)[1:5,])
  View(tmp)
}
#data: RNA and methy
{
  setwd("E:/workplace/mywork/methy/dbgap/chf/data_chf_contr/early_chf/c1_UMN_JHU/RNA/")
  load(file="./Gene.Rdata")
  load(file="./exon.Rdata")
  exon[1:4,1:4];dim(exon)
  Gene[1:4,1:4];dim(Gene)
  exon$id <- paste("sample",exon$X,sep="_")
  exon <- column_to_rownames(exon,"id")
  exon <- exon[,-c(1:2)]
  Gene$id <- paste("sample",Gene$X,sep="_")
  Gene <- column_to_rownames(Gene,"id")
  Gene <- Gene[,-c(1:2)]
  
  setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/new_champ/")
  load("CorrectedBeta.Rdata")
  CorrectedBeta[1:4,1:4]
  dim(CorrectedBeta)
  data_cpg <- read.csv("../champ_rawdata_train.csv")
  colnames(CorrectedBeta) <- paste("sample",data_cpg$shareid,sep="_")
  
  id = setdiff(rownames(exon),colnames(CorrectedBeta))
  exon_data = exon[!(rownames(exon) %in% id),]
  exon_data <- t(exon_data)
  exon_data[1:4,1:4];dim(exon_data)
  
  Gene_data = Gene[!(rownames(Gene) %in% id),]
  Gene_data <- t(Gene_data)
  Gene_data[1:4,1:4];dim(Gene_data)
  
  CorrectedBeta <- subset(CorrectedBeta,select = colnames(exon_data))
  dim(CorrectedBeta);dim(exon_data);dim(Gene_data)
  CorrectedBeta[1:4,1:4]
  exon_data[1:4,1:4]
  Gene_data[1:4,1:4]
  save(CorrectedBeta,exon_data,Gene_data,file="result/MAE.Rdata")
  {
    library(stringr) 
    exon_data[1:4,1:5]
    ids=data.frame(ensembl_id=str_split(rownames(exon_data),'[.]',simplify = T)[,1],
                   median=apply(exon_data,1,median))
    head(ids)
    head(ids$ensembl_id)
    library(org.Hs.eg.db)
    g2s=unique(toTable(org.Hs.egSYMBOL))
    head(g2s)
    g2e=unique(toTable(org.Hs.egENSEMBL)) 
    head(g2e)
    s2e=merge(g2e,g2s,by='gene_id')
    head(s2e)
    table(ids$ensembl_id %in% s2e$symbol)
    # FALSE  TRUE 
    # 2866 15448 
    ids=ids[ids$ensembl_id %in% s2e$symbol,]
    ids$ENSEMBL=s2e[match(ids$ensembl_id,s2e$symbol),2]
    length(unique(ids$ENSEMBL))
    head(ids) 
    ids=ids[order(ids$ENSEMBL,ids$median,decreasing = T),]
    ids=ids[!duplicated(ids$ENSEMBL),]
    dim(ids) 
    exon_data = as.data.frame(exon_data)
    exon_data = rownames_to_column(exon_data,"ensembl_id")
    exon_data = merge(ids,exon_data,by="ensembl_id")
    exon_data = column_to_rownames(exon_data,"ENSEMBL")
    exon_data = exon_data[,-c(1,2)]
    exon_data[1:4,1:4]  
    dim(exon_data)
  }
  {
    library(stringr) 
    Gene_data[1:4,1:5]
    ids=data.frame(ensembl_id=str_split(rownames(Gene_data),'[.]',simplify = T)[,1],
                   median=apply(Gene_data,1,median))
    head(ids)
    head(ids$ensembl_id)
    library(org.Hs.eg.db)
    g2s=unique(toTable(org.Hs.egSYMBOL))
    head(g2s)
    g2e=unique(toTable(org.Hs.egENSEMBL)) 
    head(g2e)
    s2e=merge(g2e,g2s,by='gene_id')
    head(s2e)
    table(ids$ensembl_id %in% s2e$symbol)
    # FALSE  TRUE 
    # 2529 14870 
    ids=ids[ids$ensembl_id %in% s2e$symbol,]
    ids$ENSEMBL=s2e[match(ids$ensembl_id,s2e$symbol),2]
    length(unique(ids$ENSEMBL))
    head(ids) 
    ids=ids[order(ids$ENSEMBL,ids$median,decreasing = T),]
    ids=ids[!duplicated(ids$ENSEMBL),]
    dim(ids) 
    Gene_data = as.data.frame(Gene_data)
    Gene_data = rownames_to_column(Gene_data,"ensembl_id")
    Gene_data = merge(ids,Gene_data,by="ensembl_id")
    Gene_data = column_to_rownames(Gene_data,"ENSEMBL")
    Gene_data = Gene_data[,-c(1,2)]
    Gene_data[1:4,1:4]  
    dim(Gene_data)
  }
  dim(Gene_data);dim(exon_data)
  save(exon_data,Gene_data,file="result/MAE_addENSEMBL.Rdata")
}
#creat mea
{
  load("result/MAE_addENSEMBL.Rdata") #exon_data,Gene_data
  distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
  assay <- c(rep("DNA methylation", ncol(CorrectedBeta)),
             rep("Gene expression", ncol(exon_data)))
  primary <- c(colnames(CorrectedBeta),colnames(exon_data))
  data_cpg <- read.csv("../champ_rawdata_train.csv")
  data_cpg$shareid <- paste("sample",data_cpg$shareid,sep="_")
  colname <- data_cpg[colnames(CorrectedBeta) %in% data_cpg$shareid ,"Sample_Group"]
  colname <- c(colnames(CorrectedBeta),colnames(exon_data))
  sampleMap <- data.frame(assay,primary,colname)
  #distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
  colData <- data.frame(sample = colnames(CorrectedBeta))
  tmp = data_cpg[data_cpg$shareid %in% colData$sample ,]
  colData <- merge(colData,tmp[,c(1,11)],by.x="sample",by.y="shareid")
  colData = unique(colData)
  rownames(colData) <- colData$sample
  mae <- createMAE(exp = exon_data, 
                   met = CorrectedBeta,
                   save = TRUE,
                   filter.probes = distal.probes,
                   colData = colData,
                   #sampleMap = sampleMap,
                   linearize.exp = TRUE,
                   save.filename = "mae.rda",
                   met.platform = "450K",
                   genome = "hg19",
                   TCGA = FALSE)
  save(mae,"result/mae.rda")
}
#DMP
{
  mae <- get(load("result/mae.rda"))
  sig.diff <- get.diff.meth(data = mae, 
                            group.col = "Sample_Group",
                            group1 =  "chf",
                            group2 = "nochf",
                            # if supervised mode set to 1
                            mode = "supervised",
                            minSubgroupFrac = 1, 
                            sig.dif = 0,
                            diff.dir = "both", # “hypo” and “hyper” 
                            cores = 1, 
                            dir.out ="result", 
                            save =  FALSE ,
                            pvalue = 0.05#0.05#1
                            )
  head(sig.diff);
  dim(sig.diff)#pvalue = 0.05,19;0.5,8537
  write.table(sig.diff,"result/getMethdiff.both.probes.csv")#pvalue = 0.5,8537
  write.table(sig.diff,"result/getMethdiff.both.probes.significant.csv")#pvalue = 0.05,19
  ##"getMethdiff.hypo.probes.csv"            
  ##"getMethdiff.hypo.probes.significant.csv"
  save(sig.diff,file="result/sig_diff.Rdata")#pvalue = 0.5,8537
}
#Identifying putative probe-gene pairs
{
  setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/new_champ/")
  mae <- get(load("result/mae.rda"))
  #sig.diff <- read.csv("result/getMethdiff.both.probes.significant.csv",sep = " ")
  nearGenes <- GetNearGenes(data = mae, 
                            probes = sig.diff$probe, 
                            numFlankingGenes = 20) # 10 upstream and 10 dowstream genes
  save(nearGenes,file="result/nearGenes.Rdata")
  Hypo.pair <- get.pair(data = mae,
                        group.col = "Sample_Group",
                        group1 =  "chf",
                        group2 = "nochf",
                        nearGenes = nearGenes,
                        #minSubgroupFrac = 1,
                        mode = "unsupervised",#supervised or unsupervised
                        permu.dir = "result/permu",
                        #Please set to 100000 to get significant results
                        #permu.size = 100000,
                        raw.pvalue = 0.05,#1,
                        #Please set to 0.001 to get significant results
                        Pe = 0.05,#1, 
                        cores = 1,
                        #See preAssociationProbeFiltering function
                        filter.probes = FALSE, 
                        # filter.percentage = 0.01,
                        # filter.portion = 0.3,
                        dir.out = "result",
                        label = "both",
                        save = FALSE)
  dim(Hypo.pair)
  write.table(Hypo.pair,"result/getPair.both.all.pairs.statistic5.csv")
  write.table(Hypo.pair,"result/getPair.both.all.pairs.statistic.significant.csv")
  #"getPair.hypo.all.pairs.statistic.csv"                  
  #"getPair.hypo.pairs.significant.csv"                    
  #"getPair.hypo.pairs.statistic.with.empirical.pvalue.csv"
}
#motif
{
  # Load results from previous sections
  mae <- get(load("result/mae.rda"))
  Hypo.pair1 = read.csv("result/getPair.both.all.pairs.statistic1.csv",sep=" ")
  Hypo.pair2 = read.csv("result/getPair.both.all.pairs.statistic2.csv",sep=" ")
  Hypo.pair3 = read.csv("result/getPair.both.all.pairs.statistic3.csv",sep=" ")
  Hypo.pair4 = read.csv("result/getPair.both.all.pairs.statistic4.csv",sep=" ")
  Hypo.pair5 = read.csv("result/getPair.both.all.pairs.statistic5.csv",sep=" ")
  pair <- rbind(rbind(rbind(rbind(Hypo.pair1,Hypo.pair2),Hypo.pair3),Hypo.pair4),Hypo.pair5)
  head(pair) # significantly hypomethylated probes with putative target genes
  pair = Hypo.pair
  # Identify enriched motif for significantly hypomethylated probes which 
  # have putative target genes.
  enriched.motif <- get.enriched.motif(data = mae,
                                       #probes = pair$Probe, 
                                       probes = sig.diff$probe, 
                                       dir.out = "result", 
                                       label = "both",
                                       min.incidence = 0,
                                       lower.OR = 1,
                                       save = FALSE)
  names(enriched.motif)
  head(enriched.motif[names(enriched.motif)[1]]) ## probes in the given set that have the first motif.
  write.table(enriched.motif$FOSL2_HUMAN.H11MO.0.A,"result/getMotif.both.motif.enrichment.txt")
  save(enriched.motif,file="result/getMotif.both.enriched.motifs.rda")
  #"getMotif.hypo.enriched.motifs.rda"  "getMotif.both.motif.enrichment.csv" "motif.enrichment.pdf") 
}
#TF
{
  # Load results from previous sections
  mae <- get(load("mae.rda"))
  load("result/getMotif.both.enriched.motifs12.rda")
  TF <- get.TFs(data = mae, 
                group.col = "Sample_Group",
                group1 =  "chf",
                group2 = "nochf",
                mode = "unsupervised",
                enriched.motif = enriched.motif,
                dir.out = "result", 
                cores = 1, 
                label = "both",
                save =  FALSE )
  ##"getTF.hypo.TFs.with.motif.pvalue.rda"             
  ##"getTF.hypo.significant.TFs.with.motif.summary.csv"
  write.table(TF,"result/getTF.both.significant.TFs.with.motif.summary.csv")
  save(TF,file="result/getTF.both.TFs.with.motif.pvalue.rda")
}
#Scatter plots
{
  mae <- get(load("mae.rda"))
  load("result/getMotif.hypo.enriched.motifs.rda")
  scatter.plot(data = mae,
               byProbe = list(probe = c("cg27401945"), numFlankingGenes = 20), 
               category = "Sample_Group", 
               lm = TRUE, # Draw linear regression curve
               save = TRUE) 
  scatter.plot(data = mae,
               byPair = list(probe = c("cg27401945"), gene = c("ENSG00000148704")), 
               category = "Sample_Group", save = TRUE, lm_line = TRUE) 
  
  load("result/getMotif.hypo.enriched.motifs.rda")
  names(enriched.motif)[1]
  scatter.plot(data = mae,
               byTF = list(TF = c("VAX1","DIP2C"),
                           probe = enriched.motif[[names(enriched.motif)[2]]]), 
               category = "Sample_Group",
               save = TRUE, 
               lm_line = TRUE)
}
#XX 
{
  # Load results from previous sections
  mae <- get(load("mae.rda"))
  #pair <- read.csv("result/getPair.hypo.pairs.significant.csv")
  schematic.plot(pair = pair, 
                 data = mae,
                 group.col = "Sample_Group",
                 byProbe = pair$Probe[1],
                 save = FALSE)
  schematic.plot(pair = pair, 
                 data = mae,   
                 group.col = "Sample_Group", 
                 byGene = pair$GeneID[1],
                 save = FALSE)
}
#XX motif
{
  motif.enrichment.plot(motif.enrichment = enriched.motif,
                        significant = list(OR = 1.5,lowerOR = 1.3),
                        label = "both",
                        save = FALSE)
  motif.enrichment.plot(motif.enrichment = "result/getMotif.hypo.motif.enrichment.csv",
                        significant = list(OR = 1.5,lowerOR = 1.3),
                        label = "hypo",
                        summary = TRUE,
                        save = FALSE)  
}
#TF
{
  load("result/getTF.hypo.TFs.with.motif.pvalue.rda")
  motif <- colnames(TF.meth.cor)[1]
  TF.rank.plot(motif.pvalue = TF.meth.cor,
               motif = motif,
               save = FALSE)
}
#XX heatmap
{
  # Load results from previous sections
  mae <- get(load("mae.rda"))
  pair <- read.csv("result/getPair.hypo.pairs.significant.csv")
  heatmapPairs(data = mae, 
               group.col = "Sample_Group",
               group1 = "Chf", 
               #annotation.col = c("years_smoked","gender"),
               group2 = "Nochf",
               pairs = pair,
               filename =  NULL)
}

{
  library(plyr)
  A<-strsplit(as.character(names(enriched.motif)), "_")
  tmp2 <- mapply( cbind,  A )
  df0 <- ldply (tmp2[1,], data.frame)
  #median level of methylation estimated from all distal probes
  length(enriched.motif$FOSL2_HUMAN.H11MO.0.A)
  FOSL2 = enriched.motif$FOSL2_HUMAN.H11MO.0.A
  FOSL1 = enriched.motif$FOSL1_HUMAN.H11MO.0.A
}