HFmeRisk / Git / [d2c46b] /9-25CpG clusterProfiler.R

Models:
RaymondKing/
HFmeRisk
Downloads: 1
[d2c46b]: / 9-25CpG clusterProfiler.R
History
Download this file
450 lines (431 with data), 18.4 kB

setwd("E:/workplace/mywork/methy/dbgap/chf/data_chf_contr/early_chf/c1_UMN/alldata")
#methylation <- read.csv("methylation_probe_info.csv",sep=",")
#save(methylation,file="methylation.Rdata")
load(file="methylation.Rdata")

load(file="E:/workplace/mywork/methy/dbgap/chf/data_chf_contr/early_chf/c1_UMN/alldata/methylation.Rdata")
setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary")
id3 <- read.table("xgblasso_DMP.csv",sep=",",header = T)
head(id3)
id3 = as.character(id3$Feature)
venn_CpG = methylation[methylation$probe.id %in% id3[-c(2,6,17,20,21)],]
setdiff(id3[-c(2,6,17,20,21)],venn_CpG$probe.id)
#"cg06344265"
venn_CpG$probe.id = as.character(venn_CpG$probe.id)
venn_CpG$UCSC_RefGene_Name = as.character(venn_CpG$UCSC_RefGene_Name)
venn_CpG$Relation_to_UCSC_CpG_Island = as.character(venn_CpG$Relation_to_UCSC_CpG_Island)
add_cg = c("cg06344265",NA,NA,NA,NA,NA,NA,NA,NA,37,11,NA,NA,11,NA,NA,NA,NA,NA,NA,"GRIK4",NA,"TSS200",NA,"Open sea",NA,NA,NA,NA,NA,NA,NA)
venn_CpG = rbind(add_cg,venn_CpG)
write.table(venn_CpG,"venn_probe.csv",sep=",")
{
  #all dmp
  id3 = read.csv("new_champ/myDMP.txt",sep="\t")
  id3 = rownames_to_column(id3,"probe.id")#23581
  id3 <- as.character(id3$probe.id)
  venn_CpG = methylation[methylation$probe.id %in% id3,]
  setdiff(id3,venn_CpG$probe.id)
  
  id3 = read.csv("new_champ/myDMP.txt",sep="\t")
  id3 = rownames_to_column(id3,"probe.id")#23581
  logFC_cutoff2 <- with( id3, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )
  id3 <- id3[id3$adj.P.Val<0.05 & abs(id3$logFC) > logFC_cutoff2,]#318
  venn_CpG = methylation[methylation$probe.id %in% id3$probe.id,]
  setdiff(id3$probe.id,venn_CpG$probe.id)
  
}
#RefGene & Island
{
  venn_CpG$UCSC_RefGene_Group = as.character(venn_CpG$UCSC_RefGene_Group)
  venn_CpG$UCSC_RefGene_Name = as.character(venn_CpG$UCSC_RefGene_Name)
  venn_CpG$UCSC_RefGene_Group = ifelse(venn_CpG$UCSC_RefGene_Group == "", "IGR",venn_CpG$UCSC_RefGene_Group)
  venn_CpG$Relation_to_UCSC_CpG_Island = ifelse(venn_CpG$Relation_to_UCSC_CpG_Island == "", "Open sea",venn_CpG$Relation_to_UCSC_CpG_Island)
  
  #UCSC_RefGene_Group
  paste0(venn_CpG$UCSC_RefGene_Group,collapse=",")
  A<-strsplit(as.character(venn_CpG$UCSC_RefGene_Group), ";")
  tmp2 <- mapply( cbind,  A )
  df0 <- ldply (tmp2, data.frame)
  prop.table(table(df0$X..i..))
  #TSS200     IGR TSS1500    Body 1stExon   5'UTR 
  #0.050   0.200   0.250   0.275   0.050   0.175 
  #Relation_to_UCSC_CpG_Island
  paste0(venn_CpG$Relation_to_UCSC_CpG_Island,collapse=",")
  A<-strsplit(as.character(venn_CpG$Relation_to_UCSC_CpG_Island), ";")
  tmp2 <- mapply( cbind,  A )
  df1 <- ldply (tmp2, data.frame)
  prop.table(table(df1$X..i..))
  #Open sea  N_Shelf   Island  S_Shore  N_Shore  S_Shelf 
  #0.44     0.04     0.20     0.08     0.20     0.04 
  #all cpg--methylation
  methylation$UCSC_RefGene_Group = as.character(methylation$UCSC_RefGene_Group)
  methylation$Relation_to_UCSC_CpG_Island = as.character(methylation$Relation_to_UCSC_CpG_Island)
  methylation$UCSC_RefGene_Group = ifelse(methylation$UCSC_RefGene_Group == "", "IGR",methylation$UCSC_RefGene_Group)
  methylation$Relation_to_UCSC_CpG_Island = ifelse(methylation$Relation_to_UCSC_CpG_Island == "", "Open sea",methylation$Relation_to_UCSC_CpG_Island)
  
  A<-strsplit(as.character(methylation$UCSC_RefGene_Group), ";")
  tmp2 <- mapply( cbind,  A )
  df00 <- ldply (tmp2, data.frame)
  head(df00)
  prop.table(table(df00$X..i..))
  #IGR       Body      3'UTR    TSS1500      5'UTR    1stExon     TSS200 
  #0.13726348 0.35680909 0.03739231 0.15484313 0.13165007 0.06802518 0.11401675
  A<-strsplit(as.character(methylation$Relation_to_UCSC_CpG_Island), ";")
  tmp2 <- mapply( cbind,  A )
  df11 <- ldply (tmp2, data.frame)
  head(df11)
  prop.table(table(df11$X..i..))
  #Open sea    S_Shore    N_Shelf    S_Shelf    N_Shore     Island 
  #0.35281111 0.10314165 0.05029941 0.04472638 0.13191608 0.31710536 
}

#The 25 CpGs associated with HFrisk model
{
  library(ELMER)
  nearGenes <- GetNearGenes(data = mae, 
                            probes = id3[c(1,3:5,7:16,18:19,22:30)], 
                            numFlankingGenes = 20)
}
#enrichKEGG
{
  library(org.Hs.eg.db)
  library(clusterProfiler)
  library(plyr)
  library(ggplot2)
  library(forcats)
  library(enrichplot)
  library(ReactomePA)
  library(DOSE)
  library(org.Hs.eg.db)
  library(clusterProfiler)
  library(clusterProfiler)
  library(topGO)
  library(Rgraphviz)
  library(RDAVIDWebService)
  library(plyr)
  library(stringr)
  library(ggplot2)
    
  #enrichKEGG
  kegg_SYMBOL_hsa <- function(genes){ 
    gene.df <- bitr(genes, fromType = "SYMBOL",
                    toType = c("SYMBOL", "ENTREZID"),
                    OrgDb = org.Hs.eg.db)
    head(gene.df) 
    diff.kk <- enrichKEGG(gene         = gene.df$ENTREZID,
                          organism     = 'hsa',
                          pvalueCutoff = 0.99,
                          qvalueCutoff = 0.99
    )
    return( setReadable(diff.kk, OrgDb = org.Hs.eg.db,keyType = 'ENTREZID'))
  }
  paste0(venn_CpG$UCSC_RefGene_Name,collapse=",")
  A<-strsplit(as.character(venn_CpG$UCSC_RefGene_Name), ";")
  tmp2 <- mapply( cbind,  A )
  df0 <- ldply (tmp2, data.frame)
  trunk_kk=kegg_SYMBOL_hsa(df0[,1])
  trunk_df=trunk_kk@result
  write.csv(trunk_df,file = 'enrichKEGG_result.csv')
  #other
  y <- mutate(trunk_df, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
  parse_ratio <- function(ratio) {
    ratio <- sub("^\\s*", "", as.character(ratio))
    ratio <- sub("\\s*$", "", ratio)
    numerator <- as.numeric(sub("/\\d+$", "", ratio))
    denominator <- as.numeric(sub("^\\d+/", "", ratio))
    return(numerator/denominator)
  }
  y2 <- mutate(y, FoldEnrichment = parse_ratio(GeneRatio) / parse_ratio(BgRatio))
  y3 <- mutate(y2, geneRatio2 = parse_ratio(GeneRatio)) %>%
    arrange(desc(GeneRatio))
  tmp <- y2[1:20,]
  ggplot(tmp, showCategory = 20,aes(FoldEnrichment, fct_reorder(Description, FoldEnrichment))) + 
    geom_segment(aes(xend=0, yend = Description)) +
    geom_point(aes(color=pvalue, size = Count)) +
    scale_color_viridis_c(guide=guide_colorbar(reverse=TRUE)) +
    scale_size_continuous(range=c(2, 10)) +
    theme_minimal() + 
    xlab("Fold Enrichment") + #Rich Factor
    ylab(NULL) + 
    ggtitle("Enriched Pathway Ontology")
  write.csv(y2,file = 'enrichKEGG_result2.csv')
  write.csv(y3,file = 'enrichKEGG_result3.csv')
}
#enrichGO
{
  bit_gene <- function(genes){
    gene.df <- bitr(genes, fromType = "SYMBOL",
                    toType = c("SYMBOL", "ENTREZID"),
                    OrgDb = org.Hs.eg.db)
    return(gene.df$ENTREZID)
  }
  bp <- enrichGO(bit_gene( as.character(df0[,1])), ont="all",OrgDb = "org.Hs.eg.db")
  bpp = bp@result
  bp3 <- setReadable(bp, OrgDb = org.Hs.eg.db)
  barplot(bp, split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free")
  write.table(bp@result,"enrichGO_result.csv")
  bp = as.data.frame(bp)
  enrichMap(bp)
  goplot(bp, showCategory = 10)
  bp2 <-simplify(bp, cutoff=0.7,by="p.adjust", select_fun=min)
  as.data.frame(bp2)
  write.csv(bp2,file = 'enrichGO_simplify.csv')
}
#gometh: kegg and go;gsameth
{
  library(limma)
  library(minfi)
  library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
  library(IlluminaHumanMethylation450kmanifest)
  library(RColorBrewer)
  library(missMethyl)
  library(matrixStats)
  library(minfiData)
  
  setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary")
  id3 <- read.table("xgblasso_DMP.csv",sep=",",header = T)
  head(id3)
  id3 <- as.character(id3$Feature)
  
  # Get all the CpG sites used in the analysis to form the background
  load(file= "new_champ/CorrectedBeta.Rdata")
  all <-rownames(CorrectedBeta)  
  # Total number of CpG sites tested
  length(all)
  #methylGSA
  {
    library(methylGSA)
    library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
    library(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
    
    DEG_filter = read.csv("new_champ/myDMP.txt",sep="\t")
    DEG_filter = rownames_to_column(DEG_filter,"probe.id")#23581
    cpg.pval <- as.data.frame(DEG_filter[DEG_filter$probe.id %in% c(id3[-c(2,6,17,20,21)]),c("probe.id","P.Value")])#514
    c = paste(DEG_filter[DEG_filter$probe.id %in% c(id3[-c(2,6,17,20,21)]),"probe.id"],DEG_filter[DEG_filter$probe.id %in% c(id3[-c(2,6,17,20,21)]),"P.Value"],sep =  "=",collapse = ",")
    cpg.pval <- c(cg20051875=3.0845273397844e-09,cg24205914=9.48317106472548e-07,cg10083824=1.23286930624815e-06,cg03556243=4.95956762541123e-06,cg03233656=1.032434350337e-05,cg21429551=1.12835004011784e-05,cg16781992=1.33529811893581e-05,cg27401945=1.34687254097115e-05,cg08614290=3.97354136093555e-05,cg05363438=4.55923591304913e-05,cg08101977=5.07348656655275e-05,cg10556349=5.36252008917427e-05,cg13352914=9.88886388394135e-05,cg11853697=0.000116194512198325,cg00045910=0.000133863261077109,cg00495303=0.000141563110724823,cg00522231=0.000155960771094521,cg06344265=0.000258547818107834,cg21024264=0.000272396328465786,cg05481257=0.000326204702802846,cg23299445=0.000569220592642687,cg05845376=0.000637513022532287,cg17766026=0.000662632200376124,cg25755428=0.00083234067533089,cg07041999=0.000990852821362967)
    head(cpg.pval)
    library(org.Hs.eg.db)
    res1 = methylglm(cpg.pval = cpg.pval, #Names should be CpG IDs.
                     minsize = 200,
                     maxsize = 500, 
                     GS.type = "KEGG")#"GO", "KEGG", or "Reactome"
    #2 gene sets
    res6 = methylglm(cpg.pval = cpg.pval, 
                     GS.type = "Reactome", 
                     minsize = 100, 
                     maxsize = 500)
    #0 gene sets
    res7 = methylglm(cpg.pval = cpg.pval, 
                     GS.type = "GO", 
                     minsize = 100, 
                     maxsize = 500)
    #449 gene sets
    res2 = methylRRA(cpg.pval = cpg.pval, 
                     method = "ORA", 
                     minsize = 200, 
                     maxsize = 500)
    #259 gene sets
    res3 = methylRRA(cpg.pval = cpg.pval, 
                     method = "GSEA", 
                     minsize = 200, 
                     maxsize = 500)
    #259 gene sets
    res4 = methylgometh(cpg.pval = cpg.pval, 
                        sig.cut = 0.05, 
                        minsize = 200, 
                        maxsize = 500)
    data(CpG2Genetoy)
    head(CpG2Gene)  
    FullAnnot = prepareAnnot(CpG2Gene) 
    data(GSlisttoy)
    GS.list = GS.list[1:10]
    res5 = methylRRA(cpg.pval = cpg.pval, 
                     FullAnnot = FullAnnot, 
                     method = "ORA", 
                     GS.list = GS.list, 
                     GS.idtype = "SYMBOL", 
                     minsize = 100, 
                     maxsize = 300)
    #10 gene sets
  }
  #par(mfrow=c(1,1))
  gst <- gometh(sig.cpg = id3[-c(2,6,17,20,21)], all.cpg=all, collection = "GO",plot.bias=TRUE)
  GO_result = topGO(gst)
  topGSA(gst, n=10)
  write.table(GO_result,"gometh_toGO.csv",sep=",")
  kegg <- gometh(sig.cpg = id3[-c(2,6,17,20,21)], all.cpg = all, collection = "KEGG", prior.prob=TRUE)
  #kegg <- gometh(sig.cpg = set, all.cpg = all, collection = "KEGG", prior.prob=TRUE)
  KEGG_result = topKEGG(kegg)
  topGSA(kegg, n=10)
  #gst.kegg.prom <- gometh(sig.cpg=id3[-c(2,6,17,20,21)], all.cpg=all, collection="KEGG",genomic.features = c("TSS200","TSS1500","1stExon"))
  #topGSA(gst.kegg.prom, n=10)
  #gst.kegg.body <- gometh(sig.cpg=id3[-c(2,6,17,20,21)], all.cpg=all, collection="KEGG", genomic.features = c("Body"))
  #topGSA(gst.kegg.body, n=10)
  write.table(KEGG_result,"gometh_toKEGG.csv",sep=",")
  
  hallmark <- readRDS(url("http://bioinf.wehi.edu.au/MSigDB/v7.1/Hs.h.all.v7.1.entrez.rds"))
  gsa <- gsameth(sig.cpg=id3[-c(2,6,17,20,21)], all.cpg=all, collection=hallmark)
  GSA_result = topGSA(gsa)
  write.table(GSA_result,"gsameth_topGSA.csv",sep=",")
  
}
#3.enrichPathway
{
  x <- enrichPathway(gene = bit_gene(df0[,1]) ,pvalueCutoff=0.95, readable=T )
  X = as.data.frame(x)
  parse_ratio <- function(ratio) {
    ratio <- sub("^\\s*", "", as.character(ratio))
    ratio <- sub("\\s*$", "", ratio)
    numerator <- as.numeric(sub("/\\d+$", "", ratio))
    denominator <- as.numeric(sub("^\\d+/", "", ratio))
    return(numerator/denominator)
  }
  
  y <- mutate(X, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
  y3 <- mutate(y, geneRatio1 = parse_ratio(GeneRatio))
  y4 <- mutate(y3, geneRatio2 = parse_ratio(GeneRatio) / parse_ratio(BgRatio))
  library(ggplot2)
  library(forcats)
  library(enrichplot)
  tmp <- y4[1:20,]
  ggplot(tmp, showCategory = 20,aes(geneRatio2, fct_reorder(Description, geneRatio2))) + 
    geom_segment(aes(xend=0, yend = Description)) +
    geom_point(aes(color=pvalue, size = Count)) +
    scale_color_viridis_c(guide=guide_colorbar(reverse=TRUE)) +
    scale_size_continuous(range=c(2, 10)) +
    theme_minimal() + 
    xlab("Fold Enrichment") +
    ylab(NULL) + 
    ggtitle("Enriched Pathway Ontology")
  tmp$log_pvalue <- -log10(tmp$pvalue)
  ggplot(tmp, showCategory = 20,aes(log_pvalue, fct_reorder(Description, log_pvalue))) + 
    geom_segment(aes(xend=0, yend = Description)) +
    geom_point(aes(size = Count)) +
    scale_color_viridis_c(guide=guide_colorbar(reverse=TRUE)) +
    scale_size_continuous(range=c(2, 10)) +
    theme_minimal() + 
    xlab("Fold Enrichment") +
    ylab(NULL) + 
    ggtitle("Enriched Pathway Ontology")
  dotplot(x , color = "pvalue",showCategory = 20, font.size = 12, title = "")
  barplot(x, showCategory=10)
  emapplot(x,showCategory=20,color = "pvalue")
  heatplot(x,showCategory=20)
  #heatplot(x, foldChange=geneList)
  cnetplot(x, categorySize="pvalue")#foldChange=geneList
  upsetplot(x)
  write.csv(y4,file = 'enrichPathway.csv')
}
#4.enrichDO
{
  x <- enrichDO(gene          = bit_gene( df0[,1]), 
                ont           = "DO",
                pvalueCutoff  = 0.5,
                pAdjustMethod = "BH",
                universe      = x@universe,
                minGSSize     = 5,
                maxGSSize     = 500,
                qvalueCutoff  = 0.5,
                readable      = FALSE)
  x@result
  write.csv(x@result,file = 'enrichDO.csv')
  
  #plot
  emapplot(x)
  x <- setReadable(x, 'org.Hs.eg.db')
  head(x)
  cnetplot(x)
  write.csv(x,file = 'enrichDO2.csv')
}
#5.enrichNCG
{
  ncg <- enrichNCG(bit_gene(df0[,1]),pvalueCutoff = 0.95)
  head(ncg,10)
  dotplot(ncg, showCategory=30) + ggtitle("dotplot for GSEA")
  ridgeplot(ncg)
  gseaplot2(ncg, geneSetID = 1, title = ncg$Description[1])
  gseaplot2(ncg, geneSetID = 1:3)
  gseaplot2(ncg, geneSetID = 1:3, pvalue_table = TRUE,color = c("#E495A5", "#86B875", "#7DB0DD"), ES_geom = "dot")
  write.csv(ncg,file = 'enrichNCG.csv')
}
#6. enrichDGN
{
  dgn <- enrichDGN(bit_gene(df0[,1]),pvalueCutoff = 0.55)
  head(dgn,20)
  dgn <- data.frame(dgn)
  y <- mutate(dgn, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
  library(ggplot2)
  library(forcats)
  library(enrichplot)
  tmp <- y[1:20,]
  ggplot(tmp, showCategory = 20,aes(richFactor, fct_reorder(Description, richFactor))) + 
    geom_segment(aes(xend=0, yend = Description)) +
    geom_point(aes(color=pvalue, size = Count)) +
    scale_color_viridis_c(guide=guide_colorbar(reverse=TRUE)) +
    scale_size_continuous(range=c(2, 10)) +
    theme_minimal() + 
    xlab("rich factor") +
    ylab(NULL) + 
    ggtitle("Enriched Pathway Ontology")
  write.csv(y,file = 'enrichDGN.csv')
  edox <- setReadable(dgn, 'org.Hs.eg.db', 'ENTREZID')
  cnetplot(edox)
  emapplot(dgn)
}
#7.enricher
{
  library(magrittr)
  library(clusterProfiler)
  
  #data(geneList, package="DOSE")
  #gene <- names(geneList)[abs(geneList) > 2]
  wpgmtfile <- system.file("extdata/wikipathways-20180810-gmt-Homo_sapiens.gmt", package="clusterProfiler")
  #wpgmtfile <- system.file("extdata/c5.cc.v5.0.entrez.gmt", package="clusterProfiler")
  wp2gene <- read.gmt(wpgmtfile)
  head(wp2gene)
  wp2gene <- wp2gene %>% tidyr::separate(ont, c("name","version","wpid","org"), "%")
  wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE
  wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME
  ewp <- enricher(bit_gene(df0[,1]), TERM2GENE = wpid2gene, TERM2NAME = wpid2name)
  head(ewp)
  write.csv(ewp,file = 'enricher_gmt_Homo_sapiens.csv')
  #======
  gmtfile <- system.file("extdata", "c5.cc.v5.0.entrez.gmt", package="clusterProfiler")
  c5 <- read.gmt(gmtfile)
  egmt <- enricher(bit_gene(df0[,1]), TERM2GENE=c5)
  head(egmt)
}
#8.msigdbr:enricher
{
  library(msigdbr)
  
  m_df <- msigdbr(species = "Homo sapiens")
  head(m_df, 2) %>% as.data.frame
  m_t2g <- msigdbr(species = "Homo sapiens", category = "C3") %>% 
    dplyr::select(gs_name, entrez_gene)
  head(m_t2g)
  em <- enricher(bit_gene(df0[,1]), TERM2GENE=m_t2g)
  head(em)
}
#9.groupGO
{
  ggo <- groupGO(gene     = bit_gene(df0[,1]),
                 OrgDb    = org.Hs.eg.db,
                 ont      = "BP",
                 level    = 3,
                 readable = TRUE)
  
  ggo2 <- groupGO(gene     = bit_gene(df0[,1]),
                  OrgDb    = org.Hs.eg.db,
                  ont      = "CC",
                  level    = 3,
                  readable = TRUE)
  ggo3 <- groupGO(gene     = bit_gene(df0[,1]),
                  OrgDb    = org.Hs.eg.db,
                  ont      = "MF",
                  level    = 3,
                  readable = TRUE)
  
  head(ggo)
  write.csv(ggo,file = 'groupGO_BP.csv')
  write.csv(ggo2,file = 'groupGO_CC.csv')
  write.csv(ggo3,file = 'groupGO_MF.csv')
}
#10.search:KEGG
{
  library(clusterProfiler)
  search_kegg_organism('Collagen', by='kegg_code')
  ecoli <- search_kegg_organism('ece', by='scientific_name')
  dim(ecoli)
}
#11.enrichMKEGG
{
  mkk <- enrichMKEGG(gene = bit_gene(df0[,1]),organism = 'hsa')
}
#12.enrichMeSH
{
  library(meshes)
  x <- enrichMeSH(bit_gene(df0[,1]), MeSHDb = "MeSH.Hsa.eg.db", database='gendoo', category = 'C')
  head(x)
}