--- a +++ b/Functions/clusterProfiler.enricher.R @@ -0,0 +1,161 @@ +#### 使用clusterProfiler进行通路富集分析 + +#' @param gene 字符向量,感兴趣的基因集合, 如"FUT7" "AIRE" "IFI35" +#' @param geneType 字符型向量,指明输入的geneList中name属性类型,即基因id的类型 +#' @param db.type 字符型向量,指明GSEA针对的数据库 +#' @param saveDir gsea输入目录 +#' @param pAdjustMethod, pvalueCutoff p值矫正方法和阈值 +#' @param MsigDB.category 指定特定的MsigDB中的数据库进行GSEA分析 +#' MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i), 指定i为H, C1, C2, C3, C4, C5, C6, C7, C8 +# 通过gs_cat gs_subcat,来选择库 +#' @param GO.ont 指定GSEA分析GO数据库时使用的条目类型,支持"BP", "MF", and "CC",以及"ALL" + +#' 功能富集分析的原理是基于hypergeometric test +#' clusterProfiler主要基于EntrezID做分析 +#' 分析过程中的universe,即背景基因的数目,是跟你选择的db中的subcategory有关系的,而不是固定不变的 +cluterProfiler.enricher <- function(gene, geneType = c("ENTREZID", "SYMBOL", "ENSEMBL"), + db.type = c("MsigDB", "GO", "KEGG"), + saveDir, title, + pAdjustMethod = "BH", + qvalueCutoff = 0.2, + pvalueCutoff = 0.05, + MsigDB.category = list(H = c("All"), C2 = c("BIOCARTA", "KEGG", "REACTOME"), C5 = c("BP")), + GO.ont = "BP", simplify = T, + minGSSize = 10, maxGSSize = 500, + showCategory = 20 + ){ + require(enrichplot) + require(clusterProfiler) + require(tidyverse) + + + geneType <- match.arg(geneType) + db.type <- match.arg(db.type) + # 如果基因名字类型不是SYMBOL,则需要进行ID转换 + if(geneType == "ENTREZID"){ + cat("不需要转换\n") + geneList <- gene + } + if(geneType %in% c("SYMBOL", "ENSEMBL")){ + genes <- IDConvert(genes = gene, fromType = geneType, toType = "ENTREZID") + cat("转换前基因数目:", length(gene), "\n") + index <- match(gene, genes[,1]) + geneList <- as.character(genes[na.omit(index),2]) + cat("转换后基因数目:", length(geneList), "\n") + } + + ## 第一种富集策略:MsigDB基因集合 + if(db.type == "MsigDB"){ + library(msigdbr) + + # 通路提取 + MsigDB.pathway <- tibble(gs_name = NULL, entrez_gene = NULL) + + # 将MsigDB所有基因作为背景基因 + #MsigDB.gene <- unique(msigdbr(species = "Homo sapiens") %>% dplyr::select(gs_name,entrez_gene)) + + if(class(MsigDB.category) == "character"){ + MsigDB.pathway <- msigdbr(species = "Homo sapiens") %>% dplyr::select(gs_name, entrez_gene) + }else{ + for(i in names(MsigDB.category)){ + if(MsigDB.category[[i]][1] == "All"){ + MsigDB.pathway <- rbind(MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i) %>% dplyr::select(gs_name, entrez_gene)) + }else{ + for(j in MsigDB.category[[i]]){ + MsigDB.pathway <- rbind(MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i, subcategory = j) %>% dplyr::select(gs_name, entrez_gene)) + } + } + } + } + em.res <- enricher(geneList, TERM2GENE = MsigDB.pathway, qvalueCutoff = qvalueCutoff, pvalueCutoff = pvalueCutoff, pAdjustMethod = pAdjustMethod, minGSSize = minGSSize, maxGSSize = maxGSSize) + } + + if(db.type == "GO"){ + em.res <- enrichGO(gene = geneList, + OrgDb = org.Hs.eg.db, + ont = GO.ont,minGSSize = minGSSize, maxGSSize = maxGSSize, + pAdjustMethod = pAdjustMethod, + pvalueCutoff = pvalueCutoff, + qvalueCutoff = qvalueCutoff) + #基于simplify method, we can remove redundant terms + if(simplify){ + em.res <- simplify(em.res, cutoff = 0.7, by="p.adjust", select_fun = min) + } + } + + if(db.type == "KEGG"){ + em.res <- enrichKEGG(gene = geneList, organism = "hsa", keyType = "kegg", + pAdjustMethod = pAdjustMethod, minGSSize = minGSSize, maxGSSize = maxGSSize, + pvalueCutoff = pvalueCutoff, + qvalueCutoff = qvalueCutoff) + } + #转换为geneSymbol + em.res.geneSymbol <- setReadable(em.res, org.Hs.eg.db, keyType = "ENTREZID") + + # if(nrow(em.res@result) > 0){ + # write.csv(em.res.geneSymbol, file = file.path(saveDir, paste0(title, "_", "enricherResult.csv"))) + # tryCatch({ + # p1 <- barplot(em.res.geneSymbol, showCategory = showCategory, main = title) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4)) + # print(p1) + # p2 <- dotplot(em.res.geneSymbol, showCategory = showCategory, title = title) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4)) + # print(p2) + # p3 <- heatplot(em.res.geneSymbol, showCategory = showCategory) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4)) + # print(p3) + # }, warning = function(w){ + # # 这里是出现warning状态时,应该怎么做,可以用print打印出来,可以执行其它命令 + # cat("Warning in plot!\n") + # }, error = function(e){ + # # 这里时出现Error状态时,应该怎么做,可以用print打印出来,也可以执行其它命令 + # cat("Error in plot!\n") + # },finally = { + # # 这是运行正常时,应该怎么做,可以用print打印出来,也可以执行其它命令 + # cat("Plot complete!\n") + # }) + # } + return(em.res = list(em.res.entrezid = em.res, em.res.genesymbol = em.res.geneSymbol)) +} + + +##### ID转换 +#author: Zhilin long +#time: 2021.1.19 + +#' @author Zhilin Long +#' @param gene 字符型向量,需要转换的ID向量 +#' @param fromType 字符向量,初始的ID类型 +#' @param toType 字符向量,需要转换的ID类型 +#' +#' @return data.frame 已经去除了重复和NA,可直接使用 +IDConvert <- function(genes, + method = c("clusterProfiler", "org.Hs.eg.db"), + fromType = c("ENTREZID", "SYMBOL", "ENSEMBL"), + toType = c("ENTREZID", "SYMBOL", "ENSEMBL") + ){ + require(org.Hs.eg.db) + require(clusterProfiler) + + #规范参数输入 + method <- match.arg(method) + fromType <- match.arg(fromType) + toType <- match.arg(toType) + + if(method == "clusterProfiler"){ + gene.df <- bitr(genes, fromType = fromType, toType = toType, OrgDb = "org.Hs.eg.db") + #默认会过滤没有map到的id,即条目会减少 + }else{ + gene.df <- mapIds(org.Hs.eg.db, + keys = genes, + column = toType, + keytype = fromType, + multiVals="first") + gene.df <- data.frame(ENSEMBL = names(gene.df), SYMBOL = gene.df) + colnames(gene.df) <- c(fromType, toType) + #没有map,则为NA,即条目不变 + gene.df <- na.omit(gene.df) #去除NA的行 + } + + #去除重复, + gene.df <- gene.df[!duplicated(gene.df[,2]),] + gene.df <- gene.df[!duplicated(gene.df[,1]),] + return(gene.df) +} \ No newline at end of file