[40a513]: / Functions / clusterProfiler.enricher.R

Download this file

161 lines (144 with data), 7.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#### 使用clusterProfiler进行通路富集分析
#' @param gene 字符向量,感兴趣的基因集合, 如"FUT7" "AIRE" "IFI35"
#' @param geneType 字符型向量,指明输入的geneList中name属性类型,即基因id的类型
#' @param db.type 字符型向量,指明GSEA针对的数据库
#' @param saveDir gsea输入目录
#' @param pAdjustMethod, pvalueCutoff p值矫正方法和阈值
#' @param MsigDB.category 指定特定的MsigDB中的数据库进行GSEA分析
#' MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i), 指定i为H, C1, C2, C3, C4, C5, C6, C7, C8
# 通过gs_cat gs_subcat,来选择库
#' @param GO.ont 指定GSEA分析GO数据库时使用的条目类型,支持"BP", "MF", and "CC",以及"ALL"
#' 功能富集分析的原理是基于hypergeometric test
#' clusterProfiler主要基于EntrezID做分析
#' 分析过程中的universe,即背景基因的数目,是跟你选择的db中的subcategory有关系的,而不是固定不变的
cluterProfiler.enricher <- function(gene, geneType = c("ENTREZID", "SYMBOL", "ENSEMBL"),
db.type = c("MsigDB", "GO", "KEGG"),
saveDir, title,
pAdjustMethod = "BH",
qvalueCutoff = 0.2,
pvalueCutoff = 0.05,
MsigDB.category = list(H = c("All"), C2 = c("BIOCARTA", "KEGG", "REACTOME"), C5 = c("BP")),
GO.ont = "BP", simplify = T,
minGSSize = 10, maxGSSize = 500,
showCategory = 20
){
require(enrichplot)
require(clusterProfiler)
require(tidyverse)
geneType <- match.arg(geneType)
db.type <- match.arg(db.type)
# 如果基因名字类型不是SYMBOL,则需要进行ID转换
if(geneType == "ENTREZID"){
cat("不需要转换\n")
geneList <- gene
}
if(geneType %in% c("SYMBOL", "ENSEMBL")){
genes <- IDConvert(genes = gene, fromType = geneType, toType = "ENTREZID")
cat("转换前基因数目:", length(gene), "\n")
index <- match(gene, genes[,1])
geneList <- as.character(genes[na.omit(index),2])
cat("转换后基因数目:", length(geneList), "\n")
}
## 第一种富集策略:MsigDB基因集合
if(db.type == "MsigDB"){
library(msigdbr)
# 通路提取
MsigDB.pathway <- tibble(gs_name = NULL, entrez_gene = NULL)
# 将MsigDB所有基因作为背景基因
#MsigDB.gene <- unique(msigdbr(species = "Homo sapiens") %>% dplyr::select(gs_name,entrez_gene))
if(class(MsigDB.category) == "character"){
MsigDB.pathway <- msigdbr(species = "Homo sapiens") %>% dplyr::select(gs_name, entrez_gene)
}else{
for(i in names(MsigDB.category)){
if(MsigDB.category[[i]][1] == "All"){
MsigDB.pathway <- rbind(MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i) %>% dplyr::select(gs_name, entrez_gene))
}else{
for(j in MsigDB.category[[i]]){
MsigDB.pathway <- rbind(MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i, subcategory = j) %>% dplyr::select(gs_name, entrez_gene))
}
}
}
}
em.res <- enricher(geneList, TERM2GENE = MsigDB.pathway, qvalueCutoff = qvalueCutoff, pvalueCutoff = pvalueCutoff, pAdjustMethod = pAdjustMethod, minGSSize = minGSSize, maxGSSize = maxGSSize)
}
if(db.type == "GO"){
em.res <- enrichGO(gene = geneList,
OrgDb = org.Hs.eg.db,
ont = GO.ont,minGSSize = minGSSize, maxGSSize = maxGSSize,
pAdjustMethod = pAdjustMethod,
pvalueCutoff = pvalueCutoff,
qvalueCutoff = qvalueCutoff)
#基于simplify method, we can remove redundant terms
if(simplify){
em.res <- simplify(em.res, cutoff = 0.7, by="p.adjust", select_fun = min)
}
}
if(db.type == "KEGG"){
em.res <- enrichKEGG(gene = geneList, organism = "hsa", keyType = "kegg",
pAdjustMethod = pAdjustMethod, minGSSize = minGSSize, maxGSSize = maxGSSize,
pvalueCutoff = pvalueCutoff,
qvalueCutoff = qvalueCutoff)
}
#转换为geneSymbol
em.res.geneSymbol <- setReadable(em.res, org.Hs.eg.db, keyType = "ENTREZID")
# if(nrow(em.res@result) > 0){
# write.csv(em.res.geneSymbol, file = file.path(saveDir, paste0(title, "_", "enricherResult.csv")))
# tryCatch({
# p1 <- barplot(em.res.geneSymbol, showCategory = showCategory, main = title) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4))
# print(p1)
# p2 <- dotplot(em.res.geneSymbol, showCategory = showCategory, title = title) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4))
# print(p2)
# p3 <- heatplot(em.res.geneSymbol, showCategory = showCategory) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4))
# print(p3)
# }, warning = function(w){
# # 这里是出现warning状态时,应该怎么做,可以用print打印出来,可以执行其它命令
# cat("Warning in plot!\n")
# }, error = function(e){
# # 这里时出现Error状态时,应该怎么做,可以用print打印出来,也可以执行其它命令
# cat("Error in plot!\n")
# },finally = {
# # 这是运行正常时,应该怎么做,可以用print打印出来,也可以执行其它命令
# cat("Plot complete!\n")
# })
# }
return(em.res = list(em.res.entrezid = em.res, em.res.genesymbol = em.res.geneSymbol))
}
##### ID转换
#author: Zhilin long
#time: 2021.1.19
#' @author Zhilin Long
#' @param gene 字符型向量,需要转换的ID向量
#' @param fromType 字符向量,初始的ID类型
#' @param toType 字符向量,需要转换的ID类型
#'
#' @return data.frame 已经去除了重复和NA,可直接使用
IDConvert <- function(genes,
method = c("clusterProfiler", "org.Hs.eg.db"),
fromType = c("ENTREZID", "SYMBOL", "ENSEMBL"),
toType = c("ENTREZID", "SYMBOL", "ENSEMBL")
){
require(org.Hs.eg.db)
require(clusterProfiler)
#规范参数输入
method <- match.arg(method)
fromType <- match.arg(fromType)
toType <- match.arg(toType)
if(method == "clusterProfiler"){
gene.df <- bitr(genes, fromType = fromType, toType = toType, OrgDb = "org.Hs.eg.db")
#默认会过滤没有map到的id,即条目会减少
}else{
gene.df <- mapIds(org.Hs.eg.db,
keys = genes,
column = toType,
keytype = fromType,
multiVals="first")
gene.df <- data.frame(ENSEMBL = names(gene.df), SYMBOL = gene.df)
colnames(gene.df) <- c(fromType, toType)
#没有map,则为NA,即条目不变
gene.df <- na.omit(gene.df) #去除NA的行
}
#去除重复,
gene.df <- gene.df[!duplicated(gene.df[,2]),]
gene.df <- gene.df[!duplicated(gene.df[,1]),]
return(gene.df)
}