|
a |
|
b/Functions/clusterProfiler.enricher.R |
|
|
1 |
#### 使用clusterProfiler进行通路富集分析 |
|
|
2 |
|
|
|
3 |
#' @param gene 字符向量,感兴趣的基因集合, 如"FUT7" "AIRE" "IFI35" |
|
|
4 |
#' @param geneType 字符型向量,指明输入的geneList中name属性类型,即基因id的类型 |
|
|
5 |
#' @param db.type 字符型向量,指明GSEA针对的数据库 |
|
|
6 |
#' @param saveDir gsea输入目录 |
|
|
7 |
#' @param pAdjustMethod, pvalueCutoff p值矫正方法和阈值 |
|
|
8 |
#' @param MsigDB.category 指定特定的MsigDB中的数据库进行GSEA分析 |
|
|
9 |
#' MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i), 指定i为H, C1, C2, C3, C4, C5, C6, C7, C8 |
|
|
10 |
# 通过gs_cat gs_subcat,来选择库 |
|
|
11 |
#' @param GO.ont 指定GSEA分析GO数据库时使用的条目类型,支持"BP", "MF", and "CC",以及"ALL" |
|
|
12 |
|
|
|
13 |
#' 功能富集分析的原理是基于hypergeometric test |
|
|
14 |
#' clusterProfiler主要基于EntrezID做分析 |
|
|
15 |
#' 分析过程中的universe,即背景基因的数目,是跟你选择的db中的subcategory有关系的,而不是固定不变的 |
|
|
16 |
cluterProfiler.enricher <- function(gene, geneType = c("ENTREZID", "SYMBOL", "ENSEMBL"), |
|
|
17 |
db.type = c("MsigDB", "GO", "KEGG"), |
|
|
18 |
saveDir, title, |
|
|
19 |
pAdjustMethod = "BH", |
|
|
20 |
qvalueCutoff = 0.2, |
|
|
21 |
pvalueCutoff = 0.05, |
|
|
22 |
MsigDB.category = list(H = c("All"), C2 = c("BIOCARTA", "KEGG", "REACTOME"), C5 = c("BP")), |
|
|
23 |
GO.ont = "BP", simplify = T, |
|
|
24 |
minGSSize = 10, maxGSSize = 500, |
|
|
25 |
showCategory = 20 |
|
|
26 |
){ |
|
|
27 |
require(enrichplot) |
|
|
28 |
require(clusterProfiler) |
|
|
29 |
require(tidyverse) |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
geneType <- match.arg(geneType) |
|
|
33 |
db.type <- match.arg(db.type) |
|
|
34 |
# 如果基因名字类型不是SYMBOL,则需要进行ID转换 |
|
|
35 |
if(geneType == "ENTREZID"){ |
|
|
36 |
cat("不需要转换\n") |
|
|
37 |
geneList <- gene |
|
|
38 |
} |
|
|
39 |
if(geneType %in% c("SYMBOL", "ENSEMBL")){ |
|
|
40 |
genes <- IDConvert(genes = gene, fromType = geneType, toType = "ENTREZID") |
|
|
41 |
cat("转换前基因数目:", length(gene), "\n") |
|
|
42 |
index <- match(gene, genes[,1]) |
|
|
43 |
geneList <- as.character(genes[na.omit(index),2]) |
|
|
44 |
cat("转换后基因数目:", length(geneList), "\n") |
|
|
45 |
} |
|
|
46 |
|
|
|
47 |
## 第一种富集策略:MsigDB基因集合 |
|
|
48 |
if(db.type == "MsigDB"){ |
|
|
49 |
library(msigdbr) |
|
|
50 |
|
|
|
51 |
# 通路提取 |
|
|
52 |
MsigDB.pathway <- tibble(gs_name = NULL, entrez_gene = NULL) |
|
|
53 |
|
|
|
54 |
# 将MsigDB所有基因作为背景基因 |
|
|
55 |
#MsigDB.gene <- unique(msigdbr(species = "Homo sapiens") %>% dplyr::select(gs_name,entrez_gene)) |
|
|
56 |
|
|
|
57 |
if(class(MsigDB.category) == "character"){ |
|
|
58 |
MsigDB.pathway <- msigdbr(species = "Homo sapiens") %>% dplyr::select(gs_name, entrez_gene) |
|
|
59 |
}else{ |
|
|
60 |
for(i in names(MsigDB.category)){ |
|
|
61 |
if(MsigDB.category[[i]][1] == "All"){ |
|
|
62 |
MsigDB.pathway <- rbind(MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i) %>% dplyr::select(gs_name, entrez_gene)) |
|
|
63 |
}else{ |
|
|
64 |
for(j in MsigDB.category[[i]]){ |
|
|
65 |
MsigDB.pathway <- rbind(MsigDB.pathway, msigdbr(species = "Homo sapiens", category = i, subcategory = j) %>% dplyr::select(gs_name, entrez_gene)) |
|
|
66 |
} |
|
|
67 |
} |
|
|
68 |
} |
|
|
69 |
} |
|
|
70 |
em.res <- enricher(geneList, TERM2GENE = MsigDB.pathway, qvalueCutoff = qvalueCutoff, pvalueCutoff = pvalueCutoff, pAdjustMethod = pAdjustMethod, minGSSize = minGSSize, maxGSSize = maxGSSize) |
|
|
71 |
} |
|
|
72 |
|
|
|
73 |
if(db.type == "GO"){ |
|
|
74 |
em.res <- enrichGO(gene = geneList, |
|
|
75 |
OrgDb = org.Hs.eg.db, |
|
|
76 |
ont = GO.ont,minGSSize = minGSSize, maxGSSize = maxGSSize, |
|
|
77 |
pAdjustMethod = pAdjustMethod, |
|
|
78 |
pvalueCutoff = pvalueCutoff, |
|
|
79 |
qvalueCutoff = qvalueCutoff) |
|
|
80 |
#基于simplify method, we can remove redundant terms |
|
|
81 |
if(simplify){ |
|
|
82 |
em.res <- simplify(em.res, cutoff = 0.7, by="p.adjust", select_fun = min) |
|
|
83 |
} |
|
|
84 |
} |
|
|
85 |
|
|
|
86 |
if(db.type == "KEGG"){ |
|
|
87 |
em.res <- enrichKEGG(gene = geneList, organism = "hsa", keyType = "kegg", |
|
|
88 |
pAdjustMethod = pAdjustMethod, minGSSize = minGSSize, maxGSSize = maxGSSize, |
|
|
89 |
pvalueCutoff = pvalueCutoff, |
|
|
90 |
qvalueCutoff = qvalueCutoff) |
|
|
91 |
} |
|
|
92 |
#转换为geneSymbol |
|
|
93 |
em.res.geneSymbol <- setReadable(em.res, org.Hs.eg.db, keyType = "ENTREZID") |
|
|
94 |
|
|
|
95 |
# if(nrow(em.res@result) > 0){ |
|
|
96 |
# write.csv(em.res.geneSymbol, file = file.path(saveDir, paste0(title, "_", "enricherResult.csv"))) |
|
|
97 |
# tryCatch({ |
|
|
98 |
# p1 <- barplot(em.res.geneSymbol, showCategory = showCategory, main = title) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4)) |
|
|
99 |
# print(p1) |
|
|
100 |
# p2 <- dotplot(em.res.geneSymbol, showCategory = showCategory, title = title) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4)) |
|
|
101 |
# print(p2) |
|
|
102 |
# p3 <- heatplot(em.res.geneSymbol, showCategory = showCategory) + theme(axis.text.x = element_text(size=4), axis.text.y = element_text(size=4)) |
|
|
103 |
# print(p3) |
|
|
104 |
# }, warning = function(w){ |
|
|
105 |
# # 这里是出现warning状态时,应该怎么做,可以用print打印出来,可以执行其它命令 |
|
|
106 |
# cat("Warning in plot!\n") |
|
|
107 |
# }, error = function(e){ |
|
|
108 |
# # 这里时出现Error状态时,应该怎么做,可以用print打印出来,也可以执行其它命令 |
|
|
109 |
# cat("Error in plot!\n") |
|
|
110 |
# },finally = { |
|
|
111 |
# # 这是运行正常时,应该怎么做,可以用print打印出来,也可以执行其它命令 |
|
|
112 |
# cat("Plot complete!\n") |
|
|
113 |
# }) |
|
|
114 |
# } |
|
|
115 |
return(em.res = list(em.res.entrezid = em.res, em.res.genesymbol = em.res.geneSymbol)) |
|
|
116 |
} |
|
|
117 |
|
|
|
118 |
|
|
|
119 |
##### ID转换 |
|
|
120 |
#author: Zhilin long |
|
|
121 |
#time: 2021.1.19 |
|
|
122 |
|
|
|
123 |
#' @author Zhilin Long |
|
|
124 |
#' @param gene 字符型向量,需要转换的ID向量 |
|
|
125 |
#' @param fromType 字符向量,初始的ID类型 |
|
|
126 |
#' @param toType 字符向量,需要转换的ID类型 |
|
|
127 |
#' |
|
|
128 |
#' @return data.frame 已经去除了重复和NA,可直接使用 |
|
|
129 |
IDConvert <- function(genes, |
|
|
130 |
method = c("clusterProfiler", "org.Hs.eg.db"), |
|
|
131 |
fromType = c("ENTREZID", "SYMBOL", "ENSEMBL"), |
|
|
132 |
toType = c("ENTREZID", "SYMBOL", "ENSEMBL") |
|
|
133 |
){ |
|
|
134 |
require(org.Hs.eg.db) |
|
|
135 |
require(clusterProfiler) |
|
|
136 |
|
|
|
137 |
#规范参数输入 |
|
|
138 |
method <- match.arg(method) |
|
|
139 |
fromType <- match.arg(fromType) |
|
|
140 |
toType <- match.arg(toType) |
|
|
141 |
|
|
|
142 |
if(method == "clusterProfiler"){ |
|
|
143 |
gene.df <- bitr(genes, fromType = fromType, toType = toType, OrgDb = "org.Hs.eg.db") |
|
|
144 |
#默认会过滤没有map到的id,即条目会减少 |
|
|
145 |
}else{ |
|
|
146 |
gene.df <- mapIds(org.Hs.eg.db, |
|
|
147 |
keys = genes, |
|
|
148 |
column = toType, |
|
|
149 |
keytype = fromType, |
|
|
150 |
multiVals="first") |
|
|
151 |
gene.df <- data.frame(ENSEMBL = names(gene.df), SYMBOL = gene.df) |
|
|
152 |
colnames(gene.df) <- c(fromType, toType) |
|
|
153 |
#没有map,则为NA,即条目不变 |
|
|
154 |
gene.df <- na.omit(gene.df) #去除NA的行 |
|
|
155 |
} |
|
|
156 |
|
|
|
157 |
#去除重复, |
|
|
158 |
gene.df <- gene.df[!duplicated(gene.df[,2]),] |
|
|
159 |
gene.df <- gene.df[!duplicated(gene.df[,1]),] |
|
|
160 |
return(gene.df) |
|
|
161 |
} |