|
a |
|
b/ATAC/AnalysisPipeline/4.2.peakAnnotation.genomicLocation.R |
|
|
1 |
#' @description: peak annotation and enrichment analysis |
|
|
2 |
|
|
|
3 |
##reference: |
|
|
4 |
#https://hbctraining.github.io/Intro-to-ChIPseq/lessons/12_functional_analysis.html |
|
|
5 |
#http://bioconductor.org/packages/devel/bioc/vignettes/ChIPseeker/inst/doc/ChIPseeker.html |
|
|
6 |
library(Signac) |
|
|
7 |
library(Seurat) |
|
|
8 |
library(ggpubr) |
|
|
9 |
library(GenomicRanges) |
|
|
10 |
library(tibble) |
|
|
11 |
library(dplyr) |
|
|
12 |
set.seed(101) |
|
|
13 |
|
|
|
14 |
setwd(dir = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC") |
|
|
15 |
scATAC.data <- readRDS("scATAC.data.pro.rds") |
|
|
16 |
DefaultAssay(scATAC.data) <- "Peaks" |
|
|
17 |
|
|
|
18 |
## loading packages |
|
|
19 |
library(ChIPseeker) |
|
|
20 |
library(TxDb.Hsapiens.UCSC.hg38.knownGene) |
|
|
21 |
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene |
|
|
22 |
library(clusterProfiler) |
|
|
23 |
library(org.Hs.eg.db) |
|
|
24 |
|
|
|
25 |
#####################################All Peak Annotation |
|
|
26 |
peakSet <- GetAssayData(scATAC.data, slot = "data") |
|
|
27 |
All.gr <- StringToGRanges(rownames(peakSet)) |
|
|
28 |
#TSS (transcription start site) region, by default TSS is defined from -3kb to +3kb |
|
|
29 |
All.peakAnno <- annotatePeak(All.gr, tssRegion=c(-3000, 3000), TxDb = txdb, annoDb="org.Hs.eg.db") |
|
|
30 |
saveRDS(All.peakAnno, file = "4.Peak/peak.annotation.ChIPseeker.rds") |
|
|
31 |
peakAnno.info <- All.peakAnno@anno |
|
|
32 |
peakAnno.info$annotation <- gsub(" .*", "", peakAnno.info$annotation) |
|
|
33 |
peakAnno.info$annotation <- gsub("'", "_UTR", peakAnno.info$annotation) |
|
|
34 |
peak.info <- data.frame(peaks = rownames(Motifs(scATAC.data)@data), |
|
|
35 |
originType = All.peakAnno@anno$annotation, |
|
|
36 |
peakType = peakAnno.info$annotation, |
|
|
37 |
distanceToTSS = peakAnno.info$distanceToTSS, |
|
|
38 |
SYMBOL = peakAnno.info$SYMBOL) |
|
|
39 |
saveRDS(peak.info, file = "4.Peak/peak.annotation.simple.ChIPseeker.rds") |
|
|
40 |
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000) |
|
|
41 |
tagMatrix <- getTagMatrix(All.gr, windows=promoter) |
|
|
42 |
|
|
|
43 |
pdf("4.Peak/Annotation/All.peakAnnotation.pdf") |
|
|
44 |
plotAnnoPie(All.peakAnno) |
|
|
45 |
plotAnnoBar(All.peakAnno) |
|
|
46 |
vennpie(All.peakAnno) #some annotation overlap |
|
|
47 |
upsetplot(All.peakAnno, vennpie=TRUE) |
|
|
48 |
#Visualize distribution of TF-binding loci relative to TSS |
|
|
49 |
plotDistToTSS(All.peakAnno, title="Distribution of transcription factor-binding loci\nrelative to TSS") |
|
|
50 |
tagHeatmap(tagMatrix, xlim=c(-3000, 3000), color="#4DBBD5") |
|
|
51 |
plotAvgProf(tagMatrix, xlim=c(-3000, 3000), xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency") |
|
|
52 |
plotAvgProf(tagMatrix, xlim=c(-3000, 3000), conf = 0.95, resample = 1000) |
|
|
53 |
dev.off() |
|
|
54 |
|
|
|
55 |
#######################################annotated the cell-type specific peaks |
|
|
56 |
cellType.sig.pos.DARs <- readRDS("4.Peak/cellType.sig.pos.DARs.rds") |
|
|
57 |
cellType.sig.pos.gr <- StringToGRanges(unique(cellType.sig.pos.DARs$genomicRegion)) # 8797 DAR peaks |
|
|
58 |
DAR.Anno <- annotatePeak(cellType.sig.pos.gr, tssRegion=c(-3000, 3000), TxDb = txdb, annoDb="org.Hs.eg.db") |
|
|
59 |
|
|
|
60 |
idents <- unique(cellType.sig.pos.DARs$cellType) |
|
|
61 |
cellType.gr.list <- lapply(idents, function(x){ |
|
|
62 |
idx <- which(cellType.sig.pos.DARs$cellType == x) |
|
|
63 |
gr <- StringToGRanges(cellType.sig.pos.DARs$genomicRegion[idx]) |
|
|
64 |
}) |
|
|
65 |
names(cellType.gr.list) <- idents |
|
|
66 |
|
|
|
67 |
#annotated DAR for each cell type |
|
|
68 |
DAR.anno.list <- lapply(cellType.gr.list, function(x){ |
|
|
69 |
DAR.cell <- annotatePeak(x, tssRegion=c(-3000, 3000), TxDb = txdb, annoDb="org.Hs.eg.db") |
|
|
70 |
return(DAR.cell) |
|
|
71 |
}) |
|
|
72 |
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000) |
|
|
73 |
tagMatrix <- getTagMatrix(cellType.sig.pos.gr, windows=promoter) |
|
|
74 |
|
|
|
75 |
pdf("4.Peak/Annotation/cellType.DAR.annotation.pdf") |
|
|
76 |
tagHeatmap(tagMatrix, xlim=c(-3000, 3000), color="#4DBBD5") |
|
|
77 |
plotAvgProf(tagMatrix, xlim=c(-3000, 3000), xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency") |
|
|
78 |
plotAvgProf(tagMatrix, xlim=c(-3000, 3000), conf = 0.95, resample = 1000) |
|
|
79 |
plotAnnoPie(DAR.Anno) |
|
|
80 |
plotAnnoBar(DAR.Anno) |
|
|
81 |
vennpie(DAR.Anno) #some annotation overlap |
|
|
82 |
upsetplot(DAR.Anno, vennpie=TRUE) |
|
|
83 |
|
|
|
84 |
#Visualize distribution of TF-binding loci relative to TSS |
|
|
85 |
plotDistToTSS(DAR.Anno, title="Distribution of transcription factor-binding loci\nrelative to TSS") |
|
|
86 |
|
|
|
87 |
plotAnnoBar(DAR.anno.list) |
|
|
88 |
plotDistToTSS(DAR.anno.list) |
|
|
89 |
dev.off() |
|
|
90 |
|
|
|
91 |
############################# write DARs to bed files |
|
|
92 |
# write all peaks to file to serve as background set: |
|
|
93 |
DefaultAssay(scATAC.data) <- "Peaks" |
|
|
94 |
peak_ranges <- StringToGRanges(rownames(scATAC.data)) %>% sort |
|
|
95 |
write.table(as.data.frame(peak_ranges)[,1:3], file='4.Peak/bed/allPeaks.bed', row.names=F, col.names=F, sep='\t', quote=F) |
|
|
96 |
|
|
|
97 |
########### each cell type |
|
|
98 |
All.DARs <- readRDS("4.Peak/cellType.all.DARs.rds") # 224807 region |
|
|
99 |
idents <- levels(All.DARs$cellType) |
|
|
100 |
res < sapply(idents, function(x){ |
|
|
101 |
cat("writing ", x, "\n") |
|
|
102 |
DARs.up <- All.DARs %>% dplyr::filter(cellType == x & avg_log2FC >= 0.25 & p_val_adj < 0.05) |
|
|
103 |
DARs.down <- All.DARs %>% dplyr::filter(cellType == x & avg_log2FC < -0.25 & p_val_adj < 0.05) |
|
|
104 |
|
|
|
105 |
if(nrow(DARs.up) >= 25){ |
|
|
106 |
DARs.up.info <- peakAnno.info.frame[match(DARs.up$genomicRegion, peakAnno.info.frame$id),] |
|
|
107 |
# distal |
|
|
108 |
distal.peak.up <- DARs.up.info[which(DARs.up.info$annotation == "Distal"),] |
|
|
109 |
write.table(distal.peak.up[,1:3], file = paste0('4.Peak/bed/', x,'_up_distalPeaks.bed'), row.names=F, col.names=F, sep='\t', quote=F) |
|
|
110 |
cat("Distal Up: ", nrow(distal.peak.up), "\n") |
|
|
111 |
|
|
|
112 |
# proximal |
|
|
113 |
proximal.peak.up <- DARs.up.info[which(DARs.up.info$annotation != "Distal"),] |
|
|
114 |
write.table(proximal.peak.up[,1:3], file = paste0('4.Peak/bed/', x,'_up_proximalPeaks.bed'), row.names=F, col.names=F, sep='\t', quote=F) |
|
|
115 |
cat("proximal Up: ", nrow(proximal.peak.up), "\n") |
|
|
116 |
} |
|
|
117 |
|
|
|
118 |
if(nrow(DARs.down) >= 25){ |
|
|
119 |
DARs.down.info <- peakAnno.info.frame[match(DARs.down$genomicRegion, peakAnno.info.frame$id),] |
|
|
120 |
# distal |
|
|
121 |
distal.peak.down <- DARs.down.info[which(DARs.down.info$annotation == "Distal"),] |
|
|
122 |
write.table(distal.peak.down[,1:3], file = paste0('4.Peak/bed/', x,'_down_distalPeaks.bed'), row.names=F, col.names=F, sep='\t', quote=F) |
|
|
123 |
cat("Distal Down: ", nrow(distal.peak.down), "\n") |
|
|
124 |
|
|
|
125 |
# proximal |
|
|
126 |
proximal.peak.down <- DARs.down.info[which(DARs.down.info$annotation != "Distal"),] |
|
|
127 |
write.table(proximal.peak.down[,1:3], file = paste0('4.Peak/bed/', x,'_down_proximalPeaks.bed'), row.names=F, col.names=F, sep='\t', quote=F) |
|
|
128 |
cat("proximal Down: ", nrow(proximal.peak.down), "\n") |
|
|
129 |
} |
|
|
130 |
return("Finish!") |
|
|
131 |
}) |
|
|
132 |
|
|
|
133 |
##rGREAT enrichment analysis |
|
|
134 |
source(file = "/home/longzhilin/Analysis_Code/PathwayEnrichment/Great.enrich.R") |
|
|
135 |
|
|
|
136 |
idents <- levels(scATAC.data) |
|
|
137 |
bg <- read.table("4.Peak/bed/allPeaks.bed", sep = "\t", stringsAsFactors = F) |
|
|
138 |
|
|
|
139 |
pdf("4.Peak/bed/GREAT.enrich.pdf") |
|
|
140 |
res <- lapply(idents, function(x){ |
|
|
141 |
|
|
|
142 |
## up regulate |
|
|
143 |
up.distal.bed <- tryCatch({ |
|
|
144 |
read.table(paste0("4.Peak/bed/", x, "_up_distalPeaks.bed"), sep = "\t", stringsAsFactors = F)}, error = function(e){return(NULL)}) |
|
|
145 |
if(!is.null(up.distal.bed)){ |
|
|
146 |
cat("Analysis: ", x, " up distal region...\n") |
|
|
147 |
up_distalPeaks_enrich <- Great.enrich(gr = up.distal.bed, bg = bg, title = paste0(x, "_up_distalPeaks"), Type = "GO Biological Process") |
|
|
148 |
}else{ |
|
|
149 |
up_distalPeaks_enrich <- NULL |
|
|
150 |
} |
|
|
151 |
up.proximal.bed <- tryCatch({ |
|
|
152 |
read.table(paste0("4.Peak/bed/", x, "_up_proximalPeaks.bed"), sep = "\t", stringsAsFactors = F)}, error = function(e){return(NULL)}) |
|
|
153 |
if(!is.null(up.proximal.bed)){ |
|
|
154 |
cat("Analysis: ", x, " up proximal region...\n") |
|
|
155 |
up_proximalPeaks_enrich <- Great.enrich(gr = up.proximal.bed, bg = bg, title = paste0(x, "_up_proximalPeaks"), Type = "GO Biological Process") |
|
|
156 |
}else{ |
|
|
157 |
up_proximalPeaks_enrich <- NULL |
|
|
158 |
} |
|
|
159 |
|
|
|
160 |
## down regulate |
|
|
161 |
down.distal.bed <- tryCatch({ |
|
|
162 |
read.table(paste0("4.Peak/bed/", x, "_down_distalPeaks.bed"), sep = "\t", stringsAsFactors = F)}, error = function(e){return(NULL)}) |
|
|
163 |
if(!is.null(down.distal.bed)){ |
|
|
164 |
cat("Analysis: ", x, " down distal region...\n") |
|
|
165 |
down_distalPeaks_enrich <- Great.enrich(gr = down.distal.bed, bg = bg, title = paste0(x, "_down_distalPeaks"), Type = "GO Biological Process") |
|
|
166 |
}else{ |
|
|
167 |
down_distalPeaks_enrich <- NULL |
|
|
168 |
} |
|
|
169 |
down.proximal.bed <- tryCatch({ |
|
|
170 |
read.table(paste0("4.Peak/bed/", x, "_down_proximalPeaks.bed"), sep = "\t", stringsAsFactors = F)}, error = function(e){return(NULL)}) |
|
|
171 |
if(!is.null(down.proximal.bed)){ |
|
|
172 |
cat("Analysis: ", x, " down proximal region...\n") |
|
|
173 |
down_proximalPeaks_enrich <- Great.enrich(gr = down.proximal.bed, bg = bg, title = paste0(x, "_down_proximalPeaks"), Type = "GO Biological Process") |
|
|
174 |
}else{ |
|
|
175 |
down_proximalPeaks_enrich <- NULL |
|
|
176 |
} |
|
|
177 |
return(list(up_distalPeaks_enrich = up_distalPeaks_enrich, up_proximalPeaks_enrich = up_proximalPeaks_enrich, |
|
|
178 |
down_distalPeaks_enrich = down_distalPeaks_enrich, down_proximalPeaks_enrich = down_proximalPeaks_enrich)) |
|
|
179 |
}) |
|
|
180 |
dev.off() |
|
|
181 |
names(res) <- idents |
|
|
182 |
saveRDS(res, file = "4.Peak/bed/GREAT.enrichment.rds") |
|
|
183 |
library(openxlsx) |
|
|
184 |
write.xlsx(res$Tumor, file = "4.Peak/bed/GREAT.Tumor.xlsx", sheetName = names(res$Tumor), rowNames = F) |
|
|
185 |
|
|
|
186 |
####plot --- tumor.up |
|
|
187 |
tumor.up.proximal <- res$Tumor$up_proximalPeaks_enrich |
|
|
188 |
tumor.up.distal <- res$Tumor$up_distalPeaks_enrich |
|
|
189 |
pdf("4.Peak/bed/GREAT.Tumor.selection.pdf", height = unit(5, "inches")) |
|
|
190 |
p <- ggbarplot(tumor.up.proximal, x = "wrap", y = "Hyper_Fold_Enrichment", fill = "grey", color = "grey", width = 0.4, ylab = "Fold Enrichment", xlab = "", orientation = "horiz", sort.val = c("asc"))+theme(legend.position="none") |
|
|
191 |
print(p) |
|
|
192 |
p <- ggbarplot(tumor.up.distal, x = "wrap", y = "Hyper_Fold_Enrichment", fill = "grey", color = "grey", width = 0.4, ylab = "Fold Enrichment", xlab = "", orientation = "horiz", sort.val = c("asc"))+theme(legend.position="none") |
|
|
193 |
print(p) |
|
|
194 |
dev.off() |