|
a |
|
b/ATAC/AnalysisPipeline/6.2.ccans.annotated.genomicLocation.R |
|
|
1 |
#' @description: this script will annotate cicero ccan with the ChIPSeeker database to determine what genomic |
|
|
2 |
# regions are linked by the connections and create circos plots |
|
|
3 |
|
|
|
4 |
library(ChIPseeker) |
|
|
5 |
library(TxDb.Hsapiens.UCSC.hg38.knownGene) |
|
|
6 |
library(openxlsx) |
|
|
7 |
library(circlize) |
|
|
8 |
library(dplyr) |
|
|
9 |
library(tibble) |
|
|
10 |
library(data.table) |
|
|
11 |
library(Signac) |
|
|
12 |
library(Seurat) |
|
|
13 |
library(stringr) |
|
|
14 |
library(RColorBrewer) |
|
|
15 |
library(cicero) |
|
|
16 |
|
|
|
17 |
source(file = "/home/longzhilin/Analysis_Code/Plot_colorPaletters.R") |
|
|
18 |
source(file = "/home/longzhilin/Analysis_Code/SingleCell/user.CoveragePlot.R") |
|
|
19 |
setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC") |
|
|
20 |
|
|
|
21 |
# read in cell-type-specific CCAN and filter by coaccess threshold > 0.2 |
|
|
22 |
dar_files <- list.files("6.Co-Accessible/ccans", recursive = TRUE, pattern = "ciceroConns.*.csv$", full.names = TRUE) |
|
|
23 |
idents <- gsub(".*ciceroConns.", "", dar_files) |
|
|
24 |
idents <- gsub(".csv", "", idents) |
|
|
25 |
list.dar <- lapply(dar_files, function(file_path) { |
|
|
26 |
fread(file_path) %>% |
|
|
27 |
dplyr::filter(coaccess > 0.2) %>% |
|
|
28 |
dplyr::select(Peak1, Peak2) |
|
|
29 |
}) |
|
|
30 |
names(list.dar) <- idents |
|
|
31 |
# count cis-regulatory number of coaccess > 0.2 |
|
|
32 |
peak.num <- sapply(list.dar, function(x){return(nrow(x))}) |
|
|
33 |
names(peak.num) <- idents |
|
|
34 |
|
|
|
35 |
# convert the DAR to GRanges objects to annotate |
|
|
36 |
list.dar.peak1.gr <- lapply(seq(list.dar), function(x) { |
|
|
37 |
df <- list.dar[[x]] |
|
|
38 |
gr <- StringToGRanges(df$Peak1, sep = c("_","_")) |
|
|
39 |
return(gr) |
|
|
40 |
}) |
|
|
41 |
names(list.dar.peak1.gr) <- idents |
|
|
42 |
|
|
|
43 |
list.dar.peak2.gr <- lapply(seq(list.dar), function(x) { |
|
|
44 |
df <- list.dar[[x]] |
|
|
45 |
gr <- StringToGRanges(df$Peak2, sep = c("_","_")) |
|
|
46 |
return(gr) |
|
|
47 |
}) |
|
|
48 |
names(list.dar.peak2.gr) <- idents |
|
|
49 |
|
|
|
50 |
# annotate the list of GRanges DAR for each cell type with the peak location |
|
|
51 |
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene |
|
|
52 |
list.peak1.Anno <- lapply(list.dar.peak1.gr, annotatePeak, TxDb = txdb, |
|
|
53 |
tssRegion = c(-3000, 3000), annoDb="org.Hs.eg.db", verbose = FALSE) |
|
|
54 |
saveRDS(list.peak1.Anno, file = "6.Co-Accessible/ccans/list.peak1.Anno.rds") |
|
|
55 |
list.peak1.loc <- lapply(seq(list.peak1.Anno), function(x) { |
|
|
56 |
loc <- list.peak1.Anno[[x]]@anno$annotation |
|
|
57 |
loc <- str_split(loc, pattern = " ", simplify = TRUE)[,1] |
|
|
58 |
loc <- str_replace(loc, pattern="3'", replacement="3p_UTR") |
|
|
59 |
loc <- str_replace(loc, pattern="5'", replacement="5p_UTR") |
|
|
60 |
loc <- str_replace(loc, pattern="Distal", replacement="Intergenic") |
|
|
61 |
return(loc) |
|
|
62 |
}) |
|
|
63 |
|
|
|
64 |
list.peak2.Anno <- lapply(list.dar.peak2.gr, annotatePeak, TxDb = txdb, |
|
|
65 |
tssRegion = c(-3000, 3000), annoDb="org.Hs.eg.db", verbose = FALSE) |
|
|
66 |
saveRDS(list.peak2.Anno, file = "6.Co-Accessible/ccans/list.peak2.Anno.rds") |
|
|
67 |
list.peak2.loc <- lapply(seq(list.peak2.Anno), function(x) { |
|
|
68 |
loc <- list.peak2.Anno[[x]]@anno$annotation |
|
|
69 |
loc <- str_split(loc, pattern = " ", simplify = TRUE)[,1] |
|
|
70 |
loc <- str_replace(loc, pattern="3'", replacement="3p_UTR") |
|
|
71 |
loc <- str_replace(loc, pattern="5'", replacement="5p_UTR") |
|
|
72 |
loc <- str_replace(loc, pattern="Distal", replacement="Intergenic") |
|
|
73 |
return(loc) |
|
|
74 |
}) |
|
|
75 |
|
|
|
76 |
# create df with peak1 and peak2 locations |
|
|
77 |
list.peaks.loc.df <- lapply(seq(idents), function(x) { |
|
|
78 |
df <- cbind(list.peak1.loc[[x]], list.peak2.loc[[x]]) %>% as.data.frame() |
|
|
79 |
colnames(df) <- c("Peak1","Peak2") |
|
|
80 |
counts <- dplyr::count(df, Peak1, Peak2) %>% as.data.frame() |
|
|
81 |
}) |
|
|
82 |
names(list.peaks.loc.df) <- idents |
|
|
83 |
|
|
|
84 |
# generate circos plots of predicted chromatin-chromatin interactions and save to plots directory |
|
|
85 |
Plot_Cicero_Anno <- function(ident, list.peaks.loc.df) { |
|
|
86 |
clusterID <- ident |
|
|
87 |
toplot <- as.data.frame(list.peaks.loc.df[[ident]]) |
|
|
88 |
colnames(toplot) <- c("Peak1","Peak2","n") |
|
|
89 |
print(clusterID) |
|
|
90 |
# convert to an adjacency list with a value indicating the number of connections between |
|
|
91 |
# each of the unique genomic location pairs |
|
|
92 |
unique_combos <- !duplicated(t(apply(toplot, 1, sort))) |
|
|
93 |
toplot <- toplot[unique_combos, ] |
|
|
94 |
toplot <- toplot[order(toplot$n), ] |
|
|
95 |
# toplot$n <- log2(toplot$n) |
|
|
96 |
|
|
|
97 |
# grid.col = c('3p_UTR'="red", '5p_UTR'="black", Distal_Intergenic="blue", |
|
|
98 |
# Downstream="grey", Exon="purple", Intron="orange", Promoter="green") |
|
|
99 |
grid.col = brewer.pal(7, "Paired") |
|
|
100 |
|
|
|
101 |
circos.clear() |
|
|
102 |
pdf(paste0("6.Co-Accessible/ccans/plots/ccans.annotation.",clusterID,".pdf"), width=10, height=5) |
|
|
103 |
par(mar=c(0.5,0.5,0.5,0.5)) |
|
|
104 |
circos.par(gap.after = 5) |
|
|
105 |
chordDiagram(toplot, |
|
|
106 |
grid.col = grid.col, |
|
|
107 |
annotationTrack = "grid", |
|
|
108 |
preAllocateTracks = list(track.height = max(strwidth(unlist(dimnames(toplot)))))) |
|
|
109 |
|
|
|
110 |
# we go back to the first track and customize sector labels |
|
|
111 |
circos.track(track.index = 1, panel.fun = function(x, y) { |
|
|
112 |
circos.text(CELL_META$xcenter, |
|
|
113 |
CELL_META$ylim[1], |
|
|
114 |
CELL_META$sector.index, |
|
|
115 |
facing = "clockwise", |
|
|
116 |
niceFacing = TRUE, |
|
|
117 |
adj = c(0, 0.5)) |
|
|
118 |
}, bg.border = NA) # here set bg.border to NA is important |
|
|
119 |
dev.off() |
|
|
120 |
} |
|
|
121 |
|
|
|
122 |
# save circlize plots to file |
|
|
123 |
lapply(idents, function(ident) { |
|
|
124 |
Plot_Cicero_Anno(ident, list.peaks.loc.df=list.peaks.loc.df) |
|
|
125 |
}) |
|
|
126 |
|
|
|
127 |
##############################identify specific cis-regulatory elements |
|
|
128 |
# focus on the tumor or Macrophage cell |
|
|
129 |
cell.type <- "Tumor" |
|
|
130 |
#Computing average expression & accessibility |
|
|
131 |
scRNA <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/data.merge.pro.rds") |
|
|
132 |
DefaultAssay(scRNA) <- "RNA" |
|
|
133 |
Idents(scRNA) <- scRNA$cellType_low |
|
|
134 |
scATAC <- readRDS("scATAC.data.pro.rds") |
|
|
135 |
DefaultAssay(scATAC) <- "Peaks" |
|
|
136 |
Idents(scATAC) <- scATAC$AnnotatedcellType |
|
|
137 |
average_exp_all <- AverageExpression(scRNA, assays = "RNA") |
|
|
138 |
average_acc_all <- AverageExpression(scATAC, assays = "Peaks") |
|
|
139 |
# extract the same cell type between scRNA and scATAC |
|
|
140 |
average_exp_all$RNA <- average_exp_all$RNA[, colnames(average_acc_all$Peaks)] |
|
|
141 |
|
|
|
142 |
####require one of the peaks overlap a gene's promoter (1kb) |
|
|
143 |
peak1.anno <- list.peak1.Anno[[cell.type]]@anno |
|
|
144 |
peak2.anno <- list.peak2.Anno[[cell.type]]@anno |
|
|
145 |
conns <- fread(paste0("6.Co-Accessible/ccans/ciceroConns.", cell.type,".csv")) # 250302 |
|
|
146 |
conns <- conns %>% dplyr::filter(coaccess > 0.2) |
|
|
147 |
conns <- conns %>% mutate(peak1_type = peak1.anno$annotation, peak1_distanceToTSS = peak1.anno$distanceToTSS, peak1_nearestGene = peak1.anno$SYMBOL, |
|
|
148 |
peak2_type = peak2.anno$annotation, peak2_distanceToTSS = peak2.anno$distanceToTSS, peak2_nearestGene = peak2.anno$SYMBOL) |
|
|
149 |
bg <- na.omit(unique(c(conns$peak1_nearestGene, conns$peak2_nearestGene))) # 作为background gene:17567 |
|
|
150 |
saveRDS(bg, file = paste0("6.Co-Accessible/bg.", cell.type,".rds")) # coaccess > 0.2 |
|
|
151 |
|
|
|
152 |
#select the peaks overlap the gene's promoter in conns |
|
|
153 |
cCREs <- conns %>% dplyr::filter(str_detect(peak1_type, pattern = "Promoter \\(<=1kb\\)")) |
|
|
154 |
cCREs <- cCREs[!is.na(cCREs$peak1_nearestGene),] |
|
|
155 |
conns_list <- group_split(cCREs, peak1_nearestGene) #基于peak1_nearestGene分组,通常按字母顺序排序 |
|
|
156 |
names(conns_list) <- sort(unique(cCREs$peak1_nearestGene)) |
|
|
157 |
cTargetGenes <- names(conns_list)[names(conns_list) %in% rownames(average_exp_all$RNA)] # 11453 genes |
|
|
158 |
|
|
|
159 |
# loop over all genes to compute correlation between accessibility & expression |
|
|
160 |
df <- data.frame() |
|
|
161 |
corr_list <- lapply(cTargetGenes, function(x){ |
|
|
162 |
cat("calculated gene:", x, "\n") |
|
|
163 |
# subset by cur_gene |
|
|
164 |
cur_conns <- conns_list[[x]] |
|
|
165 |
cur_conns <- cur_conns[!(cur_conns$Peak2 %in% cur_conns$Peak1),] |
|
|
166 |
cur_conns <- subset(cur_conns, coaccess >= 0.2) |
|
|
167 |
|
|
|
168 |
# skip this gene if there are no co-accessible connections |
|
|
169 |
if(nrow(cur_conns) == 0){return(data.frame())} |
|
|
170 |
|
|
|
171 |
# get average exp and acc for this gene and peaks that are co-accessible |
|
|
172 |
cur_conns$Peak2 <- gsub("_", "-", cur_conns$Peak2) |
|
|
173 |
average_acc <- average_acc_all$Peaks[as.character(cur_conns$Peak2),, drop = F] |
|
|
174 |
|
|
|
175 |
average_exp <- average_exp_all$RNA[x,] |
|
|
176 |
# correlation between expression and accessibility: |
|
|
177 |
cor_mat <- apply(average_acc, 1, function(x){ |
|
|
178 |
correlation <- cor.test(as.numeric(average_exp), as.numeric(x), method='pearson') |
|
|
179 |
data.frame("pval"=correlation$p.value, "pcc"=correlation$estimate) |
|
|
180 |
}) |
|
|
181 |
# collapse individual correlation dfs, add info |
|
|
182 |
cor_df <- Reduce(rbind, cor_mat) |
|
|
183 |
cur_conns$pcc <- cor_df$pcc |
|
|
184 |
cur_conns$pval <- cor_df$pval |
|
|
185 |
return(cur_conns) |
|
|
186 |
}) |
|
|
187 |
|
|
|
188 |
# combine into one df and remove incomplete entries: |
|
|
189 |
df <- Reduce(rbind, corr_list) |
|
|
190 |
df <- df[!is.na(df$pcc),] |
|
|
191 |
|
|
|
192 |
# compute FDR: |
|
|
193 |
df$FDR <- p.adjust(df$pval, method='fdr') |
|
|
194 |
df <- df %>% dplyr::filter(FDR < 0.05) %>% arrange(desc(pcc)) |
|
|
195 |
df$peak2_type <- gsub(" .*", "", df$peak2_type) |
|
|
196 |
df$peak2_type <- gsub("'", "_UTR", df$peak2_type) |
|
|
197 |
|
|
|
198 |
# distance between peak and target gene |
|
|
199 |
peak1_ranges <- StringToGRanges(gsub("_", "-", df$Peak1)) |
|
|
200 |
peak2_ranges <- StringToGRanges(gsub("_", "-", df$Peak2)) |
|
|
201 |
df$distance_bp <- abs(start(peak1_ranges) - start(peak2_ranges)) |
|
|
202 |
|
|
|
203 |
# write df to file: |
|
|
204 |
write.table(df, file = paste0("6.Co-Accessible/", cell.type,"_peak_gene_correlation.csv"), sep = ",", quote=FALSE, row.names=F) |
|
|
205 |
saveRDS(df, file = paste0("6.Co-Accessible/", cell.type,"_peak_gene_correlation.rds")) |