a b/ATAC/AnalysisPipeline/6.2.ccans.annotated.genomicLocation.R
1
#' @description: this script will annotate cicero ccan with the ChIPSeeker database to determine what genomic
2
# regions are linked by the connections and create circos plots
3
4
library(ChIPseeker)
5
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
6
library(openxlsx)
7
library(circlize)
8
library(dplyr)
9
library(tibble)
10
library(data.table)
11
library(Signac)
12
library(Seurat)
13
library(stringr)
14
library(RColorBrewer)
15
library(cicero)
16
17
source(file = "/home/longzhilin/Analysis_Code/Plot_colorPaletters.R")
18
source(file = "/home/longzhilin/Analysis_Code/SingleCell/user.CoveragePlot.R")
19
setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC")
20
21
# read in cell-type-specific CCAN and filter by coaccess threshold > 0.2
22
dar_files <- list.files("6.Co-Accessible/ccans", recursive = TRUE, pattern = "ciceroConns.*.csv$", full.names = TRUE)
23
idents <- gsub(".*ciceroConns.", "", dar_files)
24
idents <- gsub(".csv", "", idents)
25
list.dar <- lapply(dar_files, function(file_path) {
26
  fread(file_path) %>%
27
    dplyr::filter(coaccess > 0.2) %>%
28
    dplyr::select(Peak1, Peak2)
29
})
30
names(list.dar) <- idents
31
# count cis-regulatory number of coaccess > 0.2
32
peak.num <- sapply(list.dar, function(x){return(nrow(x))})
33
names(peak.num) <- idents
34
35
# convert the DAR to GRanges objects to annotate
36
list.dar.peak1.gr <- lapply(seq(list.dar), function(x) {
37
  df <- list.dar[[x]]
38
  gr <- StringToGRanges(df$Peak1, sep = c("_","_"))
39
  return(gr)
40
})
41
names(list.dar.peak1.gr) <- idents
42
43
list.dar.peak2.gr <- lapply(seq(list.dar), function(x) {
44
  df <- list.dar[[x]]
45
  gr <- StringToGRanges(df$Peak2, sep = c("_","_"))
46
  return(gr)
47
})
48
names(list.dar.peak2.gr) <- idents
49
50
# annotate the list of GRanges DAR for each cell type with the peak location
51
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
52
list.peak1.Anno <- lapply(list.dar.peak1.gr, annotatePeak, TxDb = txdb,
53
                        tssRegion = c(-3000, 3000), annoDb="org.Hs.eg.db", verbose = FALSE)
54
saveRDS(list.peak1.Anno, file = "6.Co-Accessible/ccans/list.peak1.Anno.rds")                        
55
list.peak1.loc <- lapply(seq(list.peak1.Anno), function(x) {
56
  loc <- list.peak1.Anno[[x]]@anno$annotation 
57
  loc <- str_split(loc, pattern = " ", simplify = TRUE)[,1]
58
  loc <- str_replace(loc, pattern="3'", replacement="3p_UTR")
59
  loc <- str_replace(loc, pattern="5'", replacement="5p_UTR")
60
  loc <- str_replace(loc, pattern="Distal", replacement="Intergenic")
61
  return(loc)
62
})
63
64
list.peak2.Anno <- lapply(list.dar.peak2.gr, annotatePeak, TxDb = txdb,
65
                          tssRegion = c(-3000, 3000), annoDb="org.Hs.eg.db", verbose = FALSE)
66
saveRDS(list.peak2.Anno, file = "6.Co-Accessible/ccans/list.peak2.Anno.rds")
67
list.peak2.loc <- lapply(seq(list.peak2.Anno), function(x) {
68
  loc <- list.peak2.Anno[[x]]@anno$annotation 
69
  loc <- str_split(loc, pattern = " ", simplify = TRUE)[,1]
70
  loc <- str_replace(loc, pattern="3'", replacement="3p_UTR")
71
  loc <- str_replace(loc, pattern="5'", replacement="5p_UTR")
72
  loc <- str_replace(loc, pattern="Distal", replacement="Intergenic")
73
  return(loc)
74
})
75
76
# create df with peak1 and peak2 locations
77
list.peaks.loc.df <- lapply(seq(idents), function(x) {
78
  df <- cbind(list.peak1.loc[[x]], list.peak2.loc[[x]]) %>% as.data.frame()
79
  colnames(df) <- c("Peak1","Peak2")
80
  counts <- dplyr::count(df, Peak1, Peak2) %>% as.data.frame()
81
})
82
names(list.peaks.loc.df) <- idents
83
84
# generate circos plots of predicted chromatin-chromatin interactions and save to plots directory
85
Plot_Cicero_Anno <- function(ident, list.peaks.loc.df) {
86
  clusterID <- ident
87
  toplot <- as.data.frame(list.peaks.loc.df[[ident]])
88
  colnames(toplot) <- c("Peak1","Peak2","n")
89
  print(clusterID)
90
  # convert to an adjacency list with a value indicating the number of connections between
91
  # each of the unique genomic location pairs
92
  unique_combos <- !duplicated(t(apply(toplot, 1, sort)))
93
  toplot <- toplot[unique_combos, ]
94
  toplot <- toplot[order(toplot$n), ]
95
  # toplot$n <- log2(toplot$n)
96
  
97
  # grid.col = c('3p_UTR'="red", '5p_UTR'="black", Distal_Intergenic="blue",
98
  # Downstream="grey", Exon="purple", Intron="orange", Promoter="green")
99
  grid.col = brewer.pal(7, "Paired")
100
101
  circos.clear()
102
  pdf(paste0("6.Co-Accessible/ccans/plots/ccans.annotation.",clusterID,".pdf"), width=10, height=5)
103
  par(mar=c(0.5,0.5,0.5,0.5))
104
  circos.par(gap.after = 5)
105
  chordDiagram(toplot,
106
             grid.col = grid.col,
107
             annotationTrack = "grid",
108
             preAllocateTracks = list(track.height = max(strwidth(unlist(dimnames(toplot))))))
109
110
  # we go back to the first track and customize sector labels
111
  circos.track(track.index = 1, panel.fun = function(x, y) {
112
    circos.text(CELL_META$xcenter,
113
                CELL_META$ylim[1],
114
                CELL_META$sector.index,
115
                facing = "clockwise",
116
                niceFacing = TRUE,
117
                adj = c(0, 0.5))
118
   }, bg.border = NA) # here set bg.border to NA is important
119
  dev.off()
120
  }
121
122
# save circlize plots to file
123
lapply(idents, function(ident) {
124
  Plot_Cicero_Anno(ident, list.peaks.loc.df=list.peaks.loc.df)
125
  })
126
127
##############################identify specific cis-regulatory elements
128
# focus on the tumor or Macrophage cell 
129
cell.type <- "Tumor"
130
#Computing average expression & accessibility
131
scRNA <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/data.merge.pro.rds")
132
DefaultAssay(scRNA) <- "RNA"
133
Idents(scRNA) <- scRNA$cellType_low
134
scATAC <- readRDS("scATAC.data.pro.rds")
135
DefaultAssay(scATAC) <- "Peaks"
136
Idents(scATAC) <- scATAC$AnnotatedcellType
137
average_exp_all <- AverageExpression(scRNA, assays = "RNA")
138
average_acc_all <- AverageExpression(scATAC, assays = "Peaks")
139
# extract the same cell type between scRNA and scATAC
140
average_exp_all$RNA <- average_exp_all$RNA[, colnames(average_acc_all$Peaks)]
141
142
####require one of the peaks overlap a gene's promoter (1kb)
143
peak1.anno <- list.peak1.Anno[[cell.type]]@anno
144
peak2.anno <- list.peak2.Anno[[cell.type]]@anno
145
conns <- fread(paste0("6.Co-Accessible/ccans/ciceroConns.", cell.type,".csv")) # 250302
146
conns <- conns %>% dplyr::filter(coaccess > 0.2)
147
conns <- conns %>% mutate(peak1_type = peak1.anno$annotation, peak1_distanceToTSS = peak1.anno$distanceToTSS, peak1_nearestGene = peak1.anno$SYMBOL,
148
                                      peak2_type = peak2.anno$annotation, peak2_distanceToTSS = peak2.anno$distanceToTSS, peak2_nearestGene = peak2.anno$SYMBOL)
149
bg <- na.omit(unique(c(conns$peak1_nearestGene, conns$peak2_nearestGene))) # 作为background gene:17567
150
saveRDS(bg, file = paste0("6.Co-Accessible/bg.", cell.type,".rds")) # coaccess > 0.2
151
152
#select the peaks overlap the gene's promoter in conns
153
cCREs <- conns %>% dplyr::filter(str_detect(peak1_type, pattern = "Promoter \\(<=1kb\\)"))
154
cCREs <- cCREs[!is.na(cCREs$peak1_nearestGene),]
155
conns_list <- group_split(cCREs, peak1_nearestGene) #基于peak1_nearestGene分组,通常按字母顺序排序
156
names(conns_list) <- sort(unique(cCREs$peak1_nearestGene)) 
157
cTargetGenes <- names(conns_list)[names(conns_list) %in% rownames(average_exp_all$RNA)] # 11453 genes
158
159
# loop over all genes to compute correlation between accessibility & expression
160
df <- data.frame()
161
corr_list <- lapply(cTargetGenes, function(x){
162
  cat("calculated gene:", x, "\n")
163
  # subset by cur_gene
164
  cur_conns <- conns_list[[x]]
165
  cur_conns <- cur_conns[!(cur_conns$Peak2 %in% cur_conns$Peak1),]
166
  cur_conns <- subset(cur_conns, coaccess >= 0.2)
167
168
  # skip this gene if there are no co-accessible connections
169
  if(nrow(cur_conns) == 0){return(data.frame())}
170
171
  # get average exp and acc for this gene and peaks that are co-accessible
172
  cur_conns$Peak2 <- gsub("_", "-", cur_conns$Peak2)
173
  average_acc <- average_acc_all$Peaks[as.character(cur_conns$Peak2),, drop = F]
174
  
175
  average_exp <- average_exp_all$RNA[x,]
176
  # correlation between expression and accessibility:
177
  cor_mat <- apply(average_acc, 1, function(x){
178
    correlation <- cor.test(as.numeric(average_exp), as.numeric(x), method='pearson')
179
    data.frame("pval"=correlation$p.value, "pcc"=correlation$estimate)
180
  })
181
  # collapse individual correlation dfs, add info
182
  cor_df <- Reduce(rbind, cor_mat)
183
  cur_conns$pcc <- cor_df$pcc
184
  cur_conns$pval <- cor_df$pval
185
  return(cur_conns)
186
})
187
188
# combine into one df and remove incomplete entries:
189
df <- Reduce(rbind, corr_list)
190
df <- df[!is.na(df$pcc),]
191
192
# compute FDR:
193
df$FDR <- p.adjust(df$pval, method='fdr')
194
df <- df %>% dplyr::filter(FDR < 0.05) %>% arrange(desc(pcc))
195
df$peak2_type <- gsub(" .*", "", df$peak2_type)
196
df$peak2_type <- gsub("'", "_UTR", df$peak2_type)
197
198
# distance between peak and target gene
199
peak1_ranges <- StringToGRanges(gsub("_", "-", df$Peak1))
200
peak2_ranges <- StringToGRanges(gsub("_", "-", df$Peak2))
201
df$distance_bp <- abs(start(peak1_ranges) - start(peak2_ranges))
202
203
# write df to file:
204
write.table(df, file = paste0("6.Co-Accessible/", cell.type,"_peak_gene_correlation.csv"), sep = ",", quote=FALSE, row.names=F)
205
saveRDS(df, file = paste0("6.Co-Accessible/", cell.type,"_peak_gene_correlation.rds"))