b/ATAC/AnalysisPipeline/6.2.ccans.annotated.genomicLocation.R
+#' @description: this script will annotate cicero ccan with the ChIPSeeker database to determine what genomic
+# regions are linked by the connections and create circos plots
+library(ChIPseeker)
+library(TxDb.Hsapiens.UCSC.hg38.knownGene)
+library(openxlsx)
+library(circlize)
+library(dplyr)
+library(tibble)
+library(data.table)
+library(Signac)
+library(Seurat)
+library(stringr)
+library(RColorBrewer)
+library(cicero)
+source(file = "/home/longzhilin/Analysis_Code/Plot_colorPaletters.R")
+source(file = "/home/longzhilin/Analysis_Code/SingleCell/user.CoveragePlot.R")
+setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC")
+# read in cell-type-specific CCAN and filter by coaccess threshold > 0.2
+dar_files <- list.files("6.Co-Accessible/ccans", recursive = TRUE, pattern = "ciceroConns.*.csv$", full.names = TRUE)
+idents <- gsub(".*ciceroConns.", "", dar_files)
+idents <- gsub(".csv", "", idents)
+list.dar <- lapply(dar_files, function(file_path) {
+  fread(file_path) %>%
+    dplyr::filter(coaccess > 0.2) %>%
+    dplyr::select(Peak1, Peak2)
+})
+names(list.dar) <- idents
+# count cis-regulatory number of coaccess > 0.2
+peak.num <- sapply(list.dar, function(x){return(nrow(x))})
+names(peak.num) <- idents
+# convert the DAR to GRanges objects to annotate
+list.dar.peak1.gr <- lapply(seq(list.dar), function(x) {
+  df <- list.dar[[x]]
+  gr <- StringToGRanges(df$Peak1, sep = c("_","_"))
+  return(gr)
+})
+names(list.dar.peak1.gr) <- idents
+list.dar.peak2.gr <- lapply(seq(list.dar), function(x) {
+  df <- list.dar[[x]]
+  gr <- StringToGRanges(df$Peak2, sep = c("_","_"))
+  return(gr)
+})
+names(list.dar.peak2.gr) <- idents
+# annotate the list of GRanges DAR for each cell type with the peak location
+txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
+list.peak1.Anno <- lapply(list.dar.peak1.gr, annotatePeak, TxDb = txdb,
+                        tssRegion = c(-3000, 3000), annoDb="org.Hs.eg.db", verbose = FALSE)
+saveRDS(list.peak1.Anno, file = "6.Co-Accessible/ccans/list.peak1.Anno.rds")
+list.peak1.loc <- lapply(seq(list.peak1.Anno), function(x) {
+  loc <- list.peak1.Anno[[x]]@anno$annotation
+  loc <- str_split(loc, pattern = " ", simplify = TRUE)[,1]
+  loc <- str_replace(loc, pattern="3'", replacement="3p_UTR")
+  loc <- str_replace(loc, pattern="5'", replacement="5p_UTR")
+  loc <- str_replace(loc, pattern="Distal", replacement="Intergenic")
+  return(loc)
+})
+list.peak2.Anno <- lapply(list.dar.peak2.gr, annotatePeak, TxDb = txdb,
+                          tssRegion = c(-3000, 3000), annoDb="org.Hs.eg.db", verbose = FALSE)
+saveRDS(list.peak2.Anno, file = "6.Co-Accessible/ccans/list.peak2.Anno.rds")
+list.peak2.loc <- lapply(seq(list.peak2.Anno), function(x) {
+  loc <- list.peak2.Anno[[x]]@anno$annotation
+  loc <- str_split(loc, pattern = " ", simplify = TRUE)[,1]
+  loc <- str_replace(loc, pattern="3'", replacement="3p_UTR")
+  loc <- str_replace(loc, pattern="5'", replacement="5p_UTR")
+  loc <- str_replace(loc, pattern="Distal", replacement="Intergenic")
+  return(loc)
+})
+# create df with peak1 and peak2 locations
+list.peaks.loc.df <- lapply(seq(idents), function(x) {
+  df <- cbind(list.peak1.loc[[x]], list.peak2.loc[[x]]) %>% as.data.frame()
+  colnames(df) <- c("Peak1","Peak2")
+  counts <- dplyr::count(df, Peak1, Peak2) %>% as.data.frame()
+})
+names(list.peaks.loc.df) <- idents
+# generate circos plots of predicted chromatin-chromatin interactions and save to plots directory
+Plot_Cicero_Anno <- function(ident, list.peaks.loc.df) {
+  clusterID <- ident
+  toplot <- as.data.frame(list.peaks.loc.df[[ident]])
+  colnames(toplot) <- c("Peak1","Peak2","n")
+  print(clusterID)
+  # convert to an adjacency list with a value indicating the number of connections between
+  # each of the unique genomic location pairs
+  unique_combos <- !duplicated(t(apply(toplot, 1, sort)))
+  toplot <- toplot[unique_combos, ]
+  toplot <- toplot[order(toplot$n), ]
+  # toplot$n <- log2(toplot$n)
+  # grid.col = c('3p_UTR'="red", '5p_UTR'="black", Distal_Intergenic="blue",
+  # Downstream="grey", Exon="purple", Intron="orange", Promoter="green")
+  grid.col = brewer.pal(7, "Paired")
+  circos.clear()
+  pdf(paste0("6.Co-Accessible/ccans/plots/ccans.annotation.",clusterID,".pdf"), width=10, height=5)
+  par(mar=c(0.5,0.5,0.5,0.5))
+  circos.par(gap.after = 5)
+  chordDiagram(toplot,
+             grid.col = grid.col,
+             annotationTrack = "grid",
+             preAllocateTracks = list(track.height = max(strwidth(unlist(dimnames(toplot))))))
+  # we go back to the first track and customize sector labels
+  circos.track(track.index = 1, panel.fun = function(x, y) {
+    circos.text(CELL_META$xcenter,
+                CELL_META$ylim[1],
+                CELL_META$sector.index,
+                facing = "clockwise",
+                niceFacing = TRUE,
+                adj = c(0, 0.5))
+   }, bg.border = NA) # here set bg.border to NA is important
+  dev.off()
+  }
+# save circlize plots to file
+lapply(idents, function(ident) {
+  Plot_Cicero_Anno(ident, list.peaks.loc.df=list.peaks.loc.df)
+  })
+##############################identify specific cis-regulatory elements
+# focus on the tumor or Macrophage cell
+cell.type <- "Tumor"
+#Computing average expression & accessibility
+scRNA <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/data.merge.pro.rds")
+DefaultAssay(scRNA) <- "RNA"
+Idents(scRNA) <- scRNA$cellType_low
+scATAC <- readRDS("scATAC.data.pro.rds")
+DefaultAssay(scATAC) <- "Peaks"
+Idents(scATAC) <- scATAC$AnnotatedcellType
+average_exp_all <- AverageExpression(scRNA, assays = "RNA")
+average_acc_all <- AverageExpression(scATAC, assays = "Peaks")
+# extract the same cell type between scRNA and scATAC
+average_exp_all$RNA <- average_exp_all$RNA[, colnames(average_acc_all$Peaks)]
+####require one of the peaks overlap a gene's promoter (1kb)
+peak1.anno <- list.peak1.Anno[[cell.type]]@anno
+peak2.anno <- list.peak2.Anno[[cell.type]]@anno
+conns <- fread(paste0("6.Co-Accessible/ccans/ciceroConns.", cell.type,".csv")) # 250302
+conns <- conns %>% dplyr::filter(coaccess > 0.2)
+conns <- conns %>% mutate(peak1_type = peak1.anno$annotation, peak1_distanceToTSS = peak1.anno$distanceToTSS, peak1_nearestGene = peak1.anno$SYMBOL,
+                                      peak2_type = peak2.anno$annotation, peak2_distanceToTSS = peak2.anno$distanceToTSS, peak2_nearestGene = peak2.anno$SYMBOL)
+bg <- na.omit(unique(c(conns$peak1_nearestGene, conns$peak2_nearestGene))) # 作为background gene：17567
+saveRDS(bg, file = paste0("6.Co-Accessible/bg.", cell.type,".rds")) # coaccess > 0.2
+#select the peaks overlap the gene's promoter in conns
+cCREs <- conns %>% dplyr::filter(str_detect(peak1_type, pattern = "Promoter \\(<=1kb\\)"))
+cCREs <- cCREs[!is.na(cCREs$peak1_nearestGene),]
+conns_list <- group_split(cCREs, peak1_nearestGene) #基于peak1_nearestGene分组，通常按字母顺序排序
+names(conns_list) <- sort(unique(cCREs$peak1_nearestGene))
+cTargetGenes <- names(conns_list)[names(conns_list) %in% rownames(average_exp_all$RNA)] # 11453 genes
+# loop over all genes to compute correlation between accessibility & expression
+df <- data.frame()
+corr_list <- lapply(cTargetGenes, function(x){
+  cat("calculated gene:", x, "\n")
+  # subset by cur_gene
+  cur_conns <- conns_list[[x]]
+  cur_conns <- cur_conns[!(cur_conns$Peak2 %in% cur_conns$Peak1),]
+  cur_conns <- subset(cur_conns, coaccess >= 0.2)
+  # skip this gene if there are no co-accessible connections
+  if(nrow(cur_conns) == 0){return(data.frame())}
+  # get average exp and acc for this gene and peaks that are co-accessible
+  cur_conns$Peak2 <- gsub("_", "-", cur_conns$Peak2)
+  average_acc <- average_acc_all$Peaks[as.character(cur_conns$Peak2),, drop = F]
+  average_exp <- average_exp_all$RNA[x,]
+  # correlation between expression and accessibility:
+  cor_mat <- apply(average_acc, 1, function(x){
+    correlation <- cor.test(as.numeric(average_exp), as.numeric(x), method='pearson')
+    data.frame("pval"=correlation$p.value, "pcc"=correlation$estimate)
+  })
+  # collapse individual correlation dfs, add info
+  cor_df <- Reduce(rbind, cor_mat)
+  cur_conns$pcc <- cor_df$pcc
+  cur_conns$pval <- cor_df$pval
+  return(cur_conns)
+})
+# combine into one df and remove incomplete entries:
+df <- Reduce(rbind, corr_list)
+df <- df[!is.na(df$pcc),]
+# compute FDR:
+df$FDR <- p.adjust(df$pval, method='fdr')
+df <- df %>% dplyr::filter(FDR < 0.05) %>% arrange(desc(pcc))
+df$peak2_type <- gsub(" .*", "", df$peak2_type)
+df$peak2_type <- gsub("'", "_UTR", df$peak2_type)
+# distance between peak and target gene
+peak1_ranges <- StringToGRanges(gsub("_", "-", df$Peak1))
+peak2_ranges <- StringToGRanges(gsub("_", "-", df$Peak2))
+df$distance_bp <- abs(start(peak1_ranges) - start(peak2_ranges))
+# write df to file:
+write.table(df, file = paste0("6.Co-Accessible/", cell.type,"_peak_gene_correlation.csv"), sep = ",", quote=FALSE, row.names=F)
+saveRDS(df, file = paste0("6.Co-Accessible/", cell.type,"_peak_gene_correlation.rds"))