RenalTumor / Git / [40a513] /ATAC/AnalysisPipeline/6.2.ccans.annotated.genomicLocation.R

Models:
AlyssaS/
RenalTumor
Downloads: 1
[40a513]: / ATAC / AnalysisPipeline / 6.2.ccans.annotated.genomicLocation.R
History
Download this file
205 lines (181 with data), 9.0 kB

#' @description: this script will annotate cicero ccan with the ChIPSeeker database to determine what genomic
# regions are linked by the connections and create circos plots

library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(openxlsx)
library(circlize)
library(dplyr)
library(tibble)
library(data.table)
library(Signac)
library(Seurat)
library(stringr)
library(RColorBrewer)
library(cicero)

source(file = "/home/longzhilin/Analysis_Code/Plot_colorPaletters.R")
source(file = "/home/longzhilin/Analysis_Code/SingleCell/user.CoveragePlot.R")
setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC")

# read in cell-type-specific CCAN and filter by coaccess threshold > 0.2
dar_files <- list.files("6.Co-Accessible/ccans", recursive = TRUE, pattern = "ciceroConns.*.csv$", full.names = TRUE)
idents <- gsub(".*ciceroConns.", "", dar_files)
idents <- gsub(".csv", "", idents)
list.dar <- lapply(dar_files, function(file_path) {
  fread(file_path) %>%
    dplyr::filter(coaccess > 0.2) %>%
    dplyr::select(Peak1, Peak2)
})
names(list.dar) <- idents
# count cis-regulatory number of coaccess > 0.2
peak.num <- sapply(list.dar, function(x){return(nrow(x))})
names(peak.num) <- idents

# convert the DAR to GRanges objects to annotate
list.dar.peak1.gr <- lapply(seq(list.dar), function(x) {
  df <- list.dar[[x]]
  gr <- StringToGRanges(df$Peak1, sep = c("_","_"))
  return(gr)
})
names(list.dar.peak1.gr) <- idents

list.dar.peak2.gr <- lapply(seq(list.dar), function(x) {
  df <- list.dar[[x]]
  gr <- StringToGRanges(df$Peak2, sep = c("_","_"))
  return(gr)
})
names(list.dar.peak2.gr) <- idents

# annotate the list of GRanges DAR for each cell type with the peak location
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
list.peak1.Anno <- lapply(list.dar.peak1.gr, annotatePeak, TxDb = txdb,
                        tssRegion = c(-3000, 3000), annoDb="org.Hs.eg.db", verbose = FALSE)
saveRDS(list.peak1.Anno, file = "6.Co-Accessible/ccans/list.peak1.Anno.rds")                        
list.peak1.loc <- lapply(seq(list.peak1.Anno), function(x) {
  loc <- list.peak1.Anno[[x]]@anno$annotation 
  loc <- str_split(loc, pattern = " ", simplify = TRUE)[,1]
  loc <- str_replace(loc, pattern="3'", replacement="3p_UTR")
  loc <- str_replace(loc, pattern="5'", replacement="5p_UTR")
  loc <- str_replace(loc, pattern="Distal", replacement="Intergenic")
  return(loc)
})

list.peak2.Anno <- lapply(list.dar.peak2.gr, annotatePeak, TxDb = txdb,
                          tssRegion = c(-3000, 3000), annoDb="org.Hs.eg.db", verbose = FALSE)
saveRDS(list.peak2.Anno, file = "6.Co-Accessible/ccans/list.peak2.Anno.rds")
list.peak2.loc <- lapply(seq(list.peak2.Anno), function(x) {
  loc <- list.peak2.Anno[[x]]@anno$annotation 
  loc <- str_split(loc, pattern = " ", simplify = TRUE)[,1]
  loc <- str_replace(loc, pattern="3'", replacement="3p_UTR")
  loc <- str_replace(loc, pattern="5'", replacement="5p_UTR")
  loc <- str_replace(loc, pattern="Distal", replacement="Intergenic")
  return(loc)
})

# create df with peak1 and peak2 locations
list.peaks.loc.df <- lapply(seq(idents), function(x) {
  df <- cbind(list.peak1.loc[[x]], list.peak2.loc[[x]]) %>% as.data.frame()
  colnames(df) <- c("Peak1","Peak2")
  counts <- dplyr::count(df, Peak1, Peak2) %>% as.data.frame()
})
names(list.peaks.loc.df) <- idents

# generate circos plots of predicted chromatin-chromatin interactions and save to plots directory
Plot_Cicero_Anno <- function(ident, list.peaks.loc.df) {
  clusterID <- ident
  toplot <- as.data.frame(list.peaks.loc.df[[ident]])
  colnames(toplot) <- c("Peak1","Peak2","n")
  print(clusterID)
  # convert to an adjacency list with a value indicating the number of connections between
  # each of the unique genomic location pairs
  unique_combos <- !duplicated(t(apply(toplot, 1, sort)))
  toplot <- toplot[unique_combos, ]
  toplot <- toplot[order(toplot$n), ]
  # toplot$n <- log2(toplot$n)
  
  # grid.col = c('3p_UTR'="red", '5p_UTR'="black", Distal_Intergenic="blue",
  # Downstream="grey", Exon="purple", Intron="orange", Promoter="green")
  grid.col = brewer.pal(7, "Paired")

  circos.clear()
  pdf(paste0("6.Co-Accessible/ccans/plots/ccans.annotation.",clusterID,".pdf"), width=10, height=5)
  par(mar=c(0.5,0.5,0.5,0.5))
  circos.par(gap.after = 5)
  chordDiagram(toplot,
             grid.col = grid.col,
             annotationTrack = "grid",
             preAllocateTracks = list(track.height = max(strwidth(unlist(dimnames(toplot))))))

  # we go back to the first track and customize sector labels
  circos.track(track.index = 1, panel.fun = function(x, y) {
    circos.text(CELL_META$xcenter,
                CELL_META$ylim[1],
                CELL_META$sector.index,
                facing = "clockwise",
                niceFacing = TRUE,
                adj = c(0, 0.5))
   }, bg.border = NA) # here set bg.border to NA is important
  dev.off()
  }

# save circlize plots to file
lapply(idents, function(ident) {
  Plot_Cicero_Anno(ident, list.peaks.loc.df=list.peaks.loc.df)
  })

##############################identify specific cis-regulatory elements
# focus on the tumor or Macrophage cell 
cell.type <- "Tumor"
#Computing average expression & accessibility
scRNA <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/data.merge.pro.rds")
DefaultAssay(scRNA) <- "RNA"
Idents(scRNA) <- scRNA$cellType_low
scATAC <- readRDS("scATAC.data.pro.rds")
DefaultAssay(scATAC) <- "Peaks"
Idents(scATAC) <- scATAC$AnnotatedcellType
average_exp_all <- AverageExpression(scRNA, assays = "RNA")
average_acc_all <- AverageExpression(scATAC, assays = "Peaks")
# extract the same cell type between scRNA and scATAC
average_exp_all$RNA <- average_exp_all$RNA[, colnames(average_acc_all$Peaks)]

####require one of the peaks overlap a gene's promoter (1kb)
peak1.anno <- list.peak1.Anno[[cell.type]]@anno
peak2.anno <- list.peak2.Anno[[cell.type]]@anno
conns <- fread(paste0("6.Co-Accessible/ccans/ciceroConns.", cell.type,".csv")) # 250302
conns <- conns %>% dplyr::filter(coaccess > 0.2)
conns <- conns %>% mutate(peak1_type = peak1.anno$annotation, peak1_distanceToTSS = peak1.anno$distanceToTSS, peak1_nearestGene = peak1.anno$SYMBOL,
                                      peak2_type = peak2.anno$annotation, peak2_distanceToTSS = peak2.anno$distanceToTSS, peak2_nearestGene = peak2.anno$SYMBOL)
bg <- na.omit(unique(c(conns$peak1_nearestGene, conns$peak2_nearestGene))) # 作为background gene：17567
saveRDS(bg, file = paste0("6.Co-Accessible/bg.", cell.type,".rds")) # coaccess > 0.2

#select the peaks overlap the gene's promoter in conns
cCREs <- conns %>% dplyr::filter(str_detect(peak1_type, pattern = "Promoter \\(<=1kb\\)"))
cCREs <- cCREs[!is.na(cCREs$peak1_nearestGene),]
conns_list <- group_split(cCREs, peak1_nearestGene) #基于peak1_nearestGene分组，通常按字母顺序排序
names(conns_list) <- sort(unique(cCREs$peak1_nearestGene)) 
cTargetGenes <- names(conns_list)[names(conns_list) %in% rownames(average_exp_all$RNA)] # 11453 genes

# loop over all genes to compute correlation between accessibility & expression
df <- data.frame()
corr_list <- lapply(cTargetGenes, function(x){
  cat("calculated gene:", x, "\n")
  # subset by cur_gene
  cur_conns <- conns_list[[x]]
  cur_conns <- cur_conns[!(cur_conns$Peak2 %in% cur_conns$Peak1),]
  cur_conns <- subset(cur_conns, coaccess >= 0.2)

  # skip this gene if there are no co-accessible connections
  if(nrow(cur_conns) == 0){return(data.frame())}

  # get average exp and acc for this gene and peaks that are co-accessible
  cur_conns$Peak2 <- gsub("_", "-", cur_conns$Peak2)
  average_acc <- average_acc_all$Peaks[as.character(cur_conns$Peak2),, drop = F]
  
  average_exp <- average_exp_all$RNA[x,]
  # correlation between expression and accessibility:
  cor_mat <- apply(average_acc, 1, function(x){
    correlation <- cor.test(as.numeric(average_exp), as.numeric(x), method='pearson')
    data.frame("pval"=correlation$p.value, "pcc"=correlation$estimate)
  })
  # collapse individual correlation dfs, add info
  cor_df <- Reduce(rbind, cor_mat)
  cur_conns$pcc <- cor_df$pcc
  cur_conns$pval <- cor_df$pval
  return(cur_conns)
})

# combine into one df and remove incomplete entries:
df <- Reduce(rbind, corr_list)
df <- df[!is.na(df$pcc),]

# compute FDR:
df$FDR <- p.adjust(df$pval, method='fdr')
df <- df %>% dplyr::filter(FDR < 0.05) %>% arrange(desc(pcc))
df$peak2_type <- gsub(" .*", "", df$peak2_type)
df$peak2_type <- gsub("'", "_UTR", df$peak2_type)

# distance between peak and target gene
peak1_ranges <- StringToGRanges(gsub("_", "-", df$Peak1))
peak2_ranges <- StringToGRanges(gsub("_", "-", df$Peak2))
df$distance_bp <- abs(start(peak1_ranges) - start(peak2_ranges))

# write df to file:
write.table(df, file = paste0("6.Co-Accessible/", cell.type,"_peak_gene_correlation.csv"), sep = ",", quote=FALSE, row.names=F)
saveRDS(df, file = paste0("6.Co-Accessible/", cell.type,"_peak_gene_correlation.rds"))