RenalTumor / Git / [40a513] /ATAC/AnalysisPipeline/4.1.callPeak&DAR.R

Models:
AlyssaS/
RenalTumor
Downloads: 1
[40a513]: / ATAC / AnalysisPipeline / 4.1.callPeak&DAR.R
History
Download this file
279 lines (255 with data), 10.9 kB

#' @description peak calling

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v86) #---GRCh38 (hg38)
library(ggplot2)
library(patchwork)
set.seed(101)
library(GenomicRanges)
library(ggpubr)
library(tibble)
library(dplyr)
library(ComplexHeatmap)
library(circlize)
library(openxlsx)

setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC")

scATAC.data <- readRDS("scATAC.data.rds")
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType
DefaultAssay(scATAC.data) <- "ATAC"

####To call peaks on each annotated cell type, we can use the group.by argument
peaks <- CallPeaks(
  object = scATAC.data,
  group.by = "AnnotatedcellType",
  macs2.path = "/home/longzhilin/miniconda3/envs/SingleCell/bin/macs2",
  outdir = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210721/scATAC/4.Peak"
)
saveRDS(peaks, "4.Peak/cellType.peak.rds") 

library(future)
plan("multiprocess", workers = 10)
options(future.globals.maxSize = 100000 * 1024^2) 
# remove peaks on nonstandard chromosomes and in genomic blacklist regions
peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse")
peaks <- subsetByOverlaps(x = peaks, ranges = blacklist_hg38_unified, invert = TRUE)
# quantify counts in each peak
macs2_counts <- FeatureMatrix(
  fragments = Fragments(scATAC.data), # from cellranger fragment result
  features = peaks,
  cells = colnames(scATAC.data)
)
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotation) <- "UCSC"
genome(annotation) <- "hg38"
saveRDS(annotation, file = "4.Peak/annotation.rds")
# create a new assay using the MACS2 peak set and add it to the Seurat object
scATAC.data[["Peaks"]] <- CreateChromatinAssay(
  counts = macs2_counts,
  fragments = Fragments(scATAC.data),
  annotation = annotation,
  genome = "hg38"
)
DefaultAssay(scATAC.data) <- "Peaks"
scATAC.data <- RunTFIDF(scATAC.data)
gene.activities <- GeneActivity(scATAC.data)
# add the gene activity matrix to the Seurat object as a new assay and normalize it
scATAC.data[['Macs2ACTIVITY']] <- CreateAssayObject(counts = gene.activities)
scATAC.data <- NormalizeData(
  object = scATAC.data,
  assay = 'Macs2ACTIVITY',
  normalization.method = 'LogNormalize',
  scale.factor = median(scATAC.data$nCount_Macs2ACTIVITY)
)
saveRDS(scATAC.data, "scATAC.data.pro.rds") #### macs2 calling

pdf("4.Peak/CA9.macs2.pdf")
CoveragePlot(
  object = scATAC.data,
  region = "CA9",
  assay = "Peaks",
  features = "CA9",
  ranges.title = "MACS2",
  expression.assay = "Macs2ACTIVITY",
  annotation = TRUE,
  peaks = F,
  links = F
)
tile_plot <- TilePlot(
  object = scATAC.data,
  region = "CA9"
)
print(tile_plot)
dev.off()

pdf("4.Peak/CA9.pdf")
CoveragePlot(
  object = scATAC.data,
  assay = "ATAC",
  expression.assay = "ACTIVITY",
  region = "CA9",
  features = "CA9",
  annotation = TRUE,
  peaks = TRUE,
  links = TRUE
)
dev.off()

###coverage plot of marker genes
source("/home/longzhilin/Analysis_Code/SingleCell/FindRegion.R")
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
promoter <- getPromoters(TxDb=txdb, upstream=1000, downstream=1000)
plot.cellType <- rev(c("CD4+ T cell", "Treg", "CD8+ T cell", "NK/NKT cell", "B cell", "Macrophage", "Monocyte", "Mast cell", "Endothelium (VCAM1+)", "Endothelium (VCAM1-)", "Mesangial cell", "Tumor"))
scATAC.data$AnnotatedcellType <- factor(scATAC.data$AnnotatedcellType, levels = plot.cellType)
DefaultAssay(scATAC.data) <- "Peaks"
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType

cell.type.markers <- read.table(file = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/2.Cluster/AnnotateCellType/cellMarker.txt", header = T, stringsAsFactors = F, sep = "\t")
genes <- cell.type.markers$Gene
genes <- genes[-19]

genes <- c("CD8A", "CD4", "GNLY", "MS4A1", "CD163", "S100A12", "TPSAB1", "PECAM1", "PDGFRB", "CA9")
pdf("2.Cluster/AnnotateCellType/cellType.coverage.plot.origin2.pdf", height = unit(3, "inches"))
res <- sapply(genes, function(x){
  cat(x, "...\n")
  regions <- FindRegion(object = scATAC.data, region = x, assay = "Peaks", extend.upstream = 1000, extend.downstream = 1000)
  idx <- data.frame(findOverlaps(regions, promoter))
  if(nrow(idx)>0){
    p <- CoveragePlot(
      object = scATAC.data,
      region = x,
      ranges.title = "MACS2",
      links = F,
      peaks = T,
      extend.upstream = 1000,
      extend.downstream = 1000,
      region.highlight = promoter[idx[,2],])
  }else{
    p <- CoveragePlot(
      object = scATAC.data,
      region = x,
      ranges.title = "MACS2",
      links = F,
      extend.upstream = 1000,
      extend.downstream = 1000,
      peaks = T)
  }
  print(p)
  return(regions)
})
dev.off()

pdf("2.Cluster/AnnotateCellType/cellType.coverage.plot2.pdf", width = unit(3, "inches"), height = unit(3, "inches"))
res <- sapply(genes, function(x){
  cat(x, "...\n")
  regions <- FindRegion(object = scATAC.data, region = x, assay = "Peaks", extend.upstream = 1000, extend.downstream = 1000)
  idx <- data.frame(findOverlaps(regions, promoter))
  if(nrow(idx)>0){
    p <- CoveragePlot(
      object = scATAC.data,
      region = x,
      ranges.title = "MACS2",
      links = F,
      peaks = F,
      extend.upstream = 1000,
      extend.downstream = 1000,
      region.highlight = promoter[idx[,2],])
  }else{
    p <- CoveragePlot(
      object = scATAC.data,
      region = x,
      ranges.title = "MACS2",
      links = F,
      extend.upstream = 1000,
      extend.downstream = 1000,
      peaks = F)
  }
  print(p)
  return(regions)
})
dev.off()

############################# identify differentially accessible chromatin regions between celltypes
DefaultAssay(scATAC.data) <- "Peaks"
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType
idents <- as.character(levels(scATAC.data))
cellType.DARs <- FindAllMarkers(scATAC.data, 
                                test.use = 'LR',
                                logfc.threshold=0, 
                                min.pct = 0.05, # often necessary to lower the min.pct threshold
                                latent.vars = "peak_region_fragments")
cf <- ClosestFeature(scATAC.data, regions = rownames(cellType.DARs)) # Find the closest feature to a given set of genomic regions
cellType.DARs <- cbind(cellType.DARs, gene=cf$gene_name, gene_biotype = cf$gene_biotype, type = cf$type, distance=cf$distance)
colnames(cellType.DARs)[6:7] <- c("cellType", "genomicRegion")
saveFormat <- lapply(idents, function(x){
  index <- which(cellType.DARs$cellType == x)
  DARs <- cellType.DARs[index,]
  DARs.up <- DARs %>% filter(avg_log2FC>0) %>% arrange(desc(avg_log2FC))
  DARs.down <- DARs %>% filter(avg_log2FC<0) %>% arrange(avg_log2FC)
  DARs <- rbind(DARs.up, DARs.down)
  return(DARs)
})
write.xlsx(saveFormat, file = "4.Peak/celltype.all.DARs.xlsx", sheetName = idents, rowNames = F)
saveRDS(cellType.DARs, file = "4.Peak/cellType.all.DARs.rds")

#require logfc.threshold >= 0.25 & p_val_adj < 0.05
cellType.sig.pos.DARs <- cellType.DARs %>% filter(avg_log2FC >=0.25 & p_val_adj < 0.05) %>% arrange(desc(avg_log2FC)) # 31925 peaks
saveFormat <- lapply(idents, function(x){
  index <- which(cellType.sig.pos.DARs$cellType == x)
  DARs <- cellType.sig.pos.DARs[index,]
  DARs <- DARs %>% arrange(desc(avg_log2FC))
  return(DARs)
})
names(saveFormat) <- idents
write.xlsx(saveFormat, file = "4.Peak/celltype.sig.pos.DARs.xlsx", sheetName = idents, rowNames = F)
saveRDS(cellType.sig.pos.DARs, file = "4.Peak/cellType.sig.pos.DARs.rds")

#plot--- differentially accessible chromatin regions heatmap
sig.region <- cellType.sig.pos.DARs %>% select(genomicRegion) %>% distinct() 
#average fragment of each peak in each cell type
sig.region.mean <- AverageExpression(scATAC.data, features = sig.region$genomicRegion, assays = "Peaks")
sig.region.mean.scale <- scale(t(sig.region.mean$Peaks))
pdf("4.Peak/cellType.sig.pos.DAR.pdf")
Heatmap(sig.region.mean.scale, name = "z-score", show_column_dend = F, show_row_dend = F, show_column_names = F, row_names_gp = gpar(fontsize = 10), width = unit(10, "cm"), height = unit(8, "cm"))
dev.off()

##plot---DAR distribution
cellType.sig.pos.DARs.ratio <- as.data.frame(table(cellType.sig.pos.DARs$cellType))
cellType.sig.pos.DARs.ratio$Type <- rep("Lymphoid", nrow(cellType.sig.pos.DARs.ratio))
cellType.sig.pos.DARs.ratio$Type[c(4, 6, 12)] <- "Myeloid"
cellType.sig.pos.DARs.ratio$Type[c(3, 7, 8)] <- "Other"
cellType.sig.pos.DARs.ratio$Type[c(9)] <- "Tumor"
cellType.sig.pos.DARs.ratio$Type <- factor(cellType.sig.pos.DARs.ratio$Type, levels = c("Lymphoid", "Myeloid", "Tumor", "Other"))
pdf("4.Peak/cellType.sig.pos.DAR.ratio.pdf")
ggbarplot(cellType.sig.pos.DARs.ratio, x="Var1", y="Freq", fill = "Type", color = "Type",
          sort.by.groups=FALSE, sort.val = "desc", palette = colors <- c("#00A087", "#4DBBD5", "#E64B35", "#3C5488"),#不按组排序
          label = T, xlab = "", ylab = "Number of DAR") + rotate_x_text(60)
dev.off()

############################# cell tpye differentially accessible chromatin and genes
## load DEGs
scRNA.DEGs <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/2.Cluster/AnnotateCellType/cellType.sig.pos.DEGs.rds")
scRNA.DEGs <- scRNA.DEGs %>% filter(avg_log2FC >= 0.25 & p_val_adj < 0.05)
compared.idents <- as.character(levels(scATAC.data))

# calculate the intersection gene between scRNA-seq and scATAC-seq in same cell type
overlap.gene.list <- sapply(compared.idents, function(x){
  idx <- which(scRNA.DEGs$cluster == x)
  DEGs <- scRNA.DEGs$gene[idx]

  idx <- which(cellType.sig.pos.DARs$cellType == x)
  DARs <- unique(cellType.sig.pos.DARs$gene[idx])

  overlap <- intersect(DEGs, DARs)
  return(overlap)
})
saveRDS(overlap.gene.list, file = "4.Peak/DEG.DAR.overlap.genes.rds")

# calculate the differential ration
overlap.ratio <- sapply(names(overlap.gene.list), function(x){
  genes <- overlap.gene.list[[x]]

  #scRNA-seq
  idx <- which(scRNA.DEGs$cluster == x)
  DEGs <- length(scRNA.DEGs$gene[idx])
  DEGs.with.DARs <- length(genes)
  Prop.DEGs.with.DARs <- DEGs.with.DARs/DEGs

  #scATAC-seq
  idx <- which(cellType.sig.pos.DARs$cellType == x)
  DARs <- length(cellType.sig.pos.DARs$genomicRegion[idx])
  DARs.near.DEGs <- length(which(cellType.sig.pos.DARs$gene[idx] %in% genes))
  Prop.DARs.near.DEGs <- DARs.near.DEGs/DARs
  return(c(DEGs, DEGs.with.DARs, Prop.DEGs.with.DARs, DARs, DARs.near.DEGs, Prop.DARs.near.DEGs))
})
overlap.ratio <- t(overlap.ratio)
colnames(overlap.ratio) <- c("DEGs", "DEGs with DARs", "Prop DEGs with DARs", "DARs", "DARs near DEGs", "Prop DARs near DEGs")

#calculate the min max mean sd
res <- apply(overlap.ratio, 2, function(x){
  return(c(min(x), max(x), mean(x), sd(x)))
})
rownames(res) <- c("min", "max", "mean", "sd")
write.xlsx(list(overlap.ratio, res), file = "4.Peak/overlap.ratio.xlsx", sheetName = c("overlap of DEGs and DARs", "Statistics"), rowNames = T)