#' @description peak calling
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v86) #---GRCh38 (hg38)
library(ggplot2)
library(patchwork)
set.seed(101)
library(GenomicRanges)
library(ggpubr)
library(tibble)
library(dplyr)
library(ComplexHeatmap)
library(circlize)
library(openxlsx)
setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC")
scATAC.data <- readRDS("scATAC.data.rds")
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType
DefaultAssay(scATAC.data) <- "ATAC"
####To call peaks on each annotated cell type, we can use the group.by argument
peaks <- CallPeaks(
object = scATAC.data,
group.by = "AnnotatedcellType",
macs2.path = "/home/longzhilin/miniconda3/envs/SingleCell/bin/macs2",
outdir = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210721/scATAC/4.Peak"
)
saveRDS(peaks, "4.Peak/cellType.peak.rds")
library(future)
plan("multiprocess", workers = 10)
options(future.globals.maxSize = 100000 * 1024^2)
# remove peaks on nonstandard chromosomes and in genomic blacklist regions
peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse")
peaks <- subsetByOverlaps(x = peaks, ranges = blacklist_hg38_unified, invert = TRUE)
# quantify counts in each peak
macs2_counts <- FeatureMatrix(
fragments = Fragments(scATAC.data), # from cellranger fragment result
features = peaks,
cells = colnames(scATAC.data)
)
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotation) <- "UCSC"
genome(annotation) <- "hg38"
saveRDS(annotation, file = "4.Peak/annotation.rds")
# create a new assay using the MACS2 peak set and add it to the Seurat object
scATAC.data[["Peaks"]] <- CreateChromatinAssay(
counts = macs2_counts,
fragments = Fragments(scATAC.data),
annotation = annotation,
genome = "hg38"
)
DefaultAssay(scATAC.data) <- "Peaks"
scATAC.data <- RunTFIDF(scATAC.data)
gene.activities <- GeneActivity(scATAC.data)
# add the gene activity matrix to the Seurat object as a new assay and normalize it
scATAC.data[['Macs2ACTIVITY']] <- CreateAssayObject(counts = gene.activities)
scATAC.data <- NormalizeData(
object = scATAC.data,
assay = 'Macs2ACTIVITY',
normalization.method = 'LogNormalize',
scale.factor = median(scATAC.data$nCount_Macs2ACTIVITY)
)
saveRDS(scATAC.data, "scATAC.data.pro.rds") #### macs2 calling
pdf("4.Peak/CA9.macs2.pdf")
CoveragePlot(
object = scATAC.data,
region = "CA9",
assay = "Peaks",
features = "CA9",
ranges.title = "MACS2",
expression.assay = "Macs2ACTIVITY",
annotation = TRUE,
peaks = F,
links = F
)
tile_plot <- TilePlot(
object = scATAC.data,
region = "CA9"
)
print(tile_plot)
dev.off()
pdf("4.Peak/CA9.pdf")
CoveragePlot(
object = scATAC.data,
assay = "ATAC",
expression.assay = "ACTIVITY",
region = "CA9",
features = "CA9",
annotation = TRUE,
peaks = TRUE,
links = TRUE
)
dev.off()
###coverage plot of marker genes
source("/home/longzhilin/Analysis_Code/SingleCell/FindRegion.R")
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
promoter <- getPromoters(TxDb=txdb, upstream=1000, downstream=1000)
plot.cellType <- rev(c("CD4+ T cell", "Treg", "CD8+ T cell", "NK/NKT cell", "B cell", "Macrophage", "Monocyte", "Mast cell", "Endothelium (VCAM1+)", "Endothelium (VCAM1-)", "Mesangial cell", "Tumor"))
scATAC.data$AnnotatedcellType <- factor(scATAC.data$AnnotatedcellType, levels = plot.cellType)
DefaultAssay(scATAC.data) <- "Peaks"
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType
cell.type.markers <- read.table(file = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/2.Cluster/AnnotateCellType/cellMarker.txt", header = T, stringsAsFactors = F, sep = "\t")
genes <- cell.type.markers$Gene
genes <- genes[-19]
genes <- c("CD8A", "CD4", "GNLY", "MS4A1", "CD163", "S100A12", "TPSAB1", "PECAM1", "PDGFRB", "CA9")
pdf("2.Cluster/AnnotateCellType/cellType.coverage.plot.origin2.pdf", height = unit(3, "inches"))
res <- sapply(genes, function(x){
cat(x, "...\n")
regions <- FindRegion(object = scATAC.data, region = x, assay = "Peaks", extend.upstream = 1000, extend.downstream = 1000)
idx <- data.frame(findOverlaps(regions, promoter))
if(nrow(idx)>0){
p <- CoveragePlot(
object = scATAC.data,
region = x,
ranges.title = "MACS2",
links = F,
peaks = T,
extend.upstream = 1000,
extend.downstream = 1000,
region.highlight = promoter[idx[,2],])
}else{
p <- CoveragePlot(
object = scATAC.data,
region = x,
ranges.title = "MACS2",
links = F,
extend.upstream = 1000,
extend.downstream = 1000,
peaks = T)
}
print(p)
return(regions)
})
dev.off()
pdf("2.Cluster/AnnotateCellType/cellType.coverage.plot2.pdf", width = unit(3, "inches"), height = unit(3, "inches"))
res <- sapply(genes, function(x){
cat(x, "...\n")
regions <- FindRegion(object = scATAC.data, region = x, assay = "Peaks", extend.upstream = 1000, extend.downstream = 1000)
idx <- data.frame(findOverlaps(regions, promoter))
if(nrow(idx)>0){
p <- CoveragePlot(
object = scATAC.data,
region = x,
ranges.title = "MACS2",
links = F,
peaks = F,
extend.upstream = 1000,
extend.downstream = 1000,
region.highlight = promoter[idx[,2],])
}else{
p <- CoveragePlot(
object = scATAC.data,
region = x,
ranges.title = "MACS2",
links = F,
extend.upstream = 1000,
extend.downstream = 1000,
peaks = F)
}
print(p)
return(regions)
})
dev.off()
############################# identify differentially accessible chromatin regions between celltypes
DefaultAssay(scATAC.data) <- "Peaks"
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType
idents <- as.character(levels(scATAC.data))
cellType.DARs <- FindAllMarkers(scATAC.data,
test.use = 'LR',
logfc.threshold=0,
min.pct = 0.05, # often necessary to lower the min.pct threshold
latent.vars = "peak_region_fragments")
cf <- ClosestFeature(scATAC.data, regions = rownames(cellType.DARs)) # Find the closest feature to a given set of genomic regions
cellType.DARs <- cbind(cellType.DARs, gene=cf$gene_name, gene_biotype = cf$gene_biotype, type = cf$type, distance=cf$distance)
colnames(cellType.DARs)[6:7] <- c("cellType", "genomicRegion")
saveFormat <- lapply(idents, function(x){
index <- which(cellType.DARs$cellType == x)
DARs <- cellType.DARs[index,]
DARs.up <- DARs %>% filter(avg_log2FC>0) %>% arrange(desc(avg_log2FC))
DARs.down <- DARs %>% filter(avg_log2FC<0) %>% arrange(avg_log2FC)
DARs <- rbind(DARs.up, DARs.down)
return(DARs)
})
write.xlsx(saveFormat, file = "4.Peak/celltype.all.DARs.xlsx", sheetName = idents, rowNames = F)
saveRDS(cellType.DARs, file = "4.Peak/cellType.all.DARs.rds")
#require logfc.threshold >= 0.25 & p_val_adj < 0.05
cellType.sig.pos.DARs <- cellType.DARs %>% filter(avg_log2FC >=0.25 & p_val_adj < 0.05) %>% arrange(desc(avg_log2FC)) # 31925 peaks
saveFormat <- lapply(idents, function(x){
index <- which(cellType.sig.pos.DARs$cellType == x)
DARs <- cellType.sig.pos.DARs[index,]
DARs <- DARs %>% arrange(desc(avg_log2FC))
return(DARs)
})
names(saveFormat) <- idents
write.xlsx(saveFormat, file = "4.Peak/celltype.sig.pos.DARs.xlsx", sheetName = idents, rowNames = F)
saveRDS(cellType.sig.pos.DARs, file = "4.Peak/cellType.sig.pos.DARs.rds")
#plot--- differentially accessible chromatin regions heatmap
sig.region <- cellType.sig.pos.DARs %>% select(genomicRegion) %>% distinct()
#average fragment of each peak in each cell type
sig.region.mean <- AverageExpression(scATAC.data, features = sig.region$genomicRegion, assays = "Peaks")
sig.region.mean.scale <- scale(t(sig.region.mean$Peaks))
pdf("4.Peak/cellType.sig.pos.DAR.pdf")
Heatmap(sig.region.mean.scale, name = "z-score", show_column_dend = F, show_row_dend = F, show_column_names = F, row_names_gp = gpar(fontsize = 10), width = unit(10, "cm"), height = unit(8, "cm"))
dev.off()
##plot---DAR distribution
cellType.sig.pos.DARs.ratio <- as.data.frame(table(cellType.sig.pos.DARs$cellType))
cellType.sig.pos.DARs.ratio$Type <- rep("Lymphoid", nrow(cellType.sig.pos.DARs.ratio))
cellType.sig.pos.DARs.ratio$Type[c(4, 6, 12)] <- "Myeloid"
cellType.sig.pos.DARs.ratio$Type[c(3, 7, 8)] <- "Other"
cellType.sig.pos.DARs.ratio$Type[c(9)] <- "Tumor"
cellType.sig.pos.DARs.ratio$Type <- factor(cellType.sig.pos.DARs.ratio$Type, levels = c("Lymphoid", "Myeloid", "Tumor", "Other"))
pdf("4.Peak/cellType.sig.pos.DAR.ratio.pdf")
ggbarplot(cellType.sig.pos.DARs.ratio, x="Var1", y="Freq", fill = "Type", color = "Type",
sort.by.groups=FALSE, sort.val = "desc", palette = colors <- c("#00A087", "#4DBBD5", "#E64B35", "#3C5488"),#不按组排序
label = T, xlab = "", ylab = "Number of DAR") + rotate_x_text(60)
dev.off()
############################# cell tpye differentially accessible chromatin and genes
## load DEGs
scRNA.DEGs <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/2.Cluster/AnnotateCellType/cellType.sig.pos.DEGs.rds")
scRNA.DEGs <- scRNA.DEGs %>% filter(avg_log2FC >= 0.25 & p_val_adj < 0.05)
compared.idents <- as.character(levels(scATAC.data))
# calculate the intersection gene between scRNA-seq and scATAC-seq in same cell type
overlap.gene.list <- sapply(compared.idents, function(x){
idx <- which(scRNA.DEGs$cluster == x)
DEGs <- scRNA.DEGs$gene[idx]
idx <- which(cellType.sig.pos.DARs$cellType == x)
DARs <- unique(cellType.sig.pos.DARs$gene[idx])
overlap <- intersect(DEGs, DARs)
return(overlap)
})
saveRDS(overlap.gene.list, file = "4.Peak/DEG.DAR.overlap.genes.rds")
# calculate the differential ration
overlap.ratio <- sapply(names(overlap.gene.list), function(x){
genes <- overlap.gene.list[[x]]
#scRNA-seq
idx <- which(scRNA.DEGs$cluster == x)
DEGs <- length(scRNA.DEGs$gene[idx])
DEGs.with.DARs <- length(genes)
Prop.DEGs.with.DARs <- DEGs.with.DARs/DEGs
#scATAC-seq
idx <- which(cellType.sig.pos.DARs$cellType == x)
DARs <- length(cellType.sig.pos.DARs$genomicRegion[idx])
DARs.near.DEGs <- length(which(cellType.sig.pos.DARs$gene[idx] %in% genes))
Prop.DARs.near.DEGs <- DARs.near.DEGs/DARs
return(c(DEGs, DEGs.with.DARs, Prop.DEGs.with.DARs, DARs, DARs.near.DEGs, Prop.DARs.near.DEGs))
})
overlap.ratio <- t(overlap.ratio)
colnames(overlap.ratio) <- c("DEGs", "DEGs with DARs", "Prop DEGs with DARs", "DARs", "DARs near DEGs", "Prop DARs near DEGs")
#calculate the min max mean sd
res <- apply(overlap.ratio, 2, function(x){
return(c(min(x), max(x), mean(x), sd(x)))
})
rownames(res) <- c("min", "max", "mean", "sd")
write.xlsx(list(overlap.ratio, res), file = "4.Peak/overlap.ratio.xlsx", sheetName = c("overlap of DEGs and DARs", "Statistics"), rowNames = T)