a b/ATAC/AnalysisPipeline/4.1.callPeak&DAR.R
1
#' @description peak calling
2
3
library(Signac)
4
library(Seurat)
5
library(GenomeInfoDb)
6
library(EnsDb.Hsapiens.v86) #---GRCh38 (hg38)
7
library(ggplot2)
8
library(patchwork)
9
set.seed(101)
10
library(GenomicRanges)
11
library(ggpubr)
12
library(tibble)
13
library(dplyr)
14
library(ComplexHeatmap)
15
library(circlize)
16
library(openxlsx)
17
18
setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC")
19
20
scATAC.data <- readRDS("scATAC.data.rds")
21
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType
22
DefaultAssay(scATAC.data) <- "ATAC"
23
24
####To call peaks on each annotated cell type, we can use the group.by argument
25
peaks <- CallPeaks(
26
  object = scATAC.data,
27
  group.by = "AnnotatedcellType",
28
  macs2.path = "/home/longzhilin/miniconda3/envs/SingleCell/bin/macs2",
29
  outdir = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210721/scATAC/4.Peak"
30
)
31
saveRDS(peaks, "4.Peak/cellType.peak.rds") 
32
33
library(future)
34
plan("multiprocess", workers = 10)
35
options(future.globals.maxSize = 100000 * 1024^2) 
36
# remove peaks on nonstandard chromosomes and in genomic blacklist regions
37
peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse")
38
peaks <- subsetByOverlaps(x = peaks, ranges = blacklist_hg38_unified, invert = TRUE)
39
# quantify counts in each peak
40
macs2_counts <- FeatureMatrix(
41
  fragments = Fragments(scATAC.data), # from cellranger fragment result
42
  features = peaks,
43
  cells = colnames(scATAC.data)
44
)
45
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
46
seqlevelsStyle(annotation) <- "UCSC"
47
genome(annotation) <- "hg38"
48
saveRDS(annotation, file = "4.Peak/annotation.rds")
49
# create a new assay using the MACS2 peak set and add it to the Seurat object
50
scATAC.data[["Peaks"]] <- CreateChromatinAssay(
51
  counts = macs2_counts,
52
  fragments = Fragments(scATAC.data),
53
  annotation = annotation,
54
  genome = "hg38"
55
)
56
DefaultAssay(scATAC.data) <- "Peaks"
57
scATAC.data <- RunTFIDF(scATAC.data)
58
gene.activities <- GeneActivity(scATAC.data)
59
# add the gene activity matrix to the Seurat object as a new assay and normalize it
60
scATAC.data[['Macs2ACTIVITY']] <- CreateAssayObject(counts = gene.activities)
61
scATAC.data <- NormalizeData(
62
  object = scATAC.data,
63
  assay = 'Macs2ACTIVITY',
64
  normalization.method = 'LogNormalize',
65
  scale.factor = median(scATAC.data$nCount_Macs2ACTIVITY)
66
)
67
saveRDS(scATAC.data, "scATAC.data.pro.rds") #### macs2 calling
68
69
pdf("4.Peak/CA9.macs2.pdf")
70
CoveragePlot(
71
  object = scATAC.data,
72
  region = "CA9",
73
  assay = "Peaks",
74
  features = "CA9",
75
  ranges.title = "MACS2",
76
  expression.assay = "Macs2ACTIVITY",
77
  annotation = TRUE,
78
  peaks = F,
79
  links = F
80
)
81
tile_plot <- TilePlot(
82
  object = scATAC.data,
83
  region = "CA9"
84
)
85
print(tile_plot)
86
dev.off()
87
88
pdf("4.Peak/CA9.pdf")
89
CoveragePlot(
90
  object = scATAC.data,
91
  assay = "ATAC",
92
  expression.assay = "ACTIVITY",
93
  region = "CA9",
94
  features = "CA9",
95
  annotation = TRUE,
96
  peaks = TRUE,
97
  links = TRUE
98
)
99
dev.off()
100
101
###coverage plot of marker genes
102
source("/home/longzhilin/Analysis_Code/SingleCell/FindRegion.R")
103
library(ChIPseeker)
104
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
105
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
106
promoter <- getPromoters(TxDb=txdb, upstream=1000, downstream=1000)
107
plot.cellType <- rev(c("CD4+ T cell", "Treg", "CD8+ T cell", "NK/NKT cell", "B cell", "Macrophage", "Monocyte", "Mast cell", "Endothelium (VCAM1+)", "Endothelium (VCAM1-)", "Mesangial cell", "Tumor"))
108
scATAC.data$AnnotatedcellType <- factor(scATAC.data$AnnotatedcellType, levels = plot.cellType)
109
DefaultAssay(scATAC.data) <- "Peaks"
110
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType
111
112
cell.type.markers <- read.table(file = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/2.Cluster/AnnotateCellType/cellMarker.txt", header = T, stringsAsFactors = F, sep = "\t")
113
genes <- cell.type.markers$Gene
114
genes <- genes[-19]
115
116
genes <- c("CD8A", "CD4", "GNLY", "MS4A1", "CD163", "S100A12", "TPSAB1", "PECAM1", "PDGFRB", "CA9")
117
pdf("2.Cluster/AnnotateCellType/cellType.coverage.plot.origin2.pdf", height = unit(3, "inches"))
118
res <- sapply(genes, function(x){
119
  cat(x, "...\n")
120
  regions <- FindRegion(object = scATAC.data, region = x, assay = "Peaks", extend.upstream = 1000, extend.downstream = 1000)
121
  idx <- data.frame(findOverlaps(regions, promoter))
122
  if(nrow(idx)>0){
123
    p <- CoveragePlot(
124
      object = scATAC.data,
125
      region = x,
126
      ranges.title = "MACS2",
127
      links = F,
128
      peaks = T,
129
      extend.upstream = 1000,
130
      extend.downstream = 1000,
131
      region.highlight = promoter[idx[,2],])
132
  }else{
133
    p <- CoveragePlot(
134
      object = scATAC.data,
135
      region = x,
136
      ranges.title = "MACS2",
137
      links = F,
138
      extend.upstream = 1000,
139
      extend.downstream = 1000,
140
      peaks = T)
141
  }
142
  print(p)
143
  return(regions)
144
})
145
dev.off()
146
147
pdf("2.Cluster/AnnotateCellType/cellType.coverage.plot2.pdf", width = unit(3, "inches"), height = unit(3, "inches"))
148
res <- sapply(genes, function(x){
149
  cat(x, "...\n")
150
  regions <- FindRegion(object = scATAC.data, region = x, assay = "Peaks", extend.upstream = 1000, extend.downstream = 1000)
151
  idx <- data.frame(findOverlaps(regions, promoter))
152
  if(nrow(idx)>0){
153
    p <- CoveragePlot(
154
      object = scATAC.data,
155
      region = x,
156
      ranges.title = "MACS2",
157
      links = F,
158
      peaks = F,
159
      extend.upstream = 1000,
160
      extend.downstream = 1000,
161
      region.highlight = promoter[idx[,2],])
162
  }else{
163
    p <- CoveragePlot(
164
      object = scATAC.data,
165
      region = x,
166
      ranges.title = "MACS2",
167
      links = F,
168
      extend.upstream = 1000,
169
      extend.downstream = 1000,
170
      peaks = F)
171
  }
172
  print(p)
173
  return(regions)
174
})
175
dev.off()
176
177
############################# identify differentially accessible chromatin regions between celltypes
178
DefaultAssay(scATAC.data) <- "Peaks"
179
Idents(scATAC.data) <- scATAC.data$AnnotatedcellType
180
idents <- as.character(levels(scATAC.data))
181
cellType.DARs <- FindAllMarkers(scATAC.data, 
182
                                test.use = 'LR',
183
                                logfc.threshold=0, 
184
                                min.pct = 0.05, # often necessary to lower the min.pct threshold
185
                                latent.vars = "peak_region_fragments")
186
cf <- ClosestFeature(scATAC.data, regions = rownames(cellType.DARs)) # Find the closest feature to a given set of genomic regions
187
cellType.DARs <- cbind(cellType.DARs, gene=cf$gene_name, gene_biotype = cf$gene_biotype, type = cf$type, distance=cf$distance)
188
colnames(cellType.DARs)[6:7] <- c("cellType", "genomicRegion")
189
saveFormat <- lapply(idents, function(x){
190
  index <- which(cellType.DARs$cellType == x)
191
  DARs <- cellType.DARs[index,]
192
  DARs.up <- DARs %>% filter(avg_log2FC>0) %>% arrange(desc(avg_log2FC))
193
  DARs.down <- DARs %>% filter(avg_log2FC<0) %>% arrange(avg_log2FC)
194
  DARs <- rbind(DARs.up, DARs.down)
195
  return(DARs)
196
})
197
write.xlsx(saveFormat, file = "4.Peak/celltype.all.DARs.xlsx", sheetName = idents, rowNames = F)
198
saveRDS(cellType.DARs, file = "4.Peak/cellType.all.DARs.rds")
199
200
#require logfc.threshold >= 0.25 & p_val_adj < 0.05
201
cellType.sig.pos.DARs <- cellType.DARs %>% filter(avg_log2FC >=0.25 & p_val_adj < 0.05) %>% arrange(desc(avg_log2FC)) # 31925 peaks
202
saveFormat <- lapply(idents, function(x){
203
  index <- which(cellType.sig.pos.DARs$cellType == x)
204
  DARs <- cellType.sig.pos.DARs[index,]
205
  DARs <- DARs %>% arrange(desc(avg_log2FC))
206
  return(DARs)
207
})
208
names(saveFormat) <- idents
209
write.xlsx(saveFormat, file = "4.Peak/celltype.sig.pos.DARs.xlsx", sheetName = idents, rowNames = F)
210
saveRDS(cellType.sig.pos.DARs, file = "4.Peak/cellType.sig.pos.DARs.rds")
211
212
#plot--- differentially accessible chromatin regions heatmap
213
sig.region <- cellType.sig.pos.DARs %>% select(genomicRegion) %>% distinct() 
214
#average fragment of each peak in each cell type
215
sig.region.mean <- AverageExpression(scATAC.data, features = sig.region$genomicRegion, assays = "Peaks")
216
sig.region.mean.scale <- scale(t(sig.region.mean$Peaks))
217
pdf("4.Peak/cellType.sig.pos.DAR.pdf")
218
Heatmap(sig.region.mean.scale, name = "z-score", show_column_dend = F, show_row_dend = F, show_column_names = F, row_names_gp = gpar(fontsize = 10), width = unit(10, "cm"), height = unit(8, "cm"))
219
dev.off()
220
221
##plot---DAR distribution
222
cellType.sig.pos.DARs.ratio <- as.data.frame(table(cellType.sig.pos.DARs$cellType))
223
cellType.sig.pos.DARs.ratio$Type <- rep("Lymphoid", nrow(cellType.sig.pos.DARs.ratio))
224
cellType.sig.pos.DARs.ratio$Type[c(4, 6, 12)] <- "Myeloid"
225
cellType.sig.pos.DARs.ratio$Type[c(3, 7, 8)] <- "Other"
226
cellType.sig.pos.DARs.ratio$Type[c(9)] <- "Tumor"
227
cellType.sig.pos.DARs.ratio$Type <- factor(cellType.sig.pos.DARs.ratio$Type, levels = c("Lymphoid", "Myeloid", "Tumor", "Other"))
228
pdf("4.Peak/cellType.sig.pos.DAR.ratio.pdf")
229
ggbarplot(cellType.sig.pos.DARs.ratio, x="Var1", y="Freq", fill = "Type", color = "Type",
230
          sort.by.groups=FALSE, sort.val = "desc", palette = colors <- c("#00A087", "#4DBBD5", "#E64B35", "#3C5488"),#不按组排序
231
          label = T, xlab = "", ylab = "Number of DAR") + rotate_x_text(60)
232
dev.off()
233
234
############################# cell tpye differentially accessible chromatin and genes
235
## load DEGs
236
scRNA.DEGs <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/2.Cluster/AnnotateCellType/cellType.sig.pos.DEGs.rds")
237
scRNA.DEGs <- scRNA.DEGs %>% filter(avg_log2FC >= 0.25 & p_val_adj < 0.05)
238
compared.idents <- as.character(levels(scATAC.data))
239
240
# calculate the intersection gene between scRNA-seq and scATAC-seq in same cell type
241
overlap.gene.list <- sapply(compared.idents, function(x){
242
  idx <- which(scRNA.DEGs$cluster == x)
243
  DEGs <- scRNA.DEGs$gene[idx]
244
245
  idx <- which(cellType.sig.pos.DARs$cellType == x)
246
  DARs <- unique(cellType.sig.pos.DARs$gene[idx])
247
248
  overlap <- intersect(DEGs, DARs)
249
  return(overlap)
250
})
251
saveRDS(overlap.gene.list, file = "4.Peak/DEG.DAR.overlap.genes.rds")
252
253
# calculate the differential ration
254
overlap.ratio <- sapply(names(overlap.gene.list), function(x){
255
  genes <- overlap.gene.list[[x]]
256
257
  #scRNA-seq
258
  idx <- which(scRNA.DEGs$cluster == x)
259
  DEGs <- length(scRNA.DEGs$gene[idx])
260
  DEGs.with.DARs <- length(genes)
261
  Prop.DEGs.with.DARs <- DEGs.with.DARs/DEGs
262
263
  #scATAC-seq
264
  idx <- which(cellType.sig.pos.DARs$cellType == x)
265
  DARs <- length(cellType.sig.pos.DARs$genomicRegion[idx])
266
  DARs.near.DEGs <- length(which(cellType.sig.pos.DARs$gene[idx] %in% genes))
267
  Prop.DARs.near.DEGs <- DARs.near.DEGs/DARs
268
  return(c(DEGs, DEGs.with.DARs, Prop.DEGs.with.DARs, DARs, DARs.near.DEGs, Prop.DARs.near.DEGs))
269
})
270
overlap.ratio <- t(overlap.ratio)
271
colnames(overlap.ratio) <- c("DEGs", "DEGs with DARs", "Prop DEGs with DARs", "DARs", "DARs near DEGs", "Prop DARs near DEGs")
272
273
#calculate the min max mean sd
274
res <- apply(overlap.ratio, 2, function(x){
275
  return(c(min(x), max(x), mean(x), sd(x)))
276
})
277
rownames(res) <- c("min", "max", "mean", "sd")
278
write.xlsx(list(overlap.ratio, res), file = "4.Peak/overlap.ratio.xlsx", sheetName = c("overlap of DEGs and DARs", "Statistics"), rowNames = T)