Switch to unified view

a b/a_PreprocessPipeline/bulkATACpre.R
1
2
# MESSAGE -----------------------------------------------------------------
3
# 
4
# author: Yulin Lyu
5
# email: lvyulin@pku.edu.cn
6
# 
7
# require: R whatever
8
# 
9
# ---
10
11
# 0. Load packages --------------------------------------------------------
12
13
library(jsonlite)
14
library(tidyverse)
15
library(magrittr)
16
library(glue)
17
18
setwd("/mnt/d/proj/github/Protocols-4pub")
19
source("a_PreprocessPipeline/utils.R")
20
21
settingList <- fromJSON("a_PreprocessPipeline/bulkATACpre_conf_X.json")
22
cmdConf <- fromJSON("a_PreprocessPipeline/cmd_conf_X.json")
23
24
setwd(settingList$workDir)
25
26
# A. Rename ---------------------------------------------------------------
27
28
setwd("0_fastq")
29
30
list.files(".")
31
32
from_file <- list.files(".", "")
33
to_file <- gsub("", "", from_file)
34
35
file.rename(from_file, to_file)
36
37
setwd("..")
38
39
# B. Load data ------------------------------------------------------------
40
41
file_name <- list.files("0_fastq", ".gz")
42
file_name <- list.files("2_trim", ".gz") # after qc
43
44
R1 <- grep("_1\\.", file_name, value = T)
45
R2 <- grep("_2\\.", file_name, value = T)
46
47
sample_name <- gsub("_1\\..*", "", R1)
48
49
# 1. QC for raw data ------------------------------------------------------
50
51
dir.create("code")
52
53
qc_dir <- "1_qc"
54
qc_dir %>% dir.create()
55
qc_dir %>% str_c("code/", .) %>% dir.create()
56
57
qc <- settingList$fastqcDir
58
59
qc_cmd <- glue("{qc} -o {qc_dir} -t 48 0_fastq/{file_name}")
60
cat(qc_cmd[1])
61
62
setCMD(qc_cmd, str_c("code/", qc_dir), 6)
63
# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
64
65
# 2. Trim -----------------------------------------------------------------
66
67
trim_dir <- "2_trim"
68
trim_dir %>% dir.create()
69
trim_dir %>% str_c("code/", .) %>% dir.create()
70
dir.create(".2_untrim")
71
72
trim <- settingList$trimmomaticDir
73
adapter <- settingList$trimAdapter
74
75
trim_cmd <- glue(
76
  "java -jar {trim} PE -threads 48 \\
77
  0_fastq/{R1} 0_fastq/{R2} \\
78
  2_trim/{sample_name}_1.trim.fastq.gz \\
79
  .2_untrim/{sample_name}_1.untrim.fastq.gz \\
80
  2_trim/{sample_name}_2.trim.fastq.gz \\
81
  .2_untrim/{sample_name}_2.untrim.fastq.gz \\
82
  ILLUMINACLIP:{adapter}:2:30:7:1:true \\
83
  LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > \\
84
  2_trim/{sample_name}.trim.log 2>&1"
85
)
86
cat(trim_cmd[1])
87
88
setCMD(trim_cmd, str_c("code/", trim_dir), 6)
89
# multiqc -o 2_trim -f -n trim 2_trim/*.log
90
91
# 3. QC for trimmed data --------------------------------------------------
92
93
# reload trimmed data first, or
94
file_name <- glue("{rep(sample_name, each = 2)}_{rep(1:2, length(sample_name))}.trim.fastq.gz")
95
R1 <- grep("_1\\.", file_name, value = T)
96
R2 <- grep("_2\\.", file_name, value = T)
97
98
qc_dir <- "3_qc"
99
qc_dir %>% dir.create()
100
qc_dir %>% str_c("code/", .) %>% dir.create()
101
102
qc <- settingList$fastqcDir
103
104
qc_cmd <- glue("{qc} -o {qc_dir} -t 48 {trim_dir}/{file_name}")
105
cat(qc_cmd[1])
106
107
setCMD(qc_cmd, str_c("code/", qc_dir), 6)
108
# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip
109
110
# 4. Map ------------------------------------------------------------------
111
112
map_dir <- "4_map"
113
map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
114
115
bt2 <- settingList$bt2Dir
116
ref <- settingList$mapRef
117
118
map_cmd <- glue(
119
  "{bt2} -p 48 --very-sensitive -X 2000 -x {ref} \\
120
  -1 {trim_dir}/{R1} -2 {trim_dir}/{R2} \\
121
  -S {map_dir}/{sample_name}.sam > \\
122
  {map_dir}/{sample_name}.log 2>&1"
123
)
124
cat(map_cmd[1])
125
126
setCMD(map_cmd, str_c("code/", map_dir), 6)
127
# multiqc -o 4_map -f -n map 4_map/*.log
128
129
# 5. Sort -----------------------------------------------------------------
130
131
sort_dir <- "5_sort"
132
sort_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
133
134
samt <- settingList$samtDir
135
136
sort_cmd <- glue(
137
  "{samt} view -@ 24 -bS {map_dir}/{sample_name}.sam | \\
138
  {samt} sort -@ 24 > \\
139
  {sort_dir}/{sample_name}.bam"
140
)
141
cat(sort_cmd[1])
142
143
setCMD(sort_cmd, str_c("code/", sort_dir), 6)
144
145
# 6. Dedup ----------------------------------------------------------------
146
147
dedup_dir <- "6_dedup"
148
dedup_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
149
150
picard <- settingList$picardDir
151
152
dedup_cmd <- glue(
153
  "java -Xms16g -Xmx256g -jar {picard} MarkDuplicates \\
154
  I={sort_dir}/{sample_name}.bam \\
155
  O={dedup_dir}/{sample_name}.dedup.bam \\
156
  M={dedup_dir}/{sample_name}.dedup.txt \\
157
  REMOVE_DUPLICATES=true > \\
158
  {dedup_dir}/{sample_name}.dedup.log 2>&1"
159
)
160
cat(dedup_cmd[1])
161
162
setCMD(dedup_cmd, str_c("code/", dedup_dir), 6)
163
164
# 7. Filter ---------------------------------------------------------------
165
166
fil_dir <- "7_filter"
167
fil_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
168
169
samt <- settingList$samtDir
170
171
fil_cmd <- glue(
172
  "{samt} view -@ 20 -h -f 2 -q 30 {dedup_dir}/{sample_name}.dedup.bam | \\
173
  egrep -v '\\bMT|\\bGL|\\bJH' | \\
174
  {samt} sort -@ 20 -O bam > \\
175
  {fil_dir}/{sample_name}.filter.bam"
176
)
177
cat(fil_cmd[1])
178
179
setCMD(fil_cmd, str_c("code/", fil_dir), 6)
180
181
# 8. Insert ---------------------------------------------------------------
182
183
ins_dir <- "8_insert"
184
ins_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
185
186
picard <- settingList$picardDir
187
188
ins_cmd <- glue(
189
  "java -Xms16g -Xmx256g -jar {picard} CollectInsertSizeMetrics \\
190
  I={fil_dir}/{sample_name}.filter.bam \\
191
  O={ins_dir}/{sample_name}.txt \\
192
  H={ins_dir}/{sample_name}.pdf > \\
193
  {ins_dir}/{sample_name}.log 2>&1"
194
)
195
cat(ins_cmd[1])
196
197
setCMD(ins_cmd, str_c("code/", ins_dir), 6)
198
199
# 9. Shift ----------------------------------------------------------------
200
201
shift_dir <- "9_shift"
202
shift_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
203
204
bedt <- settingList$bedtDir
205
206
shift_cmd <- glue(
207
  "{bedt} bamtobed -i {fil_dir}/{sample_name}.filter.bam | \\
208
  {awk_cmd} \\
209
  > {shift_dir}/{sample_name}.shift.bed",
210
  awk_cmd = "awk -F '\\t' 'BEGIN {OFS = FS}{ if ($6 == \"+\") {$2 = $2 + 4} else if ($6 == \"-\") {$3 = $3 - 5} print $0}'"
211
)
212
cat(shift_cmd[1])
213
214
setCMD(shift_cmd, str_c("code/", shift_dir), 6)
215
216
# 10. Peak ----------------------------------------------------------------
217
218
macs2 <- settingList$macs2Dir
219
220
peak_dir <- "10_peak"
221
peak_dir %>% dir.create()
222
223
org <- "hs"
224
org <- "mm"
225
226
peak_cmd <- glue("
227
  {macs2} callpeak -t {shift_dir}/{sample_name}.shift.bed \\
228
  -g {org} --nomodel --shift -100 --extsize 200 --keep-dup all -n {sample_name} \\
229
  -B --outdir {peak_dir} > {peak_dir}/{sample_name}.log 2>&1 &")
230
cat(peak_cmd[1])
231
232
group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
233
234
group_peak_cmd <- glue(
235
  "{macs2} callpeak -t {peak_group} \\
236
  -g {org} --nomodel --shift -100 --extsize 200 --keep-dup all --SPMR -n group_{group} \\
237
  -B --outdir {peak_dir} > {peak_dir}/group_{group}.log 2>&1 &",
238
  peak_group = map_chr(group_name, ~ paste(glue("{shift_dir}/{.x}.shift.bed"), collapse = " ")),
239
  group = names(group_name))
240
cat(group_peak_cmd[1])
241
242
write.table(c("#!/bin/bash\n", peak_cmd), glue("code/{peak_dir}.sh"), quote = F, row.names = F, col.names = F)
243
write.table(c("#!/bin/bash\n", group_peak_cmd), glue("code/{peak_dir}_group.sh"), quote = F, row.names = F, col.names = F)
244
245
# * * 2.11. Bigwig --------------------------------------------------------
246
247
bw_dir <- "11_bw"
248
bw_dir %>% dir.create()
249
250
dt <- settingList$dtDir
251
bdg2bw <- settingList$bdg2bwDir
252
ref_len <- settingList$refLen
253
254
index_cmd <- glue("{st} index {fil_dir}/{sample_name}.filter.bam &")
255
cat(index_cmd[1])
256
257
bw_cmd <- glue("
258
  {dt}bamCoverage --normalizeUsing RPKM -of bigwig -b {fil_dir}/{sample_name}.filter.bam \\
259
  -o {bw_dir}/{sample_name}.bw &")
260
cat(bw_cmd[1])
261
262
group_bw_cmd <- glue(
263
  "{bdg2bw} {peak_dir}/group_{group}_treat_pileup.bdg {ref_len} {bw_dir}/{group}.bw &",
264
  group = names(group_name))
265
cat(group_bw_cmd[1])
266
267
sample_name <- list.files(bw_dir, "-[0-9].bw") %>% str_replace(".bw$", "")
268
group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
269
270
cor_cmd <- glue(
271
  "{dt}multiBigwigSummary bins -p 20 -b {bws} -o {bw_dir}/bw_cor.npz &",
272
  bws = str_c(glue("{bw_dir}/{sample_name}.bw"), collapse = " \\\n"))
273
cat(cor_cmd)
274
275
group_cor_cmd <- glue(
276
  "{dt}multiBigwigSummary bins -p 20 -b {group_bws} -o {bw_dir}/group_bw_cor.npz &",
277
  group_bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
278
cat(group_cor_cmd)
279
280
corplot_cmd <- glue(
281
  "{dt}plotCorrelation -in {bw_dir}/bw_cor.npz -c pearson -p heatmap --plotNumbers -o {bw_dir}/cor_heatmap.pdf --colorMap RdBu_r &")
282
cat(corplot_cmd)
283
284
group_corplot_cmd <- glue(
285
  "{dt}plotCorrelation -in {bw_dir}/group_bw_cor.npz -c pearson -p heatmap --plotNumbers -o {bw_dir}/group_cor_heatmap.pdf --colorMap RdBu_r &")
286
cat(group_corplot_cmd)
287
288
write.table(c("#!/bin/bash\n", index_cmd), glue("code/{bw_dir}_index.sh"), quote = F, row.names = F, col.names = F)
289
write.table(c("#!/bin/bash\n", bw_cmd), glue("code/{bw_dir}.sh"), quote = F, row.names = F, col.names = F)
290
write.table(c("#!/bin/bash\n", group_bw_cmd), glue("code/{bw_dir}_group.sh"), quote = F, row.names = F, col.names = F)
291
292
write.table(c("#!/bin/bash\n", cor_cmd), glue("code/{bw_dir}_cor.sh"), quote = F, row.names = F, col.names = F)
293
write.table(c("#!/bin/bash\n", group_cor_cmd), glue("code/{bw_dir}_cor_group.sh"), quote = F, row.names = F, col.names = F)
294
write.table(c("#!/bin/bash\n", corplot_cmd), glue("code/{bw_dir}_plot_cor.sh"), quote = F, row.names = F, col.names = F)
295
write.table(c("#!/bin/bash\n", group_corplot_cmd), glue("code/{bw_dir}_plot_cor_group.sh"), quote = F, row.names = F, col.names = F)
296
297
# * * 2.12. IDR -----------------------------------------------------------
298
299
idr_dir <- "12_idr"
300
idr_dir %>% dir.create()
301
302
idr <- settingList$idrDir
303
304
idr_cmd <- map(group_name, ~ as_tibble(combn(.x, 2)) %>% map(~ paste(glue("{peak_dir}/{.x}_peaks.narrowPeak"), collapse = " "))) %>% 
305
  imap(~ glue("{idr} --samples {.x} --peak-list {peak_dir}/group_{.y}_peaks.narrowPeak \\
306
              --input-file-type narrowPeak --output-file {idr_dir}/{.y}")) %>% 
307
  map(~ imap(.x, ~ glue("{.x}-{.y}.idr.narrowPeak &"))) %>% unlist() %>% unname()
308
cat(idr_cmd[1])
309
310
write.table(c("#!/bin/bash\n", idr_cmd), glue("code/{idr_dir}.sh"), quote = F, row.names = F, col.names = F)
311
312
group_index <- map(group_name, ~ choose(length(.x), 2)) %>% imap(~ rep(.y, .x)) %>% unlist() %>% unname()
313
idr_file <- list.files(idr_dir, "idr", full.names = T)
314
idr_data <- map(idr_file, ~ read_delim(.x, "\t", col_names = F))
315
idr_peak <- map(idr_data, ~ .x[.x[[5]] > 540, ]) %>%
316
  map(~ str_c(.x[[1]], .x[[2]], .x[[3]], sep = "_")) %>%
317
  tapply(group_index, c) %>%
318
  map(~ unlist(.x) %>% unique)
319
320
group_file <- list.files(peak_dir, "^group.*Peak", full.names = T)
321
group_data <- map(group_file, ~ read.table(.x, sep = "\t", stringsAsFactors = F))
322
group_peak <- map(group_data, ~ str_c(.x[[1]], .x[[2]], .x[[3]], sep = "_"))
323
324
groupData <- pmap(list(idr_peak, group_peak, group_data), function(a, b, c) {c[b %in% a, ]})
325
326
iwalk(group_data, ~ write.table(.x, glue("{idr_dir}/{.y}.narrowPeak"), col.names = F, row.names = F, quote = F, sep = "\t"))
327
328
# * * 2.13. Motif ---------------------------------------------------------
329
330
motif_dir <- "13_motif"
331
motif_dir %>% dir.create()
332
region_file <- list.files(motif_dir, "bed")
333
334
homer <- settingList$homerDir
335
336
org <- "hg19"
337
org <- "mm10"
338
339
motif_cmd <- glue(
340
  "{homer}findMotifsGenome.pl {motif_dir}/{region_file} {org} {motif_dir}/{rigion_name} \\
341
  -size 200 -len 8,10,12 > {region_file}/{region_name}.log 2>&1 &",
342
  region_name = str_replace(region_file, ".bed", ""))
343
cat(motif_cmd[1])
344
345
write.table(c("#!/bin/bash\n", motif_cmd), glue("code/{motif_dir}.sh"), quote = F, row.names = F, col.names = F)
346
347
# * * 2.14. Heat ----------------------------------------------------------
348
349
heat_dir <- "14_heat"
350
heat_dir %>% dir.create()
351
352
sample_name <- list.files(bw_dir, "-[123].bw") %>% str_replace(".bw$", "")
353
group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
354
355
# for global
356
mat_cmd <- glue(
357
  "{dt}computeMatrix reference-point -a 2000 -b 2000 -p 20 -R {heat_dir}/hg19geneTSS.bed -S \\
358
  {bws} -o {heat_dir}/TSS_mtx.gz &",
359
  bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
360
cat(mat_cmd)
361
362
heat_cmd <- glue(
363
  "{dt}plotHeatmap -m {heat_dir}/TSS_mtx.gz --colorMap RdBu_r -o {heat_dir}/TSS_heatmap.pdf \\
364
  --outFileNameMatrix {heat_dir}/TSS_value.txt --outFileSortedRegions {heat_dir}/TSS_region.bed &")
365
cat(heat_cmd)
366
367
peak_group <- list.files(heat_dir, "peakGroup", full.names = T)
368
369
# for peaks
370
mat_cmd <- glue(
371
  "{dt}computeMatrix scale-regions -a 1000 -b 1000 -p 20 -R {peak} -S \\
372
  {bws} -o {heat_dir}/peak_mtx.gz &",
373
  peak = str_c(peak_group, collapse = " "),
374
  bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
375
cat(mat_cmd)
376
377
heat_cmd <- glue(
378
  "{dt}plotHeatmap -m {heat_dir}/peak_mtx.gz --colorMap RdBu_r -o {heat_dir}/peak_heatmap.pdf \\
379
  --outFileNameMatrix {heat_dir}/peak_value.txt --outFileSortedRegions {heat_dir}/peak_region.bed &")
380
cat(heat_cmd)
381
382
write.table(c("#!/bin/bash\n", mat_cmd), glue("code/{heat_dir}_mtx.sh"), quote = F, row.names = F, col.names = F)
383
write.table(c("#!/bin/bash\n", heat_cmd), glue("code/{heat_dir}.sh"), quote = F, row.names = F, col.names = F)
384
385
# * * 2.15. Anno ----------------------------------------------------------
386
387
anno_dir <- "15_anno"
388
anno_dir %>% dir.create()
389
390
known_motif <- settingList$knownMotif
391
392
peak_file <- list.files(anno_dir, ".homer")
393
394
anno_cmd <- glue(
395
  "{homer}annotatePeaks.pl {anno_dir}/{peak_file} {org} -m {known_motif} -mscore > {anno_dir}/{peak_name}.txt &",
396
  peak_name = str_remove(peak_file, ".homer")
397
)
398
cat(anno_cmd)
399
400
write.table(c("#!/bin/bash\n", anno_cmd), glue("code/{anno_dir}.sh"), quote = F, row.names = F, col.names = F)
401