Switch to unified view

a b/a_PreprocessPipeline/ChIPpre.R
1
2
# MESSAGE -----------------------------------------------------------------
3
#
4
# author: Yulin Lyu
5
# email: lvyulin@pku.edu.cn
6
#
7
# require: R whatever
8
#
9
# ---
10
11
# * 0. Setting ------------------------------------------------------------
12
13
settingList <- list(
14
  workDir = ".",
15
  trimmomaticDir = "app/Trimmomatic-0.39",
16
  trimAdapter = "TruSeq3-PE-2.fa",
17
  mapRef = "fasta/genome",
18
  picardDir = "app/picard.jar",
19
  bdg2bwDir = "app/bedGraphToBigWig",
20
  refLen = "fasta/genome.fa.fai"
21
)
22
23
# * 1. Load packages ------------------------------------------------------
24
25
setwd(settingList$workDir)
26
27
library(tidyverse)
28
library(magrittr)
29
library(glue)
30
31
# * 2. Preprocess ---------------------------------------------------------
32
33
setwd("0_fastq")
34
35
list.files(".")
36
37
from_file <- list.files(".", "")
38
to_file <- gsub("", "", from_file)
39
40
file.rename(from_file, to_file)
41
42
setwd("..")
43
44
# * * 2.0. Load data ------------------------------------------------------
45
46
file_name <- list.files("0_fastq", ".gz")
47
file_name <- list.files("2_trim", ".gz") # after qc
48
49
R1 <- grep("_1\\.", file_name, value = T)
50
R2 <- grep("_2\\.", file_name, value = T)
51
52
sample_name <- gsub("_1\\..*", "", R1)
53
54
# * * 2.1. QC for raw data ------------------------------------------------
55
56
dir.create("code")
57
58
qc_dir <- "1_qc"
59
qc_dir %>% dir.create()
60
61
qc_cmd <- glue("fastqc -o {qc_dir} -t 8 0_fastq/{file_name} &")
62
cat(qc_cmd[1])
63
64
write.table(c("#!/bin/bash\n", qc_cmd), glue("code/{qc_dir}.sh"), quote = F, row.names = F, col.names = F)
65
# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
66
67
# * * 2.2. Trim -----------------------------------------------------------
68
69
dir.create("2_trim")
70
dir.create(".2_untrim")
71
72
trim <- settingList$trimmomaticDir
73
adapter <- settingList$trimAdapter
74
75
trim_cmd <- glue(
76
  "java -jar {trim}/trimmomatic-0.39.jar PE -threads 10 \\
77
  0_fastq/{R1} 0_fastq/{R2} \\
78
  2_trim/{sample_name}_1.trim.fastq.gz \\
79
  .2_untrim/{sample_name}_1.untrim.fastq.gz \\
80
  2_trim/{sample_name}_2.trim.fastq.gz \\
81
  .2_untrim/{sample_name}_2.untrim.fastq.gz \\
82
  ILLUMINACLIP:{trim}/adapters/{adapter}:2:30:7:1:true \\
83
  LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > 2_trim/{sample_name}.trim.log 2>&1")
84
cat(trim_cmd[1])
85
86
trim_dir <- "2_trim"
87
trim_dir %>% str_c("code/", .) %>% dir.create()
88
89
setCMD(trim_cmd, str_c("code/", trim_dir), 1, F)
90
91
# * * 2.3. QC for trimmed data --------------------------------------------
92
93
# reload trimmed data first
94
95
qc_dir <- "3_qc"
96
qc_dir %>% dir.create()
97
98
qc_cmd <- glue("fastqc -o {qc_dir} -t 8 {trim_dir}/{file_name} &")
99
cat(qc_cmd[1])
100
101
write.table(c("#!/bin/bash\n", qc_cmd), glue("code/{qc_dir}.sh"), quote = F, row.names = F, col.names = F)
102
# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip
103
104
# * * 2.4. Map ------------------------------------------------------------
105
106
ref <- settingList$mapRef
107
108
map_dir <- "4_map"
109
map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
110
111
map_cmd <- glue(
112
  "bowtie2 -p 20 --very-sensitive -X 2000 -x {ref} \\
113
  -1 {trim_dir}/{R1} -2 {trim_dir}/{R2} \\
114
  -S {map_dir}/{sample_name}.sam > {map_dir}/{sample_name}.log 2>&1")
115
cat(map_cmd[1])
116
117
setCMD(map_cmd, str_c("code/", map_dir), 1, T)
118
119
# * * 2.5. Sort -----------------------------------------------------------
120
121
sort_dir <- "5_sort"
122
sort_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
123
124
sort_cmd <- glue(
125
  "samtools view -@ 10 -bS {map_dir}/{sample_name}.sam | \\
126
  samtools sort -@ 10 > {sort_dir}/{sample_name}.bam")
127
cat(sort_cmd[1])
128
129
setCMD(sort_cmd, str_c("code/", sort_dir), 1, F)
130
131
# * * 2.6. Dedup ----------------------------------------------------------
132
133
picard <- settingList$picardDir
134
135
dedup_dir <- "6_dedup"
136
dedup_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
137
138
dedup_cmd <- glue(
139
  "java -Xms2g -Xmx8g -XX:ParallelGCThreads=8 -jar {picard} MarkDuplicates \\
140
  I={sort_dir}/{sample_name}.bam \\
141
  O={dedup_dir}/{sample_name}.dedup.bam \\
142
  M={dedup_dir}/{sample_name}.dedup.txt \\
143
  REMOVE_DUPLICATES=true")
144
cat(dedup_cmd[1])
145
146
setCMD(dedup_cmd, str_c("code/", dedup_dir), 1, F)
147
148
# * * 2.7. Filter ---------------------------------------------------------
149
150
fil_dir <- "7_filter"
151
fil_dir %>% dir.create()
152
153
filter_cmd <- glue(
154
  "samtools view -h -f 2 -q 30 {dedup_dir}/{sample_name}.dedup.bam | \\
155
  egrep -v 'MT|GL|JH' | samtools sort -@ 4 -O bam > {fil_dir}/{sample_name}.filter.bam &")
156
cat(filter_cmd[1])
157
158
write.table(c("#!/bin/bash\n", filter_cmd), glue("code/{fil_dir}.sh"), quote = F, row.names = F, col.names = F)
159
160
# * * 2.8. Finger ---------------------------------------------------------
161
162
fin_dir <- "8_finger"
163
fin_dir %>% dir.create()
164
165
index_cmd <- glue("samtools index {fil_dir}/{sample_name}.filter.bam {fil_dir}/{sample_name}.filter.bam.bai &")
166
cat(index_cmd)
167
168
write.table(c("#!/bin/bash\n", index_cmd), "code/8_index.sh", quote = F, row.names = F, col.names = F)
169
170
finger_group <- tapply(sample_name, gsub(".*-", "", sample_name), function(x) glue("{fil_dir}/{x}.filter.bam"))
171
finger_group %<>% map(~ paste(.x, collapse = " "))
172
173
finger_cmd <- glue("
174
  plotFingerprint -b {finger_group} -o {fin_dir}/{names(finger_group)}.finger.pdf &")
175
cat(finger_cmd[1])
176
177
write.table(c("#!/bin/bash\n", finger_cmd), glue("code/{fin_dir}.sh"), quote = F, row.names = F, col.names = F)
178
179
# * * 2.9. Peak -----------------------------------------------------------
180
181
peak_dir <- "9_peak"
182
peak_dir %>% dir.create()
183
184
input_sample <- grep("in", sample_name, value = T)
185
chip_sample <- grep("in", sample_name, value = T, invert = T)
186
input_sample %<>% rep(each = 3)
187
188
org <- "hs"
189
190
peak_cmd <- glue(
191
  "macs2 callpeak -t {fil_dir}/{chip_sample}.filter.bam -c {fil_dir}/{input_sample}.filter.bam \\
192
  -f BAMPE -g {org} --broad --keep-dup all -n {chip_sample} \\
193
  --outdir {peak_dir} > {peak_dir}/{chip_sample}.log 2>&1 &")
194
cat(peak_cmd[1])
195
196
write.table(c("#!/bin/bash\n", peak_cmd), glue("code/{peak_dir}.sh"), quote = F, row.names = F, col.names = F)
197
198
# * * 2.10. Bigwig --------------------------------------------------------
199
200
bw_dir <- "10_bw"
201
bw_dir %>% dir.create()
202
203
bw_cmd <- glue("
204
  bamCoverage --normalizeUsing RPKM -of bigwig \\
205
  -b {fil_dir}/{sample_name}.filter.bam \\
206
  -o {bw_dir}/{sample_name}.bw > {bw_dir}/{sample_name}.log 2>&1 &")
207
cat(bw_cmd[1])
208
209
bw_comp_cmd <- glue(
210
  "bamCompare -p 2 -of bigwig --operation ratio \\
211
  -b1 {fil_dir}/{chip_sample}.filter.bam \\
212
  -b2 {fil_dir}/{input_sample}.filter.bam \\
213
  -o {bw_dir}/{chip_sample}.comp.bw > {bw_dir}/{chip_sample}.comp.log 2>&1 &")
214
cat(bw_comp_cmd[1])
215
216
write.table(c("#!/bin/bash\n", bw_cmd), glue("code/{bw_dir}.sh"), quote = F, row.names = F, col.names = F)
217
write.table(c("#!/bin/bash\n", bw_comp_cmd), glue("code/{bw_dir}_comp.sh"), quote = F, row.names = F, col.names = F)
218
219
# * * 2.11. Heat ----------------------------------------------------------
220
221
chip_type <- chip_sample %>% str_replace(".*-", "")
222
bw_group <- tapply(chip_sample, chip_type, c) %>%
223
  map(~ glue("{bw_dir}/{.x}.comp.bw")) %>%
224
  map(~ str_c(.x, collapse = " "))
225
226
heat_dir <- "11_heat"
227
heat_dir %>% dir.create()
228
229
bed_file <- "all.bed"
230
out_prefix <- str_c("all_", names(bw_group))
231
232
mtx_cmd <- glue(
233
  "computeMatrix scale-regions -p 6 -R {heat_dir}/{bed_file} \\
234
  -S {bw_group} -o {heat_dir}/{out_prefix}_mtx.gz &"
235
)
236
cat(mtx_cmd[1])
237
238
heat_cmd <- glue(
239
  "plotHeatmap -m {heat_dir}/{out_prefix}_mtx.gz --colorMap RdBu_r \\
240
  -o {heat_dir}/{out_prefix}_heatmap.pdf \\
241
  --outFileNameMatrix {heat_dir}/{out_prefix}_value.txt \\
242
  --outFileSortedRegions {heat_dir}/{out_prefix}_region.bed &"
243
)
244
cat(heat_cmd[1])
245
246
write.table(c("#!/bin/bash\n", mtx_cmd), glue("code/{heat_dir}_mtx.sh"), quote = F, row.names = F, col.names = F)
247
write.table(c("#!/bin/bash\n", heat_cmd), glue("code/{heat_dir}.sh"), quote = F, row.names = F, col.names = F)
248
249
# * 3. Function -----------------------------------------------------------
250
251
setCMD <- function(cmd, dir = ".", sepN = 1, clu = F) {
252
  cmd %>% tapply(seq_along(.) %% sepN, c) %>% imap(~ {
253
    ifelse(clu, glue(
254
      "#!/bin/bash
255
      #SBATCH -J batch{.y}
256
      #SBATCH -o batch{.y}.%j.out
257
      #SBATCH -e batch{.y}.%j.err
258
      #SBATCH -p cn-long
259
      #SBATCH -N 1
260
      #SBATCH --ntasks-per-node=20
261
      #SBATCH --no-requeue
262
      #SBATCH -A hkdeng_g1
263
      #SBATCH --qos=hkdengcnl
264
      export PATH=/gpfs1/hkdeng_pkuhpc/lvyl/app/anaconda3/envs/lvyl/bin:$PATH"),
265
      "#!/bin/bash") %>%
266
      c(.x)}) %T>%
267
    iwalk(~ write.table(.x, glue("{dir}/batch{.y}.sh"), quote = F, row.names = F, col.names = F)) %>%
268
    names() %>% map_chr(~ glue("{head} {dir}/batch{.x}.sh {tail}",
269
                               head = ifelse(clu, "pkubatch", "sh"),
270
                               tail = ifelse(clu, "; sleep 1", "&"))) %>%
271
    c("#!/bin/bash", .) %>% as_tibble() %>%
272
    write_delim(glue("{dir}/submit.sh"), "\n", col_names = F)
273
}