Switch to unified view

a b/a_PreprocessPipeline/WGBSpre.R
1
2
# MESSAGE -----------------------------------------------------------------
3
# 
4
# author: Yulin Lyu
5
# email: lvyulin@pku.edu.cn
6
# 
7
# require: R whatever
8
# 
9
# ---
10
11
# * 0. Setting ------------------------------------------------------------
12
13
settingList <- list(
14
  workDir = ".",
15
  mapRef = "fasta",
16
  lambdaRef = "ref/lambda",
17
  BSfilter = "BSfilter.R",
18
  bdg2bwDir = "app/bedGraphToBigWig",
19
  refLen = "fasta/genome.fa.fai"
20
)
21
22
# * 1. Load packages ------------------------------------------------------
23
24
setwd(settingList$workDir)
25
26
library(tidyverse)
27
library(magrittr)
28
library(glue)
29
30
# * 2. Preprocess ---------------------------------------------------------
31
32
setwd("0_fastq")
33
34
list.files(".")
35
36
from_file <- list.files(".", "")
37
to_file <- gsub("", "", from_file)
38
39
file.rename(from_file, to_file)
40
41
setwd("..")
42
43
# * * 2.0. Load data ------------------------------------------------------
44
45
file_name <- list.files("0_fastq", ".gz")
46
47
R1 <- grep("_1\\.", file_name, value = T)
48
R2 <- grep("_2\\.", file_name, value = T)
49
50
sample_name <- gsub("_1\\..*", "", R1)
51
52
# * * 2.1. QC for raw data ------------------------------------------------
53
54
dir.create("code")
55
56
qc_dir <- "1_qc"
57
qc_dir %>% dir.create()
58
59
qc_cmd <- glue("fastqc -o {qc_dir} -t 8 0_fastq/{file_name} &")
60
cat(qc_cmd[1])
61
62
write.table(c("#!/bin/bash\n", qc_cmd), glue("code/{qc_dir}.sh"), quote = F, row.names = F, col.names = F)
63
# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
64
65
# * * 2.2. Map ------------------------------------------------------------
66
67
ref <- settingList$mapRef
68
69
file_dir <- "0_fastq"
70
map_dir <- "2_map"
71
map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
72
73
map_cmd <- glue(
74
  "bismark --multicore 5 -X 700 --dovetail -p 2 --un --ambiguous --genome {ref} \\
75
  {file_dir}/{R1} {file_dir}/{R2} \\
76
  {map_dir} > {map_dir}/{sample_name}.log")
77
cat(map_cmd[1])
78
79
setCMD(map_cmd, str_c("code/", map_dir), 1, T)
80
# multiqc -o 2_map -f -n map 2_map/*.txt
81
82
# * * 2.3. Dedup ----------------------------------------------------------
83
84
dedup_dir <- "3_dedup"
85
dedup_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
86
87
dedup_cmd <- glue(
88
  "deduplicate_bismark -p -o {sample_name} --output_dir {dedup_dir} \\
89
  {map_dir}/{sample_name}_1_bismark_bt2_pe.bam")
90
cat(dedup_cmd[1])
91
92
setCMD(dedup_cmd, str_c("code/", dedup_dir), 1, T)
93
94
# * * 2.4. Meth -----------------------------------------------------------
95
96
meth_dir <- "4_meth"
97
meth_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
98
99
meth_cmd <- glue(
100
  "bismark_methylation_extractor -p --no_overlap --ignore 5 --ignore_r2 5 \\
101
  --multicore 7 --bedGraph --cytosine_report --zero_based --CX --buffer_size 50% \\
102
  --genome_folder {ref} -o {meth_dir} {dedup_dir}/{sample_name}.deduplicated.bam")
103
cat(meth_cmd[1])
104
105
setCMD(meth_cmd, str_c("code/", meth_dir), 1, T)
106
107
# * * 2.5. Lambda ---------------------------------------------------------
108
109
lambda_ref <- settingList$lambdaRef
110
111
lambda_dir <- "5_lambda"
112
lambda_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
113
114
lambda_cmd <- glue(
115
  "bismark --multicore 5 -X 700 --dovetail -p 2 --un --ambiguous --genome {lambda_ref} \\
116
  {file_dir}/{R1} {file_dir}/{R2} \\
117
  {lambda_dir} > {lambda_dir}/{sample_name}.log
118
  
119
  deduplicate_bismark -p -o {sample_name} --output_dir {lambda_dir} \\
120
  {lambda_dir}/{sample_name}_1_bismark_bt2_pe.bam
121
  
122
  bismark_methylation_extractor -p --no_overlap --ignore 5 --ignore_r2 5 \\
123
  --multicore 7 --bedGraph --cytosine_report --zero_based --CX --buffer_size 50% \\
124
  --genome_folder {lambda_ref} -o {lambda_dir} {lambda_dir}/{sample_name}.deduplicated.bam")
125
126
setCMD(lambda_cmd, str_c("code/", lambda_dir), 1, T)
127
128
# * * 2.6. Extract --------------------------------------------------------
129
130
extract_dir <- "6_extract"
131
extract_dir %>% dir.create()
132
133
extract_cmd <- glue(
134
  "{awk_cmd} {meth_dir}/{sample_name}.deduplicated.CX_report.txt > {extract_dir}/{sample_name}.cg.txt &",
135
  awk_cmd = "awk -F '\\t' '{if($4 + $5 >= 5 && $6 == \"CG\"){print $0}}'")
136
cat(extract_cmd[1])
137
138
write.table(c("#!/bin/bash\n", extract_cmd), glue("code/{extract_dir}.sh"), quote = F, row.names = F, col.names = F)
139
140
# * * 2.7. Filter ---------------------------------------------------------
141
142
p_files <- list.files(lambda_dir, full.names = T)
143
p_text <- map(p_files, read_delim, delim = "\n")
144
m <- p_text %>% map(~ .x[[1]][13:15]) %>% map(~ str_replace(.x, ".*\\t", "")) %>% map(~ sum(as.numeric(.x)))
145
u <- p_text %>% map(~ .x[[1]][16:18]) %>% map(~ str_replace(.x, ".*\\t", "")) %>% map(~ sum(as.numeric(.x)))
146
p <- map2_dbl(m, u, ~ {.x / (.x + .y)})
147
148
filter_dir <- "7_filter"
149
filter_dir %>% dir.create()
150
151
BSfilter <- settingList$BSfilter
152
153
filter_cmd <- glue(
154
  "Rscript {BSfilter} -i {extract_dir}/{sample_name}.cg.txt \\
155
  -d {filter_dir}/{sample_name}.dss.txt \\
156
  -b {filter_dir}/{sample_name}.bdg \\
157
  -p {p} > {filter_dir}/{sample_name}.log 2>&1 &")
158
cat(filter_cmd[1])
159
160
write.table(c("#!/bin/bash\n", filter_cmd), glue("code/{filter_dir}.sh"), quote = F, row.names = F, col.names = F)
161
162
# * * 2.8 Bigwig ----------------------------------------------------------
163
164
bw_dir <- "8_bw"
165
bw_dir %>% dir.create()
166
167
bdg2bw <- settingList$bdg2bwDir
168
ref_len <- settingList$refLen
169
170
bw_cmd <- glue("{bdg2bw} {filter_dir}/{sample_name}.bdg {ref_len} {bw_dir}/{sample_name}.bw &")
171
cat(bw_cmd[1])
172
173
write.table(c("#!/bin/bash\n", bw_cmd), glue("code/{bw_dir}.sh"), quote = F, row.names = F, col.names = F)
174
175
# * 3. Function -----------------------------------------------------------
176
177
setCMD <- function(cmd, dir = ".", sepN = 1, clu = F) {
178
  cmd %>% tapply(seq_along(.) %% sepN, c) %>% imap(~ {
179
    ifelse(clu, glue(
180
      "#!/bin/bash
181
      #SBATCH -J batch{.y}
182
      #SBATCH -o batch{.y}.%j.out
183
      #SBATCH -e batch{.y}.%j.err
184
      #SBATCH -p cn-long
185
      #SBATCH -N 1
186
      #SBATCH --ntasks-per-node=20
187
      #SBATCH --no-requeue
188
      #SBATCH -A hkdeng_g1
189
      #SBATCH --qos=hkdengcnl
190
      #export PATH=/gpfs1/hkdeng_pkuhpc/lvyl/app/Bismark-0.22.3:$PATH
191
      export PATH=/gpfs1/hkdeng_pkuhpc/lvyl/app/anaconda3/envs/lvyl/bin:$PATH"),
192
      "#!/bin/bash") %>%
193
      c(.x)}) %T>%
194
    iwalk(~ write.table(.x, glue("{dir}/batch{.y}.sh"), quote = F, row.names = F, col.names = F)) %>%
195
    names() %>% map_chr(~ glue("{head} {dir}/batch{.x}.sh {tail}",
196
                               head = ifelse(clu, "pkubatch", "sh"),
197
                               tail = ifelse(clu, "; sleep 1", "&"))) %>%
198
    c("#!/bin/bash", .) %>% as_tibble() %>%
199
    write_delim(glue("{dir}/submit.sh"), "\n", col_names = F)
200
}