# MESSAGE -----------------------------------------------------------------
#
# author: Yulin Lyu
# email: lvyulin@pku.edu.cn
#
# require: R whatever
#
# ---
# * 0. Setting ------------------------------------------------------------
settingList <- list(
workDir = ".",
trimmomaticDir = "app/Trimmomatic-0.39",
trimAdapter = "TruSeq3-PE-2.fa",
mapRef = "fasta/genome",
picardDir = "app/picard.jar",
bdg2bwDir = "app/bedGraphToBigWig",
refLen = "fasta/genome.fa.fai"
)
# * 1. Load packages ------------------------------------------------------
setwd(settingList$workDir)
library(tidyverse)
library(magrittr)
library(glue)
# * 2. Preprocess ---------------------------------------------------------
setwd("0_fastq")
list.files(".")
from_file <- list.files(".", "")
to_file <- gsub("", "", from_file)
file.rename(from_file, to_file)
setwd("..")
# * * 2.0. Load data ------------------------------------------------------
file_name <- list.files("0_fastq", ".gz")
file_name <- list.files("2_trim", ".gz") # after qc
R1 <- grep("_1\\.", file_name, value = T)
R2 <- grep("_2\\.", file_name, value = T)
sample_name <- gsub("_1\\..*", "", R1)
# * * 2.1. QC for raw data ------------------------------------------------
dir.create("code")
qc_dir <- "1_qc"
qc_dir %>% dir.create()
qc_cmd <- glue("fastqc -o {qc_dir} -t 8 0_fastq/{file_name} &")
cat(qc_cmd[1])
write.table(c("#!/bin/bash\n", qc_cmd), glue("code/{qc_dir}.sh"), quote = F, row.names = F, col.names = F)
# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
# * * 2.2. Trim -----------------------------------------------------------
dir.create("2_trim")
dir.create(".2_untrim")
trim <- settingList$trimmomaticDir
adapter <- settingList$trimAdapter
trim_cmd <- glue(
"java -jar {trim}/trimmomatic-0.39.jar PE -threads 10 \\
0_fastq/{R1} 0_fastq/{R2} \\
2_trim/{sample_name}_1.trim.fastq.gz \\
.2_untrim/{sample_name}_1.untrim.fastq.gz \\
2_trim/{sample_name}_2.trim.fastq.gz \\
.2_untrim/{sample_name}_2.untrim.fastq.gz \\
ILLUMINACLIP:{trim}/adapters/{adapter}:2:30:7:1:true \\
LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > 2_trim/{sample_name}.trim.log 2>&1")
cat(trim_cmd[1])
trim_dir <- "2_trim"
trim_dir %>% str_c("code/", .) %>% dir.create()
setCMD(trim_cmd, str_c("code/", trim_dir), 1, F)
# * * 2.3. QC for trimmed data --------------------------------------------
# reload trimmed data first
qc_dir <- "3_qc"
qc_dir %>% dir.create()
qc_cmd <- glue("fastqc -o {qc_dir} -t 8 {trim_dir}/{file_name} &")
cat(qc_cmd[1])
write.table(c("#!/bin/bash\n", qc_cmd), glue("code/{qc_dir}.sh"), quote = F, row.names = F, col.names = F)
# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip
# * * 2.4. Map ------------------------------------------------------------
ref <- settingList$mapRef
map_dir <- "4_map"
map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
map_cmd <- glue(
"bowtie2 -p 20 --very-sensitive -X 2000 -x {ref} \\
-1 {trim_dir}/{R1} -2 {trim_dir}/{R2} \\
-S {map_dir}/{sample_name}.sam > {map_dir}/{sample_name}.log 2>&1")
cat(map_cmd[1])
setCMD(map_cmd, str_c("code/", map_dir), 1, T)
# * * 2.5. Sort -----------------------------------------------------------
sort_dir <- "5_sort"
sort_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
sort_cmd <- glue(
"samtools view -@ 10 -bS {map_dir}/{sample_name}.sam | \\
samtools sort -@ 10 > {sort_dir}/{sample_name}.bam")
cat(sort_cmd[1])
setCMD(sort_cmd, str_c("code/", sort_dir), 1, F)
# * * 2.6. Dedup ----------------------------------------------------------
picard <- settingList$picardDir
dedup_dir <- "6_dedup"
dedup_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
dedup_cmd <- glue(
"java -Xms2g -Xmx8g -XX:ParallelGCThreads=8 -jar {picard} MarkDuplicates \\
I={sort_dir}/{sample_name}.bam \\
O={dedup_dir}/{sample_name}.dedup.bam \\
M={dedup_dir}/{sample_name}.dedup.txt \\
REMOVE_DUPLICATES=true")
cat(dedup_cmd[1])
setCMD(dedup_cmd, str_c("code/", dedup_dir), 1, F)
# * * 2.7. Filter ---------------------------------------------------------
fil_dir <- "7_filter"
fil_dir %>% dir.create()
filter_cmd <- glue(
"samtools view -h -f 2 -q 30 {dedup_dir}/{sample_name}.dedup.bam | \\
egrep -v 'MT|GL|JH' | samtools sort -@ 4 -O bam > {fil_dir}/{sample_name}.filter.bam &")
cat(filter_cmd[1])
write.table(c("#!/bin/bash\n", filter_cmd), glue("code/{fil_dir}.sh"), quote = F, row.names = F, col.names = F)
# * * 2.8. Finger ---------------------------------------------------------
fin_dir <- "8_finger"
fin_dir %>% dir.create()
index_cmd <- glue("samtools index {fil_dir}/{sample_name}.filter.bam {fil_dir}/{sample_name}.filter.bam.bai &")
cat(index_cmd)
write.table(c("#!/bin/bash\n", index_cmd), "code/8_index.sh", quote = F, row.names = F, col.names = F)
finger_group <- tapply(sample_name, gsub(".*-", "", sample_name), function(x) glue("{fil_dir}/{x}.filter.bam"))
finger_group %<>% map(~ paste(.x, collapse = " "))
finger_cmd <- glue("
plotFingerprint -b {finger_group} -o {fin_dir}/{names(finger_group)}.finger.pdf &")
cat(finger_cmd[1])
write.table(c("#!/bin/bash\n", finger_cmd), glue("code/{fin_dir}.sh"), quote = F, row.names = F, col.names = F)
# * * 2.9. Peak -----------------------------------------------------------
peak_dir <- "9_peak"
peak_dir %>% dir.create()
input_sample <- grep("in", sample_name, value = T)
chip_sample <- grep("in", sample_name, value = T, invert = T)
input_sample %<>% rep(each = 3)
org <- "hs"
peak_cmd <- glue(
"macs2 callpeak -t {fil_dir}/{chip_sample}.filter.bam -c {fil_dir}/{input_sample}.filter.bam \\
-f BAMPE -g {org} --broad --keep-dup all -n {chip_sample} \\
--outdir {peak_dir} > {peak_dir}/{chip_sample}.log 2>&1 &")
cat(peak_cmd[1])
write.table(c("#!/bin/bash\n", peak_cmd), glue("code/{peak_dir}.sh"), quote = F, row.names = F, col.names = F)
# * * 2.10. Bigwig --------------------------------------------------------
bw_dir <- "10_bw"
bw_dir %>% dir.create()
bw_cmd <- glue("
bamCoverage --normalizeUsing RPKM -of bigwig \\
-b {fil_dir}/{sample_name}.filter.bam \\
-o {bw_dir}/{sample_name}.bw > {bw_dir}/{sample_name}.log 2>&1 &")
cat(bw_cmd[1])
bw_comp_cmd <- glue(
"bamCompare -p 2 -of bigwig --operation ratio \\
-b1 {fil_dir}/{chip_sample}.filter.bam \\
-b2 {fil_dir}/{input_sample}.filter.bam \\
-o {bw_dir}/{chip_sample}.comp.bw > {bw_dir}/{chip_sample}.comp.log 2>&1 &")
cat(bw_comp_cmd[1])
write.table(c("#!/bin/bash\n", bw_cmd), glue("code/{bw_dir}.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", bw_comp_cmd), glue("code/{bw_dir}_comp.sh"), quote = F, row.names = F, col.names = F)
# * * 2.11. Heat ----------------------------------------------------------
chip_type <- chip_sample %>% str_replace(".*-", "")
bw_group <- tapply(chip_sample, chip_type, c) %>%
map(~ glue("{bw_dir}/{.x}.comp.bw")) %>%
map(~ str_c(.x, collapse = " "))
heat_dir <- "11_heat"
heat_dir %>% dir.create()
bed_file <- "all.bed"
out_prefix <- str_c("all_", names(bw_group))
mtx_cmd <- glue(
"computeMatrix scale-regions -p 6 -R {heat_dir}/{bed_file} \\
-S {bw_group} -o {heat_dir}/{out_prefix}_mtx.gz &"
)
cat(mtx_cmd[1])
heat_cmd <- glue(
"plotHeatmap -m {heat_dir}/{out_prefix}_mtx.gz --colorMap RdBu_r \\
-o {heat_dir}/{out_prefix}_heatmap.pdf \\
--outFileNameMatrix {heat_dir}/{out_prefix}_value.txt \\
--outFileSortedRegions {heat_dir}/{out_prefix}_region.bed &"
)
cat(heat_cmd[1])
write.table(c("#!/bin/bash\n", mtx_cmd), glue("code/{heat_dir}_mtx.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", heat_cmd), glue("code/{heat_dir}.sh"), quote = F, row.names = F, col.names = F)
# * 3. Function -----------------------------------------------------------
setCMD <- function(cmd, dir = ".", sepN = 1, clu = F) {
cmd %>% tapply(seq_along(.) %% sepN, c) %>% imap(~ {
ifelse(clu, glue(
"#!/bin/bash
#SBATCH -J batch{.y}
#SBATCH -o batch{.y}.%j.out
#SBATCH -e batch{.y}.%j.err
#SBATCH -p cn-long
#SBATCH -N 1
#SBATCH --ntasks-per-node=20
#SBATCH --no-requeue
#SBATCH -A hkdeng_g1
#SBATCH --qos=hkdengcnl
export PATH=/gpfs1/hkdeng_pkuhpc/lvyl/app/anaconda3/envs/lvyl/bin:$PATH"),
"#!/bin/bash") %>%
c(.x)}) %T>%
iwalk(~ write.table(.x, glue("{dir}/batch{.y}.sh"), quote = F, row.names = F, col.names = F)) %>%
names() %>% map_chr(~ glue("{head} {dir}/batch{.x}.sh {tail}",
head = ifelse(clu, "pkubatch", "sh"),
tail = ifelse(clu, "; sleep 1", "&"))) %>%
c("#!/bin/bash", .) %>% as_tibble() %>%
write_delim(glue("{dir}/submit.sh"), "\n", col_names = F)
}