# MESSAGE -----------------------------------------------------------------
#
# author: Yulin Lyu
# email: lvyulin@pku.edu.cn
#
# require: R whatever
#
# ---
# 0. Load packages --------------------------------------------------------
library(jsonlite)
library(tidyverse)
library(magrittr)
library(glue)
setwd("/mnt/d/proj/github/Protocols-4pub")
source("a_PreprocessPipeline/utils.R")
settingList <- fromJSON("a_PreprocessPipeline/bulkATACpre_conf_X.json")
cmdConf <- fromJSON("a_PreprocessPipeline/cmd_conf_X.json")
setwd(settingList$workDir)
# A. Rename ---------------------------------------------------------------
setwd("0_fastq")
list.files(".")
from_file <- list.files(".", "")
to_file <- gsub("", "", from_file)
file.rename(from_file, to_file)
setwd("..")
# B. Load data ------------------------------------------------------------
file_name <- list.files("0_fastq", ".gz")
file_name <- list.files("2_trim", ".gz") # after qc
R1 <- grep("_1\\.", file_name, value = T)
R2 <- grep("_2\\.", file_name, value = T)
sample_name <- gsub("_1\\..*", "", R1)
# 1. QC for raw data ------------------------------------------------------
dir.create("code")
qc_dir <- "1_qc"
qc_dir %>% dir.create()
qc_dir %>% str_c("code/", .) %>% dir.create()
qc <- settingList$fastqcDir
qc_cmd <- glue("{qc} -o {qc_dir} -t 48 0_fastq/{file_name}")
cat(qc_cmd[1])
setCMD(qc_cmd, str_c("code/", qc_dir), 6)
# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
# 2. Trim -----------------------------------------------------------------
trim_dir <- "2_trim"
trim_dir %>% dir.create()
trim_dir %>% str_c("code/", .) %>% dir.create()
dir.create(".2_untrim")
trim <- settingList$trimmomaticDir
adapter <- settingList$trimAdapter
trim_cmd <- glue(
"java -jar {trim} PE -threads 48 \\
0_fastq/{R1} 0_fastq/{R2} \\
2_trim/{sample_name}_1.trim.fastq.gz \\
.2_untrim/{sample_name}_1.untrim.fastq.gz \\
2_trim/{sample_name}_2.trim.fastq.gz \\
.2_untrim/{sample_name}_2.untrim.fastq.gz \\
ILLUMINACLIP:{adapter}:2:30:7:1:true \\
LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > \\
2_trim/{sample_name}.trim.log 2>&1"
)
cat(trim_cmd[1])
setCMD(trim_cmd, str_c("code/", trim_dir), 6)
# multiqc -o 2_trim -f -n trim 2_trim/*.log
# 3. QC for trimmed data --------------------------------------------------
# reload trimmed data first, or
file_name <- glue("{rep(sample_name, each = 2)}_{rep(1:2, length(sample_name))}.trim.fastq.gz")
R1 <- grep("_1\\.", file_name, value = T)
R2 <- grep("_2\\.", file_name, value = T)
qc_dir <- "3_qc"
qc_dir %>% dir.create()
qc_dir %>% str_c("code/", .) %>% dir.create()
qc <- settingList$fastqcDir
qc_cmd <- glue("{qc} -o {qc_dir} -t 48 {trim_dir}/{file_name}")
cat(qc_cmd[1])
setCMD(qc_cmd, str_c("code/", qc_dir), 6)
# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip
# 4. Map ------------------------------------------------------------------
map_dir <- "4_map"
map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
bt2 <- settingList$bt2Dir
ref <- settingList$mapRef
map_cmd <- glue(
"{bt2} -p 48 --very-sensitive -X 2000 -x {ref} \\
-1 {trim_dir}/{R1} -2 {trim_dir}/{R2} \\
-S {map_dir}/{sample_name}.sam > \\
{map_dir}/{sample_name}.log 2>&1"
)
cat(map_cmd[1])
setCMD(map_cmd, str_c("code/", map_dir), 6)
# multiqc -o 4_map -f -n map 4_map/*.log
# 5. Sort -----------------------------------------------------------------
sort_dir <- "5_sort"
sort_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
samt <- settingList$samtDir
sort_cmd <- glue(
"{samt} view -@ 24 -bS {map_dir}/{sample_name}.sam | \\
{samt} sort -@ 24 > \\
{sort_dir}/{sample_name}.bam"
)
cat(sort_cmd[1])
setCMD(sort_cmd, str_c("code/", sort_dir), 6)
# 6. Dedup ----------------------------------------------------------------
dedup_dir <- "6_dedup"
dedup_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
picard <- settingList$picardDir
dedup_cmd <- glue(
"java -Xms16g -Xmx256g -jar {picard} MarkDuplicates \\
I={sort_dir}/{sample_name}.bam \\
O={dedup_dir}/{sample_name}.dedup.bam \\
M={dedup_dir}/{sample_name}.dedup.txt \\
REMOVE_DUPLICATES=true > \\
{dedup_dir}/{sample_name}.dedup.log 2>&1"
)
cat(dedup_cmd[1])
setCMD(dedup_cmd, str_c("code/", dedup_dir), 6)
# 7. Filter ---------------------------------------------------------------
fil_dir <- "7_filter"
fil_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
samt <- settingList$samtDir
fil_cmd <- glue(
"{samt} view -@ 20 -h -f 2 -q 30 {dedup_dir}/{sample_name}.dedup.bam | \\
egrep -v '\\bMT|\\bGL|\\bJH' | \\
{samt} sort -@ 20 -O bam > \\
{fil_dir}/{sample_name}.filter.bam"
)
cat(fil_cmd[1])
setCMD(fil_cmd, str_c("code/", fil_dir), 6)
# 8. Insert ---------------------------------------------------------------
ins_dir <- "8_insert"
ins_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
picard <- settingList$picardDir
ins_cmd <- glue(
"java -Xms16g -Xmx256g -jar {picard} CollectInsertSizeMetrics \\
I={fil_dir}/{sample_name}.filter.bam \\
O={ins_dir}/{sample_name}.txt \\
H={ins_dir}/{sample_name}.pdf > \\
{ins_dir}/{sample_name}.log 2>&1"
)
cat(ins_cmd[1])
setCMD(ins_cmd, str_c("code/", ins_dir), 6)
# 9. Shift ----------------------------------------------------------------
shift_dir <- "9_shift"
shift_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
bedt <- settingList$bedtDir
shift_cmd <- glue(
"{bedt} bamtobed -i {fil_dir}/{sample_name}.filter.bam | \\
{awk_cmd} \\
> {shift_dir}/{sample_name}.shift.bed",
awk_cmd = "awk -F '\\t' 'BEGIN {OFS = FS}{ if ($6 == \"+\") {$2 = $2 + 4} else if ($6 == \"-\") {$3 = $3 - 5} print $0}'"
)
cat(shift_cmd[1])
setCMD(shift_cmd, str_c("code/", shift_dir), 6)
# 10. Peak ----------------------------------------------------------------
macs2 <- settingList$macs2Dir
peak_dir <- "10_peak"
peak_dir %>% dir.create()
org <- "hs"
org <- "mm"
peak_cmd <- glue("
{macs2} callpeak -t {shift_dir}/{sample_name}.shift.bed \\
-g {org} --nomodel --shift -100 --extsize 200 --keep-dup all -n {sample_name} \\
-B --outdir {peak_dir} > {peak_dir}/{sample_name}.log 2>&1 &")
cat(peak_cmd[1])
group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
group_peak_cmd <- glue(
"{macs2} callpeak -t {peak_group} \\
-g {org} --nomodel --shift -100 --extsize 200 --keep-dup all --SPMR -n group_{group} \\
-B --outdir {peak_dir} > {peak_dir}/group_{group}.log 2>&1 &",
peak_group = map_chr(group_name, ~ paste(glue("{shift_dir}/{.x}.shift.bed"), collapse = " ")),
group = names(group_name))
cat(group_peak_cmd[1])
write.table(c("#!/bin/bash\n", peak_cmd), glue("code/{peak_dir}.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", group_peak_cmd), glue("code/{peak_dir}_group.sh"), quote = F, row.names = F, col.names = F)
# * * 2.11. Bigwig --------------------------------------------------------
bw_dir <- "11_bw"
bw_dir %>% dir.create()
dt <- settingList$dtDir
bdg2bw <- settingList$bdg2bwDir
ref_len <- settingList$refLen
index_cmd <- glue("{st} index {fil_dir}/{sample_name}.filter.bam &")
cat(index_cmd[1])
bw_cmd <- glue("
{dt}bamCoverage --normalizeUsing RPKM -of bigwig -b {fil_dir}/{sample_name}.filter.bam \\
-o {bw_dir}/{sample_name}.bw &")
cat(bw_cmd[1])
group_bw_cmd <- glue(
"{bdg2bw} {peak_dir}/group_{group}_treat_pileup.bdg {ref_len} {bw_dir}/{group}.bw &",
group = names(group_name))
cat(group_bw_cmd[1])
sample_name <- list.files(bw_dir, "-[0-9].bw") %>% str_replace(".bw$", "")
group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
cor_cmd <- glue(
"{dt}multiBigwigSummary bins -p 20 -b {bws} -o {bw_dir}/bw_cor.npz &",
bws = str_c(glue("{bw_dir}/{sample_name}.bw"), collapse = " \\\n"))
cat(cor_cmd)
group_cor_cmd <- glue(
"{dt}multiBigwigSummary bins -p 20 -b {group_bws} -o {bw_dir}/group_bw_cor.npz &",
group_bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
cat(group_cor_cmd)
corplot_cmd <- glue(
"{dt}plotCorrelation -in {bw_dir}/bw_cor.npz -c pearson -p heatmap --plotNumbers -o {bw_dir}/cor_heatmap.pdf --colorMap RdBu_r &")
cat(corplot_cmd)
group_corplot_cmd <- glue(
"{dt}plotCorrelation -in {bw_dir}/group_bw_cor.npz -c pearson -p heatmap --plotNumbers -o {bw_dir}/group_cor_heatmap.pdf --colorMap RdBu_r &")
cat(group_corplot_cmd)
write.table(c("#!/bin/bash\n", index_cmd), glue("code/{bw_dir}_index.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", bw_cmd), glue("code/{bw_dir}.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", group_bw_cmd), glue("code/{bw_dir}_group.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", cor_cmd), glue("code/{bw_dir}_cor.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", group_cor_cmd), glue("code/{bw_dir}_cor_group.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", corplot_cmd), glue("code/{bw_dir}_plot_cor.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", group_corplot_cmd), glue("code/{bw_dir}_plot_cor_group.sh"), quote = F, row.names = F, col.names = F)
# * * 2.12. IDR -----------------------------------------------------------
idr_dir <- "12_idr"
idr_dir %>% dir.create()
idr <- settingList$idrDir
idr_cmd <- map(group_name, ~ as_tibble(combn(.x, 2)) %>% map(~ paste(glue("{peak_dir}/{.x}_peaks.narrowPeak"), collapse = " "))) %>%
imap(~ glue("{idr} --samples {.x} --peak-list {peak_dir}/group_{.y}_peaks.narrowPeak \\
--input-file-type narrowPeak --output-file {idr_dir}/{.y}")) %>%
map(~ imap(.x, ~ glue("{.x}-{.y}.idr.narrowPeak &"))) %>% unlist() %>% unname()
cat(idr_cmd[1])
write.table(c("#!/bin/bash\n", idr_cmd), glue("code/{idr_dir}.sh"), quote = F, row.names = F, col.names = F)
group_index <- map(group_name, ~ choose(length(.x), 2)) %>% imap(~ rep(.y, .x)) %>% unlist() %>% unname()
idr_file <- list.files(idr_dir, "idr", full.names = T)
idr_data <- map(idr_file, ~ read_delim(.x, "\t", col_names = F))
idr_peak <- map(idr_data, ~ .x[.x[[5]] > 540, ]) %>%
map(~ str_c(.x[[1]], .x[[2]], .x[[3]], sep = "_")) %>%
tapply(group_index, c) %>%
map(~ unlist(.x) %>% unique)
group_file <- list.files(peak_dir, "^group.*Peak", full.names = T)
group_data <- map(group_file, ~ read.table(.x, sep = "\t", stringsAsFactors = F))
group_peak <- map(group_data, ~ str_c(.x[[1]], .x[[2]], .x[[3]], sep = "_"))
groupData <- pmap(list(idr_peak, group_peak, group_data), function(a, b, c) {c[b %in% a, ]})
iwalk(group_data, ~ write.table(.x, glue("{idr_dir}/{.y}.narrowPeak"), col.names = F, row.names = F, quote = F, sep = "\t"))
# * * 2.13. Motif ---------------------------------------------------------
motif_dir <- "13_motif"
motif_dir %>% dir.create()
region_file <- list.files(motif_dir, "bed")
homer <- settingList$homerDir
org <- "hg19"
org <- "mm10"
motif_cmd <- glue(
"{homer}findMotifsGenome.pl {motif_dir}/{region_file} {org} {motif_dir}/{rigion_name} \\
-size 200 -len 8,10,12 > {region_file}/{region_name}.log 2>&1 &",
region_name = str_replace(region_file, ".bed", ""))
cat(motif_cmd[1])
write.table(c("#!/bin/bash\n", motif_cmd), glue("code/{motif_dir}.sh"), quote = F, row.names = F, col.names = F)
# * * 2.14. Heat ----------------------------------------------------------
heat_dir <- "14_heat"
heat_dir %>% dir.create()
sample_name <- list.files(bw_dir, "-[123].bw") %>% str_replace(".bw$", "")
group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
# for global
mat_cmd <- glue(
"{dt}computeMatrix reference-point -a 2000 -b 2000 -p 20 -R {heat_dir}/hg19geneTSS.bed -S \\
{bws} -o {heat_dir}/TSS_mtx.gz &",
bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
cat(mat_cmd)
heat_cmd <- glue(
"{dt}plotHeatmap -m {heat_dir}/TSS_mtx.gz --colorMap RdBu_r -o {heat_dir}/TSS_heatmap.pdf \\
--outFileNameMatrix {heat_dir}/TSS_value.txt --outFileSortedRegions {heat_dir}/TSS_region.bed &")
cat(heat_cmd)
peak_group <- list.files(heat_dir, "peakGroup", full.names = T)
# for peaks
mat_cmd <- glue(
"{dt}computeMatrix scale-regions -a 1000 -b 1000 -p 20 -R {peak} -S \\
{bws} -o {heat_dir}/peak_mtx.gz &",
peak = str_c(peak_group, collapse = " "),
bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
cat(mat_cmd)
heat_cmd <- glue(
"{dt}plotHeatmap -m {heat_dir}/peak_mtx.gz --colorMap RdBu_r -o {heat_dir}/peak_heatmap.pdf \\
--outFileNameMatrix {heat_dir}/peak_value.txt --outFileSortedRegions {heat_dir}/peak_region.bed &")
cat(heat_cmd)
write.table(c("#!/bin/bash\n", mat_cmd), glue("code/{heat_dir}_mtx.sh"), quote = F, row.names = F, col.names = F)
write.table(c("#!/bin/bash\n", heat_cmd), glue("code/{heat_dir}.sh"), quote = F, row.names = F, col.names = F)
# * * 2.15. Anno ----------------------------------------------------------
anno_dir <- "15_anno"
anno_dir %>% dir.create()
known_motif <- settingList$knownMotif
peak_file <- list.files(anno_dir, ".homer")
anno_cmd <- glue(
"{homer}annotatePeaks.pl {anno_dir}/{peak_file} {org} -m {known_motif} -mscore > {anno_dir}/{peak_name}.txt &",
peak_name = str_remove(peak_file, ".homer")
)
cat(anno_cmd)
write.table(c("#!/bin/bash\n", anno_cmd), glue("code/{anno_dir}.sh"), quote = F, row.names = F, col.names = F)