--- a
+++ b/a_PreprocessPipeline/bulkATACpre.R
@@ -0,0 +1,401 @@
+
+# MESSAGE -----------------------------------------------------------------
+# 
+# author: Yulin Lyu
+# email: lvyulin@pku.edu.cn
+# 
+# require: R whatever
+# 
+# ---
+
+# 0. Load packages --------------------------------------------------------
+
+library(jsonlite)
+library(tidyverse)
+library(magrittr)
+library(glue)
+
+setwd("/mnt/d/proj/github/Protocols-4pub")
+source("a_PreprocessPipeline/utils.R")
+
+settingList <- fromJSON("a_PreprocessPipeline/bulkATACpre_conf_X.json")
+cmdConf <- fromJSON("a_PreprocessPipeline/cmd_conf_X.json")
+
+setwd(settingList$workDir)
+
+# A. Rename ---------------------------------------------------------------
+
+setwd("0_fastq")
+
+list.files(".")
+
+from_file <- list.files(".", "")
+to_file <- gsub("", "", from_file)
+
+file.rename(from_file, to_file)
+
+setwd("..")
+
+# B. Load data ------------------------------------------------------------
+
+file_name <- list.files("0_fastq", ".gz")
+file_name <- list.files("2_trim", ".gz") # after qc
+
+R1 <- grep("_1\\.", file_name, value = T)
+R2 <- grep("_2\\.", file_name, value = T)
+
+sample_name <- gsub("_1\\..*", "", R1)
+
+# 1. QC for raw data ------------------------------------------------------
+
+dir.create("code")
+
+qc_dir <- "1_qc"
+qc_dir %>% dir.create()
+qc_dir %>% str_c("code/", .) %>% dir.create()
+
+qc <- settingList$fastqcDir
+
+qc_cmd <- glue("{qc} -o {qc_dir} -t 48 0_fastq/{file_name}")
+cat(qc_cmd[1])
+
+setCMD(qc_cmd, str_c("code/", qc_dir), 6)
+# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
+
+# 2. Trim -----------------------------------------------------------------
+
+trim_dir <- "2_trim"
+trim_dir %>% dir.create()
+trim_dir %>% str_c("code/", .) %>% dir.create()
+dir.create(".2_untrim")
+
+trim <- settingList$trimmomaticDir
+adapter <- settingList$trimAdapter
+
+trim_cmd <- glue(
+  "java -jar {trim} PE -threads 48 \\
+  0_fastq/{R1} 0_fastq/{R2} \\
+  2_trim/{sample_name}_1.trim.fastq.gz \\
+  .2_untrim/{sample_name}_1.untrim.fastq.gz \\
+  2_trim/{sample_name}_2.trim.fastq.gz \\
+  .2_untrim/{sample_name}_2.untrim.fastq.gz \\
+  ILLUMINACLIP:{adapter}:2:30:7:1:true \\
+  LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > \\
+  2_trim/{sample_name}.trim.log 2>&1"
+)
+cat(trim_cmd[1])
+
+setCMD(trim_cmd, str_c("code/", trim_dir), 6)
+# multiqc -o 2_trim -f -n trim 2_trim/*.log
+
+# 3. QC for trimmed data --------------------------------------------------
+
+# reload trimmed data first, or
+file_name <- glue("{rep(sample_name, each = 2)}_{rep(1:2, length(sample_name))}.trim.fastq.gz")
+R1 <- grep("_1\\.", file_name, value = T)
+R2 <- grep("_2\\.", file_name, value = T)
+
+qc_dir <- "3_qc"
+qc_dir %>% dir.create()
+qc_dir %>% str_c("code/", .) %>% dir.create()
+
+qc <- settingList$fastqcDir
+
+qc_cmd <- glue("{qc} -o {qc_dir} -t 48 {trim_dir}/{file_name}")
+cat(qc_cmd[1])
+
+setCMD(qc_cmd, str_c("code/", qc_dir), 6)
+# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip
+
+# 4. Map ------------------------------------------------------------------
+
+map_dir <- "4_map"
+map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+bt2 <- settingList$bt2Dir
+ref <- settingList$mapRef
+
+map_cmd <- glue(
+  "{bt2} -p 48 --very-sensitive -X 2000 -x {ref} \\
+  -1 {trim_dir}/{R1} -2 {trim_dir}/{R2} \\
+  -S {map_dir}/{sample_name}.sam > \\
+  {map_dir}/{sample_name}.log 2>&1"
+)
+cat(map_cmd[1])
+
+setCMD(map_cmd, str_c("code/", map_dir), 6)
+# multiqc -o 4_map -f -n map 4_map/*.log
+
+# 5. Sort -----------------------------------------------------------------
+
+sort_dir <- "5_sort"
+sort_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+samt <- settingList$samtDir
+
+sort_cmd <- glue(
+  "{samt} view -@ 24 -bS {map_dir}/{sample_name}.sam | \\
+  {samt} sort -@ 24 > \\
+  {sort_dir}/{sample_name}.bam"
+)
+cat(sort_cmd[1])
+
+setCMD(sort_cmd, str_c("code/", sort_dir), 6)
+
+# 6. Dedup ----------------------------------------------------------------
+
+dedup_dir <- "6_dedup"
+dedup_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+picard <- settingList$picardDir
+
+dedup_cmd <- glue(
+  "java -Xms16g -Xmx256g -jar {picard} MarkDuplicates \\
+  I={sort_dir}/{sample_name}.bam \\
+  O={dedup_dir}/{sample_name}.dedup.bam \\
+  M={dedup_dir}/{sample_name}.dedup.txt \\
+  REMOVE_DUPLICATES=true > \\
+  {dedup_dir}/{sample_name}.dedup.log 2>&1"
+)
+cat(dedup_cmd[1])
+
+setCMD(dedup_cmd, str_c("code/", dedup_dir), 6)
+
+# 7. Filter ---------------------------------------------------------------
+
+fil_dir <- "7_filter"
+fil_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+samt <- settingList$samtDir
+
+fil_cmd <- glue(
+  "{samt} view -@ 20 -h -f 2 -q 30 {dedup_dir}/{sample_name}.dedup.bam | \\
+  egrep -v '\\bMT|\\bGL|\\bJH' | \\
+  {samt} sort -@ 20 -O bam > \\
+  {fil_dir}/{sample_name}.filter.bam"
+)
+cat(fil_cmd[1])
+
+setCMD(fil_cmd, str_c("code/", fil_dir), 6)
+
+# 8. Insert ---------------------------------------------------------------
+
+ins_dir <- "8_insert"
+ins_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+picard <- settingList$picardDir
+
+ins_cmd <- glue(
+  "java -Xms16g -Xmx256g -jar {picard} CollectInsertSizeMetrics \\
+  I={fil_dir}/{sample_name}.filter.bam \\
+  O={ins_dir}/{sample_name}.txt \\
+  H={ins_dir}/{sample_name}.pdf > \\
+  {ins_dir}/{sample_name}.log 2>&1"
+)
+cat(ins_cmd[1])
+
+setCMD(ins_cmd, str_c("code/", ins_dir), 6)
+
+# 9. Shift ----------------------------------------------------------------
+
+shift_dir <- "9_shift"
+shift_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+bedt <- settingList$bedtDir
+
+shift_cmd <- glue(
+  "{bedt} bamtobed -i {fil_dir}/{sample_name}.filter.bam | \\
+  {awk_cmd} \\
+  > {shift_dir}/{sample_name}.shift.bed",
+  awk_cmd = "awk -F '\\t' 'BEGIN {OFS = FS}{ if ($6 == \"+\") {$2 = $2 + 4} else if ($6 == \"-\") {$3 = $3 - 5} print $0}'"
+)
+cat(shift_cmd[1])
+
+setCMD(shift_cmd, str_c("code/", shift_dir), 6)
+
+# 10. Peak ----------------------------------------------------------------
+
+macs2 <- settingList$macs2Dir
+
+peak_dir <- "10_peak"
+peak_dir %>% dir.create()
+
+org <- "hs"
+org <- "mm"
+
+peak_cmd <- glue("
+  {macs2} callpeak -t {shift_dir}/{sample_name}.shift.bed \\
+  -g {org} --nomodel --shift -100 --extsize 200 --keep-dup all -n {sample_name} \\
+  -B --outdir {peak_dir} > {peak_dir}/{sample_name}.log 2>&1 &")
+cat(peak_cmd[1])
+
+group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
+
+group_peak_cmd <- glue(
+  "{macs2} callpeak -t {peak_group} \\
+  -g {org} --nomodel --shift -100 --extsize 200 --keep-dup all --SPMR -n group_{group} \\
+  -B --outdir {peak_dir} > {peak_dir}/group_{group}.log 2>&1 &",
+  peak_group = map_chr(group_name, ~ paste(glue("{shift_dir}/{.x}.shift.bed"), collapse = " ")),
+  group = names(group_name))
+cat(group_peak_cmd[1])
+
+write.table(c("#!/bin/bash\n", peak_cmd), glue("code/{peak_dir}.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", group_peak_cmd), glue("code/{peak_dir}_group.sh"), quote = F, row.names = F, col.names = F)
+
+# * * 2.11. Bigwig --------------------------------------------------------
+
+bw_dir <- "11_bw"
+bw_dir %>% dir.create()
+
+dt <- settingList$dtDir
+bdg2bw <- settingList$bdg2bwDir
+ref_len <- settingList$refLen
+
+index_cmd <- glue("{st} index {fil_dir}/{sample_name}.filter.bam &")
+cat(index_cmd[1])
+
+bw_cmd <- glue("
+  {dt}bamCoverage --normalizeUsing RPKM -of bigwig -b {fil_dir}/{sample_name}.filter.bam \\
+  -o {bw_dir}/{sample_name}.bw &")
+cat(bw_cmd[1])
+
+group_bw_cmd <- glue(
+  "{bdg2bw} {peak_dir}/group_{group}_treat_pileup.bdg {ref_len} {bw_dir}/{group}.bw &",
+  group = names(group_name))
+cat(group_bw_cmd[1])
+
+sample_name <- list.files(bw_dir, "-[0-9].bw") %>% str_replace(".bw$", "")
+group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
+
+cor_cmd <- glue(
+  "{dt}multiBigwigSummary bins -p 20 -b {bws} -o {bw_dir}/bw_cor.npz &",
+  bws = str_c(glue("{bw_dir}/{sample_name}.bw"), collapse = " \\\n"))
+cat(cor_cmd)
+
+group_cor_cmd <- glue(
+  "{dt}multiBigwigSummary bins -p 20 -b {group_bws} -o {bw_dir}/group_bw_cor.npz &",
+  group_bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
+cat(group_cor_cmd)
+
+corplot_cmd <- glue(
+  "{dt}plotCorrelation -in {bw_dir}/bw_cor.npz -c pearson -p heatmap --plotNumbers -o {bw_dir}/cor_heatmap.pdf --colorMap RdBu_r &")
+cat(corplot_cmd)
+
+group_corplot_cmd <- glue(
+  "{dt}plotCorrelation -in {bw_dir}/group_bw_cor.npz -c pearson -p heatmap --plotNumbers -o {bw_dir}/group_cor_heatmap.pdf --colorMap RdBu_r &")
+cat(group_corplot_cmd)
+
+write.table(c("#!/bin/bash\n", index_cmd), glue("code/{bw_dir}_index.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", bw_cmd), glue("code/{bw_dir}.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", group_bw_cmd), glue("code/{bw_dir}_group.sh"), quote = F, row.names = F, col.names = F)
+
+write.table(c("#!/bin/bash\n", cor_cmd), glue("code/{bw_dir}_cor.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", group_cor_cmd), glue("code/{bw_dir}_cor_group.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", corplot_cmd), glue("code/{bw_dir}_plot_cor.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", group_corplot_cmd), glue("code/{bw_dir}_plot_cor_group.sh"), quote = F, row.names = F, col.names = F)
+
+# * * 2.12. IDR -----------------------------------------------------------
+
+idr_dir <- "12_idr"
+idr_dir %>% dir.create()
+
+idr <- settingList$idrDir
+
+idr_cmd <- map(group_name, ~ as_tibble(combn(.x, 2)) %>% map(~ paste(glue("{peak_dir}/{.x}_peaks.narrowPeak"), collapse = " "))) %>% 
+  imap(~ glue("{idr} --samples {.x} --peak-list {peak_dir}/group_{.y}_peaks.narrowPeak \\
+              --input-file-type narrowPeak --output-file {idr_dir}/{.y}")) %>% 
+  map(~ imap(.x, ~ glue("{.x}-{.y}.idr.narrowPeak &"))) %>% unlist() %>% unname()
+cat(idr_cmd[1])
+
+write.table(c("#!/bin/bash\n", idr_cmd), glue("code/{idr_dir}.sh"), quote = F, row.names = F, col.names = F)
+
+group_index <- map(group_name, ~ choose(length(.x), 2)) %>% imap(~ rep(.y, .x)) %>% unlist() %>% unname()
+idr_file <- list.files(idr_dir, "idr", full.names = T)
+idr_data <- map(idr_file, ~ read_delim(.x, "\t", col_names = F))
+idr_peak <- map(idr_data, ~ .x[.x[[5]] > 540, ]) %>%
+  map(~ str_c(.x[[1]], .x[[2]], .x[[3]], sep = "_")) %>%
+  tapply(group_index, c) %>%
+  map(~ unlist(.x) %>% unique)
+
+group_file <- list.files(peak_dir, "^group.*Peak", full.names = T)
+group_data <- map(group_file, ~ read.table(.x, sep = "\t", stringsAsFactors = F))
+group_peak <- map(group_data, ~ str_c(.x[[1]], .x[[2]], .x[[3]], sep = "_"))
+
+groupData <- pmap(list(idr_peak, group_peak, group_data), function(a, b, c) {c[b %in% a, ]})
+
+iwalk(group_data, ~ write.table(.x, glue("{idr_dir}/{.y}.narrowPeak"), col.names = F, row.names = F, quote = F, sep = "\t"))
+
+# * * 2.13. Motif ---------------------------------------------------------
+
+motif_dir <- "13_motif"
+motif_dir %>% dir.create()
+region_file <- list.files(motif_dir, "bed")
+
+homer <- settingList$homerDir
+
+org <- "hg19"
+org <- "mm10"
+
+motif_cmd <- glue(
+  "{homer}findMotifsGenome.pl {motif_dir}/{region_file} {org} {motif_dir}/{rigion_name} \\
+  -size 200 -len 8,10,12 > {region_file}/{region_name}.log 2>&1 &",
+  region_name = str_replace(region_file, ".bed", ""))
+cat(motif_cmd[1])
+
+write.table(c("#!/bin/bash\n", motif_cmd), glue("code/{motif_dir}.sh"), quote = F, row.names = F, col.names = F)
+
+# * * 2.14. Heat ----------------------------------------------------------
+
+heat_dir <- "14_heat"
+heat_dir %>% dir.create()
+
+sample_name <- list.files(bw_dir, "-[123].bw") %>% str_replace(".bw$", "")
+group_name <- tapply(sample_name, str_sub(sample_name, 1, -3), c)
+
+# for global
+mat_cmd <- glue(
+  "{dt}computeMatrix reference-point -a 2000 -b 2000 -p 20 -R {heat_dir}/hg19geneTSS.bed -S \\
+  {bws} -o {heat_dir}/TSS_mtx.gz &",
+  bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
+cat(mat_cmd)
+
+heat_cmd <- glue(
+  "{dt}plotHeatmap -m {heat_dir}/TSS_mtx.gz --colorMap RdBu_r -o {heat_dir}/TSS_heatmap.pdf \\
+  --outFileNameMatrix {heat_dir}/TSS_value.txt --outFileSortedRegions {heat_dir}/TSS_region.bed &")
+cat(heat_cmd)
+
+peak_group <- list.files(heat_dir, "peakGroup", full.names = T)
+
+# for peaks
+mat_cmd <- glue(
+  "{dt}computeMatrix scale-regions -a 1000 -b 1000 -p 20 -R {peak} -S \\
+  {bws} -o {heat_dir}/peak_mtx.gz &",
+  peak = str_c(peak_group, collapse = " "),
+  bws = str_c(glue("{bw_dir}/{names(group_name)}.bw"), collapse = " \\\n"))
+cat(mat_cmd)
+
+heat_cmd <- glue(
+  "{dt}plotHeatmap -m {heat_dir}/peak_mtx.gz --colorMap RdBu_r -o {heat_dir}/peak_heatmap.pdf \\
+  --outFileNameMatrix {heat_dir}/peak_value.txt --outFileSortedRegions {heat_dir}/peak_region.bed &")
+cat(heat_cmd)
+
+write.table(c("#!/bin/bash\n", mat_cmd), glue("code/{heat_dir}_mtx.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", heat_cmd), glue("code/{heat_dir}.sh"), quote = F, row.names = F, col.names = F)
+
+# * * 2.15. Anno ----------------------------------------------------------
+
+anno_dir <- "15_anno"
+anno_dir %>% dir.create()
+
+known_motif <- settingList$knownMotif
+
+peak_file <- list.files(anno_dir, ".homer")
+
+anno_cmd <- glue(
+  "{homer}annotatePeaks.pl {anno_dir}/{peak_file} {org} -m {known_motif} -mscore > {anno_dir}/{peak_name}.txt &",
+  peak_name = str_remove(peak_file, ".homer")
+)
+cat(anno_cmd)
+
+write.table(c("#!/bin/bash\n", anno_cmd), glue("code/{anno_dir}.sh"), quote = F, row.names = F, col.names = F)
+