Switch to side-by-side view

--- a
+++ b/a_PreprocessPipeline/ChIPpre.R
@@ -0,0 +1,273 @@
+
+# MESSAGE -----------------------------------------------------------------
+#
+# author: Yulin Lyu
+# email: lvyulin@pku.edu.cn
+#
+# require: R whatever
+#
+# ---
+
+# * 0. Setting ------------------------------------------------------------
+
+settingList <- list(
+  workDir = ".",
+  trimmomaticDir = "app/Trimmomatic-0.39",
+  trimAdapter = "TruSeq3-PE-2.fa",
+  mapRef = "fasta/genome",
+  picardDir = "app/picard.jar",
+  bdg2bwDir = "app/bedGraphToBigWig",
+  refLen = "fasta/genome.fa.fai"
+)
+
+# * 1. Load packages ------------------------------------------------------
+
+setwd(settingList$workDir)
+
+library(tidyverse)
+library(magrittr)
+library(glue)
+
+# * 2. Preprocess ---------------------------------------------------------
+
+setwd("0_fastq")
+
+list.files(".")
+
+from_file <- list.files(".", "")
+to_file <- gsub("", "", from_file)
+
+file.rename(from_file, to_file)
+
+setwd("..")
+
+# * * 2.0. Load data ------------------------------------------------------
+
+file_name <- list.files("0_fastq", ".gz")
+file_name <- list.files("2_trim", ".gz") # after qc
+
+R1 <- grep("_1\\.", file_name, value = T)
+R2 <- grep("_2\\.", file_name, value = T)
+
+sample_name <- gsub("_1\\..*", "", R1)
+
+# * * 2.1. QC for raw data ------------------------------------------------
+
+dir.create("code")
+
+qc_dir <- "1_qc"
+qc_dir %>% dir.create()
+
+qc_cmd <- glue("fastqc -o {qc_dir} -t 8 0_fastq/{file_name} &")
+cat(qc_cmd[1])
+
+write.table(c("#!/bin/bash\n", qc_cmd), glue("code/{qc_dir}.sh"), quote = F, row.names = F, col.names = F)
+# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
+
+# * * 2.2. Trim -----------------------------------------------------------
+
+dir.create("2_trim")
+dir.create(".2_untrim")
+
+trim <- settingList$trimmomaticDir
+adapter <- settingList$trimAdapter
+
+trim_cmd <- glue(
+  "java -jar {trim}/trimmomatic-0.39.jar PE -threads 10 \\
+  0_fastq/{R1} 0_fastq/{R2} \\
+  2_trim/{sample_name}_1.trim.fastq.gz \\
+  .2_untrim/{sample_name}_1.untrim.fastq.gz \\
+  2_trim/{sample_name}_2.trim.fastq.gz \\
+  .2_untrim/{sample_name}_2.untrim.fastq.gz \\
+  ILLUMINACLIP:{trim}/adapters/{adapter}:2:30:7:1:true \\
+  LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > 2_trim/{sample_name}.trim.log 2>&1")
+cat(trim_cmd[1])
+
+trim_dir <- "2_trim"
+trim_dir %>% str_c("code/", .) %>% dir.create()
+
+setCMD(trim_cmd, str_c("code/", trim_dir), 1, F)
+
+# * * 2.3. QC for trimmed data --------------------------------------------
+
+# reload trimmed data first
+
+qc_dir <- "3_qc"
+qc_dir %>% dir.create()
+
+qc_cmd <- glue("fastqc -o {qc_dir} -t 8 {trim_dir}/{file_name} &")
+cat(qc_cmd[1])
+
+write.table(c("#!/bin/bash\n", qc_cmd), glue("code/{qc_dir}.sh"), quote = F, row.names = F, col.names = F)
+# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip
+
+# * * 2.4. Map ------------------------------------------------------------
+
+ref <- settingList$mapRef
+
+map_dir <- "4_map"
+map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+map_cmd <- glue(
+  "bowtie2 -p 20 --very-sensitive -X 2000 -x {ref} \\
+  -1 {trim_dir}/{R1} -2 {trim_dir}/{R2} \\
+  -S {map_dir}/{sample_name}.sam > {map_dir}/{sample_name}.log 2>&1")
+cat(map_cmd[1])
+
+setCMD(map_cmd, str_c("code/", map_dir), 1, T)
+
+# * * 2.5. Sort -----------------------------------------------------------
+
+sort_dir <- "5_sort"
+sort_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+sort_cmd <- glue(
+  "samtools view -@ 10 -bS {map_dir}/{sample_name}.sam | \\
+  samtools sort -@ 10 > {sort_dir}/{sample_name}.bam")
+cat(sort_cmd[1])
+
+setCMD(sort_cmd, str_c("code/", sort_dir), 1, F)
+
+# * * 2.6. Dedup ----------------------------------------------------------
+
+picard <- settingList$picardDir
+
+dedup_dir <- "6_dedup"
+dedup_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
+
+dedup_cmd <- glue(
+  "java -Xms2g -Xmx8g -XX:ParallelGCThreads=8 -jar {picard} MarkDuplicates \\
+  I={sort_dir}/{sample_name}.bam \\
+  O={dedup_dir}/{sample_name}.dedup.bam \\
+  M={dedup_dir}/{sample_name}.dedup.txt \\
+  REMOVE_DUPLICATES=true")
+cat(dedup_cmd[1])
+
+setCMD(dedup_cmd, str_c("code/", dedup_dir), 1, F)
+
+# * * 2.7. Filter ---------------------------------------------------------
+
+fil_dir <- "7_filter"
+fil_dir %>% dir.create()
+
+filter_cmd <- glue(
+  "samtools view -h -f 2 -q 30 {dedup_dir}/{sample_name}.dedup.bam | \\
+  egrep -v 'MT|GL|JH' | samtools sort -@ 4 -O bam > {fil_dir}/{sample_name}.filter.bam &")
+cat(filter_cmd[1])
+
+write.table(c("#!/bin/bash\n", filter_cmd), glue("code/{fil_dir}.sh"), quote = F, row.names = F, col.names = F)
+
+# * * 2.8. Finger ---------------------------------------------------------
+
+fin_dir <- "8_finger"
+fin_dir %>% dir.create()
+
+index_cmd <- glue("samtools index {fil_dir}/{sample_name}.filter.bam {fil_dir}/{sample_name}.filter.bam.bai &")
+cat(index_cmd)
+
+write.table(c("#!/bin/bash\n", index_cmd), "code/8_index.sh", quote = F, row.names = F, col.names = F)
+
+finger_group <- tapply(sample_name, gsub(".*-", "", sample_name), function(x) glue("{fil_dir}/{x}.filter.bam"))
+finger_group %<>% map(~ paste(.x, collapse = " "))
+
+finger_cmd <- glue("
+  plotFingerprint -b {finger_group} -o {fin_dir}/{names(finger_group)}.finger.pdf &")
+cat(finger_cmd[1])
+
+write.table(c("#!/bin/bash\n", finger_cmd), glue("code/{fin_dir}.sh"), quote = F, row.names = F, col.names = F)
+
+# * * 2.9. Peak -----------------------------------------------------------
+
+peak_dir <- "9_peak"
+peak_dir %>% dir.create()
+
+input_sample <- grep("in", sample_name, value = T)
+chip_sample <- grep("in", sample_name, value = T, invert = T)
+input_sample %<>% rep(each = 3)
+
+org <- "hs"
+
+peak_cmd <- glue(
+  "macs2 callpeak -t {fil_dir}/{chip_sample}.filter.bam -c {fil_dir}/{input_sample}.filter.bam \\
+  -f BAMPE -g {org} --broad --keep-dup all -n {chip_sample} \\
+  --outdir {peak_dir} > {peak_dir}/{chip_sample}.log 2>&1 &")
+cat(peak_cmd[1])
+
+write.table(c("#!/bin/bash\n", peak_cmd), glue("code/{peak_dir}.sh"), quote = F, row.names = F, col.names = F)
+
+# * * 2.10. Bigwig --------------------------------------------------------
+
+bw_dir <- "10_bw"
+bw_dir %>% dir.create()
+
+bw_cmd <- glue("
+  bamCoverage --normalizeUsing RPKM -of bigwig \\
+  -b {fil_dir}/{sample_name}.filter.bam \\
+  -o {bw_dir}/{sample_name}.bw > {bw_dir}/{sample_name}.log 2>&1 &")
+cat(bw_cmd[1])
+
+bw_comp_cmd <- glue(
+  "bamCompare -p 2 -of bigwig --operation ratio \\
+  -b1 {fil_dir}/{chip_sample}.filter.bam \\
+  -b2 {fil_dir}/{input_sample}.filter.bam \\
+  -o {bw_dir}/{chip_sample}.comp.bw > {bw_dir}/{chip_sample}.comp.log 2>&1 &")
+cat(bw_comp_cmd[1])
+
+write.table(c("#!/bin/bash\n", bw_cmd), glue("code/{bw_dir}.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", bw_comp_cmd), glue("code/{bw_dir}_comp.sh"), quote = F, row.names = F, col.names = F)
+
+# * * 2.11. Heat ----------------------------------------------------------
+
+chip_type <- chip_sample %>% str_replace(".*-", "")
+bw_group <- tapply(chip_sample, chip_type, c) %>%
+  map(~ glue("{bw_dir}/{.x}.comp.bw")) %>%
+  map(~ str_c(.x, collapse = " "))
+
+heat_dir <- "11_heat"
+heat_dir %>% dir.create()
+
+bed_file <- "all.bed"
+out_prefix <- str_c("all_", names(bw_group))
+
+mtx_cmd <- glue(
+  "computeMatrix scale-regions -p 6 -R {heat_dir}/{bed_file} \\
+  -S {bw_group} -o {heat_dir}/{out_prefix}_mtx.gz &"
+)
+cat(mtx_cmd[1])
+
+heat_cmd <- glue(
+  "plotHeatmap -m {heat_dir}/{out_prefix}_mtx.gz --colorMap RdBu_r \\
+  -o {heat_dir}/{out_prefix}_heatmap.pdf \\
+  --outFileNameMatrix {heat_dir}/{out_prefix}_value.txt \\
+  --outFileSortedRegions {heat_dir}/{out_prefix}_region.bed &"
+)
+cat(heat_cmd[1])
+
+write.table(c("#!/bin/bash\n", mtx_cmd), glue("code/{heat_dir}_mtx.sh"), quote = F, row.names = F, col.names = F)
+write.table(c("#!/bin/bash\n", heat_cmd), glue("code/{heat_dir}.sh"), quote = F, row.names = F, col.names = F)
+
+# * 3. Function -----------------------------------------------------------
+
+setCMD <- function(cmd, dir = ".", sepN = 1, clu = F) {
+  cmd %>% tapply(seq_along(.) %% sepN, c) %>% imap(~ {
+    ifelse(clu, glue(
+      "#!/bin/bash
+      #SBATCH -J batch{.y}
+      #SBATCH -o batch{.y}.%j.out
+      #SBATCH -e batch{.y}.%j.err
+      #SBATCH -p cn-long
+      #SBATCH -N 1
+      #SBATCH --ntasks-per-node=20
+      #SBATCH --no-requeue
+      #SBATCH -A hkdeng_g1
+      #SBATCH --qos=hkdengcnl
+      export PATH=/gpfs1/hkdeng_pkuhpc/lvyl/app/anaconda3/envs/lvyl/bin:$PATH"),
+      "#!/bin/bash") %>%
+      c(.x)}) %T>%
+    iwalk(~ write.table(.x, glue("{dir}/batch{.y}.sh"), quote = F, row.names = F, col.names = F)) %>%
+    names() %>% map_chr(~ glue("{head} {dir}/batch{.x}.sh {tail}",
+                               head = ifelse(clu, "pkubatch", "sh"),
+                               tail = ifelse(clu, "; sleep 1", "&"))) %>%
+    c("#!/bin/bash", .) %>% as_tibble() %>%
+    write_delim(glue("{dir}/submit.sh"), "\n", col_names = F)
+}