# MESSAGE -----------------------------------------------------------------
#
# author: Yulin Lyu
# email: lvyulin@pku.edu.cn
#
# require: R whatever
#
# ---
# 0. Load packages --------------------------------------------------------
library(jsonlite)
library(tidyverse)
library(magrittr)
library(glue)
setwd("/mnt/d/proj/github/Protocols-4pub")
source("a_PreprocessPipeline/utils.R")
settingList <- fromJSON("a_PreprocessPipeline/bulkRNApre_conf_X.json")
cmdConf <- fromJSON("a_PreprocessPipeline/cmd_conf_X.json")
setwd(settingList$workDir)
# A. Rename ---------------------------------------------------------------
setwd("0_fastq")
list.files(".")
from_file <- list.files(".", "")
to_file <- gsub("", "", from_file)
file.rename(from_file, to_file)
setwd("..")
# B. Load data ------------------------------------------------------------
file_name <- list.files("0_fastq", ".gz")
file_name <- list.files("2_trim", ".gz") # after qc
R1 <- grep("_1\\.", file_name, value = T)
R2 <- grep("_2\\.", file_name, value = T)
sample_name <- gsub("_1\\..*", "", R1)
# 1. QC for raw data ------------------------------------------------------
dir.create("code")
qc_dir <- "1_qc"
qc_dir %>% dir.create()
qc_dir %>% str_c("code/", .) %>% dir.create()
qc <- settingList$fastqcDir
qc_cmd <- glue("{qc} -o {qc_dir} -t 48 0_fastq/{file_name}")
cat(qc_cmd[1])
setCMD(qc_cmd, str_c("code/", qc_dir), 6)
# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
# 2. Trim -----------------------------------------------------------------
trim_dir <- "2_trim"
trim_dir %>% dir.create()
trim_dir %>% str_c("code/", .) %>% dir.create()
dir.create(".2_untrim")
trim <- settingList$trimmomaticDir
adapter <- settingList$trimAdapter
trim_cmd <- glue(
"java -jar {trim} PE -threads 48 \\
0_fastq/{R1} 0_fastq/{R2} \\
2_trim/{sample_name}_1.trim.fastq.gz \\
.2_untrim/{sample_name}_1.untrim.fastq.gz \\
2_trim/{sample_name}_2.trim.fastq.gz \\
.2_untrim/{sample_name}_2.untrim.fastq.gz \\
ILLUMINACLIP:{adapter}:2:30:7:1:true \\
LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > \\
2_trim/{sample_name}.trim.log 2>&1"
)
cat(trim_cmd[1])
setCMD(trim_cmd, str_c("code/", trim_dir), 6)
# multiqc -o 2_trim -f -n trim 2_trim/*.log
# 3. QC for trimmed data --------------------------------------------------
# reload trimmed data first, or
file_name <- glue("{rep(sample_name, each = 2)}_{rep(1:2, length(sample_name))}.trim.fastq.gz")
R1 <- grep("_1\\.", file_name, value = T)
R2 <- grep("_2\\.", file_name, value = T)
qc_dir <- "3_qc"
qc_dir %>% dir.create()
qc_dir %>% str_c("code/", .) %>% dir.create()
qc <- settingList$fastqcDir
qc_cmd <- glue("{qc} -o {qc_dir} -t 48 {trim_dir}/{file_name}")
cat(qc_cmd[1])
setCMD(qc_cmd, str_c("code/", qc_dir), 6)
# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip
# 4. Map ------------------------------------------------------------------
map_dir <- "4_map"
map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
star <- settingList$starDir
ref <- settingList$mapRef
map_cmd <- glue(
"{star} --genomeDir {ref} \\
--runThreadN 48 \\
--outFileNamePrefix {map_dir}/{sample_name} \\
--readFilesIn {trim_dir}/{R1} {trim_dir}/{R2} \\
--readFilesCommand zcat \\
--outSAMtype BAM Unsorted \\
--outSAMstrandField intronMotif \\
--outFilterIntronMotifs RemoveNoncanonical"
)
cat(map_cmd[1])
setCMD(map_cmd, str_c("code/", map_dir), 6)
# multiqc -o 4_map -f -n map 4_map/*.out
# 5. Infer (optional) -----------------------------------------------------
infer_dir <- "5_infer"
infer_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
infer_path <- settingList$inferDir
gene_bed <- settingList$inferGene
infer_cmd <- glue(
"{infer_path} -r {gene_bed} \\
-i {map_dir}/{sample_name}Aligned.out.bam > \\
{infer_dir}/{sample_name}.infer.txt 2>&1"
)
cat(infer_cmd[1])
setCMD(infer_cmd, str_c("code/", infer_dir), 6)
# 6. Count ----------------------------------------------------------------
count_dir <- "6_count"
count_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
count_path <- settingList$countDir
gtf <- settingList$countAnno
count_cmd <- glue(
"{count_path} -s 0 -p -T 48 \\
-g gene_id --extraAttributes gene_name \\
-a {gtf} \\
-o {count_dir}/counts.txt {samples} > \\
{count_dir}/count.log 2>&1",
samples = str_c(map_dir, "/", sample_name, "Aligned.out.bam \\\n", collapse = "")
)
cat(count_cmd)
setCMD(count_cmd, str_c("code/", count_dir), 1)
# multiqc -o 6_count -f -n count 6_count