--- a +++ b/a_PreprocessPipeline/bulkRNApre.R @@ -0,0 +1,170 @@ + +# MESSAGE ----------------------------------------------------------------- +# +# author: Yulin Lyu +# email: lvyulin@pku.edu.cn +# +# require: R whatever +# +# --- + +# 0. Load packages -------------------------------------------------------- + +library(jsonlite) +library(tidyverse) +library(magrittr) +library(glue) + +setwd("/mnt/d/proj/github/Protocols-4pub") +source("a_PreprocessPipeline/utils.R") + +settingList <- fromJSON("a_PreprocessPipeline/bulkRNApre_conf_X.json") +cmdConf <- fromJSON("a_PreprocessPipeline/cmd_conf_X.json") + +setwd(settingList$workDir) + +# A. Rename --------------------------------------------------------------- + +setwd("0_fastq") + +list.files(".") + +from_file <- list.files(".", "") +to_file <- gsub("", "", from_file) + +file.rename(from_file, to_file) + +setwd("..") + +# B. Load data ------------------------------------------------------------ + +file_name <- list.files("0_fastq", ".gz") +file_name <- list.files("2_trim", ".gz") # after qc + +R1 <- grep("_1\\.", file_name, value = T) +R2 <- grep("_2\\.", file_name, value = T) + +sample_name <- gsub("_1\\..*", "", R1) + +# 1. QC for raw data ------------------------------------------------------ + +dir.create("code") + +qc_dir <- "1_qc" +qc_dir %>% dir.create() +qc_dir %>% str_c("code/", .) %>% dir.create() + +qc <- settingList$fastqcDir + +qc_cmd <- glue("{qc} -o {qc_dir} -t 48 0_fastq/{file_name}") +cat(qc_cmd[1]) + +setCMD(qc_cmd, str_c("code/", qc_dir), 6) +# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip + +# 2. Trim ----------------------------------------------------------------- + +trim_dir <- "2_trim" +trim_dir %>% dir.create() +trim_dir %>% str_c("code/", .) %>% dir.create() +dir.create(".2_untrim") + +trim <- settingList$trimmomaticDir +adapter <- settingList$trimAdapter + +trim_cmd <- glue( + "java -jar {trim} PE -threads 48 \\ + 0_fastq/{R1} 0_fastq/{R2} \\ + 2_trim/{sample_name}_1.trim.fastq.gz \\ + .2_untrim/{sample_name}_1.untrim.fastq.gz \\ + 2_trim/{sample_name}_2.trim.fastq.gz \\ + .2_untrim/{sample_name}_2.untrim.fastq.gz \\ + ILLUMINACLIP:{adapter}:2:30:7:1:true \\ + LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > \\ + 2_trim/{sample_name}.trim.log 2>&1" +) +cat(trim_cmd[1]) + +setCMD(trim_cmd, str_c("code/", trim_dir), 6) +# multiqc -o 2_trim -f -n trim 2_trim/*.log + +# 3. QC for trimmed data -------------------------------------------------- + +# reload trimmed data first, or +file_name <- glue("{rep(sample_name, each = 2)}_{rep(1:2, length(sample_name))}.trim.fastq.gz") +R1 <- grep("_1\\.", file_name, value = T) +R2 <- grep("_2\\.", file_name, value = T) + +qc_dir <- "3_qc" +qc_dir %>% dir.create() +qc_dir %>% str_c("code/", .) %>% dir.create() + +qc <- settingList$fastqcDir + +qc_cmd <- glue("{qc} -o {qc_dir} -t 48 {trim_dir}/{file_name}") +cat(qc_cmd[1]) + +setCMD(qc_cmd, str_c("code/", qc_dir), 6) +# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip + +# 4. Map ------------------------------------------------------------------ + +map_dir <- "4_map" +map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create() + +star <- settingList$starDir +ref <- settingList$mapRef + +map_cmd <- glue( + "{star} --genomeDir {ref} \\ + --runThreadN 48 \\ + --outFileNamePrefix {map_dir}/{sample_name} \\ + --readFilesIn {trim_dir}/{R1} {trim_dir}/{R2} \\ + --readFilesCommand zcat \\ + --outSAMtype BAM Unsorted \\ + --outSAMstrandField intronMotif \\ + --outFilterIntronMotifs RemoveNoncanonical" +) +cat(map_cmd[1]) + +setCMD(map_cmd, str_c("code/", map_dir), 6) +# multiqc -o 4_map -f -n map 4_map/*.out + +# 5. Infer (optional) ----------------------------------------------------- + +infer_dir <- "5_infer" +infer_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create() + +infer_path <- settingList$inferDir +gene_bed <- settingList$inferGene + +infer_cmd <- glue( + "{infer_path} -r {gene_bed} \\ + -i {map_dir}/{sample_name}Aligned.out.bam > \\ + {infer_dir}/{sample_name}.infer.txt 2>&1" +) +cat(infer_cmd[1]) + +setCMD(infer_cmd, str_c("code/", infer_dir), 6) + +# 6. Count ---------------------------------------------------------------- + +count_dir <- "6_count" +count_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create() + +count_path <- settingList$countDir +gtf <- settingList$countAnno + +count_cmd <- glue( + "{count_path} -s 0 -p -T 48 \\ + -g gene_id --extraAttributes gene_name \\ + -a {gtf} \\ + -o {count_dir}/counts.txt {samples} > \\ + {count_dir}/count.log 2>&1", + samples = str_c(map_dir, "/", sample_name, "Aligned.out.bam \\\n", collapse = "") +) +cat(count_cmd) + +setCMD(count_cmd, str_c("code/", count_dir), 1) +# multiqc -o 6_count -f -n count 6_count +