Switch to unified view

a b/a_PreprocessPipeline/bulkRNApre.R
1
2
# MESSAGE -----------------------------------------------------------------
3
# 
4
# author: Yulin Lyu
5
# email: lvyulin@pku.edu.cn
6
# 
7
# require: R whatever
8
# 
9
# ---
10
11
# 0. Load packages --------------------------------------------------------
12
13
library(jsonlite)
14
library(tidyverse)
15
library(magrittr)
16
library(glue)
17
18
setwd("/mnt/d/proj/github/Protocols-4pub")
19
source("a_PreprocessPipeline/utils.R")
20
21
settingList <- fromJSON("a_PreprocessPipeline/bulkRNApre_conf_X.json")
22
cmdConf <- fromJSON("a_PreprocessPipeline/cmd_conf_X.json")
23
24
setwd(settingList$workDir)
25
26
# A. Rename ---------------------------------------------------------------
27
28
setwd("0_fastq")
29
30
list.files(".")
31
32
from_file <- list.files(".", "")
33
to_file <- gsub("", "", from_file)
34
35
file.rename(from_file, to_file)
36
37
setwd("..")
38
39
# B. Load data ------------------------------------------------------------
40
41
file_name <- list.files("0_fastq", ".gz")
42
file_name <- list.files("2_trim", ".gz") # after qc
43
44
R1 <- grep("_1\\.", file_name, value = T)
45
R2 <- grep("_2\\.", file_name, value = T)
46
47
sample_name <- gsub("_1\\..*", "", R1)
48
49
# 1. QC for raw data ------------------------------------------------------
50
51
dir.create("code")
52
53
qc_dir <- "1_qc"
54
qc_dir %>% dir.create()
55
qc_dir %>% str_c("code/", .) %>% dir.create()
56
57
qc <- settingList$fastqcDir
58
59
qc_cmd <- glue("{qc} -o {qc_dir} -t 48 0_fastq/{file_name}")
60
cat(qc_cmd[1])
61
62
setCMD(qc_cmd, str_c("code/", qc_dir), 6)
63
# multiqc -o 1_qc -f -n qc.raw 1_qc/*.zip
64
65
# 2. Trim -----------------------------------------------------------------
66
67
trim_dir <- "2_trim"
68
trim_dir %>% dir.create()
69
trim_dir %>% str_c("code/", .) %>% dir.create()
70
dir.create(".2_untrim")
71
72
trim <- settingList$trimmomaticDir
73
adapter <- settingList$trimAdapter
74
75
trim_cmd <- glue(
76
  "java -jar {trim} PE -threads 48 \\
77
  0_fastq/{R1} 0_fastq/{R2} \\
78
  2_trim/{sample_name}_1.trim.fastq.gz \\
79
  .2_untrim/{sample_name}_1.untrim.fastq.gz \\
80
  2_trim/{sample_name}_2.trim.fastq.gz \\
81
  .2_untrim/{sample_name}_2.untrim.fastq.gz \\
82
  ILLUMINACLIP:{adapter}:2:30:7:1:true \\
83
  LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > \\
84
  2_trim/{sample_name}.trim.log 2>&1"
85
)
86
cat(trim_cmd[1])
87
88
setCMD(trim_cmd, str_c("code/", trim_dir), 6)
89
# multiqc -o 2_trim -f -n trim 2_trim/*.log
90
91
# 3. QC for trimmed data --------------------------------------------------
92
93
# reload trimmed data first, or
94
file_name <- glue("{rep(sample_name, each = 2)}_{rep(1:2, length(sample_name))}.trim.fastq.gz")
95
R1 <- grep("_1\\.", file_name, value = T)
96
R2 <- grep("_2\\.", file_name, value = T)
97
98
qc_dir <- "3_qc"
99
qc_dir %>% dir.create()
100
qc_dir %>% str_c("code/", .) %>% dir.create()
101
102
qc <- settingList$fastqcDir
103
104
qc_cmd <- glue("{qc} -o {qc_dir} -t 48 {trim_dir}/{file_name}")
105
cat(qc_cmd[1])
106
107
setCMD(qc_cmd, str_c("code/", qc_dir), 6)
108
# multiqc -o 3_qc -f -n qc.trim 3_qc/*.zip
109
110
# 4. Map ------------------------------------------------------------------
111
112
map_dir <- "4_map"
113
map_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
114
115
star <- settingList$starDir
116
ref <- settingList$mapRef
117
118
map_cmd <- glue(
119
  "{star} --genomeDir {ref} \\
120
  --runThreadN 48 \\
121
  --outFileNamePrefix {map_dir}/{sample_name} \\
122
  --readFilesIn {trim_dir}/{R1} {trim_dir}/{R2} \\
123
  --readFilesCommand zcat \\
124
  --outSAMtype BAM Unsorted \\
125
  --outSAMstrandField intronMotif \\
126
  --outFilterIntronMotifs RemoveNoncanonical"
127
)
128
cat(map_cmd[1])
129
130
setCMD(map_cmd, str_c("code/", map_dir), 6)
131
# multiqc -o 4_map -f -n map 4_map/*.out
132
133
# 5. Infer (optional) -----------------------------------------------------
134
135
infer_dir <- "5_infer"
136
infer_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
137
138
infer_path <- settingList$inferDir
139
gene_bed <- settingList$inferGene
140
141
infer_cmd <- glue(
142
  "{infer_path} -r {gene_bed} \\
143
  -i {map_dir}/{sample_name}Aligned.out.bam > \\
144
  {infer_dir}/{sample_name}.infer.txt 2>&1"
145
)
146
cat(infer_cmd[1])
147
148
setCMD(infer_cmd, str_c("code/", infer_dir), 6)
149
150
# 6. Count ----------------------------------------------------------------
151
152
count_dir <- "6_count"
153
count_dir %T>% dir.create() %>% str_c("code/", .) %>% dir.create()
154
155
count_path <- settingList$countDir
156
gtf <- settingList$countAnno
157
158
count_cmd <- glue(
159
  "{count_path} -s 0 -p -T 48 \\
160
  -g gene_id --extraAttributes gene_name \\
161
  -a {gtf} \\
162
  -o {count_dir}/counts.txt {samples} > \\
163
  {count_dir}/count.log 2>&1",
164
  samples = str_c(map_dir, "/", sample_name, "Aligned.out.bam \\\n", collapse = "")
165
)
166
cat(count_cmd)
167
168
setCMD(count_cmd, str_c("code/", count_dir), 1)
169
# multiqc -o 6_count -f -n count 6_count
170