|
a |
|
b/exseek/snakefiles/quality_control.snakemake |
|
|
1 |
shell.prefix('set -x;') |
|
|
2 |
include: 'common.snakemake' |
|
|
3 |
|
|
|
4 |
import os |
|
|
5 |
|
|
|
6 |
def get_all_inputs(wildcards): |
|
|
7 |
if config['paired_end']: |
|
|
8 |
available_inputs = dict( |
|
|
9 |
prinseq=expand('{output_dir}/prinseq/{sample_id}.html', |
|
|
10 |
output_dir=output_dir, sample_id=sample_ids) |
|
|
11 |
) |
|
|
12 |
else: |
|
|
13 |
available_inputs = dict( |
|
|
14 |
fastqc=expand('{output_dir}/fastqc/{sample_id}_fastqc.zip', |
|
|
15 |
output_dir=output_dir, sample_id=sample_ids), |
|
|
16 |
summarize_fastqc=expand('{output_dir}/summary/fastqc.txt', |
|
|
17 |
output_dir=output_dir), |
|
|
18 |
summarize_fastqc_html=expand('{output_dir}/summary/fastqc.html', |
|
|
19 |
output_dir=output_dir), |
|
|
20 |
clean=expand('{output_dir}/unmapped/{sample_id}/clean.fa.gz', |
|
|
21 |
output_dir=output_dir, sample_id=sample_ids) |
|
|
22 |
) |
|
|
23 |
enabled_inputs = list(available_inputs.keys()) |
|
|
24 |
inputs = [] |
|
|
25 |
for key, l in available_inputs.items(): |
|
|
26 |
if key in enabled_inputs: |
|
|
27 |
inputs += l |
|
|
28 |
return inputs |
|
|
29 |
|
|
|
30 |
rule all: |
|
|
31 |
input: |
|
|
32 |
get_all_inputs |
|
|
33 |
|
|
|
34 |
def get_input_fastq_se(wildcards): |
|
|
35 |
fastq = expand(data_dir + '/fastq/{sample_id}.fastq', sample_id=wildcards.sample_id) |
|
|
36 |
fastq_gz = expand(data_dir + '/fastq/{sample_id}.fastq.gz', sample_id=wildcards.sample_id) |
|
|
37 |
|
|
|
38 |
if all(os.path.exists(f) for f in fastq_gz): |
|
|
39 |
return fastq_gz |
|
|
40 |
else: |
|
|
41 |
return fastq |
|
|
42 |
|
|
|
43 |
rule cutadapt_se: |
|
|
44 |
input: |
|
|
45 |
auto_gzip_input(data_dir + '/fastq/{sample_id}.fastq') |
|
|
46 |
output: |
|
|
47 |
trimmed='{output_dir}/cutadapt/{sample_id}.fastq.gz' |
|
|
48 |
params: |
|
|
49 |
adaptor=config['adaptor'], |
|
|
50 |
min_read_length=config['min_read_length'], |
|
|
51 |
min_base_quality=config['min_base_quality'] |
|
|
52 |
log: |
|
|
53 |
'{output_dir}/log/cutadapt/{sample_id}' |
|
|
54 |
threads: |
|
|
55 |
config['threads_compress'] |
|
|
56 |
shell: |
|
|
57 |
'''cutadapt -a {params.adaptor} -m {params.min_read_length} --trim-n -q {params.min_base_quality} \ |
|
|
58 |
-o >(pigz -c -p {threads} > {output.trimmed}) {input} > {log} 2>&1 |
|
|
59 |
''' |
|
|
60 |
|
|
|
61 |
rule fastq_to_fasta_se: |
|
|
62 |
input: |
|
|
63 |
'{output_dir}/cutadapt/{sample_id}.fastq.gz' |
|
|
64 |
output: |
|
|
65 |
'{output_dir}/unmapped/{sample_id}/clean.fa.gz' |
|
|
66 |
threads: |
|
|
67 |
config['threads_compress'] |
|
|
68 |
shell: |
|
|
69 |
'''pigz -d -c -p {threads} {input} | fastq_to_fasta -r | pigz -p {threads} -c > {output} |
|
|
70 |
''' |
|
|
71 |
|
|
|
72 |
rule fastq_to_fasta_pe: |
|
|
73 |
input: |
|
|
74 |
'{output_dir}/cutadapt/{sample_id}_{pair_index}.fastq.gz' |
|
|
75 |
output: |
|
|
76 |
'{output_dir}/unmapped/{sample_id}/clean_{pair_index}.fa.gz' |
|
|
77 |
threads: |
|
|
78 |
config['threads_compress'] |
|
|
79 |
wildcard_constraints: |
|
|
80 |
pair_index='[12]' |
|
|
81 |
shell: |
|
|
82 |
'''pigz -p {threads} -d -c {input} \ |
|
|
83 |
| fastq_to_fasta -r -n \ |
|
|
84 |
| pigz -p {threads} -c > {output} |
|
|
85 |
''' |
|
|
86 |
|
|
|
87 |
rule cutadapt_pe: |
|
|
88 |
input: |
|
|
89 |
fastq1=auto_gzip_input(data_dir + '/fastq/{sample_id}_1.fastq'), |
|
|
90 |
fastq2=auto_gzip_input(data_dir + '/fastq/{sample_id}_2.fastq') |
|
|
91 |
output: |
|
|
92 |
fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz', |
|
|
93 |
fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz' |
|
|
94 |
threads: |
|
|
95 |
config['threads'] |
|
|
96 |
params: |
|
|
97 |
quality_5p=config['min_base_quality_5p'], |
|
|
98 |
quality_3p=config['min_base_quality_3p'], |
|
|
99 |
adaptor1=lambda wildcards: '-a ' + config['adaptor1'] if len(config['adaptor1']) > 0 else '', |
|
|
100 |
adaptor2=lambda wildcards: '-A ' + config['adaptor2'] if len(config['adaptor2']) > 0 else '', |
|
|
101 |
miniL=config['min_read_length'], |
|
|
102 |
quality_base=config['quality_base'] |
|
|
103 |
log: |
|
|
104 |
'{output_dir}/log/cutadapt/{sample_id}' |
|
|
105 |
shell: |
|
|
106 |
'''cutadapt -j {threads} -q {params.quality_5p},{params.quality_3p} \ |
|
|
107 |
{params.adaptor1} {params.adaptor2} --trim-n -m {params.miniL} \ |
|
|
108 |
-o >(gzip -c > {output.fastq1}) -p >(gzip -c > {output.fastq2}) \ |
|
|
109 |
{input.fastq1} {input.fastq2} > {log} 2>&1 |
|
|
110 |
''' |
|
|
111 |
|
|
|
112 |
rule fastqc: |
|
|
113 |
input: |
|
|
114 |
data_dir + '/fastq/{sample_id}.fastq' |
|
|
115 |
output: |
|
|
116 |
html='{output_dir}/fastqc/{sample_id}_fastqc.html', |
|
|
117 |
zip='{output_dir}/fastqc/{sample_id}_fastqc.zip' |
|
|
118 |
params: |
|
|
119 |
output_prefix='{output_dir}/fastqc/', |
|
|
120 |
temp_dir=config['temp_dir'] |
|
|
121 |
log: |
|
|
122 |
'{output_dir}/log/fastqc/{sample_id}' |
|
|
123 |
shell: |
|
|
124 |
'''fastqc -q -o {params.output_prefix} -d {params.temp_dir} {input} |
|
|
125 |
''' |
|
|
126 |
|
|
|
127 |
def parse_fastqc_data(fp): |
|
|
128 |
section = None |
|
|
129 |
qc_status = OrderedDict() |
|
|
130 |
basic_statistics = OrderedDict() |
|
|
131 |
for line in fp: |
|
|
132 |
line = str(line, encoding='utf-8') |
|
|
133 |
line = line.strip() |
|
|
134 |
if line.startswith('>>'): |
|
|
135 |
if line == '>>END_MODULE': |
|
|
136 |
continue |
|
|
137 |
section, status = line[2:].split('\t') |
|
|
138 |
qc_status[section] = status |
|
|
139 |
else: |
|
|
140 |
if section == 'Basic Statistics': |
|
|
141 |
key, val = line.split('\t') |
|
|
142 |
basic_statistics[key] = val |
|
|
143 |
for key, val in qc_status.items(): |
|
|
144 |
basic_statistics[key] = val |
|
|
145 |
return basic_statistics |
|
|
146 |
|
|
|
147 |
rule summarize_fastqc: |
|
|
148 |
input: |
|
|
149 |
zip=lambda wildcards: expand('{output_dir}/fastqc/{sample_id}_fastqc.zip', |
|
|
150 |
output_dir=wildcards.output_dir, sample_id=sample_ids) |
|
|
151 |
output: |
|
|
152 |
'{output_dir}/summary/fastqc.txt' |
|
|
153 |
run: |
|
|
154 |
import pandas as pd |
|
|
155 |
from zipfile import ZipFile |
|
|
156 |
import os |
|
|
157 |
from collections import OrderedDict |
|
|
158 |
|
|
|
159 |
summary = OrderedDict() |
|
|
160 |
columns = None |
|
|
161 |
for filename in input.zip: |
|
|
162 |
sample_id = os.path.splitext(os.path.basename(filename))[0][:-7] |
|
|
163 |
with ZipFile(filename, 'r') as zf: |
|
|
164 |
with zf.open(sample_id + '_fastqc/fastqc_data.txt', 'r') as f: |
|
|
165 |
summary[sample_id] = parse_fastqc_data(f) |
|
|
166 |
if columns is None: |
|
|
167 |
columns = list(summary[sample_id].keys()) |
|
|
168 |
summary = pd.DataFrame.from_records(summary) |
|
|
169 |
summary = summary.T |
|
|
170 |
summary = summary.reindex(columns=columns) |
|
|
171 |
summary.index.name = 'sample_id' |
|
|
172 |
summary.to_csv(output[0], sep='\t', index=True, header=True) |
|
|
173 |
|
|
|
174 |
rule summarize_fastqc_ipynb: |
|
|
175 |
input: |
|
|
176 |
'{output_dir}/summary/fastqc.txt' |
|
|
177 |
output: |
|
|
178 |
ipynb='{output_dir}/summary/fastqc.ipynb' |
|
|
179 |
run: |
|
|
180 |
import nbformat |
|
|
181 |
from nbformat.v4 import new_notebook, new_code_cell, new_markdown_cell |
|
|
182 |
nb = new_notebook() |
|
|
183 |
nb['cells'].append(new_code_cell(r"""import pandas as pd |
|
|
184 |
import numpy as np""")) |
|
|
185 |
|
|
|
186 |
nb['cells'].append(new_code_cell(r"""summary = pd.read_table('fastqc.txt', sep='\t')""")) |
|
|
187 |
nb['cells'].append(new_code_cell(r"""summary.set_index('sample_id', inplace=True, drop=False) |
|
|
188 |
qc_status = summary.iloc[:, 9:].copy() |
|
|
189 |
qc_status.fillna('NA') |
|
|
190 |
qc_status = qc_status.astype('str') |
|
|
191 |
sample_ids = qc_status.index.values |
|
|
192 |
sections = qc_status.columns.values |
|
|
193 |
def style_func(val): |
|
|
194 |
status, row, col = val.split('|') |
|
|
195 |
row, col = int(row), int(col) |
|
|
196 |
color = {'pass': 'green', 'fail': 'red', 'warn': 'orange'}.get(status, 'gray') |
|
|
197 |
return '<a href="../fastqc/{sample_id}_fastqc.html#M{section}" style="color: {color}">{status}</a>'.format( |
|
|
198 |
sample_id=sample_ids[row], color=color, status=status, section=col) |
|
|
199 |
|
|
|
200 |
pd.DataFrame(qc_status.values \ |
|
|
201 |
+ '|' + np.arange(qc_status.shape[0]).astype('str')[:, np.newaxis] \ |
|
|
202 |
+ '|' + np.arange(qc_status.shape[1]).astype('str')[np.newaxis, :], |
|
|
203 |
index=qc_status.index, columns=qc_status.columns) \ |
|
|
204 |
.style.format(style_func)""")) |
|
|
205 |
nb['metadata']['kernel_spec'] = { |
|
|
206 |
'display_name': 'Python 3', |
|
|
207 |
'language': 'python', |
|
|
208 |
'name': 'python3' |
|
|
209 |
} |
|
|
210 |
nb['metadata']['laugnage_info'] = { |
|
|
211 |
'codemirror_mode': { |
|
|
212 |
'name': 'ipython', |
|
|
213 |
'version': 3 |
|
|
214 |
}, |
|
|
215 |
'name': 'python', |
|
|
216 |
'file_extension': '.py', |
|
|
217 |
'mimetype': 'text/x-python', |
|
|
218 |
'nbconvert_exporter': 'python', |
|
|
219 |
'version': '3.6' |
|
|
220 |
} |
|
|
221 |
nbformat.write(nb, output.ipynb) |
|
|
222 |
|
|
|
223 |
rule summarize_fastqc_html: |
|
|
224 |
input: |
|
|
225 |
'{output_dir}/summary/fastqc.ipynb' |
|
|
226 |
output: |
|
|
227 |
'{output_dir}/summary/fastqc.html' |
|
|
228 |
shell: |
|
|
229 |
'''jupyter nbconvert --execute --to html \ |
|
|
230 |
--HTMLExporter.exclude_code_cell=False \ |
|
|
231 |
--HTMLExporter.exclude_input_prompt=True \ |
|
|
232 |
--HTMLExporter.exclude_output_prompt=True \ |
|
|
233 |
{input} |
|
|
234 |
''' |
|
|
235 |
|
|
|
236 |
rule prinseq_se: |
|
|
237 |
input: |
|
|
238 |
fastq='{output_dir}/cutadapt/{sample_id}.fastq.gz' |
|
|
239 |
output: |
|
|
240 |
graph_data='{output_dir}/prinseq/{sample_id}.gd', |
|
|
241 |
html='{output_dir}/prinseq/{sample_id}.html' |
|
|
242 |
shell: |
|
|
243 |
'''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose -fastq <(zcat {input.fastq}) \ |
|
|
244 |
-ns_max_n 0 -graph_data {output.graph_data} -out_good null -out_bad null |
|
|
245 |
perl {tools_dir}/prinseq/prinseq-graphs.pl -i {output.graph_data} -html_all -o {output.html} |
|
|
246 |
''' |
|
|
247 |
|
|
|
248 |
rule prinseq_pe: |
|
|
249 |
input: |
|
|
250 |
fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz', |
|
|
251 |
fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz' |
|
|
252 |
output: |
|
|
253 |
graph_data='{output_dir}/prinseq/{sample_id}.gd', |
|
|
254 |
html='{output_dir}/prinseq/{sample_id}.html' |
|
|
255 |
shell: |
|
|
256 |
'''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose -fastq {input.fastq1} -fastq2 {input.fastq2} \ |
|
|
257 |
-ns_max_n 0 -graph_data {output.graph_data} -out_good null -out_bad null |
|
|
258 |
perl {tools_dir}/prinseq/prinseq-graphs.pl -i {output.graph_data} -html_all -o {output.html} |
|
|
259 |
''' |