Switch to unified view

a b/exseek/snakefiles/quality_control.snakemake
1
shell.prefix('set -x;')
2
include: 'common.snakemake'
3
4
import os
5
6
def get_all_inputs(wildcards):
7
    if config['paired_end']:
8
        available_inputs = dict(
9
            prinseq=expand('{output_dir}/prinseq/{sample_id}.html',
10
                output_dir=output_dir, sample_id=sample_ids)
11
        )
12
    else:
13
        available_inputs = dict(
14
            fastqc=expand('{output_dir}/fastqc/{sample_id}_fastqc.zip',
15
                output_dir=output_dir, sample_id=sample_ids),
16
            summarize_fastqc=expand('{output_dir}/summary/fastqc.txt',
17
                output_dir=output_dir),
18
            summarize_fastqc_html=expand('{output_dir}/summary/fastqc.html',
19
                output_dir=output_dir),
20
            clean=expand('{output_dir}/unmapped/{sample_id}/clean.fa.gz',
21
                output_dir=output_dir, sample_id=sample_ids)
22
        )
23
    enabled_inputs = list(available_inputs.keys())
24
    inputs = []
25
    for key, l in available_inputs.items():
26
        if key in enabled_inputs:
27
            inputs += l
28
    return inputs
29
30
rule all:
31
    input:
32
        get_all_inputs
33
34
def get_input_fastq_se(wildcards):
35
    fastq = expand(data_dir + '/fastq/{sample_id}.fastq', sample_id=wildcards.sample_id)
36
    fastq_gz = expand(data_dir + '/fastq/{sample_id}.fastq.gz', sample_id=wildcards.sample_id)
37
38
    if all(os.path.exists(f) for f in fastq_gz):
39
        return fastq_gz
40
    else:
41
        return fastq
42
43
rule cutadapt_se:
44
    input:
45
        auto_gzip_input(data_dir + '/fastq/{sample_id}.fastq')
46
    output:
47
        trimmed='{output_dir}/cutadapt/{sample_id}.fastq.gz'
48
    params:
49
        adaptor=config['adaptor'],
50
        min_read_length=config['min_read_length'],
51
        min_base_quality=config['min_base_quality']
52
    log:
53
        '{output_dir}/log/cutadapt/{sample_id}'
54
    threads:
55
        config['threads_compress']
56
    shell:
57
        '''cutadapt -a {params.adaptor} -m {params.min_read_length} --trim-n -q {params.min_base_quality} \
58
            -o >(pigz -c -p {threads} > {output.trimmed}) {input} > {log} 2>&1
59
        '''
60
61
rule fastq_to_fasta_se:
62
    input:
63
        '{output_dir}/cutadapt/{sample_id}.fastq.gz'
64
    output:
65
        '{output_dir}/unmapped/{sample_id}/clean.fa.gz'
66
    threads:
67
        config['threads_compress']
68
    shell:
69
        '''pigz -d -c -p {threads} {input} | fastq_to_fasta -r | pigz -p {threads} -c > {output}
70
        '''
71
72
rule fastq_to_fasta_pe:
73
    input:
74
        '{output_dir}/cutadapt/{sample_id}_{pair_index}.fastq.gz'
75
    output:
76
        '{output_dir}/unmapped/{sample_id}/clean_{pair_index}.fa.gz'
77
    threads:
78
        config['threads_compress']
79
    wildcard_constraints:
80
        pair_index='[12]'
81
    shell:
82
        '''pigz -p {threads} -d -c {input} \
83
            | fastq_to_fasta -r -n \
84
            | pigz -p {threads} -c > {output}
85
        '''
86
87
rule cutadapt_pe:
88
    input:
89
        fastq1=auto_gzip_input(data_dir + '/fastq/{sample_id}_1.fastq'),
90
        fastq2=auto_gzip_input(data_dir + '/fastq/{sample_id}_2.fastq')
91
    output:
92
        fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz',
93
        fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz'
94
    threads:
95
        config['threads']
96
    params:
97
        quality_5p=config['min_base_quality_5p'],
98
        quality_3p=config['min_base_quality_3p'],
99
        adaptor1=lambda wildcards: '-a ' + config['adaptor1'] if len(config['adaptor1']) > 0 else '',
100
        adaptor2=lambda wildcards: '-A ' + config['adaptor2'] if len(config['adaptor2']) > 0 else '',
101
        miniL=config['min_read_length'],
102
        quality_base=config['quality_base']
103
    log:
104
        '{output_dir}/log/cutadapt/{sample_id}'
105
    shell:
106
        '''cutadapt -j {threads} -q {params.quality_5p},{params.quality_3p} \
107
            {params.adaptor1} {params.adaptor2} --trim-n -m {params.miniL} \
108
            -o >(gzip -c > {output.fastq1}) -p >(gzip -c > {output.fastq2}) \
109
            {input.fastq1} {input.fastq2} > {log} 2>&1
110
        '''
111
112
rule fastqc:
113
    input:
114
        data_dir + '/fastq/{sample_id}.fastq'
115
    output:
116
        html='{output_dir}/fastqc/{sample_id}_fastqc.html',
117
        zip='{output_dir}/fastqc/{sample_id}_fastqc.zip'
118
    params:
119
        output_prefix='{output_dir}/fastqc/',
120
        temp_dir=config['temp_dir']
121
    log:
122
        '{output_dir}/log/fastqc/{sample_id}'
123
    shell:
124
        '''fastqc -q -o {params.output_prefix} -d {params.temp_dir} {input}
125
        '''
126
127
def parse_fastqc_data(fp):
128
    section = None
129
    qc_status = OrderedDict()
130
    basic_statistics = OrderedDict()
131
    for line in fp:
132
        line = str(line, encoding='utf-8')
133
        line = line.strip()
134
        if line.startswith('>>'):
135
            if line == '>>END_MODULE':
136
                continue
137
            section, status = line[2:].split('\t')
138
            qc_status[section] = status
139
        else:
140
            if section == 'Basic Statistics':
141
                key, val = line.split('\t')
142
                basic_statistics[key] = val
143
    for key, val in qc_status.items():
144
        basic_statistics[key] = val
145
    return basic_statistics
146
        
147
rule summarize_fastqc:
148
    input:
149
        zip=lambda wildcards: expand('{output_dir}/fastqc/{sample_id}_fastqc.zip',
150
            output_dir=wildcards.output_dir, sample_id=sample_ids)
151
    output:
152
        '{output_dir}/summary/fastqc.txt'
153
    run:
154
        import pandas as pd
155
        from zipfile import ZipFile
156
        import os
157
        from collections import OrderedDict
158
159
        summary = OrderedDict()
160
        columns = None
161
        for filename in input.zip:
162
            sample_id = os.path.splitext(os.path.basename(filename))[0][:-7]
163
            with ZipFile(filename, 'r') as zf:
164
                with zf.open(sample_id + '_fastqc/fastqc_data.txt', 'r') as f:
165
                    summary[sample_id] = parse_fastqc_data(f)
166
                    if columns is None:
167
                        columns = list(summary[sample_id].keys())
168
        summary = pd.DataFrame.from_records(summary)
169
        summary = summary.T
170
        summary = summary.reindex(columns=columns)
171
        summary.index.name = 'sample_id'
172
        summary.to_csv(output[0], sep='\t', index=True, header=True) 
173
174
rule summarize_fastqc_ipynb:
175
    input:
176
        '{output_dir}/summary/fastqc.txt'
177
    output:
178
        ipynb='{output_dir}/summary/fastqc.ipynb'
179
    run:
180
        import nbformat
181
        from nbformat.v4 import new_notebook, new_code_cell, new_markdown_cell
182
        nb = new_notebook()
183
        nb['cells'].append(new_code_cell(r"""import pandas as pd
184
import numpy as np"""))
185
186
        nb['cells'].append(new_code_cell(r"""summary = pd.read_table('fastqc.txt', sep='\t')"""))
187
        nb['cells'].append(new_code_cell(r"""summary.set_index('sample_id', inplace=True, drop=False)
188
qc_status = summary.iloc[:, 9:].copy()
189
qc_status.fillna('NA')
190
qc_status = qc_status.astype('str')
191
sample_ids = qc_status.index.values
192
sections = qc_status.columns.values
193
def style_func(val):
194
    status, row, col = val.split('|')
195
    row, col = int(row), int(col)
196
    color = {'pass': 'green', 'fail': 'red', 'warn': 'orange'}.get(status, 'gray')
197
    return '<a href="../fastqc/{sample_id}_fastqc.html#M{section}" style="color: {color}">{status}</a>'.format(
198
        sample_id=sample_ids[row], color=color, status=status, section=col)
199
200
pd.DataFrame(qc_status.values \
201
             + '|' + np.arange(qc_status.shape[0]).astype('str')[:, np.newaxis] \
202
             + '|' + np.arange(qc_status.shape[1]).astype('str')[np.newaxis, :],
203
             index=qc_status.index, columns=qc_status.columns) \
204
    .style.format(style_func)"""))
205
        nb['metadata']['kernel_spec'] = {
206
            'display_name': 'Python 3',
207
            'language': 'python', 
208
            'name': 'python3'
209
        }
210
        nb['metadata']['laugnage_info'] = {
211
            'codemirror_mode': {
212
                'name': 'ipython',
213
                'version': 3
214
            },
215
            'name': 'python', 
216
            'file_extension': '.py', 
217
            'mimetype': 'text/x-python',
218
            'nbconvert_exporter': 'python',
219
            'version': '3.6'
220
        }
221
        nbformat.write(nb, output.ipynb)
222
223
rule summarize_fastqc_html:
224
    input:
225
        '{output_dir}/summary/fastqc.ipynb'
226
    output:
227
        '{output_dir}/summary/fastqc.html'
228
    shell:
229
        '''jupyter nbconvert --execute --to html \
230
            --HTMLExporter.exclude_code_cell=False \
231
            --HTMLExporter.exclude_input_prompt=True \
232
            --HTMLExporter.exclude_output_prompt=True \
233
            {input}
234
        '''
235
236
rule prinseq_se:
237
    input:
238
        fastq='{output_dir}/cutadapt/{sample_id}.fastq.gz'
239
    output:
240
        graph_data='{output_dir}/prinseq/{sample_id}.gd',
241
        html='{output_dir}/prinseq/{sample_id}.html'
242
    shell:
243
        '''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose -fastq <(zcat {input.fastq}) \
244
            -ns_max_n 0 -graph_data {output.graph_data} -out_good null -out_bad null
245
        perl {tools_dir}/prinseq/prinseq-graphs.pl -i {output.graph_data} -html_all -o {output.html}
246
        '''
247
248
rule prinseq_pe:
249
    input:
250
        fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz',
251
        fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz'
252
    output:
253
        graph_data='{output_dir}/prinseq/{sample_id}.gd',
254
        html='{output_dir}/prinseq/{sample_id}.html'
255
    shell:
256
        '''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose -fastq {input.fastq1} -fastq2 {input.fastq2} \
257
            -ns_max_n 0 -graph_data {output.graph_data} -out_good null -out_bad null
258
        perl {tools_dir}/prinseq/prinseq-graphs.pl -i {output.graph_data} -html_all -o {output.html}
259
        '''