Switch to unified view

a b/exseek/snakefiles/quality_control_pe.snakemake
1
include: 'common.snakemake'
2
3
def get_all_inputs(wildcards):
4
    available_inputs = dict(
5
        fastqc=expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.html', 
6
            output_dir=output_dir, sample_id=sample_ids, mate_index=[1, 2]),
7
        multiqc=expand('{output_dir}/summary/fastqc.html', output_dir=output_dir)
8
    )
9
    enabled_inputs = list(available_inputs.keys())
10
    inputs = []
11
    for key, l in available_inputs.items():
12
        if key in enabled_inputs:
13
            inputs += l
14
    return inputs
15
16
rule all:
17
    input:
18
        get_all_inputs
19
20
21
22
rule fastqc_pe:
23
    input:
24
        auto_gzip_input(data_dir + '/fastq/{sample_id}_{mate_index}.fastq')
25
    output:
26
        html='{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.html',
27
        zip='{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip'
28
    params:
29
        output_prefix='{output_dir}/fastqc/',
30
        temp_dir=config['temp_dir']
31
    log:
32
        '{output_dir}/log/fastqc/{sample_id}_{mate_index}'
33
    wildcard_constraints:
34
        mate_index='[12]'
35
    shell:
36
        '''fastqc -q -o {params.output_prefix} -d {params.temp_dir} {input} > {log} 2>&1
37
        '''
38
39
rule multiqc_pe:
40
    input:
41
        fastqc=expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip',
42
            output_dir=output_dir, sample_id=sample_ids, mate_index=[1, 2])
43
    output:
44
        html='{output_dir}/summary/fastqc.html',
45
        data=directory('{output_dir}/summary/fastqc_data')
46
    params:
47
        fastqc_dir='{output_dir}/fastqc'
48
    wildcard_constraints:
49
        mate_index='[12]'
50
    shell:
51
        '''multiqc -m fastqc -n {output.html} {params.fastqc_dir}
52
        '''
53
54
rule summarize_fastqc_pe:
55
    input:
56
        zip=lambda wildcards: expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip',
57
            output_dir=wildcards.output_dir, sample_id=sample_ids, mate_index=['1', '2'])
58
    output:
59
        '{output_dir}/summary/fastqc.txt'
60
    wildcard_constraints:
61
        fastqc_step='fastqc.*',
62
        mate_index='[12]'
63
    run:
64
        import pandas as pd
65
        from zipfile import ZipFile
66
        import os
67
        from collections import OrderedDict
68
69
        summary = OrderedDict()
70
        columns = None
71
        for filename in input.zip:
72
            sample_id = os.path.splitext(os.path.basename(filename))[0][:-7]
73
            with ZipFile(filename, 'r') as zf:
74
                with zf.open(sample_id + '_fastqc/fastqc_data.txt', 'r') as f:
75
                    summary[sample_id] = parse_fastqc_data(f)
76
                    if columns is None:
77
                        columns = list(summary[sample_id].keys())
78
        summary = pd.DataFrame.from_records(summary)
79
        summary = summary.T
80
        summary = summary.reindex(columns=columns)
81
        summary.index.name = 'sample_id'
82
        summary.to_csv(output[0], sep='\t', index=True, header=True)
83
84
rule summarize_fastqc_jupyter_pe:
85
    input:
86
        fastqc='{output_dir}/summary/fastqc.txt',
87
        jupyter=package_dir + '/templates/fastqc.ipynb'
88
    output:
89
        jupyter='{output_dir}/summary/fastqc_jupyter.ipynb',
90
        html='{output_dir}/summary/fastqc_jupyter.html'
91
    wildcard_constraints:
92
        fastqc_step='fastqc.*'
93
    run:
94
        shell(nbconvert_command)
95
96
rule prinseq_pe:
97
    input:
98
        fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz',
99
        fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz'
100
    output:
101
        '{output_dir}/prinseq/{sample_id}.gd'
102
    shell:
103
        '''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose \
104
            -fastq  <({bin_dir}/auto_uncompress {input.fastq1}) \
105
            -fastq2 <({bin_dir}/auto_uncompress {input.fastq2}) \
106
            -ns_max_n 0 -graph_data {output} -out_good null -out_bad null
107
        
108
        '''
109
110
rule prinseq_graph_pe:
111
    input:
112
        '{output_dir}/prinseq/{sample_id}.gd'
113
    output:
114
        '{output_dir}/prinseq/{sample_id}.html'
115
    shell:
116
        '''perl {tools_dir}/prinseq/prinseq-graphs.pl -i {input} -html_all -o {output}
117
        '''
118
119
rule summarize_cutadapt_pe:
120
    input:
121
        lambda wildcards: expand('{output_dir}/log/cutadapt/{sample_id}',
122
            output_dir=wildcards.output_dir, sample_id=sample_ids)
123
    output:
124
        '{output_dir}/summary/cutadapt.txt'
125
    run:
126
        import pandas as pd
127
        import re
128
        
129
        patterns = [
130
            re.compile(r'Total read pairs processed:\s*(?P<total_pairs>[0-9,]+)'),
131
            re.compile(r'Read 1 with adapter:\s*(?P<read1_with_adapters>[0-9,]+)'),
132
            re.compile(r'Read 2 with adapter:\s*(?P<read2_with_adapters>[0-9,]+)'),
133
            re.compile(r'Pairs that were too short:\s*(?P<pairs_too_short>[0-9,]+)'),
134
            re.compile(r'Pairs written \(passing filters\):\s*(?P<pairs_kept>[0-9,]+)'),
135
            re.compile(r'Total basepairs processed:\s*(?P<total_bp>[0-9,]+) bp'),
136
            re.compile(r'Quality-trimmed:\s*(?P<bp_quality_trimmed>[0-9,]+) bp'),
137
            re.compile(r'Total written \(filtered\):\s*(?P<bp_kept>[0-9,]+) bp'),
138
            re.compile(r'Read 1:\s*(?P<bp_read1>[0-9,]+) bp'),
139
            re.compile(r'Read 2:\s*(?P<bp_read2>[0-9,]+) bp')
140
        ]
141
142
        def parse_number(s):
143
            return int(''.join(s.split(',')))
144
145
        columns = ['sample_id', 'total_pairs', 
146
            'read1_with_adapters', 'read2_with_adapters', 
147
            'pairs_too_short', 'pairs_kept',
148
            'total_bp', 'total_bp_read1', 'total_bp_read2',
149
            'bp_quality_trimmed', 'bp_quality_trimmed_read1', 'bp_quality_trimmed_read2',
150
            'bp_kept', 'bp_kept_read1', 'bp_kept_read2'
151
        ]
152
        summary = []
153
        for filename in input:
154
            sample_id = os.path.basename(filename)
155
            record = {'sample_id': sample_id}
156
            with open(filename, 'r') as fin:
157
                section = None
158
                for line in fin:
159
                    line = line.strip()
160
                    for pat in patterns:
161
                        m = pat.search(line)
162
                        if m is not None:
163
                            d = m.groupdict()
164
                            key = list(d.keys())[0]
165
                            d[key] = ''.join(d[key].split(','))
166
                            if key in ('total_bp', 'bp_quality_trimmed', 'bp_kept'):
167
                                section = key
168
                                record[key] = d[key]
169
                            elif key in ('bp_read1', 'bp_read2'):
170
                                record[section + key[2:]] = d[key]
171
                            else:
172
                                record[key] = d[key]
173
            summary.append(record)
174
        summary = pd.DataFrame.from_records(summary)
175
        summary = summary.reindex(columns=columns)
176
        summary.to_csv(output[0], sep='\t', na_rep='NA', index=False, header=True)
177
178
rule summarize_cutadapt_jupyter_pe:
179
    input:
180
        summary='{output_dir}/summary/cutadapt.txt',
181
        jupyter=package_dir + '/templates/summarize_cutadapt_pe.ipynb'
182
    output:
183
        jupyter='{output_dir}/summary/cutadapt.ipynb',
184
        html='{output_dir}/summary/cutadapt.html'
185
    run:
186
        shell(nbconvert_command)
187