Switch to unified view

a b/exseek/snakefiles/quality_control_se.snakemake
1
include: 'common.snakemake'
2
3
import os
4
5
def get_all_inputs(wildcards):
6
    available_inputs = dict(
7
        fastqc=expand('{output_dir}/fastqc/{sample_id}_fastqc.zip',
8
            output_dir=output_dir, sample_id=sample_ids),
9
        summarize_fastqc=expand('{output_dir}/summary/fastqc.txt',
10
            output_dir=output_dir),
11
        summarize_fastqc_html=expand('{output_dir}/summary/fastqc.html',
12
            output_dir=output_dir)
13
    )
14
    enabled_inputs = list(available_inputs.keys())
15
    inputs = []
16
    for key, l in available_inputs.items():
17
        if key in enabled_inputs:
18
            inputs += l
19
    return inputs
20
21
rule all:
22
    input:
23
        get_all_inputs
24
25
rule fastqc:
26
    input:
27
        auto_gzip_input(data_dir + '/fastq/{sample_id}.fastq')
28
    output:
29
        html='{output_dir}/fastqc/{sample_id}_fastqc.html',
30
        zip='{output_dir}/fastqc/{sample_id}_fastqc.zip'
31
    params:
32
        output_prefix='{output_dir}/fastqc/',
33
        temp_dir=config['temp_dir']
34
    log:
35
        '{output_dir}/log/fastqc/{sample_id}'
36
    shell:
37
        '''fastqc -q -o {params.output_prefix} -d {params.temp_dir} {input} > {log} 2>&1
38
        '''
39
40
rule summarize_fastqc:
41
    input:
42
        zip=lambda wildcards: expand('{output_dir}/{fastqc_step}/{sample_id}_fastqc.zip',
43
            output_dir=wildcards.output_dir, fastqc_step=wildcards.fastqc_step, sample_id=sample_ids)
44
    output:
45
        '{output_dir}/summary/{fastqc_step}.txt'
46
    wildcard_constraints:
47
        fastqc_step='fastqc.*'
48
    run:
49
        import pandas as pd
50
        from zipfile import ZipFile
51
        import os
52
        from collections import OrderedDict
53
54
        summary = OrderedDict()
55
        columns = None
56
        for filename in input.zip:
57
            sample_id = os.path.splitext(os.path.basename(filename))[0][:-7]
58
            with ZipFile(filename, 'r') as zf:
59
                with zf.open(sample_id + '_fastqc/fastqc_data.txt', 'r') as f:
60
                    summary[sample_id] = parse_fastqc_data(f)
61
                    if columns is None:
62
                        columns = list(summary[sample_id].keys())
63
        summary = pd.DataFrame.from_records(summary)
64
        summary = summary.T
65
        summary = summary.reindex(columns=columns)
66
        summary.index.name = 'sample_id'
67
        summary.to_csv(output[0], sep='\t', index=True, header=True) 
68
69
"""
70
rule summarize_fastqc_jupyter:
71
    input:
72
        fastqc='{output_dir}/summary/{fastqc_step}.txt',
73
        jupyter=root_dir + '/templates/fastqc.ipynb'
74
    output:
75
        jupyter='{output_dir}/summary/{fastqc_step}.ipynb',
76
        html='{output_dir}/summary/{fastqc_step}.html'
77
    wildcard_constraints:
78
        fastqc_step='fastqc.*'
79
    run:
80
        shell(nbconvert_command)
81
"""
82
83
rule multiqc:
84
    input:
85
        fastqc=expand('{output_dir}/fastqc/{sample_id}_fastqc.zip',
86
            output_dir=output_dir, sample_id=sample_ids)
87
    output:
88
        html='{output_dir}/summary/fastqc.html',
89
        data=directory('{output_dir}/summary/fastqc_data')
90
    params:
91
        fastqc_dir='{output_dir}/fastqc'
92
    shell:
93
        '''multiqc -m fastqc -n {output.html} {params.fastqc_dir}
94
        '''
95
96
97
rule prinseq_clean_se:
98
    input:
99
        fastq='{output_dir}/cutadapt/{sample_id}.fastq.gz'
100
    output:
101
        graph_data='{output_dir}/prinseq_clean/{sample_id}.gd'
102
    shell:
103
        '''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose -fastq <(zcat {input.fastq}) \
104
            -ns_max_n 0 -graph_data {output.graph_data} -out_good null -out_bad null
105
        '''
106
107
rule prinseq_graph_clean_se:
108
    input:
109
        '{output_dir}/prinseq_clean/{sample_id}.gd'
110
    output:
111
        '{output_dir}/prinseq_clean/{sample_id}.html'
112
    shell:
113
        '''perl {tools_dir}/prinseq/prinseq-graphs.pl -i {input} -html_all -o {output}
114
        '''