a b/exseek/snakefiles/cutadapt_se.snakemake
1
shell.prefix('set -x;')
2
include: 'common.snakemake'
3
4
import os
5
6
def get_all_inputs(wildcards):
7
    available_inputs = dict(
8
        cutadapt=expand('{output_dir}/cutadapt/{sample_id}.fastq.gz',
9
            output_dir=output_dir, sample_id=sample_ids),
10
        clean=expand('{output_dir}/unmapped/{sample_id}/clean.fa.gz',
11
            output_dir=output_dir, sample_id=sample_ids),
12
        summarize_cutadapt=expand('{output_dir}/summary/cutadapt.html',
13
            output_dir=output_dir)
14
    )
15
    enabled_inputs = list(available_inputs.keys())
16
    inputs = []
17
    for key, l in available_inputs.items():
18
        if key in enabled_inputs:
19
            inputs += l
20
    return inputs
21
22
rule all:
23
    input:
24
        get_all_inputs
25
26
27
rule cutadapt_se:
28
    input:
29
        auto_gzip_input(data_dir + '/fastq/{sample_id}.fastq')
30
    output:
31
        trimmed='{output_dir}/cutadapt/{sample_id}.fastq.gz'
32
    params:
33
        min_read_length=config['min_read_length'],
34
        min_base_quality=config['min_base_quality'],
35
        trim_5p=lambda wildcards: '-u {}'.format(config['trim_5p']) if config['trim_5p'] > 0 else '',
36
        trim_3p=lambda wildcards: '-u -{}'.format(config['trim_3p']) if config['trim_3p'] > 0 else '',
37
        adaptor=lambda wildcards: get_cutadapt_adapter_args(wildcards, config['adaptor'], '-a'),
38
        adaptor_5p=lambda wildcards: get_cutadapt_adapter_args(wildcards, config['adaptor_5p'], '-g'),
39
        umi_length=config['umi_length']
40
    log:
41
        '{output_dir}/log/cutadapt/{sample_id}'
42
    threads: 2
43
    run:
44
        if config['trim_after_adapter']:
45
            shell('''cutadapt {params.adaptor} {params.adaptor_5p} \
46
                -m {params.min_read_length} --trim-n -q {params.min_base_quality} {input}  2>&1 \
47
                | cutadapt {params.trim_5p} {params.trim_3p} \
48
                -o >(pigz -c -p {threads} > {output.trimmed})  > {log}
49
                ''')
50
        else:
51
            shell('''cutadapt {params.adaptor} {params.adaptor_5p} {params.trim_5p} {params.trim_3p} \
52
                -m {params.min_read_length} --trim-n -q {params.min_base_quality} \
53
                -o >(pigz -c -p {threads} > {output.trimmed}) {input} > {log} 2>&1
54
                ''')
55
56
57
rule summarize_cutadapt_se:
58
    input:
59
        lambda wildcards: expand('{output_dir}/log/cutadapt/{sample_id}',
60
            output_dir=wildcards.output_dir, sample_id=sample_ids)
61
    output:
62
        '{output_dir}/summary/cutadapt.txt'
63
    run:
64
        import pandas as pd
65
        
66
        def parse_number(s):
67
            return int(''.join(s.split(',')))
68
69
        columns = ['sample_id', 'total_reads', 'reads_with_adapters', 'reads_too_short', 'reads_kept',
70
            'total_bp', 'bp_quality_trimmed', 'bp_kept']
71
        summary = []
72
        for filename in input:
73
            sample_id = os.path.basename(filename)
74
            record = {'sample_id': sample_id}
75
            with open(filename, 'r') as fin:
76
                for line in fin:
77
                    line = line.strip()
78
                    if line.startswith('Total reads processed:'):
79
                        record['total_reads'] = parse_number(line.split()[-1])
80
                    elif line.startswith('Reads with adapters:'):
81
                        record['reads_with_adapters'] = parse_number(line.split()[-2])
82
                    elif line.startswith('Reads that were too short:'):
83
                        record['reads_too_short'] = parse_number(line.split()[-2])
84
                    elif line.startswith('Reads written (passing filters):'):
85
                        record['reads_kept'] = parse_number(line.split()[-2])
86
                    elif line.startswith('Total basepairs processed:'):
87
                        record['total_bp'] = parse_number(line.split()[-2])
88
                    elif line.startswith('Quality-trimmed:'):
89
                        record['bp_quality_trimmed'] = parse_number(line.split()[-3])
90
                    elif line.startswith('Total written (filtered):'):
91
                        record['bp_kept'] = parse_number(line.split()[-3])
92
            summary.append(record)
93
        summary = pd.DataFrame.from_records(summary)
94
        summary = summary.reindex(columns=columns)
95
        summary.to_csv(output[0], sep='\t', na_rep='NA', index=False, header=True)
96
97
rule summarize_cutadapt_jupyter_se:
98
    input:
99
        summary='{output_dir}/summary/cutadapt.txt',
100
        jupyter=package_dir + '/templates/summarize_cutadapt_se.ipynb'
101
    output:
102
        jupyter='{output_dir}/summary/cutadapt.ipynb',
103
        html='{output_dir}/summary/cutadapt.html'
104
    run:
105
        shell(nbconvert_command)
106
107
rule fastq_to_fasta_se:
108
    input:
109
        '{output_dir}/cutadapt/{sample_id}.fastq.gz'
110
    output:
111
        '{output_dir}/unmapped/{sample_id}/clean.fa.gz'
112
    threads:
113
        config['threads_compress']
114
    shell:
115
        '''pigz -d -c -p {threads} {input} | fastq_to_fasta -r | pigz -p {threads} -c > {output}
116
        '''