[4c33d4]: / exseek / snakefiles / quality_control_pe.snakemake

Download this file

187 lines (169 with data), 7.1 kB

include: 'common.snakemake'

def get_all_inputs(wildcards):
    available_inputs = dict(
        fastqc=expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.html', 
            output_dir=output_dir, sample_id=sample_ids, mate_index=[1, 2]),
        multiqc=expand('{output_dir}/summary/fastqc.html', output_dir=output_dir)
    )
    enabled_inputs = list(available_inputs.keys())
    inputs = []
    for key, l in available_inputs.items():
        if key in enabled_inputs:
            inputs += l
    return inputs

rule all:
    input:
        get_all_inputs



rule fastqc_pe:
    input:
        auto_gzip_input(data_dir + '/fastq/{sample_id}_{mate_index}.fastq')
    output:
        html='{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.html',
        zip='{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip'
    params:
        output_prefix='{output_dir}/fastqc/',
        temp_dir=config['temp_dir']
    log:
        '{output_dir}/log/fastqc/{sample_id}_{mate_index}'
    wildcard_constraints:
        mate_index='[12]'
    shell:
        '''fastqc -q -o {params.output_prefix} -d {params.temp_dir} {input} > {log} 2>&1
        '''

rule multiqc_pe:
    input:
        fastqc=expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip',
            output_dir=output_dir, sample_id=sample_ids, mate_index=[1, 2])
    output:
        html='{output_dir}/summary/fastqc.html',
        data=directory('{output_dir}/summary/fastqc_data')
    params:
        fastqc_dir='{output_dir}/fastqc'
    wildcard_constraints:
        mate_index='[12]'
    shell:
        '''multiqc -m fastqc -n {output.html} {params.fastqc_dir}
        '''

rule summarize_fastqc_pe:
    input:
        zip=lambda wildcards: expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip',
            output_dir=wildcards.output_dir, sample_id=sample_ids, mate_index=['1', '2'])
    output:
        '{output_dir}/summary/fastqc.txt'
    wildcard_constraints:
        fastqc_step='fastqc.*',
        mate_index='[12]'
    run:
        import pandas as pd
        from zipfile import ZipFile
        import os
        from collections import OrderedDict

        summary = OrderedDict()
        columns = None
        for filename in input.zip:
            sample_id = os.path.splitext(os.path.basename(filename))[0][:-7]
            with ZipFile(filename, 'r') as zf:
                with zf.open(sample_id + '_fastqc/fastqc_data.txt', 'r') as f:
                    summary[sample_id] = parse_fastqc_data(f)
                    if columns is None:
                        columns = list(summary[sample_id].keys())
        summary = pd.DataFrame.from_records(summary)
        summary = summary.T
        summary = summary.reindex(columns=columns)
        summary.index.name = 'sample_id'
        summary.to_csv(output[0], sep='\t', index=True, header=True)

rule summarize_fastqc_jupyter_pe:
    input:
        fastqc='{output_dir}/summary/fastqc.txt',
        jupyter=package_dir + '/templates/fastqc.ipynb'
    output:
        jupyter='{output_dir}/summary/fastqc_jupyter.ipynb',
        html='{output_dir}/summary/fastqc_jupyter.html'
    wildcard_constraints:
        fastqc_step='fastqc.*'
    run:
        shell(nbconvert_command)

rule prinseq_pe:
    input:
        fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz',
        fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz'
    output:
        '{output_dir}/prinseq/{sample_id}.gd'
    shell:
        '''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose \
            -fastq  <({bin_dir}/auto_uncompress {input.fastq1}) \
            -fastq2 <({bin_dir}/auto_uncompress {input.fastq2}) \
            -ns_max_n 0 -graph_data {output} -out_good null -out_bad null
        
        '''

rule prinseq_graph_pe:
    input:
        '{output_dir}/prinseq/{sample_id}.gd'
    output:
        '{output_dir}/prinseq/{sample_id}.html'
    shell:
        '''perl {tools_dir}/prinseq/prinseq-graphs.pl -i {input} -html_all -o {output}
        '''

rule summarize_cutadapt_pe:
    input:
        lambda wildcards: expand('{output_dir}/log/cutadapt/{sample_id}',
            output_dir=wildcards.output_dir, sample_id=sample_ids)
    output:
        '{output_dir}/summary/cutadapt.txt'
    run:
        import pandas as pd
        import re
        
        patterns = [
            re.compile(r'Total read pairs processed:\s*(?P<total_pairs>[0-9,]+)'),
            re.compile(r'Read 1 with adapter:\s*(?P<read1_with_adapters>[0-9,]+)'),
            re.compile(r'Read 2 with adapter:\s*(?P<read2_with_adapters>[0-9,]+)'),
            re.compile(r'Pairs that were too short:\s*(?P<pairs_too_short>[0-9,]+)'),
            re.compile(r'Pairs written \(passing filters\):\s*(?P<pairs_kept>[0-9,]+)'),
            re.compile(r'Total basepairs processed:\s*(?P<total_bp>[0-9,]+) bp'),
            re.compile(r'Quality-trimmed:\s*(?P<bp_quality_trimmed>[0-9,]+) bp'),
            re.compile(r'Total written \(filtered\):\s*(?P<bp_kept>[0-9,]+) bp'),
            re.compile(r'Read 1:\s*(?P<bp_read1>[0-9,]+) bp'),
            re.compile(r'Read 2:\s*(?P<bp_read2>[0-9,]+) bp')
        ]

        def parse_number(s):
            return int(''.join(s.split(',')))

        columns = ['sample_id', 'total_pairs', 
            'read1_with_adapters', 'read2_with_adapters', 
            'pairs_too_short', 'pairs_kept',
            'total_bp', 'total_bp_read1', 'total_bp_read2',
            'bp_quality_trimmed', 'bp_quality_trimmed_read1', 'bp_quality_trimmed_read2',
            'bp_kept', 'bp_kept_read1', 'bp_kept_read2'
        ]
        summary = []
        for filename in input:
            sample_id = os.path.basename(filename)
            record = {'sample_id': sample_id}
            with open(filename, 'r') as fin:
                section = None
                for line in fin:
                    line = line.strip()
                    for pat in patterns:
                        m = pat.search(line)
                        if m is not None:
                            d = m.groupdict()
                            key = list(d.keys())[0]
                            d[key] = ''.join(d[key].split(','))
                            if key in ('total_bp', 'bp_quality_trimmed', 'bp_kept'):
                                section = key
                                record[key] = d[key]
                            elif key in ('bp_read1', 'bp_read2'):
                                record[section + key[2:]] = d[key]
                            else:
                                record[key] = d[key]
            summary.append(record)
        summary = pd.DataFrame.from_records(summary)
        summary = summary.reindex(columns=columns)
        summary.to_csv(output[0], sep='\t', na_rep='NA', index=False, header=True)

rule summarize_cutadapt_jupyter_pe:
    input:
        summary='{output_dir}/summary/cutadapt.txt',
        jupyter=package_dir + '/templates/summarize_cutadapt_pe.ipynb'
    output:
        jupyter='{output_dir}/summary/cutadapt.ipynb',
        html='{output_dir}/summary/cutadapt.html'
    run:
        shell(nbconvert_command)