[4c33d4]: / exseek / snakefiles / quality_control.snakemake

Download this file

259 lines (240 with data), 9.5 kB

shell.prefix('set -x;')
include: 'common.snakemake'

import os

def get_all_inputs(wildcards):
    if config['paired_end']:
        available_inputs = dict(
            prinseq=expand('{output_dir}/prinseq/{sample_id}.html',
                output_dir=output_dir, sample_id=sample_ids)
        )
    else:
        available_inputs = dict(
            fastqc=expand('{output_dir}/fastqc/{sample_id}_fastqc.zip',
                output_dir=output_dir, sample_id=sample_ids),
            summarize_fastqc=expand('{output_dir}/summary/fastqc.txt',
                output_dir=output_dir),
            summarize_fastqc_html=expand('{output_dir}/summary/fastqc.html',
                output_dir=output_dir),
            clean=expand('{output_dir}/unmapped/{sample_id}/clean.fa.gz',
                output_dir=output_dir, sample_id=sample_ids)
        )
    enabled_inputs = list(available_inputs.keys())
    inputs = []
    for key, l in available_inputs.items():
        if key in enabled_inputs:
            inputs += l
    return inputs

rule all:
    input:
        get_all_inputs

def get_input_fastq_se(wildcards):
    fastq = expand(data_dir + '/fastq/{sample_id}.fastq', sample_id=wildcards.sample_id)
    fastq_gz = expand(data_dir + '/fastq/{sample_id}.fastq.gz', sample_id=wildcards.sample_id)

    if all(os.path.exists(f) for f in fastq_gz):
        return fastq_gz
    else:
        return fastq

rule cutadapt_se:
    input:
        auto_gzip_input(data_dir + '/fastq/{sample_id}.fastq')
    output:
        trimmed='{output_dir}/cutadapt/{sample_id}.fastq.gz'
    params:
        adaptor=config['adaptor'],
        min_read_length=config['min_read_length'],
        min_base_quality=config['min_base_quality']
    log:
        '{output_dir}/log/cutadapt/{sample_id}'
    threads:
        config['threads_compress']
    shell:
        '''cutadapt -a {params.adaptor} -m {params.min_read_length} --trim-n -q {params.min_base_quality} \
            -o >(pigz -c -p {threads} > {output.trimmed}) {input} > {log} 2>&1
        '''

rule fastq_to_fasta_se:
    input:
        '{output_dir}/cutadapt/{sample_id}.fastq.gz'
    output:
        '{output_dir}/unmapped/{sample_id}/clean.fa.gz'
    threads:
        config['threads_compress']
    shell:
        '''pigz -d -c -p {threads} {input} | fastq_to_fasta -r | pigz -p {threads} -c > {output}
        '''

rule fastq_to_fasta_pe:
    input:
        '{output_dir}/cutadapt/{sample_id}_{pair_index}.fastq.gz'
    output:
        '{output_dir}/unmapped/{sample_id}/clean_{pair_index}.fa.gz'
    threads:
        config['threads_compress']
    wildcard_constraints:
        pair_index='[12]'
    shell:
        '''pigz -p {threads} -d -c {input} \
            | fastq_to_fasta -r -n \
            | pigz -p {threads} -c > {output}
        '''

rule cutadapt_pe:
    input:
        fastq1=auto_gzip_input(data_dir + '/fastq/{sample_id}_1.fastq'),
        fastq2=auto_gzip_input(data_dir + '/fastq/{sample_id}_2.fastq')
    output:
        fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz',
        fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz'
    threads:
        config['threads']
    params:
        quality_5p=config['min_base_quality_5p'],
        quality_3p=config['min_base_quality_3p'],
        adaptor1=lambda wildcards: '-a ' + config['adaptor1'] if len(config['adaptor1']) > 0 else '',
        adaptor2=lambda wildcards: '-A ' + config['adaptor2'] if len(config['adaptor2']) > 0 else '',
        miniL=config['min_read_length'],
        quality_base=config['quality_base']
    log:
        '{output_dir}/log/cutadapt/{sample_id}'
    shell:
        '''cutadapt -j {threads} -q {params.quality_5p},{params.quality_3p} \
            {params.adaptor1} {params.adaptor2} --trim-n -m {params.miniL} \
            -o >(gzip -c > {output.fastq1}) -p >(gzip -c > {output.fastq2}) \
            {input.fastq1} {input.fastq2} > {log} 2>&1
        '''

rule fastqc:
    input:
        data_dir + '/fastq/{sample_id}.fastq'
    output:
        html='{output_dir}/fastqc/{sample_id}_fastqc.html',
        zip='{output_dir}/fastqc/{sample_id}_fastqc.zip'
    params:
        output_prefix='{output_dir}/fastqc/',
        temp_dir=config['temp_dir']
    log:
        '{output_dir}/log/fastqc/{sample_id}'
    shell:
        '''fastqc -q -o {params.output_prefix} -d {params.temp_dir} {input}
        '''

def parse_fastqc_data(fp):
    section = None
    qc_status = OrderedDict()
    basic_statistics = OrderedDict()
    for line in fp:
        line = str(line, encoding='utf-8')
        line = line.strip()
        if line.startswith('>>'):
            if line == '>>END_MODULE':
                continue
            section, status = line[2:].split('\t')
            qc_status[section] = status
        else:
            if section == 'Basic Statistics':
                key, val = line.split('\t')
                basic_statistics[key] = val
    for key, val in qc_status.items():
        basic_statistics[key] = val
    return basic_statistics
        
rule summarize_fastqc:
    input:
        zip=lambda wildcards: expand('{output_dir}/fastqc/{sample_id}_fastqc.zip',
            output_dir=wildcards.output_dir, sample_id=sample_ids)
    output:
        '{output_dir}/summary/fastqc.txt'
    run:
        import pandas as pd
        from zipfile import ZipFile
        import os
        from collections import OrderedDict

        summary = OrderedDict()
        columns = None
        for filename in input.zip:
            sample_id = os.path.splitext(os.path.basename(filename))[0][:-7]
            with ZipFile(filename, 'r') as zf:
                with zf.open(sample_id + '_fastqc/fastqc_data.txt', 'r') as f:
                    summary[sample_id] = parse_fastqc_data(f)
                    if columns is None:
                        columns = list(summary[sample_id].keys())
        summary = pd.DataFrame.from_records(summary)
        summary = summary.T
        summary = summary.reindex(columns=columns)
        summary.index.name = 'sample_id'
        summary.to_csv(output[0], sep='\t', index=True, header=True) 

rule summarize_fastqc_ipynb:
    input:
        '{output_dir}/summary/fastqc.txt'
    output:
        ipynb='{output_dir}/summary/fastqc.ipynb'
    run:
        import nbformat
        from nbformat.v4 import new_notebook, new_code_cell, new_markdown_cell
        nb = new_notebook()
        nb['cells'].append(new_code_cell(r"""import pandas as pd
import numpy as np"""))

        nb['cells'].append(new_code_cell(r"""summary = pd.read_table('fastqc.txt', sep='\t')"""))
        nb['cells'].append(new_code_cell(r"""summary.set_index('sample_id', inplace=True, drop=False)
qc_status = summary.iloc[:, 9:].copy()
qc_status.fillna('NA')
qc_status = qc_status.astype('str')
sample_ids = qc_status.index.values
sections = qc_status.columns.values
def style_func(val):
    status, row, col = val.split('|')
    row, col = int(row), int(col)
    color = {'pass': 'green', 'fail': 'red', 'warn': 'orange'}.get(status, 'gray')
    return '<a href="../fastqc/{sample_id}_fastqc.html#M{section}" style="color: {color}">{status}</a>'.format(
        sample_id=sample_ids[row], color=color, status=status, section=col)

pd.DataFrame(qc_status.values \
             + '|' + np.arange(qc_status.shape[0]).astype('str')[:, np.newaxis] \
             + '|' + np.arange(qc_status.shape[1]).astype('str')[np.newaxis, :],
             index=qc_status.index, columns=qc_status.columns) \
    .style.format(style_func)"""))
        nb['metadata']['kernel_spec'] = {
            'display_name': 'Python 3',
            'language': 'python', 
            'name': 'python3'
        }
        nb['metadata']['laugnage_info'] = {
            'codemirror_mode': {
                'name': 'ipython',
                'version': 3
            },
            'name': 'python', 
            'file_extension': '.py', 
            'mimetype': 'text/x-python',
            'nbconvert_exporter': 'python',
            'version': '3.6'
        }
        nbformat.write(nb, output.ipynb)

rule summarize_fastqc_html:
    input:
        '{output_dir}/summary/fastqc.ipynb'
    output:
        '{output_dir}/summary/fastqc.html'
    shell:
        '''jupyter nbconvert --execute --to html \
            --HTMLExporter.exclude_code_cell=False \
            --HTMLExporter.exclude_input_prompt=True \
            --HTMLExporter.exclude_output_prompt=True \
            {input}
        '''

rule prinseq_se:
    input:
        fastq='{output_dir}/cutadapt/{sample_id}.fastq.gz'
    output:
        graph_data='{output_dir}/prinseq/{sample_id}.gd',
        html='{output_dir}/prinseq/{sample_id}.html'
    shell:
        '''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose -fastq <(zcat {input.fastq}) \
            -ns_max_n 0 -graph_data {output.graph_data} -out_good null -out_bad null
        perl {tools_dir}/prinseq/prinseq-graphs.pl -i {output.graph_data} -html_all -o {output.html}
        '''

rule prinseq_pe:
    input:
        fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz',
        fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz'
    output:
        graph_data='{output_dir}/prinseq/{sample_id}.gd',
        html='{output_dir}/prinseq/{sample_id}.html'
    shell:
        '''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose -fastq {input.fastq1} -fastq2 {input.fastq2} \
            -ns_max_n 0 -graph_data {output.graph_data} -out_good null -out_bad null
        perl {tools_dir}/prinseq/prinseq-graphs.pl -i {output.graph_data} -html_all -o {output.html}
        '''