187 lines (169 with data), 7.1 kB
include: 'common.snakemake'
def get_all_inputs(wildcards):
available_inputs = dict(
fastqc=expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.html',
output_dir=output_dir, sample_id=sample_ids, mate_index=[1, 2]),
multiqc=expand('{output_dir}/summary/fastqc.html', output_dir=output_dir)
)
enabled_inputs = list(available_inputs.keys())
inputs = []
for key, l in available_inputs.items():
if key in enabled_inputs:
inputs += l
return inputs
rule all:
input:
get_all_inputs
rule fastqc_pe:
input:
auto_gzip_input(data_dir + '/fastq/{sample_id}_{mate_index}.fastq')
output:
html='{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.html',
zip='{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip'
params:
output_prefix='{output_dir}/fastqc/',
temp_dir=config['temp_dir']
log:
'{output_dir}/log/fastqc/{sample_id}_{mate_index}'
wildcard_constraints:
mate_index='[12]'
shell:
'''fastqc -q -o {params.output_prefix} -d {params.temp_dir} {input} > {log} 2>&1
'''
rule multiqc_pe:
input:
fastqc=expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip',
output_dir=output_dir, sample_id=sample_ids, mate_index=[1, 2])
output:
html='{output_dir}/summary/fastqc.html',
data=directory('{output_dir}/summary/fastqc_data')
params:
fastqc_dir='{output_dir}/fastqc'
wildcard_constraints:
mate_index='[12]'
shell:
'''multiqc -m fastqc -n {output.html} {params.fastqc_dir}
'''
rule summarize_fastqc_pe:
input:
zip=lambda wildcards: expand('{output_dir}/fastqc/{sample_id}_{mate_index}_fastqc.zip',
output_dir=wildcards.output_dir, sample_id=sample_ids, mate_index=['1', '2'])
output:
'{output_dir}/summary/fastqc.txt'
wildcard_constraints:
fastqc_step='fastqc.*',
mate_index='[12]'
run:
import pandas as pd
from zipfile import ZipFile
import os
from collections import OrderedDict
summary = OrderedDict()
columns = None
for filename in input.zip:
sample_id = os.path.splitext(os.path.basename(filename))[0][:-7]
with ZipFile(filename, 'r') as zf:
with zf.open(sample_id + '_fastqc/fastqc_data.txt', 'r') as f:
summary[sample_id] = parse_fastqc_data(f)
if columns is None:
columns = list(summary[sample_id].keys())
summary = pd.DataFrame.from_records(summary)
summary = summary.T
summary = summary.reindex(columns=columns)
summary.index.name = 'sample_id'
summary.to_csv(output[0], sep='\t', index=True, header=True)
rule summarize_fastqc_jupyter_pe:
input:
fastqc='{output_dir}/summary/fastqc.txt',
jupyter=package_dir + '/templates/fastqc.ipynb'
output:
jupyter='{output_dir}/summary/fastqc_jupyter.ipynb',
html='{output_dir}/summary/fastqc_jupyter.html'
wildcard_constraints:
fastqc_step='fastqc.*'
run:
shell(nbconvert_command)
rule prinseq_pe:
input:
fastq1='{output_dir}/cutadapt/{sample_id}_1.fastq.gz',
fastq2='{output_dir}/cutadapt/{sample_id}_2.fastq.gz'
output:
'{output_dir}/prinseq/{sample_id}.gd'
shell:
'''perl {tools_dir}/prinseq/prinseq-lite.pl -verbose \
-fastq <({bin_dir}/auto_uncompress {input.fastq1}) \
-fastq2 <({bin_dir}/auto_uncompress {input.fastq2}) \
-ns_max_n 0 -graph_data {output} -out_good null -out_bad null
'''
rule prinseq_graph_pe:
input:
'{output_dir}/prinseq/{sample_id}.gd'
output:
'{output_dir}/prinseq/{sample_id}.html'
shell:
'''perl {tools_dir}/prinseq/prinseq-graphs.pl -i {input} -html_all -o {output}
'''
rule summarize_cutadapt_pe:
input:
lambda wildcards: expand('{output_dir}/log/cutadapt/{sample_id}',
output_dir=wildcards.output_dir, sample_id=sample_ids)
output:
'{output_dir}/summary/cutadapt.txt'
run:
import pandas as pd
import re
patterns = [
re.compile(r'Total read pairs processed:\s*(?P<total_pairs>[0-9,]+)'),
re.compile(r'Read 1 with adapter:\s*(?P<read1_with_adapters>[0-9,]+)'),
re.compile(r'Read 2 with adapter:\s*(?P<read2_with_adapters>[0-9,]+)'),
re.compile(r'Pairs that were too short:\s*(?P<pairs_too_short>[0-9,]+)'),
re.compile(r'Pairs written \(passing filters\):\s*(?P<pairs_kept>[0-9,]+)'),
re.compile(r'Total basepairs processed:\s*(?P<total_bp>[0-9,]+) bp'),
re.compile(r'Quality-trimmed:\s*(?P<bp_quality_trimmed>[0-9,]+) bp'),
re.compile(r'Total written \(filtered\):\s*(?P<bp_kept>[0-9,]+) bp'),
re.compile(r'Read 1:\s*(?P<bp_read1>[0-9,]+) bp'),
re.compile(r'Read 2:\s*(?P<bp_read2>[0-9,]+) bp')
]
def parse_number(s):
return int(''.join(s.split(',')))
columns = ['sample_id', 'total_pairs',
'read1_with_adapters', 'read2_with_adapters',
'pairs_too_short', 'pairs_kept',
'total_bp', 'total_bp_read1', 'total_bp_read2',
'bp_quality_trimmed', 'bp_quality_trimmed_read1', 'bp_quality_trimmed_read2',
'bp_kept', 'bp_kept_read1', 'bp_kept_read2'
]
summary = []
for filename in input:
sample_id = os.path.basename(filename)
record = {'sample_id': sample_id}
with open(filename, 'r') as fin:
section = None
for line in fin:
line = line.strip()
for pat in patterns:
m = pat.search(line)
if m is not None:
d = m.groupdict()
key = list(d.keys())[0]
d[key] = ''.join(d[key].split(','))
if key in ('total_bp', 'bp_quality_trimmed', 'bp_kept'):
section = key
record[key] = d[key]
elif key in ('bp_read1', 'bp_read2'):
record[section + key[2:]] = d[key]
else:
record[key] = d[key]
summary.append(record)
summary = pd.DataFrame.from_records(summary)
summary = summary.reindex(columns=columns)
summary.to_csv(output[0], sep='\t', na_rep='NA', index=False, header=True)
rule summarize_cutadapt_jupyter_pe:
input:
summary='{output_dir}/summary/cutadapt.txt',
jupyter=package_dir + '/templates/summarize_cutadapt_pe.ipynb'
output:
jupyter='{output_dir}/summary/cutadapt.ipynb',
html='{output_dir}/summary/cutadapt.html'
run:
shell(nbconvert_command)