291 lines (265 with data), 11.0 kB
include: 'common.snakemake'
from collections import OrderedDict
data_dir = config['data_dir']
output_dir = config['output_dir']
rna_types = config['rna_types']
adaptor = config['adaptor']
min_read_length = config['min_read_length']
genome_dir = config['genome_dir']
max_read_length = config['max_read_length']
min_base_quality = config['min_base_quality']
temp_dir = config['temp_dir']
def get_all_inputs(wildcards):
available_inputs = dict(
summarize_read_counts=expand('{output_dir}/summary/read_counts.txt',
output_dir=output_dir),
mapped_read_length=expand('{output_dir}/stats/mapped_read_length_by_sample/{sample_id}',
output_dir=output_dir, sample_id=sample_ids),
summarize_read_counts_jupyter=expand('{output_dir}/summary/read_counts.html',
output_dir=output_dir),
unmapped_other=expand('{output_dir}/unmapped/{sample_id}/other.fa.gz',
output_dir=output_dir, sample_id=sample_ids),
tbam=expand('{output_dir}/tbam/{sample_id}/{rna_type}.bam',
output_dir=output_dir, sample_id=sample_ids, rna_type=config['rna_types']),
map_genome=expand('{output_dir}/gbam/{sample_id}/genome.bam',
output_dir=output_dir, sample_id=sample_ids)
)
enabled_inputs = list(available_inputs.keys())
inputs = []
for key, l in available_inputs.items():
if key in enabled_inputs:
inputs += l
return inputs
rule all:
input:
get_all_inputs
rule fastq_to_fasta_se:
input:
auto_gzip_input('{output_dir}/cutadapt/{sample_id}.fastq')
output:
'{output_dir}/unmapped/{sample_id}/clean.fa.gz'
threads:
config['threads_compress']
shell:
'''{bin_dir}/auto_uncompress {input} \
| fastq_to_fasta -r -n \
| pigz -p {threads} -c > {output}
'''
# mapping statistics
rule read_counts_raw:
input:
data_dir + '/fastq/{sample_id}.fastq'
output:
'{output_dir}/stats/read_counts_raw/{sample_id}'
shell:
'''wc -l < {input} | awk '{{print int($0/4)}}' > {output}
'''
rule read_counts_mapped:
input:
lambda wildcards: '{output_dir}/{bam_type}/{sample_id}/{rna_type}.bam'.format(
output_dir=wildcards.output_dir,
bam_type='gbam' if wildcards.rna_type == 'other' else 'tbam',
sample_id=wildcards.sample_id, rna_type=wildcards.rna_type
)
output:
'{output_dir}/stats/read_counts_mapped/{sample_id}/{rna_type}'
wildcard_constraints:
rna_type='(?!promoter$)(?!enhancer$)(?!intron$)(?!repeats$)(?!genome).*'
shell:
'''bamtools count -in {input} > {output}
'''
rule read_counts_unmapped:
input:
'{output_dir}/unmapped/{sample_id}/{rna_type}.fa.gz'
output:
'{output_dir}/stats/read_counts_unmapped/{sample_id}/{rna_type}'
threads:
config['threads_compress']
shell:
'''pigz -p {threads} -d -c {input} | wc -l | awk '{{print int($0/2)}}' > {output}
'''
rule summarize_read_counts:
input:
mapped=lambda wildcards: expand('{output_dir}/stats/read_counts_mapped/{sample_id}/{rna_type}',
output_dir=wildcards.output_dir, sample_id=sample_ids,
rna_type=config['rna_types'] + ['other', 'promoter', 'enhancer', 'intron', 'repeats']),
unmapped=lambda wildcards: expand('{output_dir}/stats/read_counts_unmapped/{sample_id}/{rna_type}',
output_dir=wildcards.output_dir, sample_id=sample_ids, rna_type=['clean', 'other'])
output:
'{output_dir}/summary/read_counts.txt'
run:
import pandas as pd
import os
from collections import OrderedDict
records = OrderedDict()
for sample_id in sample_ids:
records[sample_id] = {}
for filename in input.mapped:
sample_id, rna_type = filename.split(os.path.sep)[-2:]
with open(filename, 'r') as f:
records[sample_id][rna_type + '.mapped'] = int(f.read().strip())
for filename in input.unmapped:
sample_id, rna_type = filename.split(os.path.sep)[-2:]
with open(filename, 'r') as f:
records[sample_id][rna_type + '.unmapped'] = int(f.read().strip())
records = pd.DataFrame.from_records(records)
records.columns.name = 'sample_id'
records.columns.name = 'item'
records.index.name = 'reads_type'
records.to_csv(output[0], sep='\t', header=True, index=True)
rule summarize_read_counts_jupyter:
input:
read_counts='{output_dir}/summary/read_counts.txt',
jupyter=package_dir + '/templates/summarize_read_counts_small.ipynb'
output:
jupyter='{output_dir}/summary/read_counts.ipynb',
html='{output_dir}/summary/read_counts.html'
shell:
'''cp {input.jupyter} {output.jupyter}
jupyter nbconvert --execute --to html \
--HTMLExporter.exclude_code_cell=False \
--HTMLExporter.exclude_input_prompt=True \
--HTMLExporter.exclude_output_prompt=True \
{output.jupyter}
'''
rule mapped_read_length:
input:
'{output_dir}/tbam/{sample_id}/{rna_type}.bam'
output:
'{output_dir}/stats/mapped_read_length/{sample_id}/{rna_type}'
shell:
'''{bin_dir}/statistics.py read_length_hist --max-length 600 -i {input} -o {output}
'''
rule merge_mapped_read_length:
input:
lambda wildcards: expand('{output_dir}/stats/mapped_read_length/{sample_id}/{rna_type}',
output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types)
output:
'{output_dir}/stats/mapped_read_length_by_sample/{sample_id}'
run:
import pandas as pd
from collections import OrderedDict
table = OrderedDict()
for filename in input:
rna_type = os.path.basename(filename)
df = pd.read_table(filename, index_col=0)
table[rna_type] = df['query']
table = pd.DataFrame(table)
table.to_csv(output[0], sep='\t', header=True, index=True)
rule bam_to_bed:
input:
bam='{output_dir}/gbam/{sample_id}/{rna_type}.bam'
output:
bed='{output_dir}/gbed/{sample_id}/{rna_type}.bed.gz'
shell:
'''bedtools bamtobed -i {input} | bedtools sort | pigz -c > {output}
'''
rule count_genomic_reads:
'''Count genomic reads with a defined order
'''
input:
reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
intron=genome_dir + '/bed/long_RNA.intron.bed',
promoter=genome_dir + '/bed/promoter.merged.bed',
enhancer=genome_dir + '/bed/enhancer.merged.bed',
repeats=genome_dir + '/bed/rmsk.bed'
output:
intron='{output_dir}/stats/read_counts_mapped/{sample_id}/intron',
promoter='{output_dir}/stats/read_counts_mapped/{sample_id}/promoter',
enhancer='{output_dir}/stats/read_counts_mapped/{sample_id}/enhancer',
repeats='{output_dir}/stats/read_counts_mapped/{sample_id}/repeats'
threads:
1
shell:
r'''pigz -c -d -p {threads} {input.reads} \
| bedtools annotate -counts -s -i - -files {input.intron} {input.promoter} {input.enhancer} {input.repeats} \
| awk 'BEGIN{{OFS="\t";FS="\t";intron=0;promoter=0;enhancer=0;repeats=0}}
{{
if($7 > 0) intron++
else if($8 > 0) promoter++
else if($9 > 0) enhancer++
else if($10 > 0) repeats++
}}END{{
print intron > "{output.intron}"
print promoter > "{output.promoter}"
print enhancer > "{output.enhancer}"
print repeats > "{output.repeats}"
}}'
'''
"""
rule count_reads_intron:
input:
reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
bed=genome_dir + '/bed/long_RNA.intron.bed'
output:
'{output_dir}/stats/read_counts_mapped/{sample_id}/intron'
shell:
'''pigz -d -c {input.reads} | bedtools intersect -bed -wa -s -a - -b {input.bed} | wc -l > {output}
'''
rule count_reads_promoter:
input:
reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
bed=genome_dir + '/bed/promoter.merged.bed'
output:
'{output_dir}/stats/read_counts_mapped/{sample_id}/promoter'
shell:
'''pigz -d -c {input.reads} | bedtools intersect -bed -wa -a - -b {input.bed} | wc -l > {output}
'''
rule count_reads_enhancer:
input:
reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
bed=genome_dir + '/bed/enhancer.merged.bed'
output:
'{output_dir}/stats/read_counts_mapped/{sample_id}/enhancer'
shell:
'''pigz -d -c {input.reads} | bedtools intersect -bed -wa -a - -b {input.bed} | wc -l > {output}
'''
rule count_reads_rmsk:
input:
reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
bed=genome_dir + '/bed/rmsk.bed'
output:
'{output_dir}/stats/read_counts_mapped/{sample_id}/repeats'
shell:
'''pigz -d -c {input.reads} | bedtools intersect -bed -wa -s -a - -b {input.bed} | wc -l > {output}
'''
rule map_circRNA:
input:
reads='{output_dir}/unmapped/{sample_id}/other.fa.gz',
index=genome_dir + '/index/bowtie2/circRNA.1.bt2'
output:
unmapped='{output_dir}/unmapped/{sample_id}/circRNA.fa.gz',
unmapped_aligner='{output_dir}/unmapped/{sample_id}/circRNA.aligner.fa.gz',
bam='{output_dir}/tbam/{sample_id}/circRNA.bam',
bam_filtered = '{output_dir}/tbam/{sample_id}/circRNA.filtered.bam'
params:
index=genome_dir + '/index/bowtie2/circRNA'
threads:
config['threads_mapping']
shell:
'''pigz -d -c {input.reads} \
| bowtie2 -f -p {threads} --norc --sensitive --no-unal \
--un-gz {output.unmapped_aligner} -x {params.index} - -S - \
| {bin_dir}/preprocess.py filter_circrna_reads --filtered-file >(samtools view -b -o {output.bam_filtered}) \
| samtools view -b -o {output.bam}
{{
pigz -d -c {output.unmapped_aligner}
samtools fasta {output.bam_filtered}
}} | pigz -c > {output.unmapped}
'''
"""
rule map_genome:
input:
reads='{output_dir}/unmapped/{sample_id}/clean.fa.gz',
index=genome_dir + '/genome_index/bowtie2/genome.1.bt2'
output:
unmapped='{output_dir}/unmapped/{sample_id}/genome.fa.gz',
bam='{output_dir}/gbam/{sample_id}/genome.bam'
params:
index=genome_dir + '/genome_index/bowtie2/genome'
shell:
'''pigz -d -c {input.reads} \
| bowtie2 -f -p {threads} --sensitive --no-unal \
--un-gz {output.unmapped} -x {params.index} - -S - \
| samtools view -b -o {output.bam}
'''