[41c1e8]: / exseek / snakefiles / mapping_small.snakemake

Download this file

291 lines (265 with data), 11.0 kB

include: 'common.snakemake'

from collections import OrderedDict

data_dir = config['data_dir']
output_dir = config['output_dir']
rna_types = config['rna_types']
adaptor = config['adaptor']
min_read_length = config['min_read_length']
genome_dir = config['genome_dir']
max_read_length = config['max_read_length']
min_base_quality = config['min_base_quality']
temp_dir = config['temp_dir']

def get_all_inputs(wildcards):
    available_inputs = dict(
        summarize_read_counts=expand('{output_dir}/summary/read_counts.txt',
            output_dir=output_dir),
        mapped_read_length=expand('{output_dir}/stats/mapped_read_length_by_sample/{sample_id}',
            output_dir=output_dir, sample_id=sample_ids),
        summarize_read_counts_jupyter=expand('{output_dir}/summary/read_counts.html',
            output_dir=output_dir),
        unmapped_other=expand('{output_dir}/unmapped/{sample_id}/other.fa.gz',
            output_dir=output_dir, sample_id=sample_ids),
        tbam=expand('{output_dir}/tbam/{sample_id}/{rna_type}.bam',
            output_dir=output_dir, sample_id=sample_ids, rna_type=config['rna_types']),
        map_genome=expand('{output_dir}/gbam/{sample_id}/genome.bam',
            output_dir=output_dir, sample_id=sample_ids)
    )
    enabled_inputs = list(available_inputs.keys())
    inputs = []
    for key, l in available_inputs.items():
        if key in enabled_inputs:
            inputs += l
    return inputs

rule all:
    input:
        get_all_inputs

rule fastq_to_fasta_se:
    input:
        auto_gzip_input('{output_dir}/cutadapt/{sample_id}.fastq')
    output:
        '{output_dir}/unmapped/{sample_id}/clean.fa.gz'
    threads:
        config['threads_compress']
    shell:
        '''{bin_dir}/auto_uncompress {input} \
            | fastq_to_fasta -r -n \
            | pigz -p {threads} -c > {output}
        '''
        
# mapping statistics
rule read_counts_raw:
    input:
        data_dir + '/fastq/{sample_id}.fastq'
    output:
        '{output_dir}/stats/read_counts_raw/{sample_id}'
    shell:
        '''wc -l < {input} | awk '{{print int($0/4)}}' > {output}
        '''

rule read_counts_mapped:
    input:
        lambda wildcards: '{output_dir}/{bam_type}/{sample_id}/{rna_type}.bam'.format(
            output_dir=wildcards.output_dir, 
            bam_type='gbam' if wildcards.rna_type == 'other' else 'tbam',
            sample_id=wildcards.sample_id, rna_type=wildcards.rna_type
        )
    output:
        '{output_dir}/stats/read_counts_mapped/{sample_id}/{rna_type}'
    wildcard_constraints:
        rna_type='(?!promoter$)(?!enhancer$)(?!intron$)(?!repeats$)(?!genome).*'
    shell:
        '''bamtools count -in {input} > {output}
        '''

rule read_counts_unmapped:
    input:
        '{output_dir}/unmapped/{sample_id}/{rna_type}.fa.gz'
    output:
        '{output_dir}/stats/read_counts_unmapped/{sample_id}/{rna_type}'
    threads:
        config['threads_compress']
    shell:
        '''pigz -p {threads} -d -c {input} | wc -l | awk '{{print int($0/2)}}' > {output}
        '''

rule summarize_read_counts:
    input:
        mapped=lambda wildcards: expand('{output_dir}/stats/read_counts_mapped/{sample_id}/{rna_type}',
            output_dir=wildcards.output_dir, sample_id=sample_ids, 
            rna_type=config['rna_types'] + ['other', 'promoter', 'enhancer', 'intron', 'repeats']),
        unmapped=lambda wildcards: expand('{output_dir}/stats/read_counts_unmapped/{sample_id}/{rna_type}',
            output_dir=wildcards.output_dir, sample_id=sample_ids, rna_type=['clean', 'other'])
    output:
        '{output_dir}/summary/read_counts.txt'
    run:
        import pandas as pd
        import os
        from collections import OrderedDict
    
        records = OrderedDict()
        for sample_id in sample_ids:
            records[sample_id] = {}
        for filename in input.mapped:
            sample_id, rna_type = filename.split(os.path.sep)[-2:]
            with open(filename, 'r') as f:
                records[sample_id][rna_type + '.mapped'] = int(f.read().strip())
        for filename in input.unmapped:
            sample_id, rna_type = filename.split(os.path.sep)[-2:]
            with open(filename, 'r') as f:
                records[sample_id][rna_type + '.unmapped'] = int(f.read().strip())
        records = pd.DataFrame.from_records(records)
        records.columns.name = 'sample_id'
        records.columns.name = 'item'
        records.index.name = 'reads_type'
        records.to_csv(output[0], sep='\t', header=True, index=True)

rule summarize_read_counts_jupyter:
    input:
        read_counts='{output_dir}/summary/read_counts.txt',
        jupyter=package_dir + '/templates/summarize_read_counts_small.ipynb'
    output:
        jupyter='{output_dir}/summary/read_counts.ipynb',
        html='{output_dir}/summary/read_counts.html'
    shell:
        '''cp {input.jupyter} {output.jupyter}
        jupyter nbconvert --execute --to html \
            --HTMLExporter.exclude_code_cell=False \
            --HTMLExporter.exclude_input_prompt=True \
            --HTMLExporter.exclude_output_prompt=True \
            {output.jupyter}
        '''

rule mapped_read_length:
    input:
        '{output_dir}/tbam/{sample_id}/{rna_type}.bam'
    output:
        '{output_dir}/stats/mapped_read_length/{sample_id}/{rna_type}'
    shell:
        '''{bin_dir}/statistics.py read_length_hist --max-length 600 -i {input} -o {output}
        '''

rule merge_mapped_read_length:
    input:
        lambda wildcards: expand('{output_dir}/stats/mapped_read_length/{sample_id}/{rna_type}',
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types)
    output:
        '{output_dir}/stats/mapped_read_length_by_sample/{sample_id}'
    run:
        import pandas as pd
        from collections import OrderedDict

        table = OrderedDict()
        for filename in input:
            rna_type = os.path.basename(filename)
            df = pd.read_table(filename, index_col=0)
            table[rna_type] = df['query']
        table = pd.DataFrame(table)
        table.to_csv(output[0], sep='\t', header=True, index=True)

rule bam_to_bed:
    input:
        bam='{output_dir}/gbam/{sample_id}/{rna_type}.bam'
    output:
        bed='{output_dir}/gbed/{sample_id}/{rna_type}.bed.gz'
    shell:
        '''bedtools bamtobed -i {input} | bedtools sort | pigz -c > {output}
        '''

rule count_genomic_reads:
    '''Count genomic reads with a defined order
    '''
    input:
        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
        intron=genome_dir + '/bed/long_RNA.intron.bed',
        promoter=genome_dir + '/bed/promoter.merged.bed',
        enhancer=genome_dir + '/bed/enhancer.merged.bed',
        repeats=genome_dir + '/bed/rmsk.bed'
    output:
        intron='{output_dir}/stats/read_counts_mapped/{sample_id}/intron',
        promoter='{output_dir}/stats/read_counts_mapped/{sample_id}/promoter',
        enhancer='{output_dir}/stats/read_counts_mapped/{sample_id}/enhancer',
        repeats='{output_dir}/stats/read_counts_mapped/{sample_id}/repeats'
    threads:
        1
    shell:
        r'''pigz -c -d -p {threads} {input.reads} \
            | bedtools annotate -counts -s -i - -files {input.intron} {input.promoter} {input.enhancer} {input.repeats} \
            | awk 'BEGIN{{OFS="\t";FS="\t";intron=0;promoter=0;enhancer=0;repeats=0}}
            {{
                if($7 > 0) intron++
                else if($8 > 0) promoter++
                else if($9 > 0) enhancer++
                else if($10 > 0) repeats++
            }}END{{
                print intron > "{output.intron}"
                print promoter > "{output.promoter}"
                print enhancer > "{output.enhancer}"
                print repeats > "{output.repeats}"
            }}'
        '''

"""
rule count_reads_intron:
    input:
        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
        bed=genome_dir + '/bed/long_RNA.intron.bed'
    output:
        '{output_dir}/stats/read_counts_mapped/{sample_id}/intron'
    shell:
        '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -s -a - -b {input.bed} | wc -l > {output}
        '''

rule count_reads_promoter:
    input:
        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
        bed=genome_dir + '/bed/promoter.merged.bed'
    output:
        '{output_dir}/stats/read_counts_mapped/{sample_id}/promoter'
    shell:
        '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -a - -b {input.bed} | wc -l > {output}
        '''

rule count_reads_enhancer:
    input:
        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
        bed=genome_dir + '/bed/enhancer.merged.bed'
    output:
        '{output_dir}/stats/read_counts_mapped/{sample_id}/enhancer'
    shell:
        '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -a - -b {input.bed} | wc -l > {output}
        '''

rule count_reads_rmsk:
    input:
        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
        bed=genome_dir + '/bed/rmsk.bed'
    output:
        '{output_dir}/stats/read_counts_mapped/{sample_id}/repeats'
    shell:
        '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -s -a - -b {input.bed} | wc -l > {output}
        '''

rule map_circRNA:
    input:
        reads='{output_dir}/unmapped/{sample_id}/other.fa.gz',
        index=genome_dir + '/index/bowtie2/circRNA.1.bt2'
    output:
        unmapped='{output_dir}/unmapped/{sample_id}/circRNA.fa.gz',
        unmapped_aligner='{output_dir}/unmapped/{sample_id}/circRNA.aligner.fa.gz',
        bam='{output_dir}/tbam/{sample_id}/circRNA.bam',
        bam_filtered = '{output_dir}/tbam/{sample_id}/circRNA.filtered.bam'
    params:
        index=genome_dir + '/index/bowtie2/circRNA'
    threads: 
        config['threads_mapping']
    shell:
        '''pigz -d -c {input.reads} \
        | bowtie2 -f -p {threads} --norc --sensitive --no-unal \
            --un-gz {output.unmapped_aligner} -x {params.index} - -S - \
        | {bin_dir}/preprocess.py filter_circrna_reads --filtered-file >(samtools view -b -o {output.bam_filtered}) \
        | samtools view -b -o {output.bam}

        {{
            pigz -d -c {output.unmapped_aligner}
            samtools fasta {output.bam_filtered}
        }} | pigz -c > {output.unmapped}
        '''
"""

rule map_genome:
    input:
        reads='{output_dir}/unmapped/{sample_id}/clean.fa.gz',
        index=genome_dir + '/genome_index/bowtie2/genome.1.bt2'
    output:
        unmapped='{output_dir}/unmapped/{sample_id}/genome.fa.gz',
        bam='{output_dir}/gbam/{sample_id}/genome.bam'
    params:
        index=genome_dir + '/genome_index/bowtie2/genome'
    shell:
        '''pigz -d -c {input.reads} \
        | bowtie2 -f -p {threads} --sensitive --no-unal \
            --un-gz {output.unmapped} -x {params.index} - -S - \
        | samtools view -b -o {output.bam}
        '''