exSEEK / Git / Diff of /exseek/snakefiles/mapping

Models:
DanielG/
exSEEK
Downloads: 1
Diff of /exseek/snakefiles/mapping_small.snakemake [000000] .. [4c33d4]
Switch to side-by-side view

--- a
+++ b/exseek/snakefiles/mapping_small.snakemake
@@ -0,0 +1,290 @@
+include: 'common.snakemake'
+
+from collections import OrderedDict
+
+data_dir = config['data_dir']
+output_dir = config['output_dir']
+rna_types = config['rna_types']
+adaptor = config['adaptor']
+min_read_length = config['min_read_length']
+genome_dir = config['genome_dir']
+max_read_length = config['max_read_length']
+min_base_quality = config['min_base_quality']
+temp_dir = config['temp_dir']
+
+def get_all_inputs(wildcards):
+    available_inputs = dict(
+        summarize_read_counts=expand('{output_dir}/summary/read_counts.txt',
+            output_dir=output_dir),
+        mapped_read_length=expand('{output_dir}/stats/mapped_read_length_by_sample/{sample_id}',
+            output_dir=output_dir, sample_id=sample_ids),
+        summarize_read_counts_jupyter=expand('{output_dir}/summary/read_counts.html',
+            output_dir=output_dir),
+        unmapped_other=expand('{output_dir}/unmapped/{sample_id}/other.fa.gz',
+            output_dir=output_dir, sample_id=sample_ids),
+        tbam=expand('{output_dir}/tbam/{sample_id}/{rna_type}.bam',
+            output_dir=output_dir, sample_id=sample_ids, rna_type=config['rna_types']),
+        map_genome=expand('{output_dir}/gbam/{sample_id}/genome.bam',
+            output_dir=output_dir, sample_id=sample_ids)
+    )
+    enabled_inputs = list(available_inputs.keys())
+    inputs = []
+    for key, l in available_inputs.items():
+        if key in enabled_inputs:
+            inputs += l
+    return inputs
+
+rule all:
+    input:
+        get_all_inputs
+
+rule fastq_to_fasta_se:
+    input:
+        auto_gzip_input('{output_dir}/cutadapt/{sample_id}.fastq')
+    output:
+        '{output_dir}/unmapped/{sample_id}/clean.fa.gz'
+    threads:
+        config['threads_compress']
+    shell:
+        '''{bin_dir}/auto_uncompress {input} \
+            | fastq_to_fasta -r -n \
+            | pigz -p {threads} -c > {output}
+        '''
+        
+# mapping statistics
+rule read_counts_raw:
+    input:
+        data_dir + '/fastq/{sample_id}.fastq'
+    output:
+        '{output_dir}/stats/read_counts_raw/{sample_id}'
+    shell:
+        '''wc -l < {input} | awk '{{print int($0/4)}}' > {output}
+        '''
+
+rule read_counts_mapped:
+    input:
+        lambda wildcards: '{output_dir}/{bam_type}/{sample_id}/{rna_type}.bam'.format(
+            output_dir=wildcards.output_dir, 
+            bam_type='gbam' if wildcards.rna_type == 'other' else 'tbam',
+            sample_id=wildcards.sample_id, rna_type=wildcards.rna_type
+        )
+    output:
+        '{output_dir}/stats/read_counts_mapped/{sample_id}/{rna_type}'
+    wildcard_constraints:
+        rna_type='(?!promoter$)(?!enhancer$)(?!intron$)(?!repeats$)(?!genome).*'
+    shell:
+        '''bamtools count -in {input} > {output}
+        '''
+
+rule read_counts_unmapped:
+    input:
+        '{output_dir}/unmapped/{sample_id}/{rna_type}.fa.gz'
+    output:
+        '{output_dir}/stats/read_counts_unmapped/{sample_id}/{rna_type}'
+    threads:
+        config['threads_compress']
+    shell:
+        '''pigz -p {threads} -d -c {input} | wc -l | awk '{{print int($0/2)}}' > {output}
+        '''
+
+rule summarize_read_counts:
+    input:
+        mapped=lambda wildcards: expand('{output_dir}/stats/read_counts_mapped/{sample_id}/{rna_type}',
+            output_dir=wildcards.output_dir, sample_id=sample_ids, 
+            rna_type=config['rna_types'] + ['other', 'promoter', 'enhancer', 'intron', 'repeats']),
+        unmapped=lambda wildcards: expand('{output_dir}/stats/read_counts_unmapped/{sample_id}/{rna_type}',
+            output_dir=wildcards.output_dir, sample_id=sample_ids, rna_type=['clean', 'other'])
+    output:
+        '{output_dir}/summary/read_counts.txt'
+    run:
+        import pandas as pd
+        import os
+        from collections import OrderedDict
+    
+        records = OrderedDict()
+        for sample_id in sample_ids:
+            records[sample_id] = {}
+        for filename in input.mapped:
+            sample_id, rna_type = filename.split(os.path.sep)[-2:]
+            with open(filename, 'r') as f:
+                records[sample_id][rna_type + '.mapped'] = int(f.read().strip())
+        for filename in input.unmapped:
+            sample_id, rna_type = filename.split(os.path.sep)[-2:]
+            with open(filename, 'r') as f:
+                records[sample_id][rna_type + '.unmapped'] = int(f.read().strip())
+        records = pd.DataFrame.from_records(records)
+        records.columns.name = 'sample_id'
+        records.columns.name = 'item'
+        records.index.name = 'reads_type'
+        records.to_csv(output[0], sep='\t', header=True, index=True)
+
+rule summarize_read_counts_jupyter:
+    input:
+        read_counts='{output_dir}/summary/read_counts.txt',
+        jupyter=package_dir + '/templates/summarize_read_counts_small.ipynb'
+    output:
+        jupyter='{output_dir}/summary/read_counts.ipynb',
+        html='{output_dir}/summary/read_counts.html'
+    shell:
+        '''cp {input.jupyter} {output.jupyter}
+        jupyter nbconvert --execute --to html \
+            --HTMLExporter.exclude_code_cell=False \
+            --HTMLExporter.exclude_input_prompt=True \
+            --HTMLExporter.exclude_output_prompt=True \
+            {output.jupyter}
+        '''
+
+rule mapped_read_length:
+    input:
+        '{output_dir}/tbam/{sample_id}/{rna_type}.bam'
+    output:
+        '{output_dir}/stats/mapped_read_length/{sample_id}/{rna_type}'
+    shell:
+        '''{bin_dir}/statistics.py read_length_hist --max-length 600 -i {input} -o {output}
+        '''
+
+rule merge_mapped_read_length:
+    input:
+        lambda wildcards: expand('{output_dir}/stats/mapped_read_length/{sample_id}/{rna_type}',
+            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types)
+    output:
+        '{output_dir}/stats/mapped_read_length_by_sample/{sample_id}'
+    run:
+        import pandas as pd
+        from collections import OrderedDict
+
+        table = OrderedDict()
+        for filename in input:
+            rna_type = os.path.basename(filename)
+            df = pd.read_table(filename, index_col=0)
+            table[rna_type] = df['query']
+        table = pd.DataFrame(table)
+        table.to_csv(output[0], sep='\t', header=True, index=True)
+
+rule bam_to_bed:
+    input:
+        bam='{output_dir}/gbam/{sample_id}/{rna_type}.bam'
+    output:
+        bed='{output_dir}/gbed/{sample_id}/{rna_type}.bed.gz'
+    shell:
+        '''bedtools bamtobed -i {input} | bedtools sort | pigz -c > {output}
+        '''
+
+rule count_genomic_reads:
+    '''Count genomic reads with a defined order
+    '''
+    input:
+        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
+        intron=genome_dir + '/bed/long_RNA.intron.bed',
+        promoter=genome_dir + '/bed/promoter.merged.bed',
+        enhancer=genome_dir + '/bed/enhancer.merged.bed',
+        repeats=genome_dir + '/bed/rmsk.bed'
+    output:
+        intron='{output_dir}/stats/read_counts_mapped/{sample_id}/intron',
+        promoter='{output_dir}/stats/read_counts_mapped/{sample_id}/promoter',
+        enhancer='{output_dir}/stats/read_counts_mapped/{sample_id}/enhancer',
+        repeats='{output_dir}/stats/read_counts_mapped/{sample_id}/repeats'
+    threads:
+        1
+    shell:
+        r'''pigz -c -d -p {threads} {input.reads} \
+            | bedtools annotate -counts -s -i - -files {input.intron} {input.promoter} {input.enhancer} {input.repeats} \
+            | awk 'BEGIN{{OFS="\t";FS="\t";intron=0;promoter=0;enhancer=0;repeats=0}}
+            {{
+                if($7 > 0) intron++
+                else if($8 > 0) promoter++
+                else if($9 > 0) enhancer++
+                else if($10 > 0) repeats++
+            }}END{{
+                print intron > "{output.intron}"
+                print promoter > "{output.promoter}"
+                print enhancer > "{output.enhancer}"
+                print repeats > "{output.repeats}"
+            }}'
+        '''
+
+"""
+rule count_reads_intron:
+    input:
+        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
+        bed=genome_dir + '/bed/long_RNA.intron.bed'
+    output:
+        '{output_dir}/stats/read_counts_mapped/{sample_id}/intron'
+    shell:
+        '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -s -a - -b {input.bed} | wc -l > {output}
+        '''
+
+rule count_reads_promoter:
+    input:
+        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
+        bed=genome_dir + '/bed/promoter.merged.bed'
+    output:
+        '{output_dir}/stats/read_counts_mapped/{sample_id}/promoter'
+    shell:
+        '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -a - -b {input.bed} | wc -l > {output}
+        '''
+
+rule count_reads_enhancer:
+    input:
+        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
+        bed=genome_dir + '/bed/enhancer.merged.bed'
+    output:
+        '{output_dir}/stats/read_counts_mapped/{sample_id}/enhancer'
+    shell:
+        '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -a - -b {input.bed} | wc -l > {output}
+        '''
+
+rule count_reads_rmsk:
+    input:
+        reads='{output_dir}/gbed/{sample_id}/other.bed.gz',
+        bed=genome_dir + '/bed/rmsk.bed'
+    output:
+        '{output_dir}/stats/read_counts_mapped/{sample_id}/repeats'
+    shell:
+        '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -s -a - -b {input.bed} | wc -l > {output}
+        '''
+
+rule map_circRNA:
+    input:
+        reads='{output_dir}/unmapped/{sample_id}/other.fa.gz',
+        index=genome_dir + '/index/bowtie2/circRNA.1.bt2'
+    output:
+        unmapped='{output_dir}/unmapped/{sample_id}/circRNA.fa.gz',
+        unmapped_aligner='{output_dir}/unmapped/{sample_id}/circRNA.aligner.fa.gz',
+        bam='{output_dir}/tbam/{sample_id}/circRNA.bam',
+        bam_filtered = '{output_dir}/tbam/{sample_id}/circRNA.filtered.bam'
+    params:
+        index=genome_dir + '/index/bowtie2/circRNA'
+    threads: 
+        config['threads_mapping']
+    shell:
+        '''pigz -d -c {input.reads} \
+        | bowtie2 -f -p {threads} --norc --sensitive --no-unal \
+            --un-gz {output.unmapped_aligner} -x {params.index} - -S - \
+        | {bin_dir}/preprocess.py filter_circrna_reads --filtered-file >(samtools view -b -o {output.bam_filtered}) \
+        | samtools view -b -o {output.bam}
+
+        {{
+            pigz -d -c {output.unmapped_aligner}
+            samtools fasta {output.bam_filtered}
+        }} | pigz -c > {output.unmapped}
+        '''
+"""
+
+rule map_genome:
+    input:
+        reads='{output_dir}/unmapped/{sample_id}/clean.fa.gz',
+        index=genome_dir + '/genome_index/bowtie2/genome.1.bt2'
+    output:
+        unmapped='{output_dir}/unmapped/{sample_id}/genome.fa.gz',
+        bam='{output_dir}/gbam/{sample_id}/genome.bam'
+    params:
+        index=genome_dir + '/genome_index/bowtie2/genome'
+    shell:
+        '''pigz -d -c {input.reads} \
+        | bowtie2 -f -p {threads} --sensitive --no-unal \
+            --un-gz {output.unmapped} -x {params.index} - -S - \
+        | samtools view -b -o {output.bam}
+        '''
+    
+