[41c1e8]: / exseek / snakefiles / mapping_long_pe.snakemake

Download this file

290 lines (268 with data), 11.7 kB

include: 'common.snakemake'

star_map_steps = ['spikein', 'univec', 'rRNA', 'genome', 'circRNA']
map_steps = list(star_map_steps)
if config['remove_duplicates_long']:
    map_steps += ['genome_rmdup', 'circRNA_rmdup']

def get_all_inputs(wildcards):
    available_inputs = dict(
        map_rRNA_paired=expand('{output_dir}/bam/{sample_id}/{map_step}.bam',
            output_dir=output_dir, sample_id=sample_ids, map_step=map_steps),
        summarize_read_length=expand('{output_dir}/stats/mapped_read_length_by_sample/{sample_id}',
            output_dir=output_dir, sample_id=sample_ids),
        summarize_insert_size=expand('{output_dir}/stats/mapped_insert_size_by_sample/{sample_id}',
            output_dir=output_dir, sample_id=sample_ids),
        summarize_mapping_star=expand('{output_dir}/summary/mapping_star.html',
            output_dir=output_dir)
    )
    enabled_inputs = list(available_inputs.keys())
    inputs = []
    for key, l in available_inputs.items():
        if key in enabled_inputs:
            inputs += l
    return inputs


rule all:
    input:
        get_all_inputs

rule rename_fastq_pe:
    input:
        auto_gzip_input('{output_dir}/cutadapt/{sample_id}_{mate_index}.fastq')
    output:
        '{output_dir}/unmapped/{sample_id}/clean_{mate_index}.fastq.gz'
    threads: 
        1
    wildcard_constraints:
        mate_index='[12]'
    shell:
        r'''{bin_dir}/auto_uncompress {input} \
            | awk 'NR%4==1{{printf "@%012d\n", int(NR/4);next}} NR%4==3{{printf "+\n";next}} {{print}}' \
            | pigz -c -p {threads} > {output}
        '''

map_command_pe = '''STAR --genomeDir {params.index} \
            --readFilesIn {input.reads1} {input.reads2} \
            --runThreadN {threads} \
            --outFileNamePrefix {params.output_prefix} \
            --outSAMtype BAM Unsorted \
            --outReadsUnmapped Fastx \
            --readFilesCommand gzip -d -c \
            --outSAMmultNmax 1 \
            --seedPerWindowNmax {params.seedPerWindowNmax}
        mv {params.output_prefix}Aligned.out.bam {output.bam}
        gzip -c {params.output_prefix}Unmapped.out.mate1 > {output.unmapped1}
        gzip -c {params.output_prefix}Unmapped.out.mate2 > {output.unmapped2}
        rm -f {params.output_prefix}Unmapped.out.mate1 {params.output_prefix}Unmapped.out.mate2
        '''

rule map_spikein_pe:
    input:
        reads1='{output_dir}/unmapped/{sample_id}/clean_1.fastq.gz',
        reads2='{output_dir}/unmapped/{sample_id}/clean_2.fastq.gz',
        index=config.get('spikein_index', genome_dir + '/index/star/spikein_long') + '/SA'
    output:
        bam='{output_dir}/bam/{sample_id}/spikein.bam',
        unmapped1='{output_dir}/unmapped/{sample_id}/spikein_1.fastq.gz',
        unmapped2='{output_dir}/unmapped/{sample_id}/spikein_2.fastq.gz',
        log='{output_dir}/mapping_star/{sample_id}/spikein/Log.final.out'
    params:
        output_prefix='{output_dir}/mapping_star/{sample_id}/spikein/',
        index=config.get('spikein_index', genome_dir + '/index/star/spikein_long/'),
        seedPerWindowNmax=20
    threads:
        config['threads_mapping']
    run:
        shell(map_command_pe)

rule map_univec_pe:
    input:
        reads1='{output_dir}/unmapped/{sample_id}/spikein_1.fastq.gz',
        reads2='{output_dir}/unmapped/{sample_id}/spikein_2.fastq.gz',
        index=genome_dir + '/index/star/univec/SA'
    output:
        bam='{output_dir}/bam/{sample_id}/univec.bam',
        unmapped1='{output_dir}/unmapped/{sample_id}/univec_1.fastq.gz',
        unmapped2='{output_dir}/unmapped/{sample_id}/univec_2.fastq.gz',
        log='{output_dir}/mapping_star/{sample_id}/univec/Log.final.out'
    params:
        output_prefix='{output_dir}/mapping_star/{sample_id}/univec/',
        index=genome_dir + '/index/star/univec',
        seedPerWindowNmax=20
    threads:
        config['threads_mapping']
    run:
        shell(map_command_pe)

rule map_rRNA_pe:
    input:
        reads1='{output_dir}/unmapped/{sample_id}/univec_1.fastq.gz',
        reads2='{output_dir}/unmapped/{sample_id}/univec_2.fastq.gz',
        index=genome_dir + '/index/star/rRNA/SA'
    output:
        bam='{output_dir}/bam/{sample_id}/rRNA.bam',
        unmapped1='{output_dir}/unmapped/{sample_id}/rRNA_1.fastq.gz',
        unmapped2='{output_dir}/unmapped/{sample_id}/rRNA_2.fastq.gz',
        log='{output_dir}/mapping_star/{sample_id}/rRNA/Log.final.out'
    params:
        output_prefix='{output_dir}/mapping_star/{sample_id}/rRNA/',
        index=genome_dir + '/index/star/rRNA',
        seedPerWindowNmax=20
    threads:
        config['threads_mapping']
    run:
        shell(map_command_pe)

rule map_genome_pe:
    input:
        reads1='{output_dir}/unmapped/{sample_id}/rRNA_1.fastq.gz',
        reads2='{output_dir}/unmapped/{sample_id}/rRNA_2.fastq.gz',
        index=genome_dir + '/index/star/genome_long_rna/SA'
    output:
        bam='{output_dir}/bam/{sample_id}/genome.bam',
        unmapped1='{output_dir}/unmapped/{sample_id}/genome_1.fastq.gz',
        unmapped2='{output_dir}/unmapped/{sample_id}/genome_2.fastq.gz',
        log='{output_dir}/mapping_star/{sample_id}/genome/Log.final.out'
    params:
        output_prefix='{output_dir}/mapping_star/{sample_id}/genome/',
        index=genome_dir + '/index/star/genome_long_rna',
        seedPerWindowNmax=50
    threads:
        config['threads_mapping']
    run:
        shell(map_command_pe)
    
rule map_circRNA_pe:
    input:
        reads1='{output_dir}/unmapped/{sample_id}/genome_1.fastq.gz',
        reads2='{output_dir}/unmapped/{sample_id}/genome_2.fastq.gz',
        index=genome_dir + '/index/star/circRNA/SA'
    output:
        bam='{output_dir}/bam/{sample_id}/circRNA.bam',
        unmapped1='{output_dir}/unmapped/{sample_id}/circRNA_1.fastq.gz',
        unmapped2='{output_dir}/unmapped/{sample_id}/circRNA_2.fastq.gz',
        log='{output_dir}/mapping_star/{sample_id}/circRNA/Log.final.out'
    params:
        output_prefix='{output_dir}/mapping_star/{sample_id}/circRNA/',
        index=genome_dir + '/index/star/circRNA',
        seedPerWindowNmax=20
    threads:
        config['threads_mapping']
    run:
        shell(map_command_pe)

rule sort_bam_by_name:
    input:
        '{output_dir}/bam/{sample_id}/{map_step}.bam'
    output:
        '{output_dir}/bam_sorted_by_name/{sample_id}/{map_step}.bam'
    params:
        temp_dir=config['temp_dir']
    shell:
        '''samtools sort -n -T {params.temp_dir} -o {output} {input}
        '''

rule remove_duplicates:
    input:
        bam='{output_dir}/bam/{sample_id}/{map_step}.bam'
    output:
        bam='{output_dir}/bam/{sample_id}/{map_step}_rmdup.bam',
        metrics='{output_dir}/log/{map_step}_rmdup/{sample_id}'
    wildcard_constraints:
        map_step='(rRNA)|(genome)|(circRNA)'
    shell:
        '''picard MarkDuplicates REMOVE_DUPLICATES=true \
            ASSUME_SORT_ORDER=queryname \
            I={input.bam} \
            O={output.bam} \
            M={output.metrics} \
            READ_NAME_REGEX=null
        '''

rule samtools_stats:
    '''Statistics for mapped reads
    '''
    input:
        '{output_dir}/bam/{sample_id}/{map_step}.bam'
    output:
        '{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
    shell:
        '''samtools stats {input} > {output}
        '''

rule parse_samtools_stats_pe:
    input:
        '{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
    output:
        fragment_counts='{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
        insert_size_average='{output_dir}/stats/insert_size_average/{sample_id}/{map_step}',
        insert_size_hist='{output_dir}/stats/insert_size_hist/{sample_id}/{map_step}',
        read_length_hist='{output_dir}/stats/read_length_hist/{sample_id}/{map_step}'
    wildcard_constraints:
        map_step='(?!clean).*'
    shell:
        '''awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "reads mapped and paired:") print int($3/2)}}' {input} > {output.fragment_counts}
        awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "insert size average:") print $3}}' {input} > {output.insert_size_average}
        awk 'BEGIN{{OFS="\t";FS="\t"}}/^IS/{{print $2,$3}}' {input} > {output.insert_size_hist}
        awk 'BEGIN{{OFS="\t";FS="\t"}}/^RL/{{print $2,$3}}' {input} > {output.read_length_hist}
        '''

rule summarize_read_length:
    input:
        lambda wildcards: expand('{output_dir}/stats/read_length_hist/{sample_id}/{map_step}',
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, map_step=map_steps)
    output:
        '{output_dir}/stats/mapped_read_length_by_sample/{sample_id}'
    run:
        import pandas as pd

        matrix = {}
        for filename in input:
            map_step = filename.split('/')[-1]
            matrix[map_step] = pd.read_table(filename, sep='\t', index_col=0, header=None, names=['read_length', map_step]).iloc[:, 0]
        matrix = pd.DataFrame(matrix)
        matrix = matrix.loc[:, map_steps]
        matrix.fillna(0, inplace=True)
        matrix = matrix.astype('int')
        matrix.index.name = 'read_length'
        matrix.to_csv(output[0], sep='\t', header=True, index=True)

rule summarize_insert_size:
    input:
        lambda wildcards: expand('{output_dir}/stats/insert_size_hist/{sample_id}/{map_step}',
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, map_step=map_steps)
    output:
        '{output_dir}/stats/mapped_insert_size_by_sample/{sample_id}'
    run:
        import pandas as pd

        matrix = {}
        for filename in input:
            map_step = filename.split('/')[-1]
            matrix[map_step] = pd.read_table(filename, sep='\t', index_col=0, header=None, names=['insert_size', map_step]).iloc[:, 0]
        matrix = pd.DataFrame(matrix)
        matrix = matrix.loc[:, map_steps]
        matrix.fillna(0, inplace=True)
        matrix = matrix.astype('int')
        matrix.index.name = 'insert_size'
        matrix.to_csv(output[0], sep='\t', header=True, index=True)

rule summarize_mapping_star:
    input:
        lambda wildcards: expand('{output_dir}/mapping_star/{sample_id}/{map_step}/Log.final.out',
            output_dir=wildcards.output_dir, sample_id=sample_ids, map_step=star_map_steps)
    output:
        '{output_dir}/summary/mapping_star.txt'
    run:
        import pandas as pd

        records = []
        columns = ['sample_id', 'map_step']
        for i, filename in enumerate(input):
            map_step = filename.split('/')[-2]
            sample_id = filename.split('/')[-3]
            record = {'sample_id': sample_id, 'map_step': map_step}
            with open(filename, 'r') as fin:
                for line in fin:
                    c = line.strip().split('|')
                    if len(c) == 2:
                        key, val = c[0].strip(), c[1].strip()
                        record[key] = val
                        if i == 0:
                            columns.append(key)
            records.append(record)
        summary = pd.DataFrame.from_records(records)
        summary = summary.reindex(columns=columns)
        summary.set_index('sample_id', inplace=True)
        summary.index.name = 'sample_id'
        summary.to_csv(output[0], sep='\t', header=True, na_rep='NA', index=True)

rule summarize_fragment_counts_jupyter:
    input:
        summary='{output_dir}/summary/mapping_star.txt',
        jupyter=package_dir + '/templates/summarize_mapping_star.ipynb'
    output:
        jupyter='{output_dir}/summary/mapping_star.ipynb',
        html='{output_dir}/summary/mapping_star.html'
    run:
        shell(nbconvert_command)