exSEEK / Git / [41c1e8] /exseek/snakefiles/bigwig

Models:
DanielG/
exSEEK
Downloads: 1
[41c1e8]: / exseek / snakefiles / bigwig_long.snakemake
History
Download this file
142 lines (131 with data), 6.5 kB

include: 'common.snakemake'

import re

if config['remove_duplicates_long']:
    map_steps = ['genome_rmdup', 'circRNA_rmdup', 'rRNA_rmdup']
else:
    map_steps = ['genome', 'circRNA', 'rRNA']

def get_all_inputs(wildcards):
    available_inputs = dict()
    for map_step in map_steps:
        if map_step.startswith('genome'):
            strands = ['+', '-']
        else:
            strands = '+'
        available_inputs[map_step] = expand('{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig',
            output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands)
        available_inputs[map_step + '_normalized'] = expand('{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig',
            output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands)
        available_inputs[map_step + '_bam'] = expand('{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam',
            output_dir=output_dir, sample_id=sample_ids, map_step=map_step)
        available_inputs[map_step + '_bai'] = expand('{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam.bai',
            output_dir=output_dir, sample_id=sample_ids, map_step=map_step)
        if map_step.startswith('circRNA'):
            available_inputs[map_step + '_bigwig_pseudo_genome'] = expand('{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bigWig',
                output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands)
            available_inputs[map_step + '_bam_pseudo_genome'] = expand('{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam',
                output_dir=output_dir, sample_id=sample_ids, map_step=map_step)
    enabled_inputs = list(available_inputs.keys())
    inputs = []
    for key, l in available_inputs.items():
        if key in enabled_inputs:
            inputs += l
    return inputs

rule all:
    input:
        get_all_inputs

rule sort_bam_by_coord:
    input:
        '{output_dir}/bam/{sample_id}/{map_step}.bam'
    output:
        '{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam'
    params:
        temp_dir=config['temp_dir']
    shell:
        '''samtools sort -T {params.temp_dir} -o {output} {input}
        '''

rule index_bam_by_coord:
    input:
        '{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam'
    output:
        '{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam.bai'
    shell:
        '''samtools index {input}
        '''

rule genomecov:
    input:
        '{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam'
    output:
        '{output_dir}/bedgraph/{sample_id}.{map_step}.{strand}.bedGraph'
    params:
        #pc='-pc' if config['paired_end'] else '',
        pc='',
        temp_dir=config['temp_dir']
    wildcard_constraints:
        strand='[+-]'
    shell:
        '''bedtools genomecov -strand {wildcards.strand} -bg {params.pc} -split -ibam {input} \
            | LC_COLLATE=C sort -T {params.temp_dir} -k1,1 -k2,2n > {output}
        '''

rule bedgraph_to_bigwig:
    input:
        bedgraph='{output_dir}/bedgraph/{sample_id}.{map_step}.{strand}.bedGraph',
        chrom_sizes=lambda wildcards: genome_dir + '/chrom_sizes/' + re.sub('_rmdup$', '', wildcards.map_step)
    output:
        '{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig'
    shell:
        '''bedGraphToBigWig {input.bedgraph} {input.chrom_sizes} {output}
        '''

rule normalize_bigwig:
    input:
        bam='{output_dir}/bam/{sample_id}/{map_step}.bam',
        bigwig='{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig',
        chrom_sizes=lambda wildcards: genome_dir + '/chrom_sizes/' + re.sub('_rmdup$', '', wildcards.map_step)
    output:
        bigwig='{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig',
        bedgraph=temp('{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bedGraph')
    wildcard_constraints:
        strand='[+-]'
    shell:
        r'''read_depth=`bamtools count -in {input.bam}`
        bigWigToBedGraph {input.bigwig} stdout \
            | awk -v d="$read_depth" 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,1000000.0*2*$4/d}}' > {output.bedgraph}
        bedGraphToBigWig {output.bedgraph} {input.chrom_sizes} {output.bigwig}
        '''

rule map_bigwig_to_pseudo_genome:
    input:
        bigwig='{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig',
        annotation=lambda wildcards: expand(genome_dir + '/bed/pseudo_genome.{gene_type}.bed', 
            gene_type=re.sub(r'_rmdup$', '', wildcards.map_step)),
        chrom_sizes=lambda wildcards: expand(genome_dir + '/chrom_sizes/pseudo_genome.{gene_type}',
            gene_type=re.sub(r'_rmdup$', '', wildcards.map_step))
    output:
        bigwig='{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bigWig',
        bedgraph=temp('{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bedGraph')
    wildcard_constraints:
        map_step='(circRNA)|(circRNA_rmdup)'
    shell:
        r'''bigWigToBedGraph {input.bigwig} stdout \
            | awk 'BEGIN{{OFS="\t";FS="\t"}} FNR==NR{{chrom[$4]=$1;start[$4]=$2;next}} {{if($1 in chrom) print chrom[$1],start[$1]+$2,start[$1]+$3,$4}}' \
                {input.annotation} - > {output.bedgraph}
            bedGraphToBigWig {output.bedgraph} {input.chrom_sizes} {output.bigwig}
        '''

rule map_bam_to_pseudo_genome:
    input:
        bam='{output_dir}/bam/{sample_id}/{map_step}.bam',
        bed=lambda wildcards: expand(genome_dir + '/bed/pseudo_genome.{gene_type}.bed', 
            gene_type=re.sub(r'_rmdup$', '', wildcards.map_step)),
        chrom_sizes=lambda wildcards: expand(genome_dir + '/chrom_sizes/pseudo_genome.{gene_type}',
            gene_type=re.sub(r'_rmdup$', '', wildcards.map_step))
    output:
        bam='{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam',
        unsorted_bam=temp('{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.unsorted.bam'),
        bai='{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam.bai'
    wildcard_constraints:
        map_step='(circRNA)|(circRNA_rmdup)'
    shell:
        '''{bin_dir}/preprocess.py map_bam_to_pseudo_genome -i {input.bam} \
            --bed {input.bed} --chrom-sizes {input.chrom_sizes} \
            -o {output.unsorted_bam}
        samtools sort {output.unsorted_bam} > {output.bam}
        samtools index {output.bam}
        '''