a b/exseek/snakefiles/bigwig_long.snakemake
1
include: 'common.snakemake'
2
3
import re
4
5
if config['remove_duplicates_long']:
6
    map_steps = ['genome_rmdup', 'circRNA_rmdup', 'rRNA_rmdup']
7
else:
8
    map_steps = ['genome', 'circRNA', 'rRNA']
9
10
def get_all_inputs(wildcards):
11
    available_inputs = dict()
12
    for map_step in map_steps:
13
        if map_step.startswith('genome'):
14
            strands = ['+', '-']
15
        else:
16
            strands = '+'
17
        available_inputs[map_step] = expand('{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig',
18
            output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands)
19
        available_inputs[map_step + '_normalized'] = expand('{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig',
20
            output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands)
21
        available_inputs[map_step + '_bam'] = expand('{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam',
22
            output_dir=output_dir, sample_id=sample_ids, map_step=map_step)
23
        available_inputs[map_step + '_bai'] = expand('{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam.bai',
24
            output_dir=output_dir, sample_id=sample_ids, map_step=map_step)
25
        if map_step.startswith('circRNA'):
26
            available_inputs[map_step + '_bigwig_pseudo_genome'] = expand('{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bigWig',
27
                output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands)
28
            available_inputs[map_step + '_bam_pseudo_genome'] = expand('{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam',
29
                output_dir=output_dir, sample_id=sample_ids, map_step=map_step)
30
    enabled_inputs = list(available_inputs.keys())
31
    inputs = []
32
    for key, l in available_inputs.items():
33
        if key in enabled_inputs:
34
            inputs += l
35
    return inputs
36
37
rule all:
38
    input:
39
        get_all_inputs
40
41
rule sort_bam_by_coord:
42
    input:
43
        '{output_dir}/bam/{sample_id}/{map_step}.bam'
44
    output:
45
        '{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam'
46
    params:
47
        temp_dir=config['temp_dir']
48
    shell:
49
        '''samtools sort -T {params.temp_dir} -o {output} {input}
50
        '''
51
52
rule index_bam_by_coord:
53
    input:
54
        '{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam'
55
    output:
56
        '{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam.bai'
57
    shell:
58
        '''samtools index {input}
59
        '''
60
61
rule genomecov:
62
    input:
63
        '{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam'
64
    output:
65
        '{output_dir}/bedgraph/{sample_id}.{map_step}.{strand}.bedGraph'
66
    params:
67
        #pc='-pc' if config['paired_end'] else '',
68
        pc='',
69
        temp_dir=config['temp_dir']
70
    wildcard_constraints:
71
        strand='[+-]'
72
    shell:
73
        '''bedtools genomecov -strand {wildcards.strand} -bg {params.pc} -split -ibam {input} \
74
            | LC_COLLATE=C sort -T {params.temp_dir} -k1,1 -k2,2n > {output}
75
        '''
76
77
rule bedgraph_to_bigwig:
78
    input:
79
        bedgraph='{output_dir}/bedgraph/{sample_id}.{map_step}.{strand}.bedGraph',
80
        chrom_sizes=lambda wildcards: genome_dir + '/chrom_sizes/' + re.sub('_rmdup$', '', wildcards.map_step)
81
    output:
82
        '{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig'
83
    shell:
84
        '''bedGraphToBigWig {input.bedgraph} {input.chrom_sizes} {output}
85
        '''
86
87
rule normalize_bigwig:
88
    input:
89
        bam='{output_dir}/bam/{sample_id}/{map_step}.bam',
90
        bigwig='{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig',
91
        chrom_sizes=lambda wildcards: genome_dir + '/chrom_sizes/' + re.sub('_rmdup$', '', wildcards.map_step)
92
    output:
93
        bigwig='{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig',
94
        bedgraph=temp('{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bedGraph')
95
    wildcard_constraints:
96
        strand='[+-]'
97
    shell:
98
        r'''read_depth=`bamtools count -in {input.bam}`
99
        bigWigToBedGraph {input.bigwig} stdout \
100
            | awk -v d="$read_depth" 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,1000000.0*2*$4/d}}' > {output.bedgraph}
101
        bedGraphToBigWig {output.bedgraph} {input.chrom_sizes} {output.bigwig}
102
        '''
103
104
rule map_bigwig_to_pseudo_genome:
105
    input:
106
        bigwig='{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig',
107
        annotation=lambda wildcards: expand(genome_dir + '/bed/pseudo_genome.{gene_type}.bed', 
108
            gene_type=re.sub(r'_rmdup$', '', wildcards.map_step)),
109
        chrom_sizes=lambda wildcards: expand(genome_dir + '/chrom_sizes/pseudo_genome.{gene_type}',
110
            gene_type=re.sub(r'_rmdup$', '', wildcards.map_step))
111
    output:
112
        bigwig='{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bigWig',
113
        bedgraph=temp('{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bedGraph')
114
    wildcard_constraints:
115
        map_step='(circRNA)|(circRNA_rmdup)'
116
    shell:
117
        r'''bigWigToBedGraph {input.bigwig} stdout \
118
            | awk 'BEGIN{{OFS="\t";FS="\t"}} FNR==NR{{chrom[$4]=$1;start[$4]=$2;next}} {{if($1 in chrom) print chrom[$1],start[$1]+$2,start[$1]+$3,$4}}' \
119
                {input.annotation} - > {output.bedgraph}
120
            bedGraphToBigWig {output.bedgraph} {input.chrom_sizes} {output.bigwig}
121
        '''
122
123
rule map_bam_to_pseudo_genome:
124
    input:
125
        bam='{output_dir}/bam/{sample_id}/{map_step}.bam',
126
        bed=lambda wildcards: expand(genome_dir + '/bed/pseudo_genome.{gene_type}.bed', 
127
            gene_type=re.sub(r'_rmdup$', '', wildcards.map_step)),
128
        chrom_sizes=lambda wildcards: expand(genome_dir + '/chrom_sizes/pseudo_genome.{gene_type}',
129
            gene_type=re.sub(r'_rmdup$', '', wildcards.map_step))
130
    output:
131
        bam='{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam',
132
        unsorted_bam=temp('{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.unsorted.bam'),
133
        bai='{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam.bai'
134
    wildcard_constraints:
135
        map_step='(circRNA)|(circRNA_rmdup)'
136
    shell:
137
        '''{bin_dir}/preprocess.py map_bam_to_pseudo_genome -i {input.bam} \
138
            --bed {input.bed} --chrom-sizes {input.chrom_sizes} \
139
            -o {output.unsorted_bam}
140
        samtools sort {output.unsorted_bam} > {output.bam}
141
        samtools index {output.bam}
142
        '''