|
a |
|
b/exseek/snakefiles/bigwig_long.snakemake |
|
|
1 |
include: 'common.snakemake' |
|
|
2 |
|
|
|
3 |
import re |
|
|
4 |
|
|
|
5 |
if config['remove_duplicates_long']: |
|
|
6 |
map_steps = ['genome_rmdup', 'circRNA_rmdup', 'rRNA_rmdup'] |
|
|
7 |
else: |
|
|
8 |
map_steps = ['genome', 'circRNA', 'rRNA'] |
|
|
9 |
|
|
|
10 |
def get_all_inputs(wildcards): |
|
|
11 |
available_inputs = dict() |
|
|
12 |
for map_step in map_steps: |
|
|
13 |
if map_step.startswith('genome'): |
|
|
14 |
strands = ['+', '-'] |
|
|
15 |
else: |
|
|
16 |
strands = '+' |
|
|
17 |
available_inputs[map_step] = expand('{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig', |
|
|
18 |
output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands) |
|
|
19 |
available_inputs[map_step + '_normalized'] = expand('{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig', |
|
|
20 |
output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands) |
|
|
21 |
available_inputs[map_step + '_bam'] = expand('{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam', |
|
|
22 |
output_dir=output_dir, sample_id=sample_ids, map_step=map_step) |
|
|
23 |
available_inputs[map_step + '_bai'] = expand('{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam.bai', |
|
|
24 |
output_dir=output_dir, sample_id=sample_ids, map_step=map_step) |
|
|
25 |
if map_step.startswith('circRNA'): |
|
|
26 |
available_inputs[map_step + '_bigwig_pseudo_genome'] = expand('{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bigWig', |
|
|
27 |
output_dir=output_dir, sample_id=sample_ids, map_step=map_step, strand=strands) |
|
|
28 |
available_inputs[map_step + '_bam_pseudo_genome'] = expand('{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam', |
|
|
29 |
output_dir=output_dir, sample_id=sample_ids, map_step=map_step) |
|
|
30 |
enabled_inputs = list(available_inputs.keys()) |
|
|
31 |
inputs = [] |
|
|
32 |
for key, l in available_inputs.items(): |
|
|
33 |
if key in enabled_inputs: |
|
|
34 |
inputs += l |
|
|
35 |
return inputs |
|
|
36 |
|
|
|
37 |
rule all: |
|
|
38 |
input: |
|
|
39 |
get_all_inputs |
|
|
40 |
|
|
|
41 |
rule sort_bam_by_coord: |
|
|
42 |
input: |
|
|
43 |
'{output_dir}/bam/{sample_id}/{map_step}.bam' |
|
|
44 |
output: |
|
|
45 |
'{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam' |
|
|
46 |
params: |
|
|
47 |
temp_dir=config['temp_dir'] |
|
|
48 |
shell: |
|
|
49 |
'''samtools sort -T {params.temp_dir} -o {output} {input} |
|
|
50 |
''' |
|
|
51 |
|
|
|
52 |
rule index_bam_by_coord: |
|
|
53 |
input: |
|
|
54 |
'{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam' |
|
|
55 |
output: |
|
|
56 |
'{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam.bai' |
|
|
57 |
shell: |
|
|
58 |
'''samtools index {input} |
|
|
59 |
''' |
|
|
60 |
|
|
|
61 |
rule genomecov: |
|
|
62 |
input: |
|
|
63 |
'{output_dir}/bam_sorted_by_coord/{sample_id}/{map_step}.bam' |
|
|
64 |
output: |
|
|
65 |
'{output_dir}/bedgraph/{sample_id}.{map_step}.{strand}.bedGraph' |
|
|
66 |
params: |
|
|
67 |
#pc='-pc' if config['paired_end'] else '', |
|
|
68 |
pc='', |
|
|
69 |
temp_dir=config['temp_dir'] |
|
|
70 |
wildcard_constraints: |
|
|
71 |
strand='[+-]' |
|
|
72 |
shell: |
|
|
73 |
'''bedtools genomecov -strand {wildcards.strand} -bg {params.pc} -split -ibam {input} \ |
|
|
74 |
| LC_COLLATE=C sort -T {params.temp_dir} -k1,1 -k2,2n > {output} |
|
|
75 |
''' |
|
|
76 |
|
|
|
77 |
rule bedgraph_to_bigwig: |
|
|
78 |
input: |
|
|
79 |
bedgraph='{output_dir}/bedgraph/{sample_id}.{map_step}.{strand}.bedGraph', |
|
|
80 |
chrom_sizes=lambda wildcards: genome_dir + '/chrom_sizes/' + re.sub('_rmdup$', '', wildcards.map_step) |
|
|
81 |
output: |
|
|
82 |
'{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig' |
|
|
83 |
shell: |
|
|
84 |
'''bedGraphToBigWig {input.bedgraph} {input.chrom_sizes} {output} |
|
|
85 |
''' |
|
|
86 |
|
|
|
87 |
rule normalize_bigwig: |
|
|
88 |
input: |
|
|
89 |
bam='{output_dir}/bam/{sample_id}/{map_step}.bam', |
|
|
90 |
bigwig='{output_dir}/bigwig/{sample_id}.{map_step}.{strand}.bigWig', |
|
|
91 |
chrom_sizes=lambda wildcards: genome_dir + '/chrom_sizes/' + re.sub('_rmdup$', '', wildcards.map_step) |
|
|
92 |
output: |
|
|
93 |
bigwig='{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig', |
|
|
94 |
bedgraph=temp('{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bedGraph') |
|
|
95 |
wildcard_constraints: |
|
|
96 |
strand='[+-]' |
|
|
97 |
shell: |
|
|
98 |
r'''read_depth=`bamtools count -in {input.bam}` |
|
|
99 |
bigWigToBedGraph {input.bigwig} stdout \ |
|
|
100 |
| awk -v d="$read_depth" 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,1000000.0*2*$4/d}}' > {output.bedgraph} |
|
|
101 |
bedGraphToBigWig {output.bedgraph} {input.chrom_sizes} {output.bigwig} |
|
|
102 |
''' |
|
|
103 |
|
|
|
104 |
rule map_bigwig_to_pseudo_genome: |
|
|
105 |
input: |
|
|
106 |
bigwig='{output_dir}/bigwig_normalized/{sample_id}.{map_step}.{strand}.bigWig', |
|
|
107 |
annotation=lambda wildcards: expand(genome_dir + '/bed/pseudo_genome.{gene_type}.bed', |
|
|
108 |
gene_type=re.sub(r'_rmdup$', '', wildcards.map_step)), |
|
|
109 |
chrom_sizes=lambda wildcards: expand(genome_dir + '/chrom_sizes/pseudo_genome.{gene_type}', |
|
|
110 |
gene_type=re.sub(r'_rmdup$', '', wildcards.map_step)) |
|
|
111 |
output: |
|
|
112 |
bigwig='{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bigWig', |
|
|
113 |
bedgraph=temp('{output_dir}/bigwig_pseudo_genome/{sample_id}.{map_step}.{strand}.bedGraph') |
|
|
114 |
wildcard_constraints: |
|
|
115 |
map_step='(circRNA)|(circRNA_rmdup)' |
|
|
116 |
shell: |
|
|
117 |
r'''bigWigToBedGraph {input.bigwig} stdout \ |
|
|
118 |
| awk 'BEGIN{{OFS="\t";FS="\t"}} FNR==NR{{chrom[$4]=$1;start[$4]=$2;next}} {{if($1 in chrom) print chrom[$1],start[$1]+$2,start[$1]+$3,$4}}' \ |
|
|
119 |
{input.annotation} - > {output.bedgraph} |
|
|
120 |
bedGraphToBigWig {output.bedgraph} {input.chrom_sizes} {output.bigwig} |
|
|
121 |
''' |
|
|
122 |
|
|
|
123 |
rule map_bam_to_pseudo_genome: |
|
|
124 |
input: |
|
|
125 |
bam='{output_dir}/bam/{sample_id}/{map_step}.bam', |
|
|
126 |
bed=lambda wildcards: expand(genome_dir + '/bed/pseudo_genome.{gene_type}.bed', |
|
|
127 |
gene_type=re.sub(r'_rmdup$', '', wildcards.map_step)), |
|
|
128 |
chrom_sizes=lambda wildcards: expand(genome_dir + '/chrom_sizes/pseudo_genome.{gene_type}', |
|
|
129 |
gene_type=re.sub(r'_rmdup$', '', wildcards.map_step)) |
|
|
130 |
output: |
|
|
131 |
bam='{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam', |
|
|
132 |
unsorted_bam=temp('{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.unsorted.bam'), |
|
|
133 |
bai='{output_dir}/bam_pseudo_genome/{sample_id}/{map_step}.bam.bai' |
|
|
134 |
wildcard_constraints: |
|
|
135 |
map_step='(circRNA)|(circRNA_rmdup)' |
|
|
136 |
shell: |
|
|
137 |
'''{bin_dir}/preprocess.py map_bam_to_pseudo_genome -i {input.bam} \ |
|
|
138 |
--bed {input.bed} --chrom-sizes {input.chrom_sizes} \ |
|
|
139 |
-o {output.unsorted_bam} |
|
|
140 |
samtools sort {output.unsorted_bam} > {output.bam} |
|
|
141 |
samtools index {output.bam} |
|
|
142 |
''' |