|
a |
|
b/exseek/snakefiles/build_index.snakemake |
|
|
1 |
include: 'common.snakemake' |
|
|
2 |
|
|
|
3 |
genome_dir = config['genome_dir'] |
|
|
4 |
rna_types_long = ['genome_long_rna', 'spikein_long', 'rRNA', 'circRNA', 'univec'] |
|
|
5 |
|
|
|
6 |
def get_all_inputs(wildcards): |
|
|
7 |
available_inputs = dict( |
|
|
8 |
fasta_index=expand('{genome_dir}/fasta/genome.fa.fai', genome_dir=genome_dir) |
|
|
9 |
) |
|
|
10 |
if config['small_rna']: |
|
|
11 |
available_inputs.update(dict( |
|
|
12 |
bowtie2_genome_index=expand('{genome_dir}/genome_index/bowtie2/genome.1.bt2', genome_dir=genome_dir), |
|
|
13 |
chrom_sizes=expand('{genome_dir}/chrom_sizes/transcriptome', genome_dir=genome_dir), |
|
|
14 |
chrom_sizes2=expand('{genome_dir}/chrom_sizes/transcriptome_genome', genome_dir=genome_dir), |
|
|
15 |
gtf_to_transcript_table=expand('{genome_dir}/transcript_table/{rna_type}.txt', |
|
|
16 |
genome_dir=genome_dir, rna_type=config['rna_types'] + ['all']), |
|
|
17 |
bowtie2_index=expand('{genome_dir}/index/bowtie2/{rna_type}.1.bt2', |
|
|
18 |
genome_dir=genome_dir, rna_type=config['rna_types']) |
|
|
19 |
)) |
|
|
20 |
else: |
|
|
21 |
available_inputs.update(dict( |
|
|
22 |
star_index=expand('{genome_dir}/index/star/{rna_type}/SA', |
|
|
23 |
genome_dir=genome_dir, rna_type=['genome_long_rna', 'spikein_long', 'rRNA', 'circRNA', 'univec']), |
|
|
24 |
transcript_table=expand('{genome_dir}/transcript_table/{rna_type}.txt', |
|
|
25 |
genome_dir=genome_dir, rna_type=['genome_long_rna', 'long']) |
|
|
26 |
)) |
|
|
27 |
enabled_inputs = list(available_inputs.keys()) |
|
|
28 |
inputs = [] |
|
|
29 |
for key, l in available_inputs.items(): |
|
|
30 |
if key in enabled_inputs: |
|
|
31 |
inputs += l |
|
|
32 |
return inputs |
|
|
33 |
|
|
|
34 |
rule all: |
|
|
35 |
input: |
|
|
36 |
get_all_inputs |
|
|
37 |
|
|
|
38 |
rule build_bowtie2_index: |
|
|
39 |
input: |
|
|
40 |
|
|
|
41 |
rule fasta_index: |
|
|
42 |
input: |
|
|
43 |
'{genome_dir}/fasta/{rna_type}.fa' |
|
|
44 |
output: |
|
|
45 |
'{genome_dir}/fasta/{rna_type}.fa.fai' |
|
|
46 |
shell: |
|
|
47 |
'''samtools faidx {input} |
|
|
48 |
''' |
|
|
49 |
|
|
|
50 |
rule transcript_index_bowtie2: |
|
|
51 |
input: |
|
|
52 |
'{genome_dir}/fasta/{rna_type}.fa' |
|
|
53 |
output: |
|
|
54 |
bt2_1='{genome_dir}/index/bowtie2/{rna_type}.1.bt2', |
|
|
55 |
bt2rev_1='{genome_dir}/index/bowtie2/{rna_type}.rev.1.bt2' |
|
|
56 |
params: |
|
|
57 |
output_prefix='{genome_dir}/index/bowtie2/{rna_type}' |
|
|
58 |
threads: |
|
|
59 |
config['threads_mapping'] |
|
|
60 |
shell: |
|
|
61 |
'''bowtie2-build --threads {threads} {input} {params.output_prefix} |
|
|
62 |
''' |
|
|
63 |
|
|
|
64 |
rule transcript_index_star: |
|
|
65 |
input: |
|
|
66 |
fasta='{genome_dir}/fasta/{rna_type}.fa', |
|
|
67 |
fai='{genome_dir}/fasta/{rna_type}.fa.fai' |
|
|
68 |
output: |
|
|
69 |
Genome='{genome_dir}/index/star/{rna_type}/Genome', |
|
|
70 |
SA='{genome_dir}/index/star/{rna_type}/SA' |
|
|
71 |
params: |
|
|
72 |
output_prefix='{genome_dir}/index/star/{rna_type}', |
|
|
73 |
limitGenomeGenerateRAM=config['star_genome_generate']['limitGenomeGenerateRAM'] |
|
|
74 |
wildcard_constraints: |
|
|
75 |
rna_type='(?!genome_long_rna).*' |
|
|
76 |
threads: |
|
|
77 |
config['threads_mapping'] |
|
|
78 |
run: |
|
|
79 |
from math import log2 |
|
|
80 |
# calculate star parameters |
|
|
81 |
genome_length = 0 |
|
|
82 |
n_seqs = 0 |
|
|
83 |
with open(input.fai, 'r') as f: |
|
|
84 |
for line in f: |
|
|
85 |
genome_length += int(line.split('\t')[1]) |
|
|
86 |
n_seqs += 1 |
|
|
87 |
genomeSAindexNbases = min(14, int(log2(genome_length)//2) - 1) |
|
|
88 |
genomeChrBinNbits = min(18, int(log2(genome_length/n_seqs))) |
|
|
89 |
if params.rna_type == 'rRNA': |
|
|
90 |
genomeSAindexNbases = 5 |
|
|
91 |
|
|
|
92 |
shell('''STAR --runThreadN {threads} \ |
|
|
93 |
--runMode genomeGenerate \ |
|
|
94 |
--genomeSAindexNbases {genomeSAindexNbases} \ |
|
|
95 |
--genomeDir {params.output_prefix} \ |
|
|
96 |
--genomeFastaFiles {input.fasta} \ |
|
|
97 |
--genomeChrBinNbits {genomeChrBinNbits} \ |
|
|
98 |
--limitGenomeGenerateRAM {params.limitGenomeGenerateRAM} |
|
|
99 |
''') |
|
|
100 |
|
|
|
101 |
rule genome_index_bowtie2: |
|
|
102 |
input: |
|
|
103 |
'{genome_dir}/fasta/genome.fa' |
|
|
104 |
output: |
|
|
105 |
bt2_1='{genome_dir}/genome_index/bowtie2/genome.1.bt2', |
|
|
106 |
bt2rev_1='{genome_dir}/genome_index/bowtie2/genome.rev.1.bt2' |
|
|
107 |
params: |
|
|
108 |
output_prefix='{genome_dir}/genome_index/bowtie2/genome' |
|
|
109 |
threads: |
|
|
110 |
config['threads_mapping'] |
|
|
111 |
shell: |
|
|
112 |
'''bowtie2-build --threads {threads} {input} {params.output_prefix} |
|
|
113 |
''' |
|
|
114 |
|
|
|
115 |
rule genome_index_star: |
|
|
116 |
input: |
|
|
117 |
'{genome_dir}/fasta/genome.fa' |
|
|
118 |
output: |
|
|
119 |
SA='{genome_dir}/genome_index/star/SA', |
|
|
120 |
Genome='{genome_dir}/genome_index/star/Genome' |
|
|
121 |
params: |
|
|
122 |
output_prefix='{genome_dir}/genome_index/star/', |
|
|
123 |
limitGenomeGenerateRAM=config['star_genome_generate']['limitGenomeGenerateRAM'] |
|
|
124 |
threads: |
|
|
125 |
config['threads_mapping'] |
|
|
126 |
shell: |
|
|
127 |
'''STAR --runMode genomeGenerate --runThreadN {threads} \ |
|
|
128 |
--genomeDir {params.output_prefix} --genomeFastaFiles {input} \ |
|
|
129 |
--limitGenomeGenerateRAM {params.limitGenomeGenerateRAM} |
|
|
130 |
''' |
|
|
131 |
|
|
|
132 |
rule star_index_genome_long_rna: |
|
|
133 |
input: |
|
|
134 |
fasta='{genome_dir}/fasta/genome.fa', |
|
|
135 |
gtf='{genome_dir}/gtf/long_RNA.gtf' |
|
|
136 |
output: |
|
|
137 |
SA='{genome_dir}/index/star/genome_long_rna/SA', |
|
|
138 |
Genome='{genome_dir}/index/star/Genome' |
|
|
139 |
params: |
|
|
140 |
output_prefix='{genome_dir}/index/star/genome_long_rna', |
|
|
141 |
sjdbOverhang=config['star_genome_generate']['sjdbOverhang'], |
|
|
142 |
limitGenomeGenerateRAM=config['star_genome_generate']['limitGenomeGenerateRAM'] |
|
|
143 |
threads: |
|
|
144 |
config['threads_mapping'] |
|
|
145 |
shell: |
|
|
146 |
'''STAR --runMode genomeGenerate --runThreadN {threads} \ |
|
|
147 |
--genomeDir {params.output_prefix} --genomeFastaFiles {input.fasta} \ |
|
|
148 |
--sjdbGTFfile {input.gtf} --sjdbOverhang {params.sjdbOverhang} \ |
|
|
149 |
--outFileNamePrefix {params.output_prefix} \ |
|
|
150 |
--limitGenomeGenerateRAM {params.limitGenomeGenerateRAM} |
|
|
151 |
''' |
|
|
152 |
|
|
|
153 |
rule chrom_sizes: |
|
|
154 |
"""Get transcript sizes of RNAs without gtf files from FASTA index |
|
|
155 |
""" |
|
|
156 |
input: |
|
|
157 |
'{genome_dir}/fasta/{rna_type}.fa.fai' |
|
|
158 |
output: |
|
|
159 |
'{genome_dir}/chrom_sizes/{rna_type}' |
|
|
160 |
wildcard_constraints: |
|
|
161 |
rna_type='(?!transcriptome).*' |
|
|
162 |
shell: |
|
|
163 |
'''cut -f1,2 {input} > {output} |
|
|
164 |
''' |
|
|
165 |
|
|
|
166 |
rule merge_transcript_sizes: |
|
|
167 |
input: |
|
|
168 |
lambda wildcards: expand('{genome_dir}/chrom_sizes/{rna_type}', |
|
|
169 |
genome_dir=wildcards.genome_dir, rna_type=rna_types) |
|
|
170 |
output: |
|
|
171 |
'{genome_dir}/chrom_sizes/transcriptome' |
|
|
172 |
shell: |
|
|
173 |
'''cat {input} > {output} |
|
|
174 |
''' |
|
|
175 |
|
|
|
176 |
|
|
|
177 |
rule merge_transcriptome_genome_sizes: |
|
|
178 |
input: |
|
|
179 |
'{genome_dir}/chrom_sizes/transcriptome', |
|
|
180 |
'{genome_dir}/chrom_sizes/genome' |
|
|
181 |
output: |
|
|
182 |
'{genome_dir}/chrom_sizes/transcriptome_genome' |
|
|
183 |
shell: |
|
|
184 |
'''cat {input} > {output} |
|
|
185 |
''' |
|
|
186 |
|
|
|
187 |
|
|
|
188 |
rule gtf_longest_transcript: |
|
|
189 |
input: |
|
|
190 |
'{genome_dir}/gtf_longest_transcript/{rna_type}.gtf' |
|
|
191 |
output: |
|
|
192 |
'{genome_dir}/gtf_longest_transcript/{rna_type}.gtf' |
|
|
193 |
shell: |
|
|
194 |
'''{bin_dir}/preprocess.py extract_longest_transcript -i {input} -o {output} |
|
|
195 |
''' |
|
|
196 |
|
|
|
197 |
rule fasta_index_to_transcript_table: |
|
|
198 |
input: |
|
|
199 |
'{genome_dir}/fasta/{rna_type}.fa.fai' |
|
|
200 |
output: |
|
|
201 |
'{genome_dir}/transcript_table/{rna_type}.txt' |
|
|
202 |
#wildcard_constraints: |
|
|
203 |
# rna_type='(rRNA)|(spikein)|(univec)' |
|
|
204 |
shell: |
|
|
205 |
r'''{{ |
|
|
206 |
echo -e 'chrom\tstart\tend\tname\tscore\tstrand\tgene_id\ttranscript_id\tgene_name\ttranscript_name\tgene_type\ttranscript_type\tsource' |
|
|
207 |
awk 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,0,$2,$1,0,"+",$1,$1,$1,$1,"{wildcards.rna_type}","{wildcards.rna_type}","miRBase"}}' {input} |
|
|
208 |
}} > {output} |
|
|
209 |
''' |
|
|
210 |
|
|
|
211 |
|
|
|
212 |
rule gtf_to_transcript_table: |
|
|
213 |
input: |
|
|
214 |
'{genome_dir}/gtf/long_RNA.gtf' |
|
|
215 |
output: |
|
|
216 |
'{genome_dir}/transcript_table/{rna_type}.txt' |
|
|
217 |
wildcard_constraints: |
|
|
218 |
rna_type='genome_long_rna' |
|
|
219 |
shell: |
|
|
220 |
'''{bin_dir}/preprocess.py gtf_to_transcript_table --feature exon \ |
|
|
221 |
--gene-type {wildcards.rna_type} \ |
|
|
222 |
--transcript-type {wildcards.rna_type} \ |
|
|
223 |
-i {input} -o {output} |
|
|
224 |
''' |
|
|
225 |
|
|
|
226 |
|
|
|
227 |
rule merge_transcript_table: |
|
|
228 |
input: |
|
|
229 |
lambda wildcards: expand('{genome_dir}/transcript_table/{rna_type}.txt', |
|
|
230 |
genome_dir=wildcards.genome_dir, rna_type=rna_types_with_gtf) |
|
|
231 |
output: |
|
|
232 |
'{genome_dir}/transcript_table/all.txt' |
|
|
233 |
shell: |
|
|
234 |
'''{{ |
|
|
235 |
echo -e 'chrom\tstart\tend\tname\tscore\tstrand\tgene_id\ttranscript_id\tgene_name\ttranscript_name\tgene_type\ttranscript_type\tsource' |
|
|
236 |
sed '1 d' {input} |
|
|
237 |
}} > {output} |
|
|
238 |
''' |
|
|
239 |
|
|
|
240 |
rule merge_transcript_table_long: |
|
|
241 |
input: |
|
|
242 |
lambda wildcards: expand('{genome_dir}/transcript_table/{rna_type}.txt', |
|
|
243 |
genome_dir=wildcards.genome_dir, rna_type=rna_types_long) |
|
|
244 |
output: |
|
|
245 |
'{genome_dir}/transcript_table/long.txt' |
|
|
246 |
shell: |
|
|
247 |
'''{{ |
|
|
248 |
echo -e 'chrom\tstart\tend\tname\tscore\tstrand\tgene_id\ttranscript_id\tgene_name\ttranscript_name\tgene_type\ttranscript_type\tsource' |
|
|
249 |
sed '1 d' {input} |
|
|
250 |
}} > {output} |
|
|
251 |
''' |