a b/exseek/snakefiles/build_index.snakemake
1
include: 'common.snakemake'
2
3
genome_dir = config['genome_dir']
4
rna_types_long = ['genome_long_rna', 'spikein_long', 'rRNA', 'circRNA', 'univec']
5
6
def get_all_inputs(wildcards):
7
    available_inputs = dict(
8
        fasta_index=expand('{genome_dir}/fasta/genome.fa.fai', genome_dir=genome_dir)
9
    )
10
    if config['small_rna']:
11
        available_inputs.update(dict(
12
            bowtie2_genome_index=expand('{genome_dir}/genome_index/bowtie2/genome.1.bt2', genome_dir=genome_dir),
13
            chrom_sizes=expand('{genome_dir}/chrom_sizes/transcriptome', genome_dir=genome_dir),
14
            chrom_sizes2=expand('{genome_dir}/chrom_sizes/transcriptome_genome', genome_dir=genome_dir),
15
            gtf_to_transcript_table=expand('{genome_dir}/transcript_table/{rna_type}.txt',
16
                genome_dir=genome_dir, rna_type=config['rna_types'] + ['all']),
17
            bowtie2_index=expand('{genome_dir}/index/bowtie2/{rna_type}.1.bt2',
18
                genome_dir=genome_dir, rna_type=config['rna_types'])
19
        ))
20
    else:
21
        available_inputs.update(dict(
22
            star_index=expand('{genome_dir}/index/star/{rna_type}/SA',
23
                genome_dir=genome_dir, rna_type=['genome_long_rna', 'spikein_long', 'rRNA', 'circRNA', 'univec']),
24
            transcript_table=expand('{genome_dir}/transcript_table/{rna_type}.txt',
25
                genome_dir=genome_dir, rna_type=['genome_long_rna', 'long'])
26
        ))
27
    enabled_inputs = list(available_inputs.keys())
28
    inputs = []
29
    for key, l in available_inputs.items():
30
        if key in enabled_inputs:
31
            inputs += l
32
    return inputs
33
34
rule all:
35
    input:
36
        get_all_inputs
37
38
rule build_bowtie2_index:
39
    input:
40
41
rule fasta_index:
42
    input:
43
        '{genome_dir}/fasta/{rna_type}.fa'
44
    output:
45
        '{genome_dir}/fasta/{rna_type}.fa.fai'
46
    shell:
47
        '''samtools faidx {input}
48
        '''
49
50
rule transcript_index_bowtie2:
51
    input:
52
        '{genome_dir}/fasta/{rna_type}.fa'
53
    output:
54
        bt2_1='{genome_dir}/index/bowtie2/{rna_type}.1.bt2',
55
        bt2rev_1='{genome_dir}/index/bowtie2/{rna_type}.rev.1.bt2'
56
    params:
57
        output_prefix='{genome_dir}/index/bowtie2/{rna_type}'
58
    threads:
59
        config['threads_mapping']
60
    shell:
61
        '''bowtie2-build --threads {threads} {input} {params.output_prefix}
62
        '''
63
64
rule transcript_index_star:
65
    input:
66
        fasta='{genome_dir}/fasta/{rna_type}.fa',
67
        fai='{genome_dir}/fasta/{rna_type}.fa.fai'
68
    output:
69
        Genome='{genome_dir}/index/star/{rna_type}/Genome',
70
        SA='{genome_dir}/index/star/{rna_type}/SA'
71
    params:
72
        output_prefix='{genome_dir}/index/star/{rna_type}',
73
        limitGenomeGenerateRAM=config['star_genome_generate']['limitGenomeGenerateRAM']
74
    wildcard_constraints:
75
        rna_type='(?!genome_long_rna).*'
76
    threads:
77
        config['threads_mapping']
78
    run:
79
        from math import log2
80
        # calculate star parameters
81
        genome_length = 0
82
        n_seqs = 0
83
        with open(input.fai, 'r') as f:
84
            for line in f:
85
                genome_length += int(line.split('\t')[1])
86
                n_seqs += 1
87
        genomeSAindexNbases = min(14, int(log2(genome_length)//2) - 1)
88
        genomeChrBinNbits = min(18, int(log2(genome_length/n_seqs)))
89
        if params.rna_type == 'rRNA':
90
            genomeSAindexNbases = 5
91
92
        shell('''STAR --runThreadN {threads} \
93
            --runMode genomeGenerate \
94
            --genomeSAindexNbases {genomeSAindexNbases} \
95
            --genomeDir {params.output_prefix} \
96
            --genomeFastaFiles {input.fasta} \
97
            --genomeChrBinNbits {genomeChrBinNbits} \
98
            --limitGenomeGenerateRAM {params.limitGenomeGenerateRAM}
99
        ''')
100
101
rule genome_index_bowtie2:
102
    input:
103
        '{genome_dir}/fasta/genome.fa'
104
    output:
105
        bt2_1='{genome_dir}/genome_index/bowtie2/genome.1.bt2',
106
        bt2rev_1='{genome_dir}/genome_index/bowtie2/genome.rev.1.bt2'
107
    params:
108
        output_prefix='{genome_dir}/genome_index/bowtie2/genome'
109
    threads:
110
        config['threads_mapping']
111
    shell:
112
        '''bowtie2-build --threads {threads} {input} {params.output_prefix}
113
        '''
114
115
rule genome_index_star:
116
    input:
117
        '{genome_dir}/fasta/genome.fa'
118
    output:
119
        SA='{genome_dir}/genome_index/star/SA',
120
        Genome='{genome_dir}/genome_index/star/Genome'
121
    params:
122
        output_prefix='{genome_dir}/genome_index/star/',
123
        limitGenomeGenerateRAM=config['star_genome_generate']['limitGenomeGenerateRAM']
124
    threads:
125
        config['threads_mapping']
126
    shell:
127
        '''STAR --runMode genomeGenerate --runThreadN {threads} \
128
            --genomeDir {params.output_prefix} --genomeFastaFiles {input} \
129
            --limitGenomeGenerateRAM {params.limitGenomeGenerateRAM}
130
        '''
131
132
rule star_index_genome_long_rna:
133
    input:
134
        fasta='{genome_dir}/fasta/genome.fa',
135
        gtf='{genome_dir}/gtf/long_RNA.gtf'
136
    output:
137
        SA='{genome_dir}/index/star/genome_long_rna/SA',
138
        Genome='{genome_dir}/index/star/Genome'
139
    params:
140
        output_prefix='{genome_dir}/index/star/genome_long_rna',
141
        sjdbOverhang=config['star_genome_generate']['sjdbOverhang'],
142
        limitGenomeGenerateRAM=config['star_genome_generate']['limitGenomeGenerateRAM']
143
    threads:
144
        config['threads_mapping']
145
    shell:
146
        '''STAR --runMode genomeGenerate --runThreadN {threads} \
147
            --genomeDir {params.output_prefix} --genomeFastaFiles {input.fasta} \
148
            --sjdbGTFfile {input.gtf} --sjdbOverhang {params.sjdbOverhang} \
149
            --outFileNamePrefix {params.output_prefix} \
150
            --limitGenomeGenerateRAM {params.limitGenomeGenerateRAM}
151
        '''
152
153
rule chrom_sizes:
154
    """Get transcript sizes of RNAs without gtf files from FASTA index
155
    """
156
    input:
157
        '{genome_dir}/fasta/{rna_type}.fa.fai'
158
    output:
159
        '{genome_dir}/chrom_sizes/{rna_type}'
160
    wildcard_constraints:
161
        rna_type='(?!transcriptome).*'
162
    shell:
163
        '''cut -f1,2 {input} > {output}
164
        '''
165
166
rule merge_transcript_sizes:
167
    input:
168
        lambda wildcards: expand('{genome_dir}/chrom_sizes/{rna_type}',
169
            genome_dir=wildcards.genome_dir, rna_type=rna_types)
170
    output:
171
        '{genome_dir}/chrom_sizes/transcriptome'
172
    shell:
173
        '''cat {input} > {output}
174
        '''
175
176
177
rule merge_transcriptome_genome_sizes:
178
    input:
179
        '{genome_dir}/chrom_sizes/transcriptome',
180
        '{genome_dir}/chrom_sizes/genome'
181
    output:
182
        '{genome_dir}/chrom_sizes/transcriptome_genome'
183
    shell:
184
        '''cat {input} > {output}
185
        '''
186
187
188
rule gtf_longest_transcript:
189
    input:
190
        '{genome_dir}/gtf_longest_transcript/{rna_type}.gtf'
191
    output:
192
        '{genome_dir}/gtf_longest_transcript/{rna_type}.gtf'
193
    shell:
194
        '''{bin_dir}/preprocess.py extract_longest_transcript -i {input} -o {output}
195
        '''
196
197
rule fasta_index_to_transcript_table:
198
    input:
199
        '{genome_dir}/fasta/{rna_type}.fa.fai'
200
    output:
201
        '{genome_dir}/transcript_table/{rna_type}.txt'
202
    #wildcard_constraints:
203
    #    rna_type='(rRNA)|(spikein)|(univec)'
204
    shell:
205
        r'''{{
206
            echo -e 'chrom\tstart\tend\tname\tscore\tstrand\tgene_id\ttranscript_id\tgene_name\ttranscript_name\tgene_type\ttranscript_type\tsource'
207
            awk 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,0,$2,$1,0,"+",$1,$1,$1,$1,"{wildcards.rna_type}","{wildcards.rna_type}","miRBase"}}' {input}
208
          }} > {output}
209
        '''
210
211
212
rule gtf_to_transcript_table:
213
    input:
214
        '{genome_dir}/gtf/long_RNA.gtf'
215
    output:
216
        '{genome_dir}/transcript_table/{rna_type}.txt'
217
    wildcard_constraints:
218
        rna_type='genome_long_rna'
219
    shell:
220
        '''{bin_dir}/preprocess.py gtf_to_transcript_table --feature exon \
221
            --gene-type {wildcards.rna_type} \
222
            --transcript-type {wildcards.rna_type} \
223
            -i {input} -o {output}
224
        '''
225
226
227
rule merge_transcript_table:
228
    input:
229
        lambda wildcards: expand('{genome_dir}/transcript_table/{rna_type}.txt',
230
            genome_dir=wildcards.genome_dir, rna_type=rna_types_with_gtf)
231
    output:
232
        '{genome_dir}/transcript_table/all.txt'
233
    shell:
234
        '''{{
235
           echo -e  'chrom\tstart\tend\tname\tscore\tstrand\tgene_id\ttranscript_id\tgene_name\ttranscript_name\tgene_type\ttranscript_type\tsource'
236
           sed '1 d' {input}
237
        }} > {output}
238
        '''
239
240
rule merge_transcript_table_long:
241
    input:
242
        lambda wildcards: expand('{genome_dir}/transcript_table/{rna_type}.txt',
243
            genome_dir=wildcards.genome_dir, rna_type=rna_types_long)
244
    output:
245
        '{genome_dir}/transcript_table/long.txt'
246
    shell:
247
        '''{{
248
           echo -e  'chrom\tstart\tend\tname\tscore\tstrand\tgene_id\ttranscript_id\tgene_name\ttranscript_name\tgene_type\ttranscript_type\tsource'
249
           sed '1 d' {input}
250
        }} > {output}
251
        '''