a b/exseek/snakefiles/mapping_long_pe.snakemake
1
include: 'common.snakemake'
2
3
star_map_steps = ['spikein', 'univec', 'rRNA', 'genome', 'circRNA']
4
map_steps = list(star_map_steps)
5
if config['remove_duplicates_long']:
6
    map_steps += ['genome_rmdup', 'circRNA_rmdup']
7
8
def get_all_inputs(wildcards):
9
    available_inputs = dict(
10
        map_rRNA_paired=expand('{output_dir}/bam/{sample_id}/{map_step}.bam',
11
            output_dir=output_dir, sample_id=sample_ids, map_step=map_steps),
12
        summarize_read_length=expand('{output_dir}/stats/mapped_read_length_by_sample/{sample_id}',
13
            output_dir=output_dir, sample_id=sample_ids),
14
        summarize_insert_size=expand('{output_dir}/stats/mapped_insert_size_by_sample/{sample_id}',
15
            output_dir=output_dir, sample_id=sample_ids),
16
        summarize_mapping_star=expand('{output_dir}/summary/mapping_star.html',
17
            output_dir=output_dir)
18
    )
19
    enabled_inputs = list(available_inputs.keys())
20
    inputs = []
21
    for key, l in available_inputs.items():
22
        if key in enabled_inputs:
23
            inputs += l
24
    return inputs
25
26
27
rule all:
28
    input:
29
        get_all_inputs
30
31
rule rename_fastq_pe:
32
    input:
33
        auto_gzip_input('{output_dir}/cutadapt/{sample_id}_{mate_index}.fastq')
34
    output:
35
        '{output_dir}/unmapped/{sample_id}/clean_{mate_index}.fastq.gz'
36
    threads: 
37
        1
38
    wildcard_constraints:
39
        mate_index='[12]'
40
    shell:
41
        r'''{bin_dir}/auto_uncompress {input} \
42
            | awk 'NR%4==1{{printf "@%012d\n", int(NR/4);next}} NR%4==3{{printf "+\n";next}} {{print}}' \
43
            | pigz -c -p {threads} > {output}
44
        '''
45
46
map_command_pe = '''STAR --genomeDir {params.index} \
47
            --readFilesIn {input.reads1} {input.reads2} \
48
            --runThreadN {threads} \
49
            --outFileNamePrefix {params.output_prefix} \
50
            --outSAMtype BAM Unsorted \
51
            --outReadsUnmapped Fastx \
52
            --readFilesCommand gzip -d -c \
53
            --outSAMmultNmax 1 \
54
            --seedPerWindowNmax {params.seedPerWindowNmax}
55
        mv {params.output_prefix}Aligned.out.bam {output.bam}
56
        gzip -c {params.output_prefix}Unmapped.out.mate1 > {output.unmapped1}
57
        gzip -c {params.output_prefix}Unmapped.out.mate2 > {output.unmapped2}
58
        rm -f {params.output_prefix}Unmapped.out.mate1 {params.output_prefix}Unmapped.out.mate2
59
        '''
60
61
rule map_spikein_pe:
62
    input:
63
        reads1='{output_dir}/unmapped/{sample_id}/clean_1.fastq.gz',
64
        reads2='{output_dir}/unmapped/{sample_id}/clean_2.fastq.gz',
65
        index=config.get('spikein_index', genome_dir + '/index/star/spikein_long') + '/SA'
66
    output:
67
        bam='{output_dir}/bam/{sample_id}/spikein.bam',
68
        unmapped1='{output_dir}/unmapped/{sample_id}/spikein_1.fastq.gz',
69
        unmapped2='{output_dir}/unmapped/{sample_id}/spikein_2.fastq.gz',
70
        log='{output_dir}/mapping_star/{sample_id}/spikein/Log.final.out'
71
    params:
72
        output_prefix='{output_dir}/mapping_star/{sample_id}/spikein/',
73
        index=config.get('spikein_index', genome_dir + '/index/star/spikein_long/'),
74
        seedPerWindowNmax=20
75
    threads:
76
        config['threads_mapping']
77
    run:
78
        shell(map_command_pe)
79
80
rule map_univec_pe:
81
    input:
82
        reads1='{output_dir}/unmapped/{sample_id}/spikein_1.fastq.gz',
83
        reads2='{output_dir}/unmapped/{sample_id}/spikein_2.fastq.gz',
84
        index=genome_dir + '/index/star/univec/SA'
85
    output:
86
        bam='{output_dir}/bam/{sample_id}/univec.bam',
87
        unmapped1='{output_dir}/unmapped/{sample_id}/univec_1.fastq.gz',
88
        unmapped2='{output_dir}/unmapped/{sample_id}/univec_2.fastq.gz',
89
        log='{output_dir}/mapping_star/{sample_id}/univec/Log.final.out'
90
    params:
91
        output_prefix='{output_dir}/mapping_star/{sample_id}/univec/',
92
        index=genome_dir + '/index/star/univec',
93
        seedPerWindowNmax=20
94
    threads:
95
        config['threads_mapping']
96
    run:
97
        shell(map_command_pe)
98
99
rule map_rRNA_pe:
100
    input:
101
        reads1='{output_dir}/unmapped/{sample_id}/univec_1.fastq.gz',
102
        reads2='{output_dir}/unmapped/{sample_id}/univec_2.fastq.gz',
103
        index=genome_dir + '/index/star/rRNA/SA'
104
    output:
105
        bam='{output_dir}/bam/{sample_id}/rRNA.bam',
106
        unmapped1='{output_dir}/unmapped/{sample_id}/rRNA_1.fastq.gz',
107
        unmapped2='{output_dir}/unmapped/{sample_id}/rRNA_2.fastq.gz',
108
        log='{output_dir}/mapping_star/{sample_id}/rRNA/Log.final.out'
109
    params:
110
        output_prefix='{output_dir}/mapping_star/{sample_id}/rRNA/',
111
        index=genome_dir + '/index/star/rRNA',
112
        seedPerWindowNmax=20
113
    threads:
114
        config['threads_mapping']
115
    run:
116
        shell(map_command_pe)
117
118
rule map_genome_pe:
119
    input:
120
        reads1='{output_dir}/unmapped/{sample_id}/rRNA_1.fastq.gz',
121
        reads2='{output_dir}/unmapped/{sample_id}/rRNA_2.fastq.gz',
122
        index=genome_dir + '/index/star/genome_long_rna/SA'
123
    output:
124
        bam='{output_dir}/bam/{sample_id}/genome.bam',
125
        unmapped1='{output_dir}/unmapped/{sample_id}/genome_1.fastq.gz',
126
        unmapped2='{output_dir}/unmapped/{sample_id}/genome_2.fastq.gz',
127
        log='{output_dir}/mapping_star/{sample_id}/genome/Log.final.out'
128
    params:
129
        output_prefix='{output_dir}/mapping_star/{sample_id}/genome/',
130
        index=genome_dir + '/index/star/genome_long_rna',
131
        seedPerWindowNmax=50
132
    threads:
133
        config['threads_mapping']
134
    run:
135
        shell(map_command_pe)
136
    
137
rule map_circRNA_pe:
138
    input:
139
        reads1='{output_dir}/unmapped/{sample_id}/genome_1.fastq.gz',
140
        reads2='{output_dir}/unmapped/{sample_id}/genome_2.fastq.gz',
141
        index=genome_dir + '/index/star/circRNA/SA'
142
    output:
143
        bam='{output_dir}/bam/{sample_id}/circRNA.bam',
144
        unmapped1='{output_dir}/unmapped/{sample_id}/circRNA_1.fastq.gz',
145
        unmapped2='{output_dir}/unmapped/{sample_id}/circRNA_2.fastq.gz',
146
        log='{output_dir}/mapping_star/{sample_id}/circRNA/Log.final.out'
147
    params:
148
        output_prefix='{output_dir}/mapping_star/{sample_id}/circRNA/',
149
        index=genome_dir + '/index/star/circRNA',
150
        seedPerWindowNmax=20
151
    threads:
152
        config['threads_mapping']
153
    run:
154
        shell(map_command_pe)
155
156
rule sort_bam_by_name:
157
    input:
158
        '{output_dir}/bam/{sample_id}/{map_step}.bam'
159
    output:
160
        '{output_dir}/bam_sorted_by_name/{sample_id}/{map_step}.bam'
161
    params:
162
        temp_dir=config['temp_dir']
163
    shell:
164
        '''samtools sort -n -T {params.temp_dir} -o {output} {input}
165
        '''
166
167
rule remove_duplicates:
168
    input:
169
        bam='{output_dir}/bam/{sample_id}/{map_step}.bam'
170
    output:
171
        bam='{output_dir}/bam/{sample_id}/{map_step}_rmdup.bam',
172
        metrics='{output_dir}/log/{map_step}_rmdup/{sample_id}'
173
    wildcard_constraints:
174
        map_step='(rRNA)|(genome)|(circRNA)'
175
    shell:
176
        '''picard MarkDuplicates REMOVE_DUPLICATES=true \
177
            ASSUME_SORT_ORDER=queryname \
178
            I={input.bam} \
179
            O={output.bam} \
180
            M={output.metrics} \
181
            READ_NAME_REGEX=null
182
        '''
183
184
rule samtools_stats:
185
    '''Statistics for mapped reads
186
    '''
187
    input:
188
        '{output_dir}/bam/{sample_id}/{map_step}.bam'
189
    output:
190
        '{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
191
    shell:
192
        '''samtools stats {input} > {output}
193
        '''
194
195
rule parse_samtools_stats_pe:
196
    input:
197
        '{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
198
    output:
199
        fragment_counts='{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
200
        insert_size_average='{output_dir}/stats/insert_size_average/{sample_id}/{map_step}',
201
        insert_size_hist='{output_dir}/stats/insert_size_hist/{sample_id}/{map_step}',
202
        read_length_hist='{output_dir}/stats/read_length_hist/{sample_id}/{map_step}'
203
    wildcard_constraints:
204
        map_step='(?!clean).*'
205
    shell:
206
        '''awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "reads mapped and paired:") print int($3/2)}}' {input} > {output.fragment_counts}
207
        awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "insert size average:") print $3}}' {input} > {output.insert_size_average}
208
        awk 'BEGIN{{OFS="\t";FS="\t"}}/^IS/{{print $2,$3}}' {input} > {output.insert_size_hist}
209
        awk 'BEGIN{{OFS="\t";FS="\t"}}/^RL/{{print $2,$3}}' {input} > {output.read_length_hist}
210
        '''
211
212
rule summarize_read_length:
213
    input:
214
        lambda wildcards: expand('{output_dir}/stats/read_length_hist/{sample_id}/{map_step}',
215
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, map_step=map_steps)
216
    output:
217
        '{output_dir}/stats/mapped_read_length_by_sample/{sample_id}'
218
    run:
219
        import pandas as pd
220
221
        matrix = {}
222
        for filename in input:
223
            map_step = filename.split('/')[-1]
224
            matrix[map_step] = pd.read_table(filename, sep='\t', index_col=0, header=None, names=['read_length', map_step]).iloc[:, 0]
225
        matrix = pd.DataFrame(matrix)
226
        matrix = matrix.loc[:, map_steps]
227
        matrix.fillna(0, inplace=True)
228
        matrix = matrix.astype('int')
229
        matrix.index.name = 'read_length'
230
        matrix.to_csv(output[0], sep='\t', header=True, index=True)
231
232
rule summarize_insert_size:
233
    input:
234
        lambda wildcards: expand('{output_dir}/stats/insert_size_hist/{sample_id}/{map_step}',
235
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, map_step=map_steps)
236
    output:
237
        '{output_dir}/stats/mapped_insert_size_by_sample/{sample_id}'
238
    run:
239
        import pandas as pd
240
241
        matrix = {}
242
        for filename in input:
243
            map_step = filename.split('/')[-1]
244
            matrix[map_step] = pd.read_table(filename, sep='\t', index_col=0, header=None, names=['insert_size', map_step]).iloc[:, 0]
245
        matrix = pd.DataFrame(matrix)
246
        matrix = matrix.loc[:, map_steps]
247
        matrix.fillna(0, inplace=True)
248
        matrix = matrix.astype('int')
249
        matrix.index.name = 'insert_size'
250
        matrix.to_csv(output[0], sep='\t', header=True, index=True)
251
252
rule summarize_mapping_star:
253
    input:
254
        lambda wildcards: expand('{output_dir}/mapping_star/{sample_id}/{map_step}/Log.final.out',
255
            output_dir=wildcards.output_dir, sample_id=sample_ids, map_step=star_map_steps)
256
    output:
257
        '{output_dir}/summary/mapping_star.txt'
258
    run:
259
        import pandas as pd
260
261
        records = []
262
        columns = ['sample_id', 'map_step']
263
        for i, filename in enumerate(input):
264
            map_step = filename.split('/')[-2]
265
            sample_id = filename.split('/')[-3]
266
            record = {'sample_id': sample_id, 'map_step': map_step}
267
            with open(filename, 'r') as fin:
268
                for line in fin:
269
                    c = line.strip().split('|')
270
                    if len(c) == 2:
271
                        key, val = c[0].strip(), c[1].strip()
272
                        record[key] = val
273
                        if i == 0:
274
                            columns.append(key)
275
            records.append(record)
276
        summary = pd.DataFrame.from_records(records)
277
        summary = summary.reindex(columns=columns)
278
        summary.set_index('sample_id', inplace=True)
279
        summary.index.name = 'sample_id'
280
        summary.to_csv(output[0], sep='\t', header=True, na_rep='NA', index=True)
281
282
rule summarize_fragment_counts_jupyter:
283
    input:
284
        summary='{output_dir}/summary/mapping_star.txt',
285
        jupyter=package_dir + '/templates/summarize_mapping_star.ipynb'
286
    output:
287
        jupyter='{output_dir}/summary/mapping_star.ipynb',
288
        html='{output_dir}/summary/mapping_star.html'
289
    run:
290
        shell(nbconvert_command)