a b/exseek/snakefiles/mapping_long_se.snakemake
1
include: 'common.snakemake'
2
3
map_steps = ['rRNA', 'genome', 'circRNA']
4
if config['remove_duplicates_long']:
5
    map_steps += ['genome_rmdup', 'circRNA_rmdup', 'rRNA_rmdup']
6
7
def get_all_inputs(wildcards):
8
    available_inputs = dict(
9
        map_rRNA_paired=expand('{output_dir}/bam/{sample_id}/{map_step}.bam',
10
            output_dir=output_dir, sample_id=sample_ids, map_step=map_steps),
11
        summarize_read_length=expand('{output_dir}/stats/mapped_read_length_by_sample/{sample_id}',
12
            output_dir=output_dir, sample_id=sample_ids),
13
        count_clean_reads=expand('{output_dir}/stats/read_counts/{sample_id}/clean',
14
            output_dir=output_dir, sample_id=sample_ids)
15
    )
16
    enabled_inputs = list(available_inputs.keys())
17
    inputs = []
18
    for key, l in available_inputs.items():
19
        if key in enabled_inputs:
20
            inputs += l
21
    return inputs
22
23
rule all:
24
    input:
25
        get_all_inputs
26
27
map_command_se = '''STAR --genomeDir {params.index} \
28
            --readFilesIn {input.reads1} {input.reads2} \
29
            --runThreadN {threads} \
30
            --outFileNamePrefix {params.output_prefix} \
31
            --outSAMtype BAM Unsorted \
32
            --outReadsUnmapped Fastx \
33
            --readFilesCommand gzip -d -c \
34
            --outSAMmultNmax 1 \
35
            --seedPerWindowNmax {params.seedPerWindowNmax}
36
        mv {params.output_prefix}Aligned.out.bam {output.bam}
37
        gzip -c {params.output_prefix}Unmapped.out > {output.unmapped}
38
        rm -f {params.output_prefix}Unmapped.out
39
        '''
40
    
41
rule map_rRNA_se:
42
    input:
43
        reads='{output_dir}/unmapped/{sample_id}/clean.fa.gz',
44
        index=genome_dir + '/index/star/rRNA/SA'
45
    output:
46
        bam='{output_dir}/bam/{sample_id}/rRNA.bam',
47
        unmapped='{output_dir}/unmapped/{sample_id}/rRNA.fa.gz',
48
        log='{output_dir}/mapping_star/{sample_id}/rRNA/Log.final.out'
49
    params:
50
        output_prefix='{output_dir}/mapping_star/{sample_id}/rRNA/',
51
        index=genome_dir + '/index/star/rRNA',
52
        seedPerWindowNmax=20
53
    threads:
54
        config['threads_mapping']
55
    run:
56
        shell(map_command_se)
57
58
rule map_genome_se:
59
    input:
60
        reads='{output_dir}/unmapped/{sample_id}/rRNA.fa.gz',
61
        index=genome_dir + '/long_index/star/SA'
62
    output:
63
        bam='{output_dir}/bam/{sample_id}/genome.bam',
64
        unmapped='{output_dir}/unmapped/{sample_id}/genome.fa.gz',
65
        log='{output_dir}/mapping_star/{sample_id}/genome/Log.final.out'
66
    params:
67
        output_prefix='{output_dir}/mapping_star/{sample_id}/genome/',
68
        index=genome_dir + '/long_index/star',
69
        seedPerWindowNmax=50
70
    threads:
71
        config['threads_mapping']
72
    run:
73
        shell(map_command_pe)
74
    
75
rule map_circRNA_se:
76
    input:
77
        reads='{output_dir}/unmapped/{sample_id}/genome.fa.gz'
78
        index=genome_dir + '/index/star/circRNA/SA'
79
    output:
80
        bam='{output_dir}/bam/{sample_id}/circRNA.bam',
81
        unmapped='{output_dir}/unmapped/{sample_id}/circRNA.fa.gz',
82
        log='{output_dir}/mapping_star/{sample_id}/circRNA/Log.final.out'
83
    params:
84
        output_prefix='{output_dir}/mapping_star/{sample_id}/circRNA/',
85
        index=genome_dir + '/index/star/circRNA',
86
        seedPerWindowNmax=20
87
    threads:
88
        config['threads_mapping']
89
    run:
90
        shell(map_command_pe)
91
92
rule sort_bam_by_name:
93
    input:
94
        '{output_dir}/bam/{sample_id}/{map_step}.bam'
95
    output:
96
        '{output_dir}/bam_sorted_by_name/{sample_id}/{map_step}.bam'
97
    params:
98
        temp_dir=config['temp_dir']
99
    shell:
100
        '''samtools sort -n -T {params.temp_dir} -o {output} {input}
101
        '''
102
103
rule remove_duplicates:
104
    input:
105
        bam='{output_dir}/bam_sorted_by_name/{sample_id}/{map_step}.bam'
106
    output:
107
        bam='{output_dir}/bam/{sample_id}/{map_step}_rmdup.bam',
108
        metrics='{output_dir}/log/{map_step}_rmdup/{sample_id}'
109
    wildcard_constraints:
110
        map_step='(rRNA)|(genome)|(circRNA)'
111
    shell:
112
        '''picard MarkDuplicates REMOVE_DUPLICATES=true \
113
            ASSUME_SORT_ORDER=queryname \
114
            I={input.bam} \
115
            O={output.bam} \
116
            M={output.metrics}
117
        '''
118
119
rule samtools_stats:
120
    input:
121
        '{output_dir}/bam/{sample_id}/{map_step}.bam'
122
    output:
123
        '{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
124
    shell:
125
        '''samtools stats {input} > {output}
126
        '''
127
128
rule count_clean_reads:
129
    input:
130
        '{output_dir}/unmapped/{sample_id}/clean_1.fa.gz'
131
    output:
132
        '{output_dir}/stats/read_counts/{sample_id}/clean'
133
    threads:
134
        config['threads_compress']
135
    shell:
136
        '''pigz -p {threads} -d -c {input} | wc -l | awk '{{print int($0/2)}}' > {output}
137
        '''
138
139
rule parse_samtools_stats_se:
140
    input:
141
        '{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
142
    output:
143
        fragment_counts='{output_dir}/stats/read_counts/{sample_id}/{map_step}',
144
        read_length_hist='{output_dir}/stats/read_length_hist/{sample_id}/{map_step}'
145
    wildcard_constraints:
146
        map_step='(?!clean).*'
147
    shell:
148
        '''
149
        awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "reads mapped:") print int($3/2)}}' {input} > {output.fragment_counts}
150
        awk 'BEGIN{{OFS="\t";FS="\t"}}/^RL/{{print $2,$3}}' {input} > {output.read_length_hist}
151
        '''
152
153
rule summarize_read_length:
154
    input:
155
        lambda wildcards: expand('{output_dir}/stats/read_length_hist/{sample_id}/{map_step}',
156
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, map_step=map_steps)
157
    output:
158
        '{output_dir}/stats/mapped_read_length_by_sample/{sample_id}'
159
    run:
160
        import pandas as pd
161
162
        matrix = {}
163
        for filename in input:
164
            map_step = filename.split('/')[-1]
165
            matrix[map_step] = pd.read_table(filename, sep='\t', index_col=0, header=None, names=['read_length', map_step]).iloc[:, 0]
166
        matrix = pd.DataFrame(matrix)
167
        matrix = matrix.loc[:, map_steps]
168
        matrix.fillna(0, inplace=True)
169
        matrix = matrix.astype('int')
170
        matrix.index.name = 'read_length'
171
        matrix.to_csv(output[0], sep='\t', header=True, index=True)
172
173