Switch to unified view

a b/exseek/snakefiles/count_matrix_small.snakemake
1
include: 'common.snakemake'
2
3
# remove rRNA and spikein
4
rna_types = list(filter(lambda x: x not in ('rRNA', 'spikein', 'univec'), rna_types))
5
6
def get_all_inputs(wildcards):
7
    available_inputs = dict(
8
        count_matrix=expand('{output_dir}/count_matrix/transcript.txt', output_dir=output_dir),
9
        count_matrix_mirna=expand('{output_dir}/count_matrix/transcript_mirna.txt', output_dir=output_dir),
10
        count_matrix_mirna_and_long_fragments=expand('{output_dir}/count_matrix/mirna_and_long_fragments.txt', output_dir=output_dir)
11
    )
12
    if 'spikein' in config['rna_types']:
13
        available_inputs['count_matrix_spikein'] = expand('{output_dir}/count_matrix/spikein.txt', output_dir=output_dir)
14
15
    enabled_inputs = list(available_inputs.keys())
16
    inputs = []
17
    for key, l in available_inputs.items():
18
        if key in enabled_inputs:
19
            inputs += l
20
    return inputs
21
22
rule all:
23
    input:
24
        get_all_inputs
25
26
rule featurecounts_gbam:
27
    input:
28
        bam='{output_dir}/gbam/{sample_id}/{rna_type}.bam',
29
        gtf=genome_dir + '/gtf_by_biotype/{rna_type}.gtf'
30
    output:
31
        counts='{output_dir}/counts_by_biotype/featurecounts/{sample_id}/{rna_type}',
32
        summary='{output_dir}/counts_by_biotype/featurecounts/{sample_id}/{rna_type}.summary'
33
    params:
34
        strandness={'forward': 1, 'reverse': 2}.get(config['strandness'], 0),
35
        paired_end={True: '-p', False: ''}[config['paired_end']],
36
        min_mapping_quality=config['min_mapping_quality']
37
    log:
38
        '{output_dir}/log/featurecounts_gbam/{sample_id}/{rna_type}'
39
    shell:
40
        '''featureCounts -t exon -g gene_id -s {params.strandness} -Q {params.min_mapping_quality} \
41
            {params.paired_end} -a {input.gtf} -o {output.counts} {input.bam}
42
        '''
43
44
rule merge_featurecounts_by_biotype:
45
    input:
46
        lambda wildcards: expand('{output_dir}/counts_by_biotype/featurecounts/{sample_id}/{rna_type}',
47
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types)
48
    output:
49
        '{output_dir}/counts/featurecounts/{sample_id}'
50
    shell:
51
        '''cat {input} | awk 'BEGIN{{OFS="\t";FS="\t"}}!($0 ~ /^#/)&& !($0 ~/^Geneid/) {{print $1,$NF}}' > {output}
52
        '''
53
54
rule count_matrix_mirna:
55
    '''Count matrix of miRNA only
56
    '''
57
    input:
58
        '{output_dir}/count_matrix/transcript.txt'
59
    output:
60
        '{output_dir}/count_matrix/transcript_mirna.txt'
61
    shell:
62
        '''awk 'NR==1{{print}}NR>1{{split($0,a,"|");if(a[2] == "miRNA") print}}' {input} > {output}
63
        '''
64
65
rule combine_mirna_and_domains_rna:
66
    '''Count matrix of miRNA and domains
67
    Remove genomic regions
68
    '''
69
    input:
70
        mirna='{output_dir}/count_matrix/transcript_mirna.txt',
71
        long_fragments='{output_dir}/count_matrix/long_fragments.txt'
72
    output:
73
        '{output_dir}/count_matrix/mirna_and_long_fragments.txt'
74
    shell:
75
        '''{{
76
            cat {input.mirna}
77
            awk '(NR>1)&&(!($0 ~ /genomic/))' {input.long_fragments}
78
        }} > {output}
79
        '''
80
81
rule count_matrix_small:
82
    '''Count matrix of miRNA and piRNA
83
    '''
84
    input:
85
        '{output_dir}/count_matrix/transcript.txt'
86
    output:
87
        '{output_dir}/count_matrix/transcript_small.txt'
88
    shell:
89
        '''awk 'NR==1{{print}}NR>1{{split($0,a,"|");if((a[2] == "miRNA") || (a[2] == "piRNA")) print}}' {input} > {output}
90
        '''
91
92
93
def get_count_matrix_full_length_inputs(wildcards):
94
    rna_types = [rna_type for rna_type in rna_types_with_gtf]
95
    if wildcards.count_method == 'transcript':
96
        rna_types = [rna_type for rna_type in rna_types_with_gtf]
97
        return {
98
            'counts': expand('{output_dir}/counts/transcript/{sample_id}',
99
                output_dir=wildcards.output_dir, sample_id=sample_ids),
100
            'transcript_table': expand(genome_dir + '/transcript_table/{rna_type}.txt', rna_type=rna_types),
101
            'transcript_sizes': expand(genome_dir + '/chrom_sizes/{rna_type}', rna_type=rna_types)
102
        }
103
    elif wildcards.count_method == 'spikein':
104
        return {
105
            'counts': expand('{output_dir}/counts/spikein/{sample_id}',
106
                output_dir=wildcards.output_dir, sample_id=sample_ids),
107
            'transcript_table': [config['spikein_dir'] + '/transcript_table/spikein.txt'],
108
            'transcript_sizes': [config['spikein_dir'] + '/chrom_sizes/spikein'],
109
        }
110
    else:
111
        raise ValueError('unknown count method: {}'.format(wildcards.count_method))
112
113
rule count_matrix_full_length:
114
    '''Count matrix of full length transcripts (all types of RNAs)
115
    '''
116
    input:
117
        unpack(get_count_matrix_full_length_inputs)
118
    output:
119
        '{output_dir}/count_matrix/{count_method}.txt'
120
    wildcard_constraints:
121
        count_method='(transcript)|(spikein).*'
122
    run:
123
        import pandas as pd
124
        import os
125
        from collections import OrderedDict
126
        import numpy as np
127
128
        counts = OrderedDict()
129
        gene_ids = np.zeros(0, dtype='str')
130
        for filename in input.counts:
131
            sample_id = os.path.basename(filename)
132
            counts[sample_id] = pd.read_table(filename, sep='\t', header=None, index_col=0,
133
                names=['feature', 'count'], dtype={'feature': 'str', 'count': 'int'}).iloc[:, 0]
134
            counts[sample_id].index = counts[sample_id].index.astype('str')
135
            counts[sample_id] = counts[sample_id][counts[sample_id] > 0]
136
            gene_ids = np.union1d(gene_ids, counts[sample_id].index.values)
137
        # annotate features
138
        transcript_table = []
139
        for filename in input.transcript_table:
140
            transcript_table.append(pd.read_table(filename, sep='\t', dtype='str'))
141
        transcript_table = pd.concat(transcript_table, axis=0)
142
        transcript_table = transcript_table.drop_duplicates('transcript_id', keep='first')
143
        transcript_table.set_index('transcript_id', inplace=True, drop=False)
144
145
        transcript_sizes = []
146
        for filename in input.transcript_sizes:
147
            transcript_sizes.append(pd.read_table(filename, 
148
                sep='\t', header=None, names=['transcript_id', 'length'], dtype='str'))   
149
        transcript_sizes = pd.concat(transcript_sizes, axis=0)  
150
        transcript_sizes = transcript_sizes.drop_duplicates('transcript_id', keep='first')
151
        transcript_sizes.set_index('transcript_id', inplace=True)
152
        transcript_sizes = transcript_sizes.loc[:, 'length']
153
154
        print(gene_ids)
155
        print(transcript_table.head())
156
        reindex_gene_ids = transcript_table.loc[:, 'transcript_id'].reindex(gene_ids)
157
        isna_index = reindex_gene_ids.isna()
158
        print(reindex_gene_ids[isna_index].head(25))
159
        gene_ids = reindex_gene_ids[~isna_index].index.values
160
        feature_names = transcript_table.loc[gene_ids, 'gene_id'].values \
161
            + '|' + transcript_table.loc[gene_ids, 'gene_type'].values \
162
            + '|' + transcript_table.loc[gene_ids, 'gene_name'].values \
163
            + '|' + transcript_table.loc[gene_ids, 'gene_id'].values \
164
            + '|' + transcript_table.loc[gene_ids, 'transcript_id'].values \
165
            + '|0|' + transcript_sizes[transcript_table.loc[gene_ids, 'transcript_id'].values].values.astype('str')
166
167
        #print('len(feature_names) = {}'.format(len(feature_names)))
168
        #print('len(gene_ids) = {}'.format(len(gene_ids)))
169
        # create matrix
170
        matrix = pd.DataFrame(np.zeros((len(gene_ids), len(counts)), dtype=np.int32), 
171
            index=gene_ids, columns=list(counts.keys()))
172
        for sample_id in sample_ids:
173
            counts[sample_id] = counts[sample_id].reindex(gene_ids).dropna()
174
            matrix.loc[counts[sample_id].index.values, sample_id] = counts[sample_id].values
175
        matrix.index = feature_names
176
        matrix.index.name = 'feature'
177
        
178
        matrix.to_csv(output[0], sep='\t', header=True, index=True, na_rep='NA')
179
180
rule htseq_gbam:
181
    input:
182
        bam='{output_dir}/gbam/{sample_id}/{rna_type}.bam',
183
        gtf=genome_dir + '/gtf_by_biotype/{rna_type}.gtf'
184
    output:
185
        counts='{output_dir}/counts_by_biotype/htseq/{sample_id}/{rna_type}'
186
    params:
187
        strandness={'forward': 'yes', 'reverse': 'reverse'}.get(config['strandness'], 'no'),
188
        min_mapping_quality=config['min_mapping_quality']
189
    shell:
190
        '''htseq-count -t exon -i gene_id -f bam -m intersection-strict -a {params.min_mapping_quality} \
191
            -s {params.strandness} {input.bam} {input.gtf} > {output.counts}
192
        '''
193
194
rule merge_htseq_by_biotype:
195
    input:
196
        lambda wildcards: expand('{output_dir}/counts_by_biotype/htseq/{sample_id}/{rna_type}',
197
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types)
198
    output:
199
        '{output_dir}/counts/htseq/{sample_id}'
200
    shell:
201
        '''
202
        cat {input} | grep -v '^__' > {output}
203
        '''
204
205
rule count_transcript:
206
    input:
207
        bam='{output_dir}/tbam/{sample_id}/{rna_type}.bam'
208
    output:
209
        '{output_dir}/counts_by_biotype/transcript/{sample_id}/{rna_type}'
210
    params:
211
        min_mapping_quality=config['min_mapping_quality'],
212
        strandness=config['strandness']
213
    #wildcard_constraints:
214
    #    rna_type='(?!miRNA).*'
215
    shell:
216
        '''{bin_dir}/count_reads.py count_transcript -i {input.bam} -s {params.strandness} -q {params.min_mapping_quality} -o {output}
217
        '''
218
219
"""
220
rule count_mature_mirna:
221
    input:
222
        bam='{output_dir}/tbam/{sample_id}/miRNA.bam',
223
        annotation=genome_dir + '/gff3/miRBase.gff3'
224
    output:
225
        '{output_dir}/counts_by_biotype/transcript/{sample_id}/miRNA'
226
    params:
227
        min_mapping_quality=config['min_mapping_quality']
228
    shell:
229
        '''{bin_dir}/count_reads.py count_mature_mirna -i {input.bam} -a {input.annotation} -q {params.min_mapping_quality} -o {output}
230
        '''
231
"""
232
233
rule merge_transcript_by_biotype:
234
    input:
235
        lambda wildcards: expand('{output_dir}/counts_by_biotype/transcript/{sample_id}/{rna_type}',
236
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types)
237
    output:
238
        '{output_dir}/counts/transcript/{sample_id}'
239
    shell:
240
        '''cat {input} > {output}
241
        '''
242
243
rule merge_spikein:
244
    input:
245
        lambda wildcards: expand('{output_dir}/counts_by_biotype/transcript/{sample_id}/spikein',
246
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id)
247
    output:
248
        '{output_dir}/counts/spikein/{sample_id}'
249
    shell:
250
        '''cat {input} > {output}
251
        '''