|
a |
|
b/exseek/snakefiles/count_matrix_small.snakemake |
|
|
1 |
include: 'common.snakemake' |
|
|
2 |
|
|
|
3 |
# remove rRNA and spikein |
|
|
4 |
rna_types = list(filter(lambda x: x not in ('rRNA', 'spikein', 'univec'), rna_types)) |
|
|
5 |
|
|
|
6 |
def get_all_inputs(wildcards): |
|
|
7 |
available_inputs = dict( |
|
|
8 |
count_matrix=expand('{output_dir}/count_matrix/transcript.txt', output_dir=output_dir), |
|
|
9 |
count_matrix_mirna=expand('{output_dir}/count_matrix/transcript_mirna.txt', output_dir=output_dir), |
|
|
10 |
count_matrix_mirna_and_long_fragments=expand('{output_dir}/count_matrix/mirna_and_long_fragments.txt', output_dir=output_dir) |
|
|
11 |
) |
|
|
12 |
if 'spikein' in config['rna_types']: |
|
|
13 |
available_inputs['count_matrix_spikein'] = expand('{output_dir}/count_matrix/spikein.txt', output_dir=output_dir) |
|
|
14 |
|
|
|
15 |
enabled_inputs = list(available_inputs.keys()) |
|
|
16 |
inputs = [] |
|
|
17 |
for key, l in available_inputs.items(): |
|
|
18 |
if key in enabled_inputs: |
|
|
19 |
inputs += l |
|
|
20 |
return inputs |
|
|
21 |
|
|
|
22 |
rule all: |
|
|
23 |
input: |
|
|
24 |
get_all_inputs |
|
|
25 |
|
|
|
26 |
rule featurecounts_gbam: |
|
|
27 |
input: |
|
|
28 |
bam='{output_dir}/gbam/{sample_id}/{rna_type}.bam', |
|
|
29 |
gtf=genome_dir + '/gtf_by_biotype/{rna_type}.gtf' |
|
|
30 |
output: |
|
|
31 |
counts='{output_dir}/counts_by_biotype/featurecounts/{sample_id}/{rna_type}', |
|
|
32 |
summary='{output_dir}/counts_by_biotype/featurecounts/{sample_id}/{rna_type}.summary' |
|
|
33 |
params: |
|
|
34 |
strandness={'forward': 1, 'reverse': 2}.get(config['strandness'], 0), |
|
|
35 |
paired_end={True: '-p', False: ''}[config['paired_end']], |
|
|
36 |
min_mapping_quality=config['min_mapping_quality'] |
|
|
37 |
log: |
|
|
38 |
'{output_dir}/log/featurecounts_gbam/{sample_id}/{rna_type}' |
|
|
39 |
shell: |
|
|
40 |
'''featureCounts -t exon -g gene_id -s {params.strandness} -Q {params.min_mapping_quality} \ |
|
|
41 |
{params.paired_end} -a {input.gtf} -o {output.counts} {input.bam} |
|
|
42 |
''' |
|
|
43 |
|
|
|
44 |
rule merge_featurecounts_by_biotype: |
|
|
45 |
input: |
|
|
46 |
lambda wildcards: expand('{output_dir}/counts_by_biotype/featurecounts/{sample_id}/{rna_type}', |
|
|
47 |
output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types) |
|
|
48 |
output: |
|
|
49 |
'{output_dir}/counts/featurecounts/{sample_id}' |
|
|
50 |
shell: |
|
|
51 |
'''cat {input} | awk 'BEGIN{{OFS="\t";FS="\t"}}!($0 ~ /^#/)&& !($0 ~/^Geneid/) {{print $1,$NF}}' > {output} |
|
|
52 |
''' |
|
|
53 |
|
|
|
54 |
rule count_matrix_mirna: |
|
|
55 |
'''Count matrix of miRNA only |
|
|
56 |
''' |
|
|
57 |
input: |
|
|
58 |
'{output_dir}/count_matrix/transcript.txt' |
|
|
59 |
output: |
|
|
60 |
'{output_dir}/count_matrix/transcript_mirna.txt' |
|
|
61 |
shell: |
|
|
62 |
'''awk 'NR==1{{print}}NR>1{{split($0,a,"|");if(a[2] == "miRNA") print}}' {input} > {output} |
|
|
63 |
''' |
|
|
64 |
|
|
|
65 |
rule combine_mirna_and_domains_rna: |
|
|
66 |
'''Count matrix of miRNA and domains |
|
|
67 |
Remove genomic regions |
|
|
68 |
''' |
|
|
69 |
input: |
|
|
70 |
mirna='{output_dir}/count_matrix/transcript_mirna.txt', |
|
|
71 |
long_fragments='{output_dir}/count_matrix/long_fragments.txt' |
|
|
72 |
output: |
|
|
73 |
'{output_dir}/count_matrix/mirna_and_long_fragments.txt' |
|
|
74 |
shell: |
|
|
75 |
'''{{ |
|
|
76 |
cat {input.mirna} |
|
|
77 |
awk '(NR>1)&&(!($0 ~ /genomic/))' {input.long_fragments} |
|
|
78 |
}} > {output} |
|
|
79 |
''' |
|
|
80 |
|
|
|
81 |
rule count_matrix_small: |
|
|
82 |
'''Count matrix of miRNA and piRNA |
|
|
83 |
''' |
|
|
84 |
input: |
|
|
85 |
'{output_dir}/count_matrix/transcript.txt' |
|
|
86 |
output: |
|
|
87 |
'{output_dir}/count_matrix/transcript_small.txt' |
|
|
88 |
shell: |
|
|
89 |
'''awk 'NR==1{{print}}NR>1{{split($0,a,"|");if((a[2] == "miRNA") || (a[2] == "piRNA")) print}}' {input} > {output} |
|
|
90 |
''' |
|
|
91 |
|
|
|
92 |
|
|
|
93 |
def get_count_matrix_full_length_inputs(wildcards): |
|
|
94 |
rna_types = [rna_type for rna_type in rna_types_with_gtf] |
|
|
95 |
if wildcards.count_method == 'transcript': |
|
|
96 |
rna_types = [rna_type for rna_type in rna_types_with_gtf] |
|
|
97 |
return { |
|
|
98 |
'counts': expand('{output_dir}/counts/transcript/{sample_id}', |
|
|
99 |
output_dir=wildcards.output_dir, sample_id=sample_ids), |
|
|
100 |
'transcript_table': expand(genome_dir + '/transcript_table/{rna_type}.txt', rna_type=rna_types), |
|
|
101 |
'transcript_sizes': expand(genome_dir + '/chrom_sizes/{rna_type}', rna_type=rna_types) |
|
|
102 |
} |
|
|
103 |
elif wildcards.count_method == 'spikein': |
|
|
104 |
return { |
|
|
105 |
'counts': expand('{output_dir}/counts/spikein/{sample_id}', |
|
|
106 |
output_dir=wildcards.output_dir, sample_id=sample_ids), |
|
|
107 |
'transcript_table': [config['spikein_dir'] + '/transcript_table/spikein.txt'], |
|
|
108 |
'transcript_sizes': [config['spikein_dir'] + '/chrom_sizes/spikein'], |
|
|
109 |
} |
|
|
110 |
else: |
|
|
111 |
raise ValueError('unknown count method: {}'.format(wildcards.count_method)) |
|
|
112 |
|
|
|
113 |
rule count_matrix_full_length: |
|
|
114 |
'''Count matrix of full length transcripts (all types of RNAs) |
|
|
115 |
''' |
|
|
116 |
input: |
|
|
117 |
unpack(get_count_matrix_full_length_inputs) |
|
|
118 |
output: |
|
|
119 |
'{output_dir}/count_matrix/{count_method}.txt' |
|
|
120 |
wildcard_constraints: |
|
|
121 |
count_method='(transcript)|(spikein).*' |
|
|
122 |
run: |
|
|
123 |
import pandas as pd |
|
|
124 |
import os |
|
|
125 |
from collections import OrderedDict |
|
|
126 |
import numpy as np |
|
|
127 |
|
|
|
128 |
counts = OrderedDict() |
|
|
129 |
gene_ids = np.zeros(0, dtype='str') |
|
|
130 |
for filename in input.counts: |
|
|
131 |
sample_id = os.path.basename(filename) |
|
|
132 |
counts[sample_id] = pd.read_table(filename, sep='\t', header=None, index_col=0, |
|
|
133 |
names=['feature', 'count'], dtype={'feature': 'str', 'count': 'int'}).iloc[:, 0] |
|
|
134 |
counts[sample_id].index = counts[sample_id].index.astype('str') |
|
|
135 |
counts[sample_id] = counts[sample_id][counts[sample_id] > 0] |
|
|
136 |
gene_ids = np.union1d(gene_ids, counts[sample_id].index.values) |
|
|
137 |
# annotate features |
|
|
138 |
transcript_table = [] |
|
|
139 |
for filename in input.transcript_table: |
|
|
140 |
transcript_table.append(pd.read_table(filename, sep='\t', dtype='str')) |
|
|
141 |
transcript_table = pd.concat(transcript_table, axis=0) |
|
|
142 |
transcript_table = transcript_table.drop_duplicates('transcript_id', keep='first') |
|
|
143 |
transcript_table.set_index('transcript_id', inplace=True, drop=False) |
|
|
144 |
|
|
|
145 |
transcript_sizes = [] |
|
|
146 |
for filename in input.transcript_sizes: |
|
|
147 |
transcript_sizes.append(pd.read_table(filename, |
|
|
148 |
sep='\t', header=None, names=['transcript_id', 'length'], dtype='str')) |
|
|
149 |
transcript_sizes = pd.concat(transcript_sizes, axis=0) |
|
|
150 |
transcript_sizes = transcript_sizes.drop_duplicates('transcript_id', keep='first') |
|
|
151 |
transcript_sizes.set_index('transcript_id', inplace=True) |
|
|
152 |
transcript_sizes = transcript_sizes.loc[:, 'length'] |
|
|
153 |
|
|
|
154 |
print(gene_ids) |
|
|
155 |
print(transcript_table.head()) |
|
|
156 |
reindex_gene_ids = transcript_table.loc[:, 'transcript_id'].reindex(gene_ids) |
|
|
157 |
isna_index = reindex_gene_ids.isna() |
|
|
158 |
print(reindex_gene_ids[isna_index].head(25)) |
|
|
159 |
gene_ids = reindex_gene_ids[~isna_index].index.values |
|
|
160 |
feature_names = transcript_table.loc[gene_ids, 'gene_id'].values \ |
|
|
161 |
+ '|' + transcript_table.loc[gene_ids, 'gene_type'].values \ |
|
|
162 |
+ '|' + transcript_table.loc[gene_ids, 'gene_name'].values \ |
|
|
163 |
+ '|' + transcript_table.loc[gene_ids, 'gene_id'].values \ |
|
|
164 |
+ '|' + transcript_table.loc[gene_ids, 'transcript_id'].values \ |
|
|
165 |
+ '|0|' + transcript_sizes[transcript_table.loc[gene_ids, 'transcript_id'].values].values.astype('str') |
|
|
166 |
|
|
|
167 |
#print('len(feature_names) = {}'.format(len(feature_names))) |
|
|
168 |
#print('len(gene_ids) = {}'.format(len(gene_ids))) |
|
|
169 |
# create matrix |
|
|
170 |
matrix = pd.DataFrame(np.zeros((len(gene_ids), len(counts)), dtype=np.int32), |
|
|
171 |
index=gene_ids, columns=list(counts.keys())) |
|
|
172 |
for sample_id in sample_ids: |
|
|
173 |
counts[sample_id] = counts[sample_id].reindex(gene_ids).dropna() |
|
|
174 |
matrix.loc[counts[sample_id].index.values, sample_id] = counts[sample_id].values |
|
|
175 |
matrix.index = feature_names |
|
|
176 |
matrix.index.name = 'feature' |
|
|
177 |
|
|
|
178 |
matrix.to_csv(output[0], sep='\t', header=True, index=True, na_rep='NA') |
|
|
179 |
|
|
|
180 |
rule htseq_gbam: |
|
|
181 |
input: |
|
|
182 |
bam='{output_dir}/gbam/{sample_id}/{rna_type}.bam', |
|
|
183 |
gtf=genome_dir + '/gtf_by_biotype/{rna_type}.gtf' |
|
|
184 |
output: |
|
|
185 |
counts='{output_dir}/counts_by_biotype/htseq/{sample_id}/{rna_type}' |
|
|
186 |
params: |
|
|
187 |
strandness={'forward': 'yes', 'reverse': 'reverse'}.get(config['strandness'], 'no'), |
|
|
188 |
min_mapping_quality=config['min_mapping_quality'] |
|
|
189 |
shell: |
|
|
190 |
'''htseq-count -t exon -i gene_id -f bam -m intersection-strict -a {params.min_mapping_quality} \ |
|
|
191 |
-s {params.strandness} {input.bam} {input.gtf} > {output.counts} |
|
|
192 |
''' |
|
|
193 |
|
|
|
194 |
rule merge_htseq_by_biotype: |
|
|
195 |
input: |
|
|
196 |
lambda wildcards: expand('{output_dir}/counts_by_biotype/htseq/{sample_id}/{rna_type}', |
|
|
197 |
output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types) |
|
|
198 |
output: |
|
|
199 |
'{output_dir}/counts/htseq/{sample_id}' |
|
|
200 |
shell: |
|
|
201 |
''' |
|
|
202 |
cat {input} | grep -v '^__' > {output} |
|
|
203 |
''' |
|
|
204 |
|
|
|
205 |
rule count_transcript: |
|
|
206 |
input: |
|
|
207 |
bam='{output_dir}/tbam/{sample_id}/{rna_type}.bam' |
|
|
208 |
output: |
|
|
209 |
'{output_dir}/counts_by_biotype/transcript/{sample_id}/{rna_type}' |
|
|
210 |
params: |
|
|
211 |
min_mapping_quality=config['min_mapping_quality'], |
|
|
212 |
strandness=config['strandness'] |
|
|
213 |
#wildcard_constraints: |
|
|
214 |
# rna_type='(?!miRNA).*' |
|
|
215 |
shell: |
|
|
216 |
'''{bin_dir}/count_reads.py count_transcript -i {input.bam} -s {params.strandness} -q {params.min_mapping_quality} -o {output} |
|
|
217 |
''' |
|
|
218 |
|
|
|
219 |
""" |
|
|
220 |
rule count_mature_mirna: |
|
|
221 |
input: |
|
|
222 |
bam='{output_dir}/tbam/{sample_id}/miRNA.bam', |
|
|
223 |
annotation=genome_dir + '/gff3/miRBase.gff3' |
|
|
224 |
output: |
|
|
225 |
'{output_dir}/counts_by_biotype/transcript/{sample_id}/miRNA' |
|
|
226 |
params: |
|
|
227 |
min_mapping_quality=config['min_mapping_quality'] |
|
|
228 |
shell: |
|
|
229 |
'''{bin_dir}/count_reads.py count_mature_mirna -i {input.bam} -a {input.annotation} -q {params.min_mapping_quality} -o {output} |
|
|
230 |
''' |
|
|
231 |
""" |
|
|
232 |
|
|
|
233 |
rule merge_transcript_by_biotype: |
|
|
234 |
input: |
|
|
235 |
lambda wildcards: expand('{output_dir}/counts_by_biotype/transcript/{sample_id}/{rna_type}', |
|
|
236 |
output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types) |
|
|
237 |
output: |
|
|
238 |
'{output_dir}/counts/transcript/{sample_id}' |
|
|
239 |
shell: |
|
|
240 |
'''cat {input} > {output} |
|
|
241 |
''' |
|
|
242 |
|
|
|
243 |
rule merge_spikein: |
|
|
244 |
input: |
|
|
245 |
lambda wildcards: expand('{output_dir}/counts_by_biotype/transcript/{sample_id}/spikein', |
|
|
246 |
output_dir=wildcards.output_dir, sample_id=wildcards.sample_id) |
|
|
247 |
output: |
|
|
248 |
'{output_dir}/counts/spikein/{sample_id}' |
|
|
249 |
shell: |
|
|
250 |
'''cat {input} > {output} |
|
|
251 |
''' |