|
a |
|
b/singlecellmultiomics/bamProcessing/bamToCountTable.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
import singlecellmultiomics.modularDemultiplexer |
|
|
4 |
import os |
|
|
5 |
import sys |
|
|
6 |
import pysam |
|
|
7 |
import collections |
|
|
8 |
import argparse |
|
|
9 |
import pandas as pd |
|
|
10 |
import numpy as np |
|
|
11 |
import itertools |
|
|
12 |
import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods |
|
|
13 |
import gzip # for loading blacklist bedfiles |
|
|
14 |
TagDefinitions = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
def coordinate_to_sliding_bin_locations(dp, bin_size, sliding_increment): |
|
|
18 |
""" |
|
|
19 |
Convert a single value to a list of overlapping bins |
|
|
20 |
|
|
|
21 |
Parameters |
|
|
22 |
---------- |
|
|
23 |
point : int |
|
|
24 |
coordinate to look up |
|
|
25 |
|
|
|
26 |
bin_size : int |
|
|
27 |
bin size |
|
|
28 |
|
|
|
29 |
sliding_increment : int |
|
|
30 |
sliding window offset, this is the increment between bins |
|
|
31 |
|
|
|
32 |
Returns |
|
|
33 |
------- |
|
|
34 |
start : int |
|
|
35 |
the start coordinate of the first overlapping bin |
|
|
36 |
end :int |
|
|
37 |
the end of the last overlapping bin |
|
|
38 |
|
|
|
39 |
start_id : int |
|
|
40 |
the index of the first overlapping bin |
|
|
41 |
end_id : int |
|
|
42 |
the index of the last overlapping bin |
|
|
43 |
|
|
|
44 |
""" |
|
|
45 |
start_id = int(np.ceil(((dp - bin_size) / sliding_increment))) |
|
|
46 |
start = start_id * sliding_increment |
|
|
47 |
end_id = int(np.floor(dp / sliding_increment)) |
|
|
48 |
end = end_id * sliding_increment + bin_size |
|
|
49 |
return start, end, start_id, end_id |
|
|
50 |
|
|
|
51 |
|
|
|
52 |
def coordinate_to_bins(point, bin_size, sliding_increment): |
|
|
53 |
""" |
|
|
54 |
Convert a single value to a list of overlapping bins |
|
|
55 |
|
|
|
56 |
Parameters |
|
|
57 |
---------- |
|
|
58 |
point : int |
|
|
59 |
coordinate to look up |
|
|
60 |
|
|
|
61 |
bin_size : int |
|
|
62 |
bin size |
|
|
63 |
|
|
|
64 |
sliding_increment : int |
|
|
65 |
sliding window offset, this is the increment between bins |
|
|
66 |
|
|
|
67 |
Returns |
|
|
68 |
------- |
|
|
69 |
list: [(bin_start,bin_end), .. ] |
|
|
70 |
|
|
|
71 |
""" |
|
|
72 |
start, end, start_id, end_id = coordinate_to_sliding_bin_locations( |
|
|
73 |
point, bin_size, sliding_increment) |
|
|
74 |
return [(i * sliding_increment, i * sliding_increment + bin_size) |
|
|
75 |
for i in range(start_id, end_id + 1)] |
|
|
76 |
|
|
|
77 |
|
|
|
78 |
def read_has_alternative_hits_to_non_alts(read): |
|
|
79 |
if read.has_tag('XA'): |
|
|
80 |
for alt_align in read.get_tag('XA').split(';'): |
|
|
81 |
if len(alt_align) == 0: # Sometimes this tag is empty for some reason |
|
|
82 |
continue |
|
|
83 |
|
|
|
84 |
hchrom, hpos, hcigar, hflag = alt_align.split(',') |
|
|
85 |
if not hchrom.endswith('_alt'): |
|
|
86 |
return True |
|
|
87 |
return False |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
def readTag(read, tag, defective='None'): |
|
|
91 |
try: |
|
|
92 |
value = singlecellmultiomics.modularDemultiplexer.metaFromRead( |
|
|
93 |
read, tag) |
|
|
94 |
except Exception as e: |
|
|
95 |
value = defective |
|
|
96 |
return value |
|
|
97 |
|
|
|
98 |
# define if a read should be used |
|
|
99 |
|
|
|
100 |
|
|
|
101 |
def read_should_be_counted(read, args, blacklist_dic = None): |
|
|
102 |
""" |
|
|
103 |
Check if a read should be counted given the filter arguments |
|
|
104 |
|
|
|
105 |
Parameters |
|
|
106 |
---------- |
|
|
107 |
read : pysam.AlignedSegment or None |
|
|
108 |
read to check if it should be counted |
|
|
109 |
|
|
|
110 |
Returns |
|
|
111 |
------- |
|
|
112 |
bool |
|
|
113 |
""" |
|
|
114 |
|
|
|
115 |
if args.r1only and read.is_read2: |
|
|
116 |
return False |
|
|
117 |
if args.r2only and read.is_read1: |
|
|
118 |
return False |
|
|
119 |
|
|
|
120 |
if args.filterMP: |
|
|
121 |
if not read.has_tag('mp'): |
|
|
122 |
return False |
|
|
123 |
|
|
|
124 |
if read.get_tag('mp')!='unique': |
|
|
125 |
return False |
|
|
126 |
|
|
|
127 |
if read is None or read.is_qcfail: |
|
|
128 |
return False |
|
|
129 |
|
|
|
130 |
# Mapping quality below threshold |
|
|
131 |
if read.mapping_quality < args.minMQ: |
|
|
132 |
return False |
|
|
133 |
|
|
|
134 |
|
|
|
135 |
if args.proper_pairs_only and not read.is_proper_pair: |
|
|
136 |
return False |
|
|
137 |
|
|
|
138 |
if args.no_indels and ('I' in read.cigarstring or 'D' in read.cigarstring): |
|
|
139 |
return False |
|
|
140 |
|
|
|
141 |
if args.max_base_edits is not None and read.has_tag('NM') and int(read.get_tag('NM'))>args.max_base_edits: |
|
|
142 |
return False |
|
|
143 |
|
|
|
144 |
if args.no_softclips and 'S' in read.cigarstring: |
|
|
145 |
return False |
|
|
146 |
|
|
|
147 |
# Read has alternative hits |
|
|
148 |
if args.filterXA: |
|
|
149 |
if read_has_alternative_hits_to_non_alts(read): |
|
|
150 |
return False |
|
|
151 |
|
|
|
152 |
# Read is a duplicate |
|
|
153 |
# (args.dedup and ( not read.has_tag('RC') or (read.has_tag('RC') and read.get_tag('RC')!=1))): |
|
|
154 |
if read.is_unmapped or (args.dedup and ( |
|
|
155 |
read.has_tag("RR") or read.is_duplicate)): |
|
|
156 |
return False |
|
|
157 |
|
|
|
158 |
# Read is in blacklist |
|
|
159 |
if blacklist_dic is not None: |
|
|
160 |
if read.reference_name in blacklist_dic: |
|
|
161 |
# iterate through tuples of startend to check |
|
|
162 |
for startend in blacklist_dic[read.reference_name]: |
|
|
163 |
# start is 0-based inclusive, end is 0-based exclusive |
|
|
164 |
start_bad = read.reference_start >= startend[0] and read.reference_start < startend[1] |
|
|
165 |
end_bad = read.reference_end >= startend[0] and read.reference_end < startend[1] |
|
|
166 |
if start_bad or end_bad: |
|
|
167 |
return False |
|
|
168 |
|
|
|
169 |
return True |
|
|
170 |
|
|
|
171 |
|
|
|
172 |
def tagToHumanName(tag, TagDefinitions): |
|
|
173 |
if tag not in TagDefinitions: |
|
|
174 |
return tag |
|
|
175 |
return TagDefinitions[tag].humanName |
|
|
176 |
|
|
|
177 |
|
|
|
178 |
def assignReads( |
|
|
179 |
read, |
|
|
180 |
countTable, |
|
|
181 |
args, |
|
|
182 |
joinFeatures, |
|
|
183 |
featureTags, |
|
|
184 |
sampleTags, |
|
|
185 |
more_args=[], |
|
|
186 |
blacklist_dic = None): |
|
|
187 |
|
|
|
188 |
assigned = 0 |
|
|
189 |
if not read_should_be_counted(read, args, blacklist_dic): |
|
|
190 |
return assigned |
|
|
191 |
|
|
|
192 |
# Get the sample to which this read belongs |
|
|
193 |
sample = tuple(readTag(read, tag) for tag in sampleTags) |
|
|
194 |
|
|
|
195 |
# Decide how many counts this read yields |
|
|
196 |
countToAdd = 1 |
|
|
197 |
|
|
|
198 |
if args.r1only or args.r2only: |
|
|
199 |
countToAdd = 1 |
|
|
200 |
elif not args.doNotDivideFragments: # not not = True |
|
|
201 |
# IF the read is paired, and the mate mapped, we should count 0.5, and will end up |
|
|
202 |
# with 1 in total |
|
|
203 |
|
|
|
204 |
countToAdd = (0.5 if (read.is_paired and not read.mate_is_unmapped) else 1) |
|
|
205 |
|
|
|
206 |
assigned += 1 |
|
|
207 |
|
|
|
208 |
if args.divideMultimapping: |
|
|
209 |
if read.has_tag('XA'): |
|
|
210 |
countToAdd = countToAdd / len(read.get_tag('XA').split(';')) |
|
|
211 |
elif read.has_tag('NH'): |
|
|
212 |
countToAdd = countToAdd / int(read.get_tag('NH')) |
|
|
213 |
else: |
|
|
214 |
countToAdd = countToAdd |
|
|
215 |
|
|
|
216 |
# Define what counts to add to what samples |
|
|
217 |
# [ (sample, feature, increment), .. ] |
|
|
218 |
count_increment = [] |
|
|
219 |
|
|
|
220 |
feature_dict = {} |
|
|
221 |
joined_feature = [] |
|
|
222 |
used_features = [] |
|
|
223 |
for tag in featureTags: |
|
|
224 |
feat = str(readTag(read, tag)) |
|
|
225 |
feature_dict[tag] = feat |
|
|
226 |
if args.bin is not None and args.binTag == tag: |
|
|
227 |
continue |
|
|
228 |
if args.byValue is not None and tag == args.byValue: |
|
|
229 |
continue |
|
|
230 |
joined_feature.append(feat) |
|
|
231 |
used_features.append(tag) |
|
|
232 |
|
|
|
233 |
if joinFeatures: |
|
|
234 |
if args.splitFeatures: |
|
|
235 |
# Obtain all key states: |
|
|
236 |
key_states = [] |
|
|
237 |
for tag in featureTags: |
|
|
238 |
value = feature_dict.get(tag) |
|
|
239 |
key_states.append(value.split(args.featureDelimiter)) |
|
|
240 |
|
|
|
241 |
for state in itertools.product(*key_states): |
|
|
242 |
joined_feature = [] |
|
|
243 |
feature_dict = { |
|
|
244 |
feature: value |
|
|
245 |
for feature, value in zip(featureTags, state) |
|
|
246 |
} |
|
|
247 |
for feature, value in zip(featureTags, state): |
|
|
248 |
if args.bin is not None and args.binTag == feature: |
|
|
249 |
continue |
|
|
250 |
|
|
|
251 |
if len(value) > 0: |
|
|
252 |
joined_feature.append(value) |
|
|
253 |
else: |
|
|
254 |
joined_feature.append('None') |
|
|
255 |
|
|
|
256 |
if args.byValue is not None: |
|
|
257 |
raise NotImplementedError( |
|
|
258 |
'By value is not implemented for --splitFeatures') |
|
|
259 |
|
|
|
260 |
else: |
|
|
261 |
count_increment.append({ |
|
|
262 |
'key': tuple(joined_feature), |
|
|
263 |
'features': feature_dict, |
|
|
264 |
'samples': [sample], |
|
|
265 |
'increment': countToAdd}) |
|
|
266 |
joined_feature[0] |
|
|
267 |
else: |
|
|
268 |
if args.byValue is not None: |
|
|
269 |
|
|
|
270 |
try: |
|
|
271 |
add = float(feature_dict.get(args.byValue, 0)) |
|
|
272 |
except ValueError: |
|
|
273 |
add = 0 |
|
|
274 |
|
|
|
275 |
count_increment.append({ |
|
|
276 |
'key': tuple(joined_feature), |
|
|
277 |
'features': feature_dict, |
|
|
278 |
'samples': [sample], |
|
|
279 |
'increment': add}) |
|
|
280 |
|
|
|
281 |
else: |
|
|
282 |
count_increment.append({ |
|
|
283 |
'key': tuple(joined_feature), |
|
|
284 |
'features': feature_dict, |
|
|
285 |
'samples': [sample], |
|
|
286 |
'increment': countToAdd}) |
|
|
287 |
else: |
|
|
288 |
if args.bin is not None: |
|
|
289 |
raise NotImplementedError('Try using -joinedFeatureTags') |
|
|
290 |
|
|
|
291 |
for feature, value in feature_dict.items(): |
|
|
292 |
if args.byValue is not None and feature == args.byValue: |
|
|
293 |
|
|
|
294 |
try: |
|
|
295 |
add = float(feature_dict.get(args.byValue, 0)) |
|
|
296 |
except ValueError: |
|
|
297 |
add = 0 |
|
|
298 |
|
|
|
299 |
count_increment.append({ |
|
|
300 |
'key': tuple(joined_feature), |
|
|
301 |
'features': feature_dict, |
|
|
302 |
'samples': [sample], |
|
|
303 |
'increment': add}) |
|
|
304 |
|
|
|
305 |
elif args.splitFeatures: |
|
|
306 |
for f in value.split(args.featureDelimiter): |
|
|
307 |
count_increment.append({ |
|
|
308 |
'key': (f), |
|
|
309 |
'features': {feature: f}, |
|
|
310 |
'samples': [sample], |
|
|
311 |
'increment': countToAdd |
|
|
312 |
}) |
|
|
313 |
|
|
|
314 |
else: |
|
|
315 |
count_increment.append({ |
|
|
316 |
'key': (value, ), |
|
|
317 |
'features': {feature: value}, |
|
|
318 |
'samples': [sample], |
|
|
319 |
'increment': countToAdd}) |
|
|
320 |
|
|
|
321 |
""" |
|
|
322 |
Now we have a list of dicts: |
|
|
323 |
{ |
|
|
324 |
'key':feature, |
|
|
325 |
'features':{feature:value}, |
|
|
326 |
'samples':[sample], |
|
|
327 |
'increment':countToAdd}) |
|
|
328 |
} |
|
|
329 |
""" |
|
|
330 |
|
|
|
331 |
# increment the count table accordingly: |
|
|
332 |
if args.bin is not None: |
|
|
333 |
for dtable in count_increment: |
|
|
334 |
key = dtable['key'] |
|
|
335 |
countToAdd = dtable['increment'] |
|
|
336 |
samples = dtable['samples'] |
|
|
337 |
value_to_be_binned = dtable['features'].get(args.binTag, None) |
|
|
338 |
|
|
|
339 |
if value_to_be_binned is None or value_to_be_binned == 'None': |
|
|
340 |
continue |
|
|
341 |
|
|
|
342 |
for start, end in coordinate_to_bins( |
|
|
343 |
int(value_to_be_binned), args.bin, args.sliding): |
|
|
344 |
# Reject bins outside boundary |
|
|
345 |
if not args.keepOverBounds and ( |
|
|
346 |
start < 0 or end > args.ref_lengths[read.reference_name]): |
|
|
347 |
continue |
|
|
348 |
for sample in samples: |
|
|
349 |
countTable[sample][tuple( |
|
|
350 |
list(key) + [start, end])] += countToAdd |
|
|
351 |
|
|
|
352 |
elif args.bedfile is not None: |
|
|
353 |
|
|
|
354 |
for dtable in count_increment: |
|
|
355 |
if args.byValue: |
|
|
356 |
key = [args.byValue] |
|
|
357 |
else: |
|
|
358 |
key = dtable['key'] |
|
|
359 |
countToAdd = dtable['increment'] |
|
|
360 |
samples = dtable['samples'] |
|
|
361 |
|
|
|
362 |
# Get features from bedfile |
|
|
363 |
start, end, bname = more_args[0], more_args[1], more_args[2] |
|
|
364 |
jfeat = tuple(list(key) + [start, end, bname]) |
|
|
365 |
if len(key): |
|
|
366 |
countTable[sample][jfeat] += countToAdd |
|
|
367 |
# else: this will also emit non assigned reads |
|
|
368 |
# countTable[sample][ 'None' ] += countToAdd |
|
|
369 |
|
|
|
370 |
else: |
|
|
371 |
for dtable in count_increment: |
|
|
372 |
key = dtable['key'] |
|
|
373 |
countToAdd = dtable['increment'] |
|
|
374 |
samples = dtable['samples'] |
|
|
375 |
|
|
|
376 |
for sample in samples: |
|
|
377 |
if len(key) == 1: |
|
|
378 |
countTable[sample][key[0]] += countToAdd |
|
|
379 |
else: |
|
|
380 |
countTable[sample][key] += countToAdd |
|
|
381 |
|
|
|
382 |
return assigned |
|
|
383 |
|
|
|
384 |
|
|
|
385 |
def create_count_table(args, return_df=False): |
|
|
386 |
|
|
|
387 |
if len(args.alignmentfiles) == 0: |
|
|
388 |
raise ValueError("Supply at least one bam file") |
|
|
389 |
if args.bedfile is not None: |
|
|
390 |
assert(os.path.isfile(args.bedfile)) |
|
|
391 |
|
|
|
392 |
if args.sliding is None: |
|
|
393 |
args.sliding = args.bin |
|
|
394 |
|
|
|
395 |
if not return_df and args.o is None and args.alignmentfiles is not None: |
|
|
396 |
args.showtags = True |
|
|
397 |
|
|
|
398 |
if args.showtags: |
|
|
399 |
# Find which tags are available in the file: |
|
|
400 |
head = 1000 |
|
|
401 |
tagObs = collections.Counter() |
|
|
402 |
for bamFile in args.alignmentfiles: |
|
|
403 |
with pysam.AlignmentFile(bamFile) as f: |
|
|
404 |
for i, read in enumerate(f): |
|
|
405 |
tagObs += collections.Counter([k for k, |
|
|
406 |
v in read.get_tags(with_value_type=False)]) |
|
|
407 |
if i == (head - 1): |
|
|
408 |
break |
|
|
409 |
import colorama |
|
|
410 |
|
|
|
411 |
print( |
|
|
412 |
f'{colorama.Style.BRIGHT}Tags seen in the supplied bam file(s):{colorama.Style.RESET_ALL}') |
|
|
413 |
for observedTag in tagObs: |
|
|
414 |
tag = observedTag |
|
|
415 |
if observedTag in TagDefinitions: |
|
|
416 |
t = TagDefinitions[observedTag] |
|
|
417 |
humanName = t.humanName |
|
|
418 |
isPhred = t.isPhred |
|
|
419 |
else: |
|
|
420 |
t = observedTag |
|
|
421 |
isPhred = False |
|
|
422 |
humanName = f'{colorama.Style.RESET_ALL}<No information available>' |
|
|
423 |
|
|
|
424 |
allReadsHaveTag = (tagObs[tag] == head) |
|
|
425 |
color = colorama.Fore.GREEN if allReadsHaveTag else colorama.Fore.YELLOW |
|
|
426 |
print( |
|
|
427 |
f'{color}{ colorama.Style.BRIGHT}{tag}{colorama.Style.RESET_ALL}\t{color+humanName}\t{colorama.Style.DIM}{"PHRED" if isPhred else ""}{colorama.Style.RESET_ALL}' + |
|
|
428 |
f'\t{"" if allReadsHaveTag else "Not all reads have this tag"}') |
|
|
429 |
|
|
|
430 |
print(f'{colorama.Style.BRIGHT}\nAVO lab tag list:{colorama.Style.RESET_ALL}') |
|
|
431 |
for tag, t in TagDefinitions.items(): |
|
|
432 |
print(f'{colorama.Style.BRIGHT}{tag}{colorama.Style.RESET_ALL}\t{t.humanName}\t{colorama.Style.DIM}{"PHRED" if t.isPhred else ""}{colorama.Style.RESET_ALL}') |
|
|
433 |
exit() |
|
|
434 |
|
|
|
435 |
if args.o is None and not return_df: |
|
|
436 |
raise ValueError('Supply an output file') |
|
|
437 |
|
|
|
438 |
if args.alignmentfiles is None: |
|
|
439 |
raise ValueError('Supply alignment (BAM) files') |
|
|
440 |
|
|
|
441 |
joinFeatures = False |
|
|
442 |
featureTags = [] |
|
|
443 |
if args.featureTags is not None: |
|
|
444 |
featureTags = args.featureTags.split(',') |
|
|
445 |
|
|
|
446 |
if args.joinedFeatureTags is not None: |
|
|
447 |
featureTags = args.joinedFeatureTags.split(',') |
|
|
448 |
joinFeatures = True |
|
|
449 |
|
|
|
450 |
# When -byValue is used, and joined feature tags are supplied, automatically append the args.byValue tag, otherwise it will get lost |
|
|
451 |
if args.byValue is not None and len(featureTags)>0 and args.byValue not in featureTags: |
|
|
452 |
featureTags.append(args.byValue) |
|
|
453 |
|
|
|
454 |
|
|
|
455 |
if args.bin is not None and args.binTag not in featureTags: |
|
|
456 |
print("The bin tag was not supplied as feature, automatically appending the bin feature.") |
|
|
457 |
featureTags.append(args.binTag) |
|
|
458 |
|
|
|
459 |
if len(featureTags) == 0: |
|
|
460 |
raise ValueError( |
|
|
461 |
"No features supplied! Please supply -featureTags -joinedFeatureTags and or -binTag") |
|
|
462 |
|
|
|
463 |
sampleTags = args.sampleTags.split(',') |
|
|
464 |
countTable = collections.defaultdict( |
|
|
465 |
collections.Counter) # cell->feature->count |
|
|
466 |
|
|
|
467 |
if args.blacklist is not None: |
|
|
468 |
# create blacklist dictionary {chromosome : [ (start1, end1), ..., (startN, endN) ]} |
|
|
469 |
# used to check each read and exclude if it is within any of these start end sites |
|
|
470 |
# |
|
|
471 |
blacklist_dic = {} |
|
|
472 |
print("Creating blacklist dictionary:") |
|
|
473 |
with open(args.blacklist, mode='r') as blfile: |
|
|
474 |
for row in blfile: |
|
|
475 |
parts = row.strip().split() |
|
|
476 |
chromo, start, end = parts[0], int(parts[1]), int(parts[2]) |
|
|
477 |
if chromo not in blacklist_dic: |
|
|
478 |
# init chromo |
|
|
479 |
blacklist_dic[chromo] = [] # list of tuples |
|
|
480 |
blacklist_dic[chromo].append( (start, end) ) |
|
|
481 |
print(blacklist_dic) |
|
|
482 |
else: |
|
|
483 |
blacklist_dic = None |
|
|
484 |
|
|
|
485 |
assigned = 0 |
|
|
486 |
for bamFile in args.alignmentfiles: |
|
|
487 |
|
|
|
488 |
with pysam.AlignmentFile(bamFile) as f: |
|
|
489 |
i = 0 # make sure i is defined |
|
|
490 |
if args.bin: |
|
|
491 |
# Obtain the reference sequence lengths |
|
|
492 |
ref_lengths = { |
|
|
493 |
r: f.get_reference_length(r) for r in f.references} |
|
|
494 |
args.ref_lengths = ref_lengths |
|
|
495 |
if args.bedfile is None: |
|
|
496 |
# for adding counts associated with a tag OR with binning |
|
|
497 |
if args.contig is not None: |
|
|
498 |
pysam_iterator = f.fetch(args.contig) |
|
|
499 |
else: |
|
|
500 |
pysam_iterator = f |
|
|
501 |
|
|
|
502 |
for i, read in enumerate(pysam_iterator): |
|
|
503 |
if i % 1_000_000 == 0: |
|
|
504 |
print( |
|
|
505 |
f"{bamFile} Processed {i} reads, assigned {assigned}, completion:{100*(i/(0.001+f.mapped+f.unmapped+f.nocoordinate))}%") |
|
|
506 |
|
|
|
507 |
if args.head is not None and i > args.head: |
|
|
508 |
break |
|
|
509 |
|
|
|
510 |
assigned += assignReads(read, |
|
|
511 |
countTable, |
|
|
512 |
args, |
|
|
513 |
joinFeatures, |
|
|
514 |
featureTags, |
|
|
515 |
sampleTags, |
|
|
516 |
blacklist_dic = blacklist_dic) |
|
|
517 |
else: # args.bedfile is not None |
|
|
518 |
# for adding counts associated with a bedfile |
|
|
519 |
with open(args.bedfile, "r") as bfile: |
|
|
520 |
#breader = csv.reader(bfile, delimiter = "\t") |
|
|
521 |
for row in bfile: |
|
|
522 |
|
|
|
523 |
parts = row.strip().split() |
|
|
524 |
chromo, start, end, bname = parts[0], int(float( |
|
|
525 |
parts[1])), int(float(parts[2])), parts[3] |
|
|
526 |
if args.contig is not None and chromo != args.contig: |
|
|
527 |
continue |
|
|
528 |
for i, read in enumerate(f.fetch(chromo, start, end)): |
|
|
529 |
if i % 1_000_000 == 0: |
|
|
530 |
print( |
|
|
531 |
f"{bamFile} Processed {i} reads, assigned {assigned}, completion:{100*(i/(0.001+f.mapped+f.unmapped+f.nocoordinate))}%") |
|
|
532 |
assigned += assignReads(read, |
|
|
533 |
countTable, |
|
|
534 |
args, |
|
|
535 |
joinFeatures, |
|
|
536 |
featureTags, |
|
|
537 |
sampleTags, |
|
|
538 |
more_args=[start, |
|
|
539 |
end, |
|
|
540 |
bname], |
|
|
541 |
blacklist_dic = blacklist_dic) |
|
|
542 |
|
|
|
543 |
if args.head is not None and i > args.head: |
|
|
544 |
break |
|
|
545 |
|
|
|
546 |
print( |
|
|
547 |
f"Finished: {bamFile} Processed {i} reads, assigned {assigned}") |
|
|
548 |
print(f"Finished counting, now exporting to {args.o}") |
|
|
549 |
df = pd.DataFrame.from_dict(countTable) |
|
|
550 |
|
|
|
551 |
# Set names of indices |
|
|
552 |
if not args.noNames: |
|
|
553 |
df.columns.set_names([tagToHumanName(t, TagDefinitions) |
|
|
554 |
for t in sampleTags], inplace=True) |
|
|
555 |
|
|
|
556 |
try: |
|
|
557 |
if args.bin is not None: |
|
|
558 |
index_names = [tagToHumanName( |
|
|
559 |
t, TagDefinitions) for t in featureTags if t != args.binTag] + ['start', 'end'] |
|
|
560 |
df.index.set_names(index_names, inplace=True) |
|
|
561 |
elif args.bedfile is not None: |
|
|
562 |
index_names = [tagToHumanName( |
|
|
563 |
t, TagDefinitions) for t in featureTags if t != args.binTag] + ['start', 'end', 'bname'] |
|
|
564 |
df.index.set_names(index_names, inplace=True) |
|
|
565 |
elif joinFeatures: |
|
|
566 |
index_names = [ |
|
|
567 |
tagToHumanName( |
|
|
568 |
t, TagDefinitions) for t in featureTags] |
|
|
569 |
df.index.set_names(index_names, inplace=True) |
|
|
570 |
else: |
|
|
571 |
index_names = ','.join( |
|
|
572 |
[tagToHumanName(t, TagDefinitions) for t in featureTags]) |
|
|
573 |
df.index.set_names(index_names, inplace=True) |
|
|
574 |
except Exception as e: |
|
|
575 |
pass |
|
|
576 |
print(index_names) |
|
|
577 |
|
|
|
578 |
if return_df: |
|
|
579 |
return df |
|
|
580 |
|
|
|
581 |
if args.bulk: |
|
|
582 |
sum_row = df.sum(axis=1) |
|
|
583 |
df = pd.DataFrame(sum_row) |
|
|
584 |
df.columns = ["Bulkseq"] |
|
|
585 |
|
|
|
586 |
if args.o.endswith('.pickle') or args.o.endswith('.pickle.gz'): |
|
|
587 |
df.to_pickle(args.o) |
|
|
588 |
else: |
|
|
589 |
df.to_csv(args.o) |
|
|
590 |
return args.o |
|
|
591 |
print("Finished export.") |
|
|
592 |
|
|
|
593 |
|
|
|
594 |
if __name__ == '__main__': |
|
|
595 |
argparser = argparse.ArgumentParser( |
|
|
596 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
597 |
description='Convert BAM file(s) which has been annotated using featureCounts or otherwise to a count matrix') |
|
|
598 |
argparser.add_argument( |
|
|
599 |
'-o', |
|
|
600 |
type=str, |
|
|
601 |
help="output csv path, or pandas dataframe if path ends with pickle.gz", |
|
|
602 |
required=False) |
|
|
603 |
argparser.add_argument( |
|
|
604 |
'-featureTags', |
|
|
605 |
type=str, |
|
|
606 |
default=None, |
|
|
607 |
help='Tag(s) used for defining the COLUMNS of the matrix. Single dimension.') |
|
|
608 |
argparser.add_argument( |
|
|
609 |
'-joinedFeatureTags', |
|
|
610 |
type=str, |
|
|
611 |
default=None, |
|
|
612 |
help='These define the COLUMNS of your matrix. For example if you want allele (DA) and restriction site (DS) use DA,DS. If you want a column containing the chromosome mapped to use "chrom" as feature. Use this argument if you want to use a multidimensional index') |
|
|
613 |
argparser.add_argument( |
|
|
614 |
'-sampleTags', |
|
|
615 |
type=str, |
|
|
616 |
default='SM', |
|
|
617 |
help='Comma separated tags defining the names of the ROWS in the output matrix') |
|
|
618 |
argparser.add_argument('alignmentfiles', type=str, nargs='*') |
|
|
619 |
argparser.add_argument( |
|
|
620 |
'-head', |
|
|
621 |
type=int, |
|
|
622 |
help='Run the algorithm only on the first N reads to check if the result looks like what you expect.') |
|
|
623 |
|
|
|
624 |
|
|
|
625 |
argparser.add_argument( |
|
|
626 |
'--bulk', |
|
|
627 |
action='store_true', |
|
|
628 |
help='Include this when making a count table for bulkseq data. This operation will sum the counts of all sampleTags into a single column.') |
|
|
629 |
argparser.add_argument( |
|
|
630 |
'--r1only', |
|
|
631 |
action='store_true', |
|
|
632 |
help='Only count R1') |
|
|
633 |
|
|
|
634 |
argparser.add_argument( |
|
|
635 |
'--r2only', |
|
|
636 |
action='store_true', |
|
|
637 |
help='Only count R2') |
|
|
638 |
|
|
|
639 |
argparser.add_argument( |
|
|
640 |
'--splitFeatures', |
|
|
641 |
action='store_true', |
|
|
642 |
help='Split features by , . For example if a read has a feature Foo,Bar increase counts for both Foo and Bar') |
|
|
643 |
argparser.add_argument('-featureDelimiter', type=str, default=',') |
|
|
644 |
argparser.add_argument( |
|
|
645 |
'-contig', |
|
|
646 |
type=str, |
|
|
647 |
help='Run only on this chromosome') |
|
|
648 |
|
|
|
649 |
multimapping_args = argparser.add_argument_group('Multimapping', '') |
|
|
650 |
multimapping_args.add_argument( |
|
|
651 |
'--divideMultimapping', |
|
|
652 |
action='store_true', |
|
|
653 |
help='Divide multimapping reads over all targets. Requires the XA or NH tag to be set.') |
|
|
654 |
multimapping_args.add_argument( |
|
|
655 |
'--doNotDivideFragments', |
|
|
656 |
action='store_true', |
|
|
657 |
help='When used every read is counted once, a fragment will count as two reads. 0.5 otherwise') |
|
|
658 |
|
|
|
659 |
|
|
|
660 |
binning_args = argparser.add_argument_group('Binning', '') |
|
|
661 |
#binning_args.add_argument('-offset', type=int, default=0, help="Add offset to bin. If bin=1000, offset=200, f=1999 -> 1200. f=4199 -> 3200") |
|
|
662 |
binning_args.add_argument( |
|
|
663 |
'-sliding', |
|
|
664 |
type=int, |
|
|
665 |
help="make bins overlapping, the stepsize is equal to the supplied value here. If nothing is supplied this value equals the bin size") |
|
|
666 |
binning_args.add_argument( |
|
|
667 |
'--keepOverBounds', |
|
|
668 |
action='store_true', |
|
|
669 |
help="Keep bins which go over chromsome bounds (start<0) end > chromsome length") |
|
|
670 |
|
|
|
671 |
binning_args.add_argument( |
|
|
672 |
'-bin', |
|
|
673 |
type=int, |
|
|
674 |
help="Devide and floor to bin features. If bin=1000, f=1999 -> 1000.") |
|
|
675 |
# binning_args.add_argument('--showBinEnd', action='store_true', help="If |
|
|
676 |
# True, then show DS column as 120000-220000, otherwise 120000 only. This |
|
|
677 |
# specifies the bin range in which the read was counted" ) this is now |
|
|
678 |
# always on! |
|
|
679 |
binning_args.add_argument('-binTag', default='DS') |
|
|
680 |
binning_args.add_argument( |
|
|
681 |
'-byValue', |
|
|
682 |
type=str, |
|
|
683 |
help='Extract the value from the supplied tag and use this as count to add') |
|
|
684 |
binning_args.add_argument('-blacklist', metavar='BED FILE BLACKLIST TO FILTER OUT', help = 'Bedfile of blacklist regions to exclude', default=None) |
|
|
685 |
bed_args = argparser.add_argument_group('Bedfiles', '') |
|
|
686 |
bed_args.add_argument( |
|
|
687 |
'-bedfile', |
|
|
688 |
type=str, |
|
|
689 |
help="Bed file containing 3 columns, chromo, start, end to be read for fetching counts") |
|
|
690 |
|
|
|
691 |
|
|
|
692 |
|
|
|
693 |
filters = argparser.add_argument_group('Filters', '') |
|
|
694 |
|
|
|
695 |
filters.add_argument( |
|
|
696 |
'--proper_pairs_only', |
|
|
697 |
action='store_true', |
|
|
698 |
help='Only count reads mapped in a proper pair (within the expected insert size range)') |
|
|
699 |
|
|
|
700 |
filters.add_argument( |
|
|
701 |
'--no_softclips', |
|
|
702 |
action='store_true', |
|
|
703 |
help='Only count reads without softclips') |
|
|
704 |
|
|
|
705 |
filters.add_argument( |
|
|
706 |
'-max_base_edits', |
|
|
707 |
type=int, |
|
|
708 |
help='Count reads with at most this value of bases being different than the reference') |
|
|
709 |
|
|
|
710 |
|
|
|
711 |
filters.add_argument( |
|
|
712 |
'--no_indels', |
|
|
713 |
action='store_true', |
|
|
714 |
help='Only count reads without indels') |
|
|
715 |
|
|
|
716 |
filters.add_argument( |
|
|
717 |
'--dedup', |
|
|
718 |
action='store_true', |
|
|
719 |
help='Count only the first occurence of a molecule. Requires RC tag to be set. Reads without RC tag will be ignored!') |
|
|
720 |
|
|
|
721 |
filters.add_argument( |
|
|
722 |
'-minMQ', |
|
|
723 |
type=int, |
|
|
724 |
default=0, |
|
|
725 |
help="minimum mapping quality") |
|
|
726 |
|
|
|
727 |
filters.add_argument( |
|
|
728 |
'--filterMP', |
|
|
729 |
action= 'store_true', |
|
|
730 |
help="Filter reads which are not uniquely mappable, this is based on presence on the `mp` tag, which can be generated by bamtagmultiome.py ") |
|
|
731 |
|
|
|
732 |
filters.add_argument( |
|
|
733 |
'--filterXA', |
|
|
734 |
action='store_true', |
|
|
735 |
help="Do not count reads where the XA (alternative hits) tag has been set for a non-alternative locus.") |
|
|
736 |
|
|
|
737 |
argparser.add_argument( |
|
|
738 |
'--noNames', |
|
|
739 |
action='store_true', |
|
|
740 |
help='Do not set names of the index and columns, this simplifies the resulting CSV/pickle file') |
|
|
741 |
argparser.add_argument( |
|
|
742 |
'--showtags', |
|
|
743 |
action='store_true', |
|
|
744 |
help='Show a list of commonly used tags') |
|
|
745 |
|
|
|
746 |
args = argparser.parse_args() |
|
|
747 |
create_count_table(args) |