|
a |
|
b/singlecellmultiomics/fragment/fragment.py |
|
|
1 |
import itertools |
|
|
2 |
from singlecellmultiomics.utils.sequtils import hamming_distance, get_consensus_dictionaries, pick_best_base_call |
|
|
3 |
import pysamiterators.iterators |
|
|
4 |
import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods |
|
|
5 |
from singlecellmultiomics.utils import style_str |
|
|
6 |
from singlecellmultiomics.bamProcessing import get_read_group_from_read |
|
|
7 |
from singlecellmultiomics.features import FeatureAnnotatedObject |
|
|
8 |
|
|
|
9 |
complement = str.maketrans('ATCGN', 'TAGCN') |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
class Fragment(): |
|
|
13 |
""" |
|
|
14 |
This class holds 1 or more reads which are derived from the same cluster |
|
|
15 |
|
|
|
16 |
Example: |
|
|
17 |
Generate a Fragment with a single associated read:: |
|
|
18 |
|
|
|
19 |
>>> from singlecellmultiomics.molecule import Molecule |
|
|
20 |
>>> from singlecellmultiomics.fragment import Fragment |
|
|
21 |
>>> import pysam |
|
|
22 |
>>> read = pysam.AlignedSegment() |
|
|
23 |
>>> read.reference_start = 30 |
|
|
24 |
>>> read.query_name = 'R1' |
|
|
25 |
>>> read.mapping_quality = 30 |
|
|
26 |
>>> read.set_tag('SM','CELL_1') # The sample to which the sample belongs is extracted from the SM tag |
|
|
27 |
>>> read.set_tag('RX','CAT') # The UMI is extracted from the RX tag |
|
|
28 |
>>> read.query_sequence = "CATGTATCCGGGCTTAA" |
|
|
29 |
>>> read.query_qualities = [30] * len(read.query_sequence) |
|
|
30 |
>>> read.cigarstring = f'{len(read.query_sequence)}M' |
|
|
31 |
>>> Fragment([read]) |
|
|
32 |
Fragment: |
|
|
33 |
sample:CELL_1 |
|
|
34 |
umi:CAT |
|
|
35 |
span:None 30-47 |
|
|
36 |
strand:+ |
|
|
37 |
has R1: yes |
|
|
38 |
has R2: no |
|
|
39 |
randomer trimmed: no |
|
|
40 |
|
|
|
41 |
Warning: |
|
|
42 |
Make sure the RX and SM tags of the read are set! If these are encoded |
|
|
43 |
in the read name, use singlecellmultiomics.universalBamTagger.customreads |
|
|
44 |
for conversion. |
|
|
45 |
|
|
|
46 |
""" |
|
|
47 |
|
|
|
48 |
def __init__(self, reads, |
|
|
49 |
assignment_radius: int = 0, |
|
|
50 |
umi_hamming_distance: int = 1, |
|
|
51 |
R1_primer_length: int = 0, |
|
|
52 |
R2_primer_length: int = 6, |
|
|
53 |
tag_definitions: list = None, |
|
|
54 |
max_fragment_size: int = None, |
|
|
55 |
mapping_dir=(False, True), |
|
|
56 |
max_NUC_stretch: int = None, |
|
|
57 |
read_group_format: int = 0, # R1 forward, R2 reverse |
|
|
58 |
library_name: str = None, # Overwrites the library name |
|
|
59 |
single_end: bool = False |
|
|
60 |
): |
|
|
61 |
""" |
|
|
62 |
Initialise Fragment |
|
|
63 |
|
|
|
64 |
Args: |
|
|
65 |
reads( list ): |
|
|
66 |
list containing pysam AlignedSegment reads |
|
|
67 |
|
|
|
68 |
assignment_radius( int ): |
|
|
69 |
Assignment radius for molecule resolver |
|
|
70 |
|
|
|
71 |
umi_hamming_distance( int ): |
|
|
72 |
umi_hamming_distance distance threshold for associating multiple fragments to a molecule |
|
|
73 |
|
|
|
74 |
R1_primer_length(int): |
|
|
75 |
length of R1 primer, these bases are not taken into account when calculating a consensus |
|
|
76 |
|
|
|
77 |
R2_primer_length(int): |
|
|
78 |
length of R2 primer, these bases are not taken into account when calculating a consensus, this length is auto-detected/ overwritten when the rS tag is set |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
tag_definitions(list): |
|
|
82 |
sam TagDefinitions |
|
|
83 |
|
|
|
84 |
max_fragment_size (int): |
|
|
85 |
When specified, fragments with a fragment size larger than the specified value are flagged as qcfail |
|
|
86 |
|
|
|
87 |
mapping_dir( tuple ): |
|
|
88 |
expected mapping direction of mates, |
|
|
89 |
True for reverse, False for forward. This parameter is used in dovetail detection |
|
|
90 |
|
|
|
91 |
read_group_format(int) : see get_read_group() |
|
|
92 |
Returns: |
|
|
93 |
html(string) : html representation of the fragment |
|
|
94 |
""" |
|
|
95 |
self.R1_primer_length = R1_primer_length |
|
|
96 |
self.R2_primer_length = R2_primer_length |
|
|
97 |
if tag_definitions is None: |
|
|
98 |
tag_definitions = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions |
|
|
99 |
self.tag_definitions = tag_definitions |
|
|
100 |
self.assignment_radius = assignment_radius |
|
|
101 |
self.umi_hamming_distance = umi_hamming_distance |
|
|
102 |
self.mapping_dir = mapping_dir |
|
|
103 |
self.reads = reads |
|
|
104 |
self.strand = None |
|
|
105 |
self.meta = {} # dictionary of meta data |
|
|
106 |
self.is_mapped = None |
|
|
107 |
# Check for multimapping |
|
|
108 |
self.is_multimapped = True |
|
|
109 |
self.mapping_quality = 0 |
|
|
110 |
self.match_hash = None |
|
|
111 |
self.safe_span = None # wether the span of the fragment could be determined |
|
|
112 |
self.unsafe_trimmed = False # wether primers have been trimmed off |
|
|
113 |
self.random_primer_sequence = None |
|
|
114 |
self.max_fragment_size = max_fragment_size |
|
|
115 |
self.read_group_format = read_group_format |
|
|
116 |
self.max_NUC_stretch = max_NUC_stretch |
|
|
117 |
self.qcfail = False |
|
|
118 |
self.single_end = single_end |
|
|
119 |
|
|
|
120 |
# Span:\ |
|
|
121 |
self.span = [None, None, None] |
|
|
122 |
|
|
|
123 |
self.umi = None |
|
|
124 |
|
|
|
125 |
# Force R1=read1 R2=read2: |
|
|
126 |
|
|
|
127 |
for i, read in enumerate(self.reads): |
|
|
128 |
|
|
|
129 |
if read is None: |
|
|
130 |
continue |
|
|
131 |
|
|
|
132 |
if self.max_NUC_stretch is not None and ( |
|
|
133 |
self.max_NUC_stretch*'A' in read.seq or |
|
|
134 |
self.max_NUC_stretch*'G' in read.seq or |
|
|
135 |
self.max_NUC_stretch*'T' in read.seq or |
|
|
136 |
self.max_NUC_stretch*'C' in read.seq): |
|
|
137 |
|
|
|
138 |
self.set_rejection_reason('HomoPolymer', set_qcfail=True) |
|
|
139 |
self.qcfail=True |
|
|
140 |
break |
|
|
141 |
|
|
|
142 |
if read.is_qcfail: |
|
|
143 |
self.qcfail = True |
|
|
144 |
|
|
|
145 |
if read.has_tag('rS'): |
|
|
146 |
self.random_primer_sequence = read.get_tag('rS') |
|
|
147 |
self.unsafe_trimmed = True |
|
|
148 |
self.R2_primer_length = 0 # Set R2 primer length to zero, as the primer has been removed |
|
|
149 |
|
|
|
150 |
if not read.is_unmapped: |
|
|
151 |
self.is_mapped = True |
|
|
152 |
if read.mapping_quality > 0: |
|
|
153 |
self.is_multimapped = False |
|
|
154 |
self.mapping_quality = max( |
|
|
155 |
self.mapping_quality, read.mapping_quality) |
|
|
156 |
|
|
|
157 |
if i == 0: |
|
|
158 |
if read.is_read2: |
|
|
159 |
if not read.is_qcfail: |
|
|
160 |
raise ValueError('Supply first R1 then R2') |
|
|
161 |
read.is_read1 = True |
|
|
162 |
read.is_read2 = False |
|
|
163 |
elif i == 1: |
|
|
164 |
read.is_read1 = False |
|
|
165 |
read.is_read2 = True |
|
|
166 |
self.set_sample(library_name=library_name) |
|
|
167 |
self.update_umi() |
|
|
168 |
if self.qcfail: |
|
|
169 |
return |
|
|
170 |
self.set_strand(self.identify_strand()) |
|
|
171 |
self.update_span() |
|
|
172 |
|
|
|
173 |
|
|
|
174 |
def write_tensor(self, chromosome=None, span_start=None, span_end=None, height=30, index_start=0, |
|
|
175 |
base_content_table=None, |
|
|
176 |
base_mismatches_table=None, |
|
|
177 |
base_indel_table=None, |
|
|
178 |
base_qual_table=None, |
|
|
179 |
base_clip_table=None, |
|
|
180 |
mask_reference_bases=None, |
|
|
181 |
# set with reference bases to mask (converted to N ) |
|
|
182 |
# (chrom,pos) |
|
|
183 |
reference=None, |
|
|
184 |
skip_missing_reads=False |
|
|
185 |
): |
|
|
186 |
""" |
|
|
187 |
Write tensor representation of the fragment to supplied 2d arrays |
|
|
188 |
|
|
|
189 |
Args: |
|
|
190 |
chromosome( str ): |
|
|
191 |
chromosome to view |
|
|
192 |
|
|
|
193 |
span_start( int ): |
|
|
194 |
first base to show |
|
|
195 |
|
|
|
196 |
span_end( int ): |
|
|
197 |
last base to show |
|
|
198 |
|
|
|
199 |
height (int) : |
|
|
200 |
height of the tensor (reads) |
|
|
201 |
|
|
|
202 |
index_start(int): start writing tensor at this row |
|
|
203 |
|
|
|
204 |
base_content_table(np.array) : 2d array to write base contents to |
|
|
205 |
|
|
|
206 |
base_indel_table(np.array) : 2d array to write indel information to |
|
|
207 |
|
|
|
208 |
base_content_table(np.array) : 2d array to write base contents to |
|
|
209 |
|
|
|
210 |
base_content_table(np.array) : 2d array to write base contents to |
|
|
211 |
|
|
|
212 |
mask_reference_bases(set): mask reference bases in this set with a N set( (chrom,pos), ... ) |
|
|
213 |
|
|
|
214 |
reference(pysam.FastaFile) : Handle to reference file to use instead of MD tag. If None: MD tag is used. |
|
|
215 |
|
|
|
216 |
skip_missing_reads (bool) : when enabled only existing (non None) reads are added to the tensor. Use this option when mapping single-end |
|
|
217 |
|
|
|
218 |
Returns: |
|
|
219 |
None |
|
|
220 |
|
|
|
221 |
""" |
|
|
222 |
self.base_mapper = {} |
|
|
223 |
for i, (strand, base) in enumerate( |
|
|
224 |
itertools.product((True, False), 'NACTG-')): |
|
|
225 |
self.base_mapper[(strand, base)] = i + 2 |
|
|
226 |
|
|
|
227 |
if chromosome is None and span_start is None and span_end is None: |
|
|
228 |
chromosome, span_start, span_end = self.get_span() |
|
|
229 |
|
|
|
230 |
span_len = span_end - span_start |
|
|
231 |
|
|
|
232 |
reads = self |
|
|
233 |
last_ref = None |
|
|
234 |
used_reads = 0 |
|
|
235 |
for ri, read in enumerate(reads): |
|
|
236 |
|
|
|
237 |
if read is None: |
|
|
238 |
continue |
|
|
239 |
|
|
|
240 |
alignment_operations = list(itertools.chain( |
|
|
241 |
*([operation] * l for operation, l in read.cigartuples if operation != 2))) |
|
|
242 |
print(alignment_operations) |
|
|
243 |
# Obtain the amount of operations before the alignment start |
|
|
244 |
operations_before_alignment_start = 0 |
|
|
245 |
for query_pos, ref_pos in read.get_aligned_pairs(): |
|
|
246 |
if ref_pos is None: |
|
|
247 |
operations_before_alignment_start += 1 |
|
|
248 |
else: |
|
|
249 |
break |
|
|
250 |
initial_base_offset = read.reference_start - operations_before_alignment_start |
|
|
251 |
|
|
|
252 |
# Obtain row index in the tensor |
|
|
253 |
if skip_missing_reads: |
|
|
254 |
row_index = (used_reads + index_start) % height |
|
|
255 |
else: |
|
|
256 |
row_index = (ri + index_start) % height |
|
|
257 |
|
|
|
258 |
# Where in the reference are we? |
|
|
259 |
ref_pointer = read.reference_start |
|
|
260 |
|
|
|
261 |
alignment_started = False |
|
|
262 |
for operation, (cycle, query_pos, ref_pos, ref_base) in zip( |
|
|
263 |
alignment_operations, |
|
|
264 |
pysamiterators.iterators.ReadCycleIterator( |
|
|
265 |
read, |
|
|
266 |
with_seq=True, |
|
|
267 |
reference=reference)): |
|
|
268 |
|
|
|
269 |
# Mask locations from mask_reference_bases |
|
|
270 |
if mask_reference_bases is not None and ref_pos is not None and ( |
|
|
271 |
chromosome, ref_pos) in mask_reference_bases: |
|
|
272 |
ref_base = 'N' |
|
|
273 |
|
|
|
274 |
if ref_pos is None: |
|
|
275 |
if not alignment_started: |
|
|
276 |
ref_pointer = initial_base_offset + i |
|
|
277 |
else: |
|
|
278 |
ref_pointer += 1 |
|
|
279 |
if operation == 4: |
|
|
280 |
try: |
|
|
281 |
query_base = read.seq[query_pos] |
|
|
282 |
base_clip_table[row_index, ref_pointer - |
|
|
283 |
span_start] = self.base_mapper[(read.is_reverse, query_base)] |
|
|
284 |
except IndexError: |
|
|
285 |
pass |
|
|
286 |
continue |
|
|
287 |
ref_pointer = ref_pos |
|
|
288 |
|
|
|
289 |
if (ref_pos - span_start) < 0 or (ref_pos - |
|
|
290 |
span_start) > (span_len - 1): |
|
|
291 |
continue |
|
|
292 |
|
|
|
293 |
ref_base = ref_base.upper() |
|
|
294 |
if query_pos is None: |
|
|
295 |
query_base = '-' |
|
|
296 |
qual = 0 |
|
|
297 |
base_indel_table[row_index, ref_pos - span_start] = 1 |
|
|
298 |
else: |
|
|
299 |
query_base = read.seq[query_pos] |
|
|
300 |
qual = ord(read.qual[query_pos]) |
|
|
301 |
|
|
|
302 |
if ref_pos is not None: |
|
|
303 |
base_qual_table[row_index, ref_pos - span_start] = qual |
|
|
304 |
if query_base != ref_base: |
|
|
305 |
base_content_table[row_index, ref_pos - |
|
|
306 |
span_start] = self.base_mapper[(read.is_reverse, query_base)] |
|
|
307 |
else: |
|
|
308 |
base_mismatches_table[row_index, |
|
|
309 |
ref_pos - span_start] = 0.2 |
|
|
310 |
base_content_table[row_index, ref_pos - |
|
|
311 |
span_start] = self.base_mapper[(read.is_reverse, query_base)] |
|
|
312 |
|
|
|
313 |
used_reads += 1 |
|
|
314 |
if skip_missing_reads: |
|
|
315 |
return used_reads + index_start + 1 |
|
|
316 |
else: |
|
|
317 |
return ri + index_start + 1 |
|
|
318 |
|
|
|
319 |
def get_read_group(self, with_attr_dict=False) -> str: |
|
|
320 |
|
|
|
321 |
r = None |
|
|
322 |
for read in self.reads: |
|
|
323 |
if read is None: |
|
|
324 |
continue |
|
|
325 |
|
|
|
326 |
r = get_read_group_from_read(read, |
|
|
327 |
format=self.read_group_format, |
|
|
328 |
with_attr_dict=with_attr_dict) |
|
|
329 |
|
|
|
330 |
break |
|
|
331 |
|
|
|
332 |
return r |
|
|
333 |
|
|
|
334 |
|
|
|
335 |
def set_duplicate(self, is_duplicate): |
|
|
336 |
"""Define this fragment as duplicate, sets the corresponding bam bit flag |
|
|
337 |
Args: |
|
|
338 |
value(bool) : is_duplicate |
|
|
339 |
""" |
|
|
340 |
for read in self.reads: |
|
|
341 |
if read is not None: |
|
|
342 |
read.is_duplicate = is_duplicate |
|
|
343 |
|
|
|
344 |
def get_fragment_size(self): |
|
|
345 |
return abs(self.span[2] - self.span[1]) |
|
|
346 |
|
|
|
347 |
def write_tags(self): |
|
|
348 |
self.set_meta('MQ', self.mapping_quality) |
|
|
349 |
self.set_meta('MM', self.is_multimapped) |
|
|
350 |
self.set_meta('RG', self.get_read_group()) |
|
|
351 |
if self.has_valid_span(): |
|
|
352 |
# Write fragment size: |
|
|
353 |
if self.safe_span: |
|
|
354 |
self.set_meta('fS', self.get_fragment_size()) |
|
|
355 |
self.set_meta('fe', self.span[1]) |
|
|
356 |
self.set_meta('fs', self.span[2]) |
|
|
357 |
else: |
|
|
358 |
self.remove_meta('fS') |
|
|
359 |
|
|
|
360 |
else: |
|
|
361 |
self.set_rejection_reason('FS', set_qcfail=True) |
|
|
362 |
|
|
|
363 |
# Set qcfail bit when the fragment is not valid |
|
|
364 |
|
|
|
365 |
if not self.is_valid(): |
|
|
366 |
for read in self: |
|
|
367 |
if read is not None: |
|
|
368 |
read.is_qcfail = True |
|
|
369 |
|
|
|
370 |
def write_pysam(self, pysam_handle): |
|
|
371 |
"""Write all associated reads to the target file |
|
|
372 |
|
|
|
373 |
Args: |
|
|
374 |
target_file (pysam.AlignmentFile) : Target file |
|
|
375 |
""" |
|
|
376 |
self.write_tags() |
|
|
377 |
for read in self: |
|
|
378 |
if read is not None: |
|
|
379 |
pysam_handle.write(read) |
|
|
380 |
|
|
|
381 |
def identify_strand(self): |
|
|
382 |
# If R2 is rev complement: |
|
|
383 |
if self.get_R1() is not None: |
|
|
384 |
# verify if the read has been mapped: |
|
|
385 |
if self.get_R1().is_unmapped: |
|
|
386 |
return None |
|
|
387 |
return self.get_R1().is_reverse |
|
|
388 |
elif self.get_R2() is not None: |
|
|
389 |
# verify if the read has been mapped: |
|
|
390 |
if self.get_R2().is_unmapped: |
|
|
391 |
return None |
|
|
392 |
return not self.get_R2().is_reverse |
|
|
393 |
return None |
|
|
394 |
|
|
|
395 |
def get_random_primer_hash(self): |
|
|
396 |
"""Obtain hash describing the random primer |
|
|
397 |
this assumes the random primer is on the end of R2 and has a length of self.R2_primer_length |
|
|
398 |
When the rS tag is set, the value of this tag is used as random primer sequence |
|
|
399 |
Returns None,None when the random primer cannot be described |
|
|
400 |
|
|
|
401 |
Returns: |
|
|
402 |
reference_name (str) or None |
|
|
403 |
reference_start (int) : Int or None |
|
|
404 |
sequence (str) : Int or None |
|
|
405 |
""" |
|
|
406 |
R2 = self.get_R2() |
|
|
407 |
|
|
|
408 |
if R2 is None or R2.query_sequence is None: |
|
|
409 |
return None, None, None |
|
|
410 |
|
|
|
411 |
if self.R2_primer_length == 0 and self.random_primer_sequence is None: |
|
|
412 |
return None, None, None |
|
|
413 |
|
|
|
414 |
# The read was not mapped |
|
|
415 |
if R2.is_unmapped: |
|
|
416 |
# Guess the orientation does not matter |
|
|
417 |
if self.random_primer_sequence is not None: |
|
|
418 |
return None, None, self.random_primer_sequence |
|
|
419 |
return None, None, R2.query_sequence[:self.R2_primer_length] |
|
|
420 |
|
|
|
421 |
if R2.is_reverse: |
|
|
422 |
global complement |
|
|
423 |
if self.random_primer_sequence is not None: |
|
|
424 |
return R2.reference_name, R2.reference_end, self.random_primer_sequence |
|
|
425 |
|
|
|
426 |
return(R2.reference_name, R2.reference_end, R2.query_sequence[-self.R2_primer_length:][::-1].translate(complement)) |
|
|
427 |
else: |
|
|
428 |
if self.random_primer_sequence is not None: |
|
|
429 |
return R2.reference_name, R2.reference_start, self.random_primer_sequence |
|
|
430 |
return(R2.reference_name, R2.reference_start, R2.query_sequence[:self.R2_primer_length]) |
|
|
431 |
raise ValueError() |
|
|
432 |
|
|
|
433 |
def remove_meta(self,key): |
|
|
434 |
for read in self: |
|
|
435 |
if read is not None: |
|
|
436 |
if read.has_tag(key): |
|
|
437 |
read.set_tag(key,None) |
|
|
438 |
|
|
|
439 |
|
|
|
440 |
def set_meta(self, key, value, as_set=False): |
|
|
441 |
self.meta[key] = value |
|
|
442 |
for read in self: |
|
|
443 |
if read is not None: |
|
|
444 |
if as_set and read.has_tag(key): |
|
|
445 |
|
|
|
446 |
data = set(read.get_tag(key).split(',')) |
|
|
447 |
data.add(value) |
|
|
448 |
read.set_tag(key, ','.join(sorted(list(data)))) |
|
|
449 |
|
|
|
450 |
else: |
|
|
451 |
read.set_tag(key, value) |
|
|
452 |
|
|
|
453 |
def get_R1(self): |
|
|
454 |
""" |
|
|
455 |
Obtain the AlignedSegment of read 1 of the fragment |
|
|
456 |
|
|
|
457 |
Returns: |
|
|
458 |
R1 (pysam.AlignedSegment) : Read 1 of the fragment, returns None |
|
|
459 |
when R1 is not mapped |
|
|
460 |
""" |
|
|
461 |
if len(self.reads) == 0: |
|
|
462 |
raise IndexError('The fragment has no associated reads') |
|
|
463 |
return self.reads[0] |
|
|
464 |
|
|
|
465 |
def get_R2(self): |
|
|
466 |
""" |
|
|
467 |
Obtain the AlignedSegment of read 2 of the fragment |
|
|
468 |
|
|
|
469 |
Returns: |
|
|
470 |
R1 (pysam.AlignedSegment) : Read 2 of the fragment, returns None |
|
|
471 |
when R2 is not mapped |
|
|
472 |
""" |
|
|
473 |
if len(self.reads) < 2: |
|
|
474 |
raise IndexError('The fragment has no associated R2') |
|
|
475 |
return self.reads[1] |
|
|
476 |
|
|
|
477 |
def has_R1(self): |
|
|
478 |
""" |
|
|
479 |
Check if the fragment has an associated read 1 |
|
|
480 |
|
|
|
481 |
Returns: |
|
|
482 |
has_r1 (bool) |
|
|
483 |
""" |
|
|
484 |
if len(self.reads) == 0: |
|
|
485 |
return False |
|
|
486 |
return self.reads[0] is not None |
|
|
487 |
|
|
|
488 |
def has_R2(self): |
|
|
489 |
""" |
|
|
490 |
Check if the fragment has an associated read 2 |
|
|
491 |
|
|
|
492 |
Returns: |
|
|
493 |
has_r2 (bool) |
|
|
494 |
""" |
|
|
495 |
if len(self.reads) < 2: |
|
|
496 |
return False |
|
|
497 |
return self.reads[1] is not None |
|
|
498 |
|
|
|
499 |
@property |
|
|
500 |
def R1(self): |
|
|
501 |
return self.reads[0] |
|
|
502 |
|
|
|
503 |
@property |
|
|
504 |
def R2(self): |
|
|
505 |
return self.reads[1] |
|
|
506 |
|
|
|
507 |
def get_consensus(self, only_include_refbase: str = None, dove_safe: bool = False, **get_consensus_dictionaries_kwargs) -> dict: |
|
|
508 |
""" |
|
|
509 |
a dictionary of (reference_pos) : (qbase, quality, reference_base) tuples |
|
|
510 |
|
|
|
511 |
Args: |
|
|
512 |
only_include_refbase(str) : Only report bases aligned to this reference base, uppercase only |
|
|
513 |
dove_safe(bool) : Only report bases supported within R1 and R2 start and end coordinates |
|
|
514 |
|
|
|
515 |
Returns: |
|
|
516 |
consensus(dict) : {reference_position: (qbase, quality) |
|
|
517 |
""" |
|
|
518 |
r1_consensus, r2_consensus = get_consensus_dictionaries( |
|
|
519 |
self.R1, |
|
|
520 |
self.R2, |
|
|
521 |
only_include_refbase=only_include_refbase, |
|
|
522 |
dove_safe=dove_safe, |
|
|
523 |
**get_consensus_dictionaries_kwargs) |
|
|
524 |
|
|
|
525 |
return { |
|
|
526 |
ref_pos:pick_best_base_call( r1_consensus.get(ref_pos) , r2_consensus.get(ref_pos) ) |
|
|
527 |
for ref_pos in set(r1_consensus.keys()).union(set(r2_consensus.keys())) |
|
|
528 |
} |
|
|
529 |
|
|
|
530 |
|
|
|
531 |
@property |
|
|
532 |
def estimated_length(self) -> int: |
|
|
533 |
""" |
|
|
534 |
Obtain the estimated size of the fragment, |
|
|
535 |
returns None when estimation is not possible |
|
|
536 |
Takes into account removed bases (R2_primer_length attribute) |
|
|
537 |
Assumes inwards sequencing orientation, except when self.single_end is set |
|
|
538 |
""" |
|
|
539 |
if self.single_end: |
|
|
540 |
if self[0] is None: |
|
|
541 |
return None |
|
|
542 |
return self[0].reference_end - self[0].reference_start |
|
|
543 |
|
|
|
544 |
|
|
|
545 |
if self.has_R1() and self.has_R2(): |
|
|
546 |
|
|
|
547 |
contig = self.R1.reference_name |
|
|
548 |
if self.R1.is_reverse and not self.R2.is_reverse: |
|
|
549 |
## rp |----- R2 ----> <--- R1 ----| |
|
|
550 |
## |>----------DISTANCE------------<| |
|
|
551 |
start, end = self.R2.reference_start - self.R2_primer_length, self.R1.reference_end |
|
|
552 |
if start<end: |
|
|
553 |
return end - start |
|
|
554 |
else: |
|
|
555 |
return None |
|
|
556 |
elif not self.R1.is_reverse and self.R2.is_reverse: |
|
|
557 |
|
|
|
558 |
## |----- R1 ----> <--- R2 ----| rp |
|
|
559 |
## |>----------DISTANCE------------<| |
|
|
560 |
|
|
|
561 |
start, end = self.R1.reference_start, self.R2.reference_end + self.R2_primer_length |
|
|
562 |
if start<end: |
|
|
563 |
return end - start |
|
|
564 |
else: |
|
|
565 |
return None |
|
|
566 |
else: |
|
|
567 |
return None |
|
|
568 |
return None |
|
|
569 |
|
|
|
570 |
|
|
|
571 |
def update_span(self): |
|
|
572 |
""" |
|
|
573 |
Update the span (the location the fragment maps to) stored in Fragment |
|
|
574 |
|
|
|
575 |
The span is a single stretch of coordinates on a single contig. |
|
|
576 |
The contig is determined by the reference_name assocated to the first |
|
|
577 |
mapped read in self.reads |
|
|
578 |
|
|
|
579 |
This calculation assumes the reads are sequenced inwards and |
|
|
580 |
dove-tails of the molecule cannot be trusted |
|
|
581 |
""" |
|
|
582 |
|
|
|
583 |
contig = None |
|
|
584 |
start = None |
|
|
585 |
end = None |
|
|
586 |
|
|
|
587 |
|
|
|
588 |
if self.has_R1() and self.has_R2() and \ |
|
|
589 |
self.R1.reference_start is not None and self.R1.reference_end is not None and \ |
|
|
590 |
self.R2.reference_start is not None and self.R2.reference_end is not None : |
|
|
591 |
|
|
|
592 |
contig = self.R1.reference_name |
|
|
593 |
if self.R1.is_reverse and not self.R2.is_reverse: |
|
|
594 |
start, end = self.R2.reference_start, self.R1.reference_end |
|
|
595 |
self.safe_span = True |
|
|
596 |
elif not self.R1.is_reverse and self.R2.is_reverse: |
|
|
597 |
start, end = self.R1.reference_start, self.R2.reference_end |
|
|
598 |
self.safe_span = True |
|
|
599 |
else: |
|
|
600 |
start = min(self.R1.reference_start, self.R2.reference_start) |
|
|
601 |
end = max(self.R1.reference_start, self.R2.reference_start) |
|
|
602 |
self.safe_span = False |
|
|
603 |
|
|
|
604 |
elif self.has_R1() and self.R1.reference_start is not None and self.R1.reference_end is not None : |
|
|
605 |
contig = self.R1.reference_name |
|
|
606 |
start, end = self.R1.reference_start, self.R1.reference_end |
|
|
607 |
self.safe_span = False |
|
|
608 |
|
|
|
609 |
elif self.has_R2() and self.R2.reference_start is not None and self.R2.reference_end is not None : |
|
|
610 |
contig = self.R2.reference_name |
|
|
611 |
start, end = self.R2.reference_start, self.R2.reference_end |
|
|
612 |
self.safe_span = False |
|
|
613 |
else: |
|
|
614 |
|
|
|
615 |
# Its Sometimes possible that no cigar is set for the alignment, only a start coordinate |
|
|
616 |
|
|
|
617 |
for read in self: |
|
|
618 |
if read is None: |
|
|
619 |
continue |
|
|
620 |
if len(read.cigar)!=0: |
|
|
621 |
raise NotImplementedError("Non implemented span") |
|
|
622 |
if read.reference_start is not None: |
|
|
623 |
start,end = read.reference_start, read.reference_start |
|
|
624 |
contig = read.reference_name |
|
|
625 |
else: |
|
|
626 |
raise NotImplementedError("Non implemented span, undefined alignment, and not start coordinate") |
|
|
627 |
|
|
|
628 |
|
|
|
629 |
|
|
|
630 |
|
|
|
631 |
self.span = (contig, start, end) |
|
|
632 |
|
|
|
633 |
@property |
|
|
634 |
def aligned_length(self): |
|
|
635 |
return self.span[2]-self.span[1] |
|
|
636 |
|
|
|
637 |
def get_span(self) -> tuple: |
|
|
638 |
return self.span |
|
|
639 |
|
|
|
640 |
def has_valid_span(self) -> bool: |
|
|
641 |
|
|
|
642 |
# Check if the span could be calculated: |
|
|
643 |
defined_span = not any([x is None for x in self.get_span()]) |
|
|
644 |
if not defined_span: |
|
|
645 |
return False |
|
|
646 |
|
|
|
647 |
if self.max_fragment_size is not None and self.get_fragment_size()>self.max_fragment_size: |
|
|
648 |
return False |
|
|
649 |
|
|
|
650 |
return True |
|
|
651 |
|
|
|
652 |
def set_sample(self, sample: str = None, library_name: str = None): |
|
|
653 |
"""Force sample name or obtain sample name from associated reads""" |
|
|
654 |
if sample is not None: |
|
|
655 |
self.sample = sample |
|
|
656 |
else: |
|
|
657 |
for read in self.reads: |
|
|
658 |
if read is not None and read.has_tag('SM'): |
|
|
659 |
self.sample = read.get_tag('SM') |
|
|
660 |
break |
|
|
661 |
|
|
|
662 |
if library_name is not None: |
|
|
663 |
sample = self.sample.split('_', 1)[-1] |
|
|
664 |
self.sample = library_name+'_'+sample |
|
|
665 |
for read in self.reads: |
|
|
666 |
if read is not None: |
|
|
667 |
read.set_tag('SM', self.sample) |
|
|
668 |
read.set_tag('LY', library_name) |
|
|
669 |
|
|
|
670 |
def get_sample(self) -> str: |
|
|
671 |
""" Obtain the sample name associated with the fragment |
|
|
672 |
The sample name is extracted from the SM tag of any of the associated reads. |
|
|
673 |
|
|
|
674 |
Returns: |
|
|
675 |
sample name (str) |
|
|
676 |
""" |
|
|
677 |
return self.sample |
|
|
678 |
|
|
|
679 |
def __len__(self): |
|
|
680 |
"""Obtain the amount of associated reads to the fragment |
|
|
681 |
|
|
|
682 |
Returns: |
|
|
683 |
assocoiated reads (int) |
|
|
684 |
""" |
|
|
685 |
return(sum(read is not None for read in self.reads)) |
|
|
686 |
|
|
|
687 |
def set_strand(self, strand: bool): |
|
|
688 |
"""Set mapping strand |
|
|
689 |
|
|
|
690 |
Args: |
|
|
691 |
strand (bool) : False for Forward, True for reverse |
|
|
692 |
""" |
|
|
693 |
self.strand = strand |
|
|
694 |
|
|
|
695 |
def get_strand(self): |
|
|
696 |
"""Obtain strand |
|
|
697 |
|
|
|
698 |
Returns: |
|
|
699 |
strand (bool) : False for Forward, True for reverse |
|
|
700 |
""" |
|
|
701 |
return self.strand |
|
|
702 |
|
|
|
703 |
def update_umi(self): |
|
|
704 |
""" |
|
|
705 |
Extract umi from 'RX' tag and store the UMI in the Fragment object |
|
|
706 |
""" |
|
|
707 |
for read in self.reads: |
|
|
708 |
if read is not None and read.has_tag('RX'): |
|
|
709 |
self.umi = read.get_tag('RX') |
|
|
710 |
|
|
|
711 |
def get_umi(self): |
|
|
712 |
""" |
|
|
713 |
Get the UMI sequence associated to this fragment |
|
|
714 |
|
|
|
715 |
Returns: |
|
|
716 |
umi(str) |
|
|
717 |
""" |
|
|
718 |
return self.umi |
|
|
719 |
|
|
|
720 |
def __iter__(self): |
|
|
721 |
return iter(self.reads) |
|
|
722 |
|
|
|
723 |
def umi_eq(self, other): |
|
|
724 |
""" |
|
|
725 |
Hamming distance measurement to another Fragment or Molecule, |
|
|
726 |
|
|
|
727 |
Returns : |
|
|
728 |
is_close (bool) : returns True when the hamming distance between |
|
|
729 |
the two objects <= umi_hamming_distance |
|
|
730 |
|
|
|
731 |
Example: |
|
|
732 |
>>> from singlecellmultiomics.molecule import Molecule |
|
|
733 |
>>> from singlecellmultiomics.fragment import Fragment |
|
|
734 |
>>> import pysam |
|
|
735 |
>>> # Create reads (read_A, and read_B), they both belong to the same |
|
|
736 |
>>> # cell and have 1 hamming distance between their UMI's |
|
|
737 |
>>> read_A = pysam.AlignedSegment() |
|
|
738 |
>>> read_A.set_tag('SM','CELL_1') # The sample to which the sample belongs is extracted from the SM tag |
|
|
739 |
>>> read_A.set_tag('RX','CAT') # The UMI is extracted from the RX tag |
|
|
740 |
>>> read_B = pysam.AlignedSegment() |
|
|
741 |
>>> read_B.set_tag('SM','CELL_1') # The sample to which the sample belongs is extracted from the SM tag |
|
|
742 |
>>> read_B.set_tag('RX','CAG') # The UMI is extracted from the RX tag |
|
|
743 |
|
|
|
744 |
>>> # Create fragment objects for read_A and B: |
|
|
745 |
>>> frag_A = Fragment([read_A],umi_hamming_distance=0) |
|
|
746 |
>>> frag_B = Fragment([read_B],umi_hamming_distance=0) |
|
|
747 |
>>> frag_A.umi_eq(frag_B) # This returns False, the distance is 1, which is higher than 0 (umi_hamming_distance) |
|
|
748 |
False |
|
|
749 |
|
|
|
750 |
>>> frag_A = Fragment([read_A],umi_hamming_distance=1) |
|
|
751 |
>>> frag_B = Fragment([read_B],umi_hamming_distance=1) |
|
|
752 |
>>> frag_A.umi_eq(frag_B) # This returns True, the distance is 1, which is the (umi_hamming_distance) |
|
|
753 |
True |
|
|
754 |
""" |
|
|
755 |
|
|
|
756 |
if self.umi == other.umi: |
|
|
757 |
return True |
|
|
758 |
if self.umi_hamming_distance == 0: |
|
|
759 |
return False |
|
|
760 |
else: |
|
|
761 |
if len(self.umi)!=len(other.umi): |
|
|
762 |
return False |
|
|
763 |
return hamming_distance( |
|
|
764 |
self.umi, other.umi) <= self.umi_hamming_distance |
|
|
765 |
|
|
|
766 |
def __eq__(self, other): # other can also be a Molecule! |
|
|
767 |
# Make sure fragments map to the same strand, cheap comparisons |
|
|
768 |
""" |
|
|
769 |
Check equivalence between two Fragments or Fragment and Molecule. |
|
|
770 |
|
|
|
771 |
|
|
|
772 |
Args: |
|
|
773 |
other (Fragment or Molecule) : object to compare against |
|
|
774 |
|
|
|
775 |
Returns |
|
|
776 |
is_eq (bool) : True when the other object is (likely) derived from the same molecule |
|
|
777 |
|
|
|
778 |
Example: |
|
|
779 |
>>> from singlecellmultiomics.molecule import Molecule |
|
|
780 |
>>> from singlecellmultiomics.fragment import Fragment |
|
|
781 |
>>> import pysam |
|
|
782 |
>>> # Create sam file to write some reads to: |
|
|
783 |
>>> test_sam = pysam.AlignmentFile('test.sam','w',reference_names=['chr1','chr2'],reference_lengths=[1000,1000]) |
|
|
784 |
>>> read_A = pysam.AlignedSegment(test_sam.header) |
|
|
785 |
>>> read_A.set_tag('SM','CELL_1') # The sample to which the sample belongs is extracted from the SM tag |
|
|
786 |
>>> read_A.set_tag('RX','CAT') # The UMI is extracted from the RX tag |
|
|
787 |
>>> # By default the molecule assignment is done based on the mapping location of read 1: |
|
|
788 |
>>> read_A.reference_name = 'chr1' |
|
|
789 |
>>> read_A.reference_start = 100 |
|
|
790 |
>>> read_A.query_sequence = 'ATCGGG' |
|
|
791 |
>>> read_A.cigarstring = '6M' |
|
|
792 |
|
|
|
793 |
>>> read_B = pysam.AlignedSegment(test_sam.header) |
|
|
794 |
>>> read_B.set_tag('SM','CELL_1') |
|
|
795 |
>>> read_B.set_tag('RX','CAT') |
|
|
796 |
>>> read_B.reference_start = 100 |
|
|
797 |
>>> read_B.query_sequence = 'ATCGG' |
|
|
798 |
>>> read_B.cigarstring = '5M' |
|
|
799 |
|
|
|
800 |
>>> frag_A = Fragment([read_A],umi_hamming_distance=0) |
|
|
801 |
>>> frag_A |
|
|
802 |
Fragment: |
|
|
803 |
sample:CELL_1 |
|
|
804 |
umi:CAT |
|
|
805 |
span:chr1 100-106 |
|
|
806 |
strand:+ |
|
|
807 |
has R1: yes |
|
|
808 |
has R2: no |
|
|
809 |
randomer trimmed: no |
|
|
810 |
>>> frag_B = Fragment([read_B],umi_hamming_distance=0) |
|
|
811 |
>>> frag_A == frag_B |
|
|
812 |
True |
|
|
813 |
# Fragment A and fragment B belong to the same molecule, |
|
|
814 |
# the UMI is identical, the starting position of R1 is identical and |
|
|
815 |
# the sample name matches |
|
|
816 |
|
|
|
817 |
When we move one of the reads, the Fragments are not equivalent any more |
|
|
818 |
|
|
|
819 |
Example: |
|
|
820 |
>>> read_B.reference_start = 150 |
|
|
821 |
>>> frag_B = Fragment([read_B],umi_hamming_distance=0) |
|
|
822 |
>>> frag_A == frag_B |
|
|
823 |
False |
|
|
824 |
|
|
|
825 |
Except if the difference <= the assignment_radius |
|
|
826 |
|
|
|
827 |
Example: |
|
|
828 |
>>> read_B.reference_start = 150 |
|
|
829 |
>>> read_A.reference_start = 100 |
|
|
830 |
>>> frag_B = Fragment([read_B],assignment_radius=300) |
|
|
831 |
>>> frag_A = Fragment([read_A],assignment_radius=300) |
|
|
832 |
>>> frag_A == frag_B |
|
|
833 |
True |
|
|
834 |
|
|
|
835 |
When the UMI's are too far apart, the eq function returns `False` |
|
|
836 |
|
|
|
837 |
Example: |
|
|
838 |
>>> read_B.reference_start = 100 |
|
|
839 |
>>> read_A.reference_start = 100 |
|
|
840 |
>>> read_A.set_tag('RX','GGG') |
|
|
841 |
>>> frag_B = Fragment([read_B]) |
|
|
842 |
>>> frag_A = Fragment([read_A]) |
|
|
843 |
>>> frag_A == frag_B |
|
|
844 |
False |
|
|
845 |
|
|
|
846 |
When the sample of the Fragments are not identical, the eq function returns `False` |
|
|
847 |
|
|
|
848 |
Example: |
|
|
849 |
>>> read_B.reference_start = 100 |
|
|
850 |
>>> read_A.reference_start = 100 |
|
|
851 |
>>> read_A.set_tag('RX','AAA') |
|
|
852 |
>>> read_B.set_tag('RX','AAA') |
|
|
853 |
>>> read_B.set_tag('SM', 'CELL_2' ) |
|
|
854 |
>>> frag_B = Fragment([read_B]) |
|
|
855 |
>>> frag_A = Fragment([read_A]) |
|
|
856 |
>>> frag_A == frag_B |
|
|
857 |
False |
|
|
858 |
|
|
|
859 |
""" |
|
|
860 |
if self.sample != other.sample: |
|
|
861 |
return False |
|
|
862 |
|
|
|
863 |
if self.strand != other.strand: |
|
|
864 |
return False |
|
|
865 |
|
|
|
866 |
if not self.has_valid_span() or not other.has_valid_span(): |
|
|
867 |
return False |
|
|
868 |
|
|
|
869 |
if min(abs(self.span[1] - |
|
|
870 |
other.span[1]), abs(self.span[2] - |
|
|
871 |
other.span[2])) > self.assignment_radius: |
|
|
872 |
return False |
|
|
873 |
|
|
|
874 |
# Make sure UMI's are similar enough, more expensive hamming distance |
|
|
875 |
# calculation |
|
|
876 |
return self.umi_eq(other) |
|
|
877 |
|
|
|
878 |
def __getitem__(self, index): |
|
|
879 |
""" |
|
|
880 |
Get a read from the fragment |
|
|
881 |
|
|
|
882 |
Args: |
|
|
883 |
index( int ): |
|
|
884 |
0 : Read 1 |
|
|
885 |
1: Read 2 |
|
|
886 |
.. |
|
|
887 |
""" |
|
|
888 |
return self.reads[index] |
|
|
889 |
|
|
|
890 |
def is_valid(self): |
|
|
891 |
if self.qcfail: |
|
|
892 |
return False |
|
|
893 |
return self.has_valid_span() |
|
|
894 |
|
|
|
895 |
def get_strand_repr(self): |
|
|
896 |
s = self.get_strand() |
|
|
897 |
if s is None: |
|
|
898 |
return '?' |
|
|
899 |
if s: |
|
|
900 |
return '-' |
|
|
901 |
else: |
|
|
902 |
return '+' |
|
|
903 |
|
|
|
904 |
def __repr__(self): |
|
|
905 |
rep = f"""Fragment: |
|
|
906 |
sample:{self.get_sample()} |
|
|
907 |
umi:{self.get_umi()} |
|
|
908 |
span:{('%s %s-%s' % self.get_span() if self.has_valid_span() else 'no span')} |
|
|
909 |
strand:{self.get_strand_repr()} |
|
|
910 |
has R1: {"yes" if self.has_R1() else "no"} |
|
|
911 |
has R2: {"yes" if self.has_R2() else "no"} |
|
|
912 |
randomer trimmed: {"yes" if self.unsafe_trimmed else "no"} |
|
|
913 |
""" |
|
|
914 |
|
|
|
915 |
try: |
|
|
916 |
rep += '\n\t'.join([f'{key}:{str(value)}' for key, value in self.meta.items()]) |
|
|
917 |
except Exception as e: |
|
|
918 |
print(self.meta) |
|
|
919 |
raise |
|
|
920 |
return rep |
|
|
921 |
|
|
|
922 |
def get_html( |
|
|
923 |
self, |
|
|
924 |
chromosome=None, |
|
|
925 |
span_start=None, |
|
|
926 |
span_end=None, |
|
|
927 |
show_read1=None, |
|
|
928 |
show_read2=None): |
|
|
929 |
""" |
|
|
930 |
Get HTML representation of the fragment |
|
|
931 |
|
|
|
932 |
Args: |
|
|
933 |
chromosome( str ): |
|
|
934 |
chromosome to view |
|
|
935 |
|
|
|
936 |
span_start( int ): |
|
|
937 |
first base to show |
|
|
938 |
|
|
|
939 |
span_end( int ): |
|
|
940 |
last base to show |
|
|
941 |
|
|
|
942 |
show_read1(bool): |
|
|
943 |
show read1 |
|
|
944 |
|
|
|
945 |
show_read2(bool): |
|
|
946 |
show read2 |
|
|
947 |
|
|
|
948 |
Returns: |
|
|
949 |
html(string) : html representation of the fragment |
|
|
950 |
|
|
|
951 |
""" |
|
|
952 |
|
|
|
953 |
if chromosome is None and span_start is None and span_end is None: |
|
|
954 |
chromosome, span_start, span_end = self.get_span() |
|
|
955 |
|
|
|
956 |
span_len = abs(span_end - span_start) |
|
|
957 |
visualized_frag = ['.'] * span_len |
|
|
958 |
|
|
|
959 |
if show_read1 is None and show_read2 is None: |
|
|
960 |
reads = self |
|
|
961 |
else: |
|
|
962 |
reads = [] |
|
|
963 |
if show_read1: |
|
|
964 |
reads.append(self.get_R1()) |
|
|
965 |
if show_read2: |
|
|
966 |
reads.append(self.get_R2()) |
|
|
967 |
|
|
|
968 |
for read in reads: |
|
|
969 |
if read is None: |
|
|
970 |
continue |
|
|
971 |
# for cycle, query_pos, ref_pos, ref_base in |
|
|
972 |
# pysamiterators.iterators.ReadCycleIterator(read,with_seq=True): |
|
|
973 |
for query_pos, ref_pos, ref_base in read.get_aligned_pairs( |
|
|
974 |
with_seq=True, matches_only=True): |
|
|
975 |
if ref_pos is None or ref_pos < span_start or ref_pos > span_end: |
|
|
976 |
continue |
|
|
977 |
|
|
|
978 |
ref_base = ref_base.upper() |
|
|
979 |
if query_pos is None: |
|
|
980 |
query_base = '-' |
|
|
981 |
else: |
|
|
982 |
query_base = read.seq[query_pos] |
|
|
983 |
if ref_pos is not None: |
|
|
984 |
if query_base != ref_base: |
|
|
985 |
v = style_str(query_base, color='red', weight=500) |
|
|
986 |
else: |
|
|
987 |
v = query_base |
|
|
988 |
try: |
|
|
989 |
visualized_frag[ref_pos - span_start] = v |
|
|
990 |
except IndexError: |
|
|
991 |
pass # the alignment is outside the requested view |
|
|
992 |
return ''.join(visualized_frag) |
|
|
993 |
|
|
|
994 |
def set_rejection_reason(self, reason, set_qcfail=False): |
|
|
995 |
self.set_meta('RR', reason, as_set=True) |
|
|
996 |
if set_qcfail: |
|
|
997 |
for read in self: |
|
|
998 |
if read is not None: |
|
|
999 |
read.is_qcfail = True |
|
|
1000 |
|
|
|
1001 |
def set_recognized_sequence(self, seq): |
|
|
1002 |
self.set_meta('RZ', seq) |
|
|
1003 |
|
|
|
1004 |
|
|
|
1005 |
|
|
|
1006 |
class SingleEndTranscriptFragment(Fragment, FeatureAnnotatedObject): |
|
|
1007 |
def __init__(self, reads, features, assignment_radius=0, stranded=None, capture_locations=False, auto_set_intron_exon_features=True,**kwargs): |
|
|
1008 |
|
|
|
1009 |
Fragment.__init__(self, reads, assignment_radius=assignment_radius, **kwargs) |
|
|
1010 |
FeatureAnnotatedObject.__init__(self, |
|
|
1011 |
features=features, |
|
|
1012 |
stranded=stranded, |
|
|
1013 |
capture_locations=capture_locations, |
|
|
1014 |
auto_set_intron_exon_features=auto_set_intron_exon_features ) |
|
|
1015 |
|
|
|
1016 |
self.match_hash = (self.sample, |
|
|
1017 |
self.span[0], |
|
|
1018 |
self.strand) |
|
|
1019 |
|
|
|
1020 |
|
|
|
1021 |
def write_tags(self): |
|
|
1022 |
# First decide on what values to write |
|
|
1023 |
FeatureAnnotatedObject.write_tags(self) |
|
|
1024 |
# Additional fragment data |
|
|
1025 |
Fragment.write_tags(self) |
|
|
1026 |
|
|
|
1027 |
def is_valid(self): |
|
|
1028 |
if len(self.genes)==0: |
|
|
1029 |
return False |
|
|
1030 |
return True |
|
|
1031 |
|
|
|
1032 |
def annotate(self): |
|
|
1033 |
for read in self: |
|
|
1034 |
if read is None: |
|
|
1035 |
continue |
|
|
1036 |
|
|
|
1037 |
read_strand = read.is_reverse |
|
|
1038 |
if self.stranded is not None and self.stranded==False: |
|
|
1039 |
read_strand=not read_strand |
|
|
1040 |
|
|
|
1041 |
|
|
|
1042 |
strand = ('-' if read_strand else '+') |
|
|
1043 |
read.set_tag('mr',strand) |
|
|
1044 |
for start, end in read.get_blocks(): |
|
|
1045 |
|
|
|
1046 |
for hit in self.features.findFeaturesBetween( |
|
|
1047 |
chromosome=read.reference_name, |
|
|
1048 |
sampleStart=start, |
|
|
1049 |
sampleEnd=end, |
|
|
1050 |
strand=(None if self.stranded is None else strand)) : |
|
|
1051 |
|
|
|
1052 |
hit_start, hit_end, hit_id, hit_strand, hit_ids = hit |
|
|
1053 |
self.hits[hit_ids].add( |
|
|
1054 |
(read.reference_name, (hit_start, hit_end))) |
|
|
1055 |
|
|
|
1056 |
if self.capture_locations: |
|
|
1057 |
if not hit_id in self.feature_locations: |
|
|
1058 |
self.feature_locations[hit_id] = [] |
|
|
1059 |
self.feature_locations[hit_id].append( (hit_start, hit_end, hit_strand)) |
|
|
1060 |
|
|
|
1061 |
|
|
|
1062 |
def get_site_location(self): |
|
|
1063 |
return self.span[0], self.start |
|
|
1064 |
|
|
|
1065 |
def __eq__(self, other): |
|
|
1066 |
# Check if the other fragment/molecule is aligned to the same gene |
|
|
1067 |
if self.match_hash != other.match_hash: |
|
|
1068 |
return False |
|
|
1069 |
|
|
|
1070 |
if len(self.genes.intersection(other.genes)): |
|
|
1071 |
return self.umi_eq(other) |
|
|
1072 |
|
|
|
1073 |
return False |
|
|
1074 |
|
|
|
1075 |
|
|
|
1076 |
|
|
|
1077 |
|
|
|
1078 |
class FeatureCountsSingleEndFragment(Fragment): |
|
|
1079 |
""" Class for fragments annotated with featureCounts |
|
|
1080 |
|
|
|
1081 |
Extracts annotated gene from the XT tag. |
|
|
1082 |
Deduplicates using XT tag and UMI |
|
|
1083 |
Reads without XT tag are flagged as invalid |
|
|
1084 |
|
|
|
1085 |
""" |
|
|
1086 |
|
|
|
1087 |
def __init__(self, |
|
|
1088 |
reads, |
|
|
1089 |
R1_primer_length=4, |
|
|
1090 |
R2_primer_length=6, |
|
|
1091 |
assignment_radius=100_000, |
|
|
1092 |
umi_hamming_distance=1, |
|
|
1093 |
invert_strand=False, |
|
|
1094 |
**kwargs |
|
|
1095 |
): |
|
|
1096 |
|
|
|
1097 |
Fragment.__init__(self, |
|
|
1098 |
reads, |
|
|
1099 |
assignment_radius=assignment_radius, |
|
|
1100 |
R1_primer_length=R1_primer_length, |
|
|
1101 |
R2_primer_length=R2_primer_length, |
|
|
1102 |
umi_hamming_distance=umi_hamming_distance,**kwargs) |
|
|
1103 |
|
|
|
1104 |
self.strand = None |
|
|
1105 |
self.gene = None |
|
|
1106 |
self.identify_gene() |
|
|
1107 |
|
|
|
1108 |
if self.is_valid(): |
|
|
1109 |
self.match_hash = (self.strand, self.gene, self.sample) |
|
|
1110 |
else: |
|
|
1111 |
self.match_hash = None |
|
|
1112 |
|
|
|
1113 |
def __eq__(self, other): |
|
|
1114 |
# Make sure fragments map to the same strand, cheap comparisons |
|
|
1115 |
if self.match_hash != other.match_hash: |
|
|
1116 |
return False |
|
|
1117 |
|
|
|
1118 |
# Make sure UMI's are similar enough, more expensive hamming distance |
|
|
1119 |
# calculation |
|
|
1120 |
return self.umi_eq(other) |
|
|
1121 |
|
|
|
1122 |
def is_valid(self): |
|
|
1123 |
return self.gene is not None |
|
|
1124 |
|
|
|
1125 |
def identify_gene(self): |
|
|
1126 |
self.valid = False |
|
|
1127 |
for read in self.reads: |
|
|
1128 |
if read is not None and read.has_tag('XT'): |
|
|
1129 |
self.gene = read.get_tag('XT') |
|
|
1130 |
self.valid = True |
|
|
1131 |
return self.gene |
|
|
1132 |
|
|
|
1133 |
|
|
|
1134 |
|
|
|
1135 |
class FeatureCountsFullLengthFragment(FeatureCountsSingleEndFragment): |
|
|
1136 |
""" Class for fragments annotated with featureCounts, with multiple reads covering a gene |
|
|
1137 |
|
|
|
1138 |
Extracts annotated gene from the XT tag. |
|
|
1139 |
Deduplicates using XT tag and UMI |
|
|
1140 |
Reads without XT tag are flagged as invalid |
|
|
1141 |
|
|
|
1142 |
""" |
|
|
1143 |
|
|
|
1144 |
def __init__(self, |
|
|
1145 |
reads, |
|
|
1146 |
R1_primer_length=4, |
|
|
1147 |
R2_primer_length=6, |
|
|
1148 |
assignment_radius=10_000, |
|
|
1149 |
umi_hamming_distance=1, |
|
|
1150 |
invert_strand=False, |
|
|
1151 |
**kwargs |
|
|
1152 |
): |
|
|
1153 |
|
|
|
1154 |
FeatureCountsSingleEndFragment.__init__(self, |
|
|
1155 |
reads, |
|
|
1156 |
assignment_radius=assignment_radius, |
|
|
1157 |
R1_primer_length=R1_primer_length, |
|
|
1158 |
R2_primer_length=R2_primer_length, |
|
|
1159 |
umi_hamming_distance=umi_hamming_distance, |
|
|
1160 |
**kwargs |
|
|
1161 |
|
|
|
1162 |
) |
|
|
1163 |
|
|
|
1164 |
def __eq__(self, other): |
|
|
1165 |
# Make sure fragments map to the same strand, cheap comparisons |
|
|
1166 |
if self.match_hash != other.match_hash: |
|
|
1167 |
return False |
|
|
1168 |
|
|
|
1169 |
#Calculate distance |
|
|
1170 |
if min(abs(self.span[1] - |
|
|
1171 |
other.span[1]), abs(self.span[2] - |
|
|
1172 |
other.span[2])) > self.assignment_radius: |
|
|
1173 |
return False |
|
|
1174 |
|
|
|
1175 |
# Make sure UMI's are similar enough, more expensive hamming distance |
|
|
1176 |
# calculation |
|
|
1177 |
return self.umi_eq(other) |
|
|
1178 |
|
|
|
1179 |
|
|
|
1180 |
class FragmentWithoutPosition(Fragment): |
|
|
1181 |
""" Fragment without a specific location on a contig""" |
|
|
1182 |
def get_site_location(self): |
|
|
1183 |
if self.has_valid_span(): |
|
|
1184 |
return self.span[0], 0 |
|
|
1185 |
else: |
|
|
1186 |
return '*', 0 |
|
|
1187 |
|
|
|
1188 |
class FragmentStartPosition(Fragment): |
|
|
1189 |
""" Fragment without a specific location on a contig""" |
|
|
1190 |
def get_site_location(self): |
|
|
1191 |
for read in self: |
|
|
1192 |
if read is not None and not read.is_unmapped: |
|
|
1193 |
return read.reference_name, read.reference_start |
|
|
1194 |
return None |
|
|
1195 |
|
|
|
1196 |
|
|
|
1197 |
|
|
|
1198 |
class FragmentWithoutUMI(Fragment): |
|
|
1199 |
""" |
|
|
1200 |
Use this class when no UMI information is available |
|
|
1201 |
""" |
|
|
1202 |
|
|
|
1203 |
def __init__(self, reads, **kwargs): |
|
|
1204 |
Fragment.__init__(self, reads, **kwargs) |
|
|
1205 |
|
|
|
1206 |
# remove the set_umi function |
|
|
1207 |
def set_umi(self, **kwargs): |
|
|
1208 |
pass |
|
|
1209 |
|
|
|
1210 |
# Replace the equality function |
|
|
1211 |
def __eq__(self, other): # other can also be a Molecule! |
|
|
1212 |
# Make sure fragments map to the same strand, cheap comparisons |
|
|
1213 |
if self.sample != other.sample: |
|
|
1214 |
return False |
|
|
1215 |
|
|
|
1216 |
if self.strand != other.strand: |
|
|
1217 |
return False |
|
|
1218 |
|
|
|
1219 |
if min(abs(self.span[1] - |
|
|
1220 |
other.span[1]), abs(self.span[2] - |
|
|
1221 |
other.span[2])) > self.assignment_radius: |
|
|
1222 |
return False |
|
|
1223 |
|
|
|
1224 |
# Sample matches and starting position is within the defined span |
|
|
1225 |
# radius |
|
|
1226 |
return True |