Switch to unified view

a b/singlecellmultiomics/fragment/fragment.py
1
import itertools
2
from singlecellmultiomics.utils.sequtils import hamming_distance, get_consensus_dictionaries, pick_best_base_call
3
import pysamiterators.iterators
4
import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods
5
from singlecellmultiomics.utils import style_str
6
from singlecellmultiomics.bamProcessing import get_read_group_from_read
7
from singlecellmultiomics.features import FeatureAnnotatedObject
8
9
complement = str.maketrans('ATCGN', 'TAGCN')
10
11
12
class Fragment():
13
    """
14
    This class holds 1 or more reads which are derived from the same cluster
15
16
    Example:
17
        Generate a Fragment with a single associated read::
18
19
            >>> from singlecellmultiomics.molecule import Molecule
20
            >>> from singlecellmultiomics.fragment import Fragment
21
            >>> import pysam
22
            >>> read = pysam.AlignedSegment()
23
            >>> read.reference_start = 30
24
            >>> read.query_name = 'R1'
25
            >>> read.mapping_quality = 30
26
            >>> read.set_tag('SM','CELL_1') # The sample to which the sample belongs is extracted from the SM tag
27
            >>> read.set_tag('RX','CAT') # The UMI is extracted from the RX tag
28
            >>> read.query_sequence = "CATGTATCCGGGCTTAA"
29
            >>> read.query_qualities = [30] * len(read.query_sequence)
30
            >>> read.cigarstring = f'{len(read.query_sequence)}M'
31
            >>> Fragment([read])
32
            Fragment:
33
                sample:CELL_1
34
                umi:CAT
35
                span:None 30-47
36
                strand:+
37
                has R1: yes
38
                has R2: no
39
                randomer trimmed: no
40
41
    Warning:
42
        Make sure the RX and SM tags of the read are set! If these are encoded
43
        in the read name, use singlecellmultiomics.universalBamTagger.customreads
44
        for conversion.
45
46
    """
47
48
    def __init__(self, reads,
49
                 assignment_radius: int = 0,
50
                 umi_hamming_distance: int = 1,
51
                 R1_primer_length: int = 0,
52
                 R2_primer_length: int = 6,
53
                 tag_definitions: list = None,
54
                 max_fragment_size: int = None,
55
                 mapping_dir=(False, True),
56
                 max_NUC_stretch: int = None,
57
                 read_group_format: int = 0,  # R1 forward, R2 reverse
58
                 library_name: str = None, # Overwrites the library name
59
                 single_end: bool = False
60
                 ):
61
        """
62
        Initialise Fragment
63
64
            Args:
65
                reads( list ):
66
                    list containing pysam AlignedSegment reads
67
68
                assignment_radius( int ):
69
                    Assignment radius for molecule resolver
70
71
                umi_hamming_distance( int ):
72
                    umi_hamming_distance distance threshold for associating multiple fragments to a molecule
73
74
                R1_primer_length(int):
75
                    length of R1 primer, these bases are not taken into account when calculating a consensus
76
77
                R2_primer_length(int):
78
                    length of R2 primer, these bases are not taken into account when calculating a consensus, this length is auto-detected/ overwritten when the rS tag is set
79
80
81
                tag_definitions(list):
82
                    sam TagDefinitions
83
84
                max_fragment_size (int):
85
                    When specified, fragments with a fragment size larger than the specified value are flagged as qcfail
86
87
                mapping_dir( tuple ):
88
                    expected mapping direction of mates,
89
                    True for reverse, False for forward. This parameter is used in dovetail detection
90
91
                read_group_format(int) : see get_read_group()
92
            Returns:
93
                html(string) : html representation of the fragment
94
        """
95
        self.R1_primer_length = R1_primer_length
96
        self.R2_primer_length = R2_primer_length
97
        if tag_definitions is None:
98
            tag_definitions = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions
99
        self.tag_definitions = tag_definitions
100
        self.assignment_radius = assignment_radius
101
        self.umi_hamming_distance = umi_hamming_distance
102
        self.mapping_dir = mapping_dir
103
        self.reads = reads
104
        self.strand = None
105
        self.meta = {}  # dictionary of meta data
106
        self.is_mapped = None
107
        # Check for multimapping
108
        self.is_multimapped = True
109
        self.mapping_quality = 0
110
        self.match_hash = None
111
        self.safe_span = None  # wether the span of the fragment could be determined
112
        self.unsafe_trimmed = False  # wether primers have been trimmed off
113
        self.random_primer_sequence = None
114
        self.max_fragment_size = max_fragment_size
115
        self.read_group_format = read_group_format
116
        self.max_NUC_stretch = max_NUC_stretch
117
        self.qcfail = False
118
        self.single_end = single_end
119
120
        # Span:\
121
        self.span = [None, None, None]
122
123
        self.umi = None
124
125
        # Force R1=read1 R2=read2:
126
127
        for i, read in enumerate(self.reads):
128
129
            if read is None:
130
                continue
131
132
            if self.max_NUC_stretch is not None and (
133
                self.max_NUC_stretch*'A' in read.seq or
134
                self.max_NUC_stretch*'G' in read.seq or
135
                self.max_NUC_stretch*'T' in read.seq or
136
                self.max_NUC_stretch*'C' in read.seq):
137
138
                self.set_rejection_reason('HomoPolymer', set_qcfail=True)
139
                self.qcfail=True
140
                break
141
142
            if read.is_qcfail:
143
                self.qcfail = True
144
145
            if read.has_tag('rS'):
146
                self.random_primer_sequence = read.get_tag('rS')
147
                self.unsafe_trimmed = True
148
                self.R2_primer_length = 0 # Set R2 primer length to zero, as the primer has been removed
149
150
            if not read.is_unmapped:
151
                self.is_mapped = True
152
            if read.mapping_quality > 0:
153
                self.is_multimapped = False
154
            self.mapping_quality = max(
155
                self.mapping_quality, read.mapping_quality)
156
157
            if i == 0:
158
                if read.is_read2:
159
                    if not read.is_qcfail:
160
                        raise ValueError('Supply first R1 then R2')
161
                read.is_read1 = True
162
                read.is_read2 = False
163
            elif i == 1:
164
                read.is_read1 = False
165
                read.is_read2 = True
166
        self.set_sample(library_name=library_name)
167
        self.update_umi()
168
        if self.qcfail:
169
            return
170
        self.set_strand(self.identify_strand())
171
        self.update_span()
172
173
174
    def write_tensor(self, chromosome=None, span_start=None, span_end=None, height=30, index_start=0,
175
                     base_content_table=None,
176
                     base_mismatches_table=None,
177
                     base_indel_table=None,
178
                     base_qual_table=None,
179
                     base_clip_table=None,
180
                     mask_reference_bases=None,
181
                     # set with reference bases to mask (converted to N )
182
                     # (chrom,pos)
183
                     reference=None,
184
                     skip_missing_reads=False
185
                     ):
186
        """
187
        Write tensor representation of the fragment to supplied 2d arrays
188
189
        Args:
190
            chromosome( str ):
191
                chromosome to view
192
193
            span_start( int ):
194
                first base to show
195
196
            span_end( int ):
197
                last base to show
198
199
            height (int) :
200
                height of the tensor (reads)
201
202
            index_start(int): start writing tensor at this row
203
204
            base_content_table(np.array) : 2d array to write base contents to
205
206
            base_indel_table(np.array) : 2d array to write indel information to
207
208
            base_content_table(np.array) : 2d array to write base contents to
209
210
            base_content_table(np.array) : 2d array to write base contents to
211
212
            mask_reference_bases(set): mask reference bases in this set with a N set( (chrom,pos), ... )
213
214
            reference(pysam.FastaFile) :  Handle to reference file to use instead of MD tag. If None: MD tag is used.
215
216
            skip_missing_reads (bool) : when enabled only existing (non None) reads are added to the tensor. Use this option when mapping single-end
217
218
        Returns:
219
            None
220
221
        """
222
        self.base_mapper = {}
223
        for i, (strand, base) in enumerate(
224
                itertools.product((True, False), 'NACTG-')):
225
            self.base_mapper[(strand, base)] = i + 2
226
227
        if chromosome is None and span_start is None and span_end is None:
228
            chromosome, span_start, span_end = self.get_span()
229
230
        span_len = span_end - span_start
231
232
        reads = self
233
        last_ref = None
234
        used_reads = 0
235
        for ri, read in enumerate(reads):
236
237
            if read is None:
238
                continue
239
240
            alignment_operations = list(itertools.chain(
241
                *([operation] * l for operation, l in read.cigartuples if operation != 2)))
242
            print(alignment_operations)
243
            # Obtain the amount of operations before the alignment start
244
            operations_before_alignment_start = 0
245
            for query_pos, ref_pos in read.get_aligned_pairs():
246
                if ref_pos is None:
247
                    operations_before_alignment_start += 1
248
                else:
249
                    break
250
            initial_base_offset = read.reference_start - operations_before_alignment_start
251
252
            # Obtain row index in the tensor
253
            if skip_missing_reads:
254
                row_index = (used_reads + index_start) % height
255
            else:
256
                row_index = (ri + index_start) % height
257
258
            # Where in the reference are we?
259
            ref_pointer = read.reference_start
260
261
            alignment_started = False
262
            for operation, (cycle, query_pos, ref_pos, ref_base) in zip(
263
                alignment_operations,
264
                pysamiterators.iterators.ReadCycleIterator(
265
                    read,
266
                    with_seq=True,
267
                    reference=reference)):
268
269
                # Mask locations from mask_reference_bases
270
                if mask_reference_bases is not None and ref_pos is not None and (
271
                        chromosome, ref_pos) in mask_reference_bases:
272
                    ref_base = 'N'
273
274
                if ref_pos is None:
275
                    if not alignment_started:
276
                        ref_pointer = initial_base_offset + i
277
                    else:
278
                        ref_pointer += 1
279
                    if operation == 4:
280
                        try:
281
                            query_base = read.seq[query_pos]
282
                            base_clip_table[row_index, ref_pointer -
283
                                            span_start] = self.base_mapper[(read.is_reverse, query_base)]
284
                        except IndexError:
285
                            pass
286
                    continue
287
                ref_pointer = ref_pos
288
289
                if (ref_pos - span_start) < 0 or (ref_pos -
290
                                                  span_start) > (span_len - 1):
291
                    continue
292
293
                ref_base = ref_base.upper()
294
                if query_pos is None:
295
                    query_base = '-'
296
                    qual = 0
297
                    base_indel_table[row_index, ref_pos - span_start] = 1
298
                else:
299
                    query_base = read.seq[query_pos]
300
                    qual = ord(read.qual[query_pos])
301
302
                if ref_pos is not None:
303
                    base_qual_table[row_index, ref_pos - span_start] = qual
304
                    if query_base != ref_base:
305
                        base_content_table[row_index, ref_pos -
306
                                           span_start] = self.base_mapper[(read.is_reverse, query_base)]
307
                    else:
308
                        base_mismatches_table[row_index,
309
                                              ref_pos - span_start] = 0.2
310
                    base_content_table[row_index, ref_pos -
311
                                       span_start] = self.base_mapper[(read.is_reverse, query_base)]
312
313
            used_reads += 1
314
        if skip_missing_reads:
315
            return used_reads + index_start + 1
316
        else:
317
            return ri + index_start + 1
318
319
    def get_read_group(self, with_attr_dict=False) -> str:
320
321
        r = None
322
        for read in self.reads:
323
            if read is None:
324
                continue
325
326
            r = get_read_group_from_read(read,
327
                    format=self.read_group_format,
328
                    with_attr_dict=with_attr_dict)
329
330
            break
331
332
        return r
333
334
335
    def set_duplicate(self, is_duplicate):
336
        """Define this fragment as duplicate, sets the corresponding bam bit flag
337
        Args:
338
            value(bool) : is_duplicate
339
        """
340
        for read in self.reads:
341
            if read is not None:
342
                read.is_duplicate = is_duplicate
343
344
    def get_fragment_size(self):
345
        return abs(self.span[2] - self.span[1])
346
347
    def write_tags(self):
348
        self.set_meta('MQ', self.mapping_quality)
349
        self.set_meta('MM', self.is_multimapped)
350
        self.set_meta('RG', self.get_read_group())
351
        if self.has_valid_span():
352
            # Write fragment size:
353
            if self.safe_span:
354
                self.set_meta('fS', self.get_fragment_size())
355
                self.set_meta('fe', self.span[1])
356
                self.set_meta('fs', self.span[2])
357
            else:
358
                self.remove_meta('fS')
359
360
        else:
361
            self.set_rejection_reason('FS', set_qcfail=True)
362
363
        # Set qcfail bit when the fragment is not valid
364
365
        if not self.is_valid():
366
            for read in self:
367
                if read is not None:
368
                    read.is_qcfail = True
369
370
    def write_pysam(self, pysam_handle):
371
        """Write all associated reads to the target file
372
373
        Args:
374
            target_file (pysam.AlignmentFile) : Target file
375
        """
376
        self.write_tags()
377
        for read in self:
378
            if read is not None:
379
                pysam_handle.write(read)
380
381
    def identify_strand(self):
382
        # If R2 is rev complement:
383
        if self.get_R1() is not None:
384
            # verify if the read has been mapped:
385
            if self.get_R1().is_unmapped:
386
                return None
387
            return self.get_R1().is_reverse
388
        elif self.get_R2() is not None:
389
            # verify if the read has been mapped:
390
            if self.get_R2().is_unmapped:
391
                return None
392
            return not self.get_R2().is_reverse
393
        return None
394
395
    def get_random_primer_hash(self):
396
        """Obtain hash describing the random primer
397
        this assumes the random primer is on the end of R2 and has a length of self.R2_primer_length
398
        When the rS tag is set, the value of this tag is used as random primer sequence
399
        Returns None,None when the random primer cannot be described
400
401
        Returns:
402
            reference_name (str) or None
403
            reference_start (int) : Int or None
404
            sequence (str) : Int or None
405
        """
406
        R2 = self.get_R2()
407
408
        if R2 is None or R2.query_sequence is None:
409
            return None, None, None
410
411
        if self.R2_primer_length == 0 and self.random_primer_sequence is None:
412
            return None, None, None
413
414
        # The read was not mapped
415
        if R2.is_unmapped:
416
            # Guess the orientation does not matter
417
            if self.random_primer_sequence is not None:
418
                return None, None, self.random_primer_sequence
419
            return None, None, R2.query_sequence[:self.R2_primer_length]
420
421
        if R2.is_reverse:
422
            global complement
423
            if self.random_primer_sequence is not None:
424
                return R2.reference_name, R2.reference_end, self.random_primer_sequence
425
426
            return(R2.reference_name, R2.reference_end, R2.query_sequence[-self.R2_primer_length:][::-1].translate(complement))
427
        else:
428
            if self.random_primer_sequence is not None:
429
                return R2.reference_name, R2.reference_start, self.random_primer_sequence
430
            return(R2.reference_name, R2.reference_start, R2.query_sequence[:self.R2_primer_length])
431
        raise ValueError()
432
433
    def remove_meta(self,key):
434
        for read in self:
435
            if read is not None:
436
                if read.has_tag(key):
437
                    read.set_tag(key,None)
438
439
440
    def set_meta(self, key, value, as_set=False):
441
        self.meta[key] = value
442
        for read in self:
443
            if read is not None:
444
                if as_set and read.has_tag(key):
445
446
                    data = set(read.get_tag(key).split(','))
447
                    data.add(value)
448
                    read.set_tag(key, ','.join(sorted(list(data))))
449
450
                else:
451
                    read.set_tag(key, value)
452
453
    def get_R1(self):
454
        """
455
        Obtain the AlignedSegment of read 1 of the fragment
456
457
        Returns:
458
            R1 (pysam.AlignedSegment) : Read 1 of the fragment, returns None
459
                                        when R1 is not mapped
460
        """
461
        if len(self.reads) == 0:
462
            raise IndexError('The fragment has no associated reads')
463
        return self.reads[0]
464
465
    def get_R2(self):
466
        """
467
        Obtain the AlignedSegment of read 2 of the fragment
468
469
        Returns:
470
            R1 (pysam.AlignedSegment) : Read 2 of the fragment, returns None
471
                                        when R2 is not mapped
472
        """
473
        if len(self.reads) < 2:
474
            raise IndexError('The fragment has no associated R2')
475
        return self.reads[1]
476
477
    def has_R1(self):
478
        """
479
        Check if the fragment has an associated read 1
480
481
        Returns:
482
            has_r1 (bool)
483
        """
484
        if len(self.reads) == 0:
485
            return False
486
        return self.reads[0] is not None
487
488
    def has_R2(self):
489
        """
490
        Check if the fragment has an associated read 2
491
492
        Returns:
493
            has_r2 (bool)
494
        """
495
        if len(self.reads) < 2:
496
            return False
497
        return self.reads[1] is not None
498
499
    @property
500
    def R1(self):
501
        return self.reads[0]
502
503
    @property
504
    def R2(self):
505
        return self.reads[1]
506
507
    def get_consensus(self, only_include_refbase: str = None, dove_safe: bool = False, **get_consensus_dictionaries_kwargs) -> dict:
508
        """
509
        a dictionary of (reference_pos) : (qbase, quality, reference_base) tuples
510
511
        Args:
512
            only_include_refbase(str) : Only report bases aligned to this reference base, uppercase only
513
            dove_safe(bool) : Only report bases supported within R1 and R2 start and end coordinates
514
515
        Returns:
516
            consensus(dict) : {reference_position: (qbase, quality)
517
        """
518
        r1_consensus, r2_consensus = get_consensus_dictionaries(
519
                self.R1,
520
                self.R2,
521
                only_include_refbase=only_include_refbase,
522
                dove_safe=dove_safe,
523
                **get_consensus_dictionaries_kwargs)
524
525
        return {
526
            ref_pos:pick_best_base_call( r1_consensus.get(ref_pos) , r2_consensus.get(ref_pos) )
527
            for ref_pos in set(r1_consensus.keys()).union(set(r2_consensus.keys()))
528
        }
529
530
531
    @property
532
    def estimated_length(self) -> int:
533
        """
534
        Obtain the estimated size of the fragment,
535
        returns None when estimation is not possible
536
        Takes into account removed bases (R2_primer_length attribute)
537
        Assumes inwards sequencing orientation, except when self.single_end is set
538
        """
539
        if self.single_end:
540
            if self[0] is None:
541
                return None
542
            return self[0].reference_end - self[0].reference_start
543
544
545
        if self.has_R1() and self.has_R2():
546
547
            contig = self.R1.reference_name
548
            if self.R1.is_reverse and not self.R2.is_reverse:
549
                ##     rp |----- R2 ---->  <--- R1 ----|
550
                ##    |>----------DISTANCE------------<|
551
                start, end = self.R2.reference_start - self.R2_primer_length, self.R1.reference_end
552
                if start<end:
553
                    return end - start
554
                else:
555
                    return None
556
            elif not self.R1.is_reverse and self.R2.is_reverse:
557
558
                ##    |----- R1 ---->  <--- R2 ----|  rp
559
                ##    |>----------DISTANCE------------<|
560
561
                start, end = self.R1.reference_start, self.R2.reference_end + self.R2_primer_length
562
                if start<end:
563
                    return end - start
564
                else:
565
                    return None
566
            else:
567
                return None
568
        return None
569
570
571
    def update_span(self):
572
        """
573
        Update the span (the location the fragment maps to) stored in Fragment
574
575
        The span is a single stretch of coordinates on a single contig.
576
        The contig is determined by the reference_name assocated to the first
577
        mapped read in self.reads
578
579
        This calculation assumes the reads are sequenced inwards and
580
        dove-tails of the molecule cannot be trusted
581
        """
582
583
        contig = None
584
        start = None
585
        end = None
586
587
588
        if self.has_R1() and self.has_R2() and \
589
           self.R1.reference_start is not None and self.R1.reference_end is not None and \
590
           self.R2.reference_start is not None and self.R2.reference_end is not None :
591
592
            contig = self.R1.reference_name
593
            if self.R1.is_reverse and not self.R2.is_reverse:
594
                start, end = self.R2.reference_start, self.R1.reference_end
595
                self.safe_span = True
596
            elif not self.R1.is_reverse and self.R2.is_reverse:
597
                start, end = self.R1.reference_start, self.R2.reference_end
598
                self.safe_span = True
599
            else:
600
                start = min(self.R1.reference_start, self.R2.reference_start)
601
                end = max(self.R1.reference_start, self.R2.reference_start)
602
                self.safe_span = False
603
604
        elif self.has_R1() and self.R1.reference_start is not None and self.R1.reference_end is not None :
605
            contig = self.R1.reference_name
606
            start, end = self.R1.reference_start, self.R1.reference_end
607
            self.safe_span = False
608
609
        elif self.has_R2()  and self.R2.reference_start is not None and self.R2.reference_end is not None :
610
            contig = self.R2.reference_name
611
            start, end = self.R2.reference_start, self.R2.reference_end
612
            self.safe_span = False
613
        else:
614
615
            # Its Sometimes possible that no cigar is set for the alignment, only a start coordinate
616
617
            for read in self:
618
                if read is None:
619
                    continue
620
                if len(read.cigar)!=0:
621
                    raise NotImplementedError("Non implemented span")
622
                if read.reference_start is not None:
623
                    start,end = read.reference_start, read.reference_start
624
                    contig = read.reference_name
625
                else:
626
                    raise NotImplementedError("Non implemented span, undefined alignment, and not start coordinate")
627
628
629
630
631
        self.span = (contig, start, end)
632
633
    @property
634
    def aligned_length(self):
635
        return self.span[2]-self.span[1]
636
637
    def get_span(self) -> tuple:
638
        return self.span
639
640
    def has_valid_span(self) -> bool:
641
642
        # Check if the span could be calculated:
643
        defined_span = not any([x is None for x in self.get_span()])
644
        if not defined_span:
645
            return False
646
647
        if self.max_fragment_size is not None and self.get_fragment_size()>self.max_fragment_size:
648
            return False
649
650
        return True
651
652
    def set_sample(self, sample: str = None, library_name: str = None):
653
        """Force sample name or obtain sample name from associated reads"""
654
        if sample is not None:
655
            self.sample = sample
656
        else:
657
            for read in self.reads:
658
                if read is not None and read.has_tag('SM'):
659
                    self.sample = read.get_tag('SM')
660
                    break
661
662
        if library_name is not None:
663
            sample = self.sample.split('_', 1)[-1]
664
            self.sample = library_name+'_'+sample
665
            for read in self.reads:
666
                if read is not None:
667
                    read.set_tag('SM', self.sample)
668
                    read.set_tag('LY', library_name)
669
670
    def get_sample(self) -> str:
671
        """ Obtain the sample name associated with the fragment
672
        The sample name is extracted from the SM tag of any of the associated reads.
673
674
        Returns:
675
            sample name (str)
676
        """
677
        return self.sample
678
679
    def __len__(self):
680
        """Obtain the amount of associated reads to the fragment
681
682
        Returns:
683
            assocoiated reads (int)
684
        """
685
        return(sum(read is not None for read in self.reads))
686
687
    def set_strand(self, strand: bool):
688
        """Set mapping strand
689
690
        Args:
691
            strand (bool) : False for Forward, True for reverse
692
        """
693
        self.strand = strand
694
695
    def get_strand(self):
696
        """Obtain strand
697
698
        Returns:
699
            strand (bool) : False for Forward, True for reverse
700
        """
701
        return self.strand
702
703
    def update_umi(self):
704
        """
705
        Extract umi from 'RX' tag and store the UMI in the Fragment object
706
        """
707
        for read in self.reads:
708
            if read is not None and read.has_tag('RX'):
709
                self.umi = read.get_tag('RX')
710
711
    def get_umi(self):
712
        """
713
        Get the UMI sequence associated to this fragment
714
715
        Returns:
716
            umi(str)
717
        """
718
        return self.umi
719
720
    def __iter__(self):
721
        return iter(self.reads)
722
723
    def umi_eq(self, other):
724
        """
725
        Hamming distance measurement to another Fragment or Molecule,
726
727
        Returns :
728
            is_close (bool) : returns True when the hamming distance between
729
            the two objects <= umi_hamming_distance
730
731
        Example:
732
            >>> from singlecellmultiomics.molecule import Molecule
733
            >>> from singlecellmultiomics.fragment import Fragment
734
            >>> import pysam
735
            >>> # Create reads (read_A, and read_B), they both belong to the same
736
            >>> # cell and have 1 hamming distance between their UMI's
737
            >>> read_A = pysam.AlignedSegment()
738
            >>> read_A.set_tag('SM','CELL_1') # The sample to which the sample belongs is extracted from the SM tag
739
            >>> read_A.set_tag('RX','CAT') # The UMI is extracted from the RX tag
740
            >>> read_B = pysam.AlignedSegment()
741
            >>> read_B.set_tag('SM','CELL_1') # The sample to which the sample belongs is extracted from the SM tag
742
            >>> read_B.set_tag('RX','CAG') # The UMI is extracted from the RX tag
743
744
            >>> # Create fragment objects for read_A and B:
745
            >>> frag_A = Fragment([read_A],umi_hamming_distance=0)
746
            >>> frag_B = Fragment([read_B],umi_hamming_distance=0)
747
            >>> frag_A.umi_eq(frag_B) # This returns False, the distance is 1, which is higher than 0 (umi_hamming_distance)
748
            False
749
750
            >>> frag_A = Fragment([read_A],umi_hamming_distance=1)
751
            >>> frag_B = Fragment([read_B],umi_hamming_distance=1)
752
            >>> frag_A.umi_eq(frag_B) # This returns True, the distance is 1, which is the (umi_hamming_distance)
753
            True
754
        """
755
756
        if self.umi == other.umi:
757
            return True
758
        if self.umi_hamming_distance == 0:
759
            return False
760
        else:
761
            if len(self.umi)!=len(other.umi):
762
                return False
763
            return hamming_distance(
764
                self.umi, other.umi) <= self.umi_hamming_distance
765
766
    def __eq__(self, other):  # other can also be a Molecule!
767
        # Make sure fragments map to the same strand, cheap comparisons
768
        """
769
        Check equivalence between two Fragments or Fragment and Molecule.
770
771
772
        Args:
773
            other (Fragment or Molecule) : object to compare against
774
775
        Returns
776
            is_eq (bool) : True when the other object is (likely) derived from the same molecule
777
778
        Example:
779
            >>> from singlecellmultiomics.molecule import Molecule
780
            >>> from singlecellmultiomics.fragment import Fragment
781
            >>> import pysam
782
            >>> # Create sam file to write some reads to:
783
            >>> test_sam = pysam.AlignmentFile('test.sam','w',reference_names=['chr1','chr2'],reference_lengths=[1000,1000])
784
            >>> read_A = pysam.AlignedSegment(test_sam.header)
785
            >>> read_A.set_tag('SM','CELL_1') # The sample to which the sample belongs is extracted from the SM tag
786
            >>> read_A.set_tag('RX','CAT') # The UMI is extracted from the RX tag
787
            >>> # By default the molecule assignment is done based on the mapping location of read 1:
788
            >>> read_A.reference_name = 'chr1'
789
            >>> read_A.reference_start = 100
790
            >>> read_A.query_sequence = 'ATCGGG'
791
            >>> read_A.cigarstring = '6M'
792
793
            >>> read_B = pysam.AlignedSegment(test_sam.header)
794
            >>> read_B.set_tag('SM','CELL_1')
795
            >>> read_B.set_tag('RX','CAT')
796
            >>> read_B.reference_start = 100
797
            >>> read_B.query_sequence = 'ATCGG'
798
            >>> read_B.cigarstring = '5M'
799
800
            >>> frag_A = Fragment([read_A],umi_hamming_distance=0)
801
            >>> frag_A
802
                Fragment:
803
                    sample:CELL_1
804
                    umi:CAT
805
                    span:chr1 100-106
806
                    strand:+
807
                    has R1: yes
808
                    has R2: no
809
                    randomer trimmed: no
810
            >>> frag_B = Fragment([read_B],umi_hamming_distance=0)
811
            >>> frag_A == frag_B
812
            True
813
            # Fragment A and fragment B belong to the same molecule,
814
            # the UMI is identical, the starting position of R1 is identical and
815
            # the sample name matches
816
817
        When we move one of the reads, the Fragments are not equivalent any more
818
819
        Example:
820
            >>> read_B.reference_start = 150
821
            >>> frag_B = Fragment([read_B],umi_hamming_distance=0)
822
            >>> frag_A == frag_B
823
            False
824
825
        Except if the difference <= the assignment_radius
826
827
        Example:
828
            >>> read_B.reference_start = 150
829
            >>> read_A.reference_start = 100
830
            >>> frag_B = Fragment([read_B],assignment_radius=300)
831
            >>> frag_A = Fragment([read_A],assignment_radius=300)
832
            >>> frag_A == frag_B
833
            True
834
835
         When the UMI's are too far apart, the eq function returns `False`
836
837
        Example:
838
            >>> read_B.reference_start = 100
839
            >>> read_A.reference_start = 100
840
            >>> read_A.set_tag('RX','GGG')
841
            >>> frag_B = Fragment([read_B])
842
            >>> frag_A = Fragment([read_A])
843
            >>> frag_A == frag_B
844
            False
845
846
        When the sample of the Fragments are not identical, the eq function returns `False`
847
848
        Example:
849
            >>> read_B.reference_start = 100
850
            >>> read_A.reference_start = 100
851
            >>> read_A.set_tag('RX','AAA')
852
            >>> read_B.set_tag('RX','AAA')
853
            >>> read_B.set_tag('SM', 'CELL_2' )
854
            >>> frag_B = Fragment([read_B])
855
            >>> frag_A = Fragment([read_A])
856
            >>> frag_A == frag_B
857
            False
858
859
        """
860
        if self.sample != other.sample:
861
            return False
862
863
        if self.strand != other.strand:
864
            return False
865
866
        if not self.has_valid_span() or not other.has_valid_span():
867
            return False
868
869
        if min(abs(self.span[1] -
870
                   other.span[1]), abs(self.span[2] -
871
                                       other.span[2])) > self.assignment_radius:
872
            return False
873
874
        # Make sure UMI's are similar enough, more expensive hamming distance
875
        # calculation
876
        return self.umi_eq(other)
877
878
    def __getitem__(self, index):
879
        """
880
        Get a read from the fragment
881
882
        Args:
883
            index( int ):
884
                0 : Read 1
885
                1: Read 2
886
                ..
887
        """
888
        return self.reads[index]
889
890
    def is_valid(self):
891
        if self.qcfail:
892
            return False
893
        return self.has_valid_span()
894
895
    def get_strand_repr(self):
896
        s = self.get_strand()
897
        if s is None:
898
            return '?'
899
        if s:
900
            return '-'
901
        else:
902
            return '+'
903
904
    def __repr__(self):
905
        rep =  f"""Fragment:
906
        sample:{self.get_sample()}
907
        umi:{self.get_umi()}
908
        span:{('%s %s-%s' % self.get_span() if self.has_valid_span() else 'no span')}
909
        strand:{self.get_strand_repr()}
910
        has R1: {"yes" if self.has_R1() else "no"}
911
        has R2: {"yes" if self.has_R2() else "no"}
912
        randomer trimmed: {"yes" if self.unsafe_trimmed else "no"}
913
        """
914
915
        try:
916
            rep += '\n\t'.join([f'{key}:{str(value)}' for key, value in self.meta.items()])
917
        except Exception as e:
918
            print(self.meta)
919
            raise
920
        return rep
921
922
    def get_html(
923
            self,
924
            chromosome=None,
925
            span_start=None,
926
            span_end=None,
927
            show_read1=None,
928
            show_read2=None):
929
        """
930
        Get HTML representation of the fragment
931
932
        Args:
933
            chromosome( str ):
934
                chromosome to view
935
936
            span_start( int ):
937
                first base to show
938
939
            span_end( int ):
940
                last base to show
941
942
            show_read1(bool):
943
                show read1
944
945
            show_read2(bool):
946
                show read2
947
948
        Returns:
949
            html(string) : html representation of the fragment
950
951
        """
952
953
        if chromosome is None and span_start is None and span_end is None:
954
            chromosome, span_start, span_end = self.get_span()
955
956
        span_len = abs(span_end - span_start)
957
        visualized_frag = ['.'] * span_len
958
959
        if show_read1 is None and show_read2 is None:
960
            reads = self
961
        else:
962
            reads = []
963
            if show_read1:
964
                reads.append(self.get_R1())
965
            if show_read2:
966
                reads.append(self.get_R2())
967
968
        for read in reads:
969
            if read is None:
970
                continue
971
            # for cycle, query_pos, ref_pos, ref_base in
972
            # pysamiterators.iterators.ReadCycleIterator(read,with_seq=True):
973
            for query_pos, ref_pos, ref_base in read.get_aligned_pairs(
974
                    with_seq=True, matches_only=True):
975
                if ref_pos is None or ref_pos < span_start or ref_pos > span_end:
976
                    continue
977
978
                ref_base = ref_base.upper()
979
                if query_pos is None:
980
                    query_base = '-'
981
                else:
982
                    query_base = read.seq[query_pos]
983
                if ref_pos is not None:
984
                    if query_base != ref_base:
985
                        v = style_str(query_base, color='red', weight=500)
986
                    else:
987
                        v = query_base
988
                    try:
989
                        visualized_frag[ref_pos - span_start] = v
990
                    except IndexError:
991
                        pass  # the alignment is outside the requested view
992
        return ''.join(visualized_frag)
993
994
    def set_rejection_reason(self, reason, set_qcfail=False):
995
        self.set_meta('RR', reason, as_set=True)
996
        if set_qcfail:
997
            for read in self:
998
                if read is not None:
999
                    read.is_qcfail = True
1000
1001
    def set_recognized_sequence(self, seq):
1002
        self.set_meta('RZ', seq)
1003
1004
1005
1006
class SingleEndTranscriptFragment(Fragment, FeatureAnnotatedObject):
1007
    def __init__(self, reads, features, assignment_radius=0, stranded=None, capture_locations=False, auto_set_intron_exon_features=True,**kwargs):
1008
1009
        Fragment.__init__(self, reads, assignment_radius=assignment_radius, **kwargs)
1010
        FeatureAnnotatedObject.__init__(self,
1011
                                        features=features,
1012
                                        stranded=stranded,
1013
                                        capture_locations=capture_locations,
1014
                                        auto_set_intron_exon_features=auto_set_intron_exon_features )
1015
1016
        self.match_hash = (self.sample,
1017
                           self.span[0],
1018
                           self.strand)
1019
1020
1021
    def write_tags(self):
1022
        # First decide on what values to write
1023
        FeatureAnnotatedObject.write_tags(self)
1024
        # Additional fragment data
1025
        Fragment.write_tags(self)
1026
1027
    def is_valid(self):
1028
        if len(self.genes)==0:
1029
            return False
1030
        return True
1031
1032
    def annotate(self):
1033
        for read in self:
1034
            if read is None:
1035
                continue
1036
1037
            read_strand = read.is_reverse
1038
            if self.stranded is not None and self.stranded==False:
1039
                read_strand=not read_strand
1040
1041
1042
            strand = ('-' if read_strand else '+')
1043
            read.set_tag('mr',strand)
1044
            for start, end in read.get_blocks():
1045
1046
                for hit in self.features.findFeaturesBetween(
1047
                        chromosome=read.reference_name,
1048
                        sampleStart=start,
1049
                        sampleEnd=end,
1050
                        strand=(None if self.stranded is None else strand)) :
1051
1052
                    hit_start, hit_end, hit_id, hit_strand, hit_ids = hit
1053
                    self.hits[hit_ids].add(
1054
                        (read.reference_name, (hit_start, hit_end)))
1055
1056
                    if self.capture_locations:
1057
                        if not hit_id in self.feature_locations:
1058
                            self.feature_locations[hit_id] = []
1059
                        self.feature_locations[hit_id].append( (hit_start, hit_end, hit_strand))
1060
1061
1062
    def get_site_location(self):
1063
        return self.span[0], self.start
1064
1065
    def __eq__(self, other):
1066
        # Check if the other fragment/molecule is aligned to the same gene
1067
        if self.match_hash != other.match_hash:
1068
            return False
1069
1070
        if len(self.genes.intersection(other.genes)):
1071
            return self.umi_eq(other)
1072
1073
        return False
1074
1075
1076
1077
1078
class FeatureCountsSingleEndFragment(Fragment):
1079
    """ Class for fragments annotated with featureCounts
1080
1081
    Extracts annotated gene from the XT tag.
1082
    Deduplicates using XT tag and UMI
1083
    Reads without XT tag are flagged as invalid
1084
1085
    """
1086
1087
    def __init__(self,
1088
                 reads,
1089
                 R1_primer_length=4,
1090
                 R2_primer_length=6,
1091
                 assignment_radius=100_000,
1092
                 umi_hamming_distance=1,
1093
                 invert_strand=False,
1094
                 **kwargs
1095
                 ):
1096
1097
        Fragment.__init__(self,
1098
                          reads,
1099
                          assignment_radius=assignment_radius,
1100
                          R1_primer_length=R1_primer_length,
1101
                          R2_primer_length=R2_primer_length,
1102
                          umi_hamming_distance=umi_hamming_distance,**kwargs)
1103
1104
        self.strand = None
1105
        self.gene = None
1106
        self.identify_gene()
1107
1108
        if self.is_valid():
1109
            self.match_hash = (self.strand, self.gene, self.sample)
1110
        else:
1111
            self.match_hash = None
1112
1113
    def __eq__(self, other):
1114
        # Make sure fragments map to the same strand, cheap comparisons
1115
        if self.match_hash != other.match_hash:
1116
            return False
1117
1118
        # Make sure UMI's are similar enough, more expensive hamming distance
1119
        # calculation
1120
        return self.umi_eq(other)
1121
1122
    def is_valid(self):
1123
        return self.gene is not None
1124
1125
    def identify_gene(self):
1126
        self.valid = False
1127
        for read in self.reads:
1128
            if read is not None and read.has_tag('XT'):
1129
                self.gene = read.get_tag('XT')
1130
                self.valid = True
1131
                return self.gene
1132
1133
1134
1135
class FeatureCountsFullLengthFragment(FeatureCountsSingleEndFragment):
1136
    """ Class for fragments annotated with featureCounts, with multiple reads covering a gene
1137
1138
    Extracts annotated gene from the XT tag.
1139
    Deduplicates using XT tag and UMI
1140
    Reads without XT tag are flagged as invalid
1141
1142
    """
1143
1144
    def __init__(self,
1145
                 reads,
1146
                 R1_primer_length=4,
1147
                 R2_primer_length=6,
1148
                 assignment_radius=10_000,
1149
                 umi_hamming_distance=1,
1150
                 invert_strand=False,
1151
                 **kwargs
1152
                 ):
1153
1154
        FeatureCountsSingleEndFragment.__init__(self,
1155
                          reads,
1156
                          assignment_radius=assignment_radius,
1157
                          R1_primer_length=R1_primer_length,
1158
                          R2_primer_length=R2_primer_length,
1159
                          umi_hamming_distance=umi_hamming_distance,
1160
                          **kwargs
1161
1162
                          )
1163
1164
    def __eq__(self, other):
1165
        # Make sure fragments map to the same strand, cheap comparisons
1166
        if self.match_hash != other.match_hash:
1167
            return False
1168
1169
        #Calculate distance
1170
        if min(abs(self.span[1] -
1171
                   other.span[1]), abs(self.span[2] -
1172
                                       other.span[2])) > self.assignment_radius:
1173
            return False
1174
1175
        # Make sure UMI's are similar enough, more expensive hamming distance
1176
        # calculation
1177
        return self.umi_eq(other)
1178
1179
1180
class FragmentWithoutPosition(Fragment):
1181
    """ Fragment without a specific location on a contig"""
1182
    def get_site_location(self):
1183
        if self.has_valid_span():
1184
            return self.span[0], 0
1185
        else:
1186
            return '*', 0
1187
1188
class FragmentStartPosition(Fragment):
1189
    """ Fragment without a specific location on a contig"""
1190
    def get_site_location(self):
1191
        for read in self:
1192
            if read is not None and not read.is_unmapped:
1193
                return read.reference_name, read.reference_start
1194
        return None
1195
1196
1197
1198
class FragmentWithoutUMI(Fragment):
1199
    """
1200
    Use this class when no UMI information is available
1201
    """
1202
1203
    def __init__(self, reads, **kwargs):
1204
        Fragment.__init__(self, reads, **kwargs)
1205
1206
    # remove the set_umi function
1207
    def set_umi(self, **kwargs):
1208
        pass
1209
1210
    # Replace the equality function
1211
    def __eq__(self, other):  # other can also be a Molecule!
1212
        # Make sure fragments map to the same strand, cheap comparisons
1213
        if self.sample != other.sample:
1214
            return False
1215
1216
        if self.strand != other.strand:
1217
            return False
1218
1219
        if min(abs(self.span[1] -
1220
                   other.span[1]), abs(self.span[2] -
1221
                                       other.span[2])) > self.assignment_radius:
1222
            return False
1223
1224
        # Sample matches and starting position is within the defined span
1225
        # radius
1226
        return True