a b/singlecellmultiomics/fragment/nlaIII.py
1
from singlecellmultiomics.fragment import Fragment
2
from singlecellmultiomics.utils.sequtils import hamming_distance
3
4
5
class NlaIIIFragment(Fragment):
6
    def __init__(self,
7
                 reads,
8
                 R1_primer_length=4,
9
                 R2_primer_length=6,
10
                 assignment_radius=1_000,
11
                 umi_hamming_distance=1,
12
                 invert_strand=False,
13
                 check_motif=True,
14
                 no_overhang =False, # CATG is present OUTSIDE the fragment
15
                 cut_location_offset=-4,
16
                 reference=None, #Reference is required when no_overhang=True
17
                 allow_cycle_shift=False,
18
                 use_allele_tag = False, # Use existing DA tag for molecule deduplication
19
                 no_umi_cigar_processing=False, **kwargs
20
                 ):
21
        self.invert_strand = invert_strand
22
        self.no_overhang = no_overhang
23
        self.reference = reference
24
        self.allow_cycle_shift = allow_cycle_shift
25
        self.cut_location_offset = cut_location_offset
26
        self.no_umi_cigar_processing= no_umi_cigar_processing
27
        self.use_allele_tag = use_allele_tag
28
        self.check_motif=check_motif
29
30
        if self.no_overhang and reference  is None:
31
            raise ValueError('Supply a reference handle when no_overhang=True')
32
        if self.no_overhang and not self.check_motif:
33
            raise ValueError('no_overhang=True is not compatible with check_motif=False, as there is no way to deduplicate. Consider using method "chic"')
34
35
        Fragment.__init__(self,
36
                          reads,
37
                          assignment_radius=assignment_radius,
38
                          R1_primer_length=R1_primer_length,
39
                          R2_primer_length=R2_primer_length,
40
                          umi_hamming_distance=umi_hamming_distance, **kwargs)
41
        # set NLAIII cut site given reads
42
        self.strand = None
43
        self.site_location = None
44
        self.cut_site_strand = None
45
        if self.identify_site():
46
47
            self.found_valid_site = True
48
        else:
49
            self.found_valid_site = False
50
51
        if self.is_valid():
52
53
            if self.use_allele_tag:
54
55
                allele = 'n'
56
                for read in self.reads:
57
                    if read is not None and read.has_tag('DA'):
58
                        allele = read.get_tag('DA')
59
60
                self.match_hash = (
61
                    allele,
62
                    self.strand,
63
                    self.cut_site_strand,
64
                    self.site_location[0],
65
                    self.site_location[1],
66
                    self.sample)
67
68
            else:
69
                self.match_hash = (
70
                    self.strand,
71
                    self.cut_site_strand,
72
                    self.site_location[0],
73
                    self.site_location[1],
74
                    self.sample)
75
        else:
76
            self.match_hash = None
77
78
    def set_site(self, site_chrom, site_pos, site_strand=None, valid=True):
79
        if not valid :
80
            self.found_valid_site = False
81
        else:
82
            self.found_valid_site = True
83
            self.set_meta('DS', site_pos)
84
85
        if site_strand is not None:
86
            if self.invert_strand:
87
                self.set_meta('RS', not site_strand)
88
            else:
89
                self.set_meta('RS', site_strand)
90
        self.set_strand(site_strand)
91
        self.site_location = (site_chrom, site_pos)
92
        self.cut_site_strand = site_strand
93
94
    def get_safe_span(self, allow_unsafe=True):
95
96
    # For a mapped read pair it can be important to figure out which bases are actual genomic signal
97
    # Al sequence behind and aligned to the random primer(s) cannot be trusted and should be masked out
98
    # secondly all signal before the starting location of R1 cannot be trusted
99
    # This function returns a lower and higher bound of the locations within the fragment that can be trusted
100
    # ASCII art: (H is primer sequence)
101
    #           R1 H------------------------>
102
    #   <------------------HH R2
103
104
    # Output: (E is emitted)
105
    #           R1 HEEEEEEE----------------->
106
    #   <------------------HH R2
107
108
109
        if self.R1_primer_length==0 and self.R2_primer_length==0:
110
            starts = tuple( read.reference_start for read in self if read is not None and not read.is_unmapped )
111
            ends = tuple( read.reference_end for read in self if read is not None and not read.is_unmapped )
112
            return min( min(starts), min(ends) ), max( max(starts), max(ends) )
113
114
        R1, R2 = self.reads
115
116
        if (R1 is None or R1.is_unmapped) and allow_unsafe:
117
            return R2.reference_start,R2.reference_end
118
119
        if (R2 is None or R2.is_unmapped) and allow_unsafe:
120
            return R1.reference_start,R1.reference_end
121
122
123
        if  (R1 is None and not allow_unsafe) or R2 is None or R1.is_unmapped:
124
            # This is an annoying situation, we cannot determine what bases can be trusted
125
            raise ValueError('Genomic locations cannot be determined')
126
127
        if R2.is_unmapped:
128
            # This is an annoying situation, we cannot determine what bases can be trusted
129
            raise ValueError('Genomic locations cannot be determined')
130
131
        if R1.is_reverse==R2.is_reverse:
132
            raise ValueError('Fragment incorrectly mapped') # The fragment is not correctly mapped
133
134
        if not R1.is_reverse: # R1 is forward, R2 is reverse
135
            #           R1 H------------------------>
136
            #   <------------------HH R2
137
            start = R1.reference_start+ self.R1_primer_length
138
            end = R2.reference_end -  self.R2_primer_length
139
        else:
140
            #           R2 HH------------------------>
141
            #   <------------------HH R1
142
            start = R2.reference_start+ self.R2_primer_length
143
            end = R1.reference_end -  self.R1_primer_length
144
145
        if start>=end:
146
            raise ValueError('Fragment has no size')
147
148
        return start,end
149
150
151
    def identify_site(self):
152
        self.found_valid_site = False
153
        R1, R2 = self.reads
154
        """ Valid configs:
155
        CATG######## R1 ########## ^ ########## R2 ##########
156
        ############ R2 ########## ^ ########### R1 #####CATG  reverse case
157
        !BWA inverts the query sequence if it maps to the negative strand!
158
159
        or R2.is_unmapped:
160
            if R1.is_unmapped and R2.is_unmapped:
161
                self.set_rejection_reason(R1, 'unmapped R1;R2')
162
            elif R1.is_unmapped:
163
                self.set_rejection_reason(R1, 'unmapped R1')
164
                self.set_rejection_reason(R2, 'unmapped R1')
165
            else:
166
                self.set_rejection_reason(R1, 'unmapped R2')
167
                self.set_rejection_reason(R2, 'unmapped R2')
168
            return(None)
169
        """
170
        if R1 is None or R1.is_unmapped:
171
            self.set_rejection_reason('unmapped R1')
172
            return None
173
174
        if self.no_overhang:
175
176
            # scan 3 bp of sequence for CATG
177
            scan_extra_bp = 3
178
            site_coordinate = None
179
            if R1.is_reverse:
180
                motif = self.reference.fetch(R1.reference_name, R1.reference_end, R1.reference_end-self.cut_location_offset+scan_extra_bp)
181
                if 'CATG' in motif:
182
                    offset = motif.find('CATG')
183
                    site_coordinate = R1.reference_end + offset
184
            else:
185
                motif = self.reference.fetch(R1.reference_name,  R1.reference_start+self.cut_location_offset-scan_extra_bp,  R1.reference_start)
186
                if 'CATG' in motif:
187
                    offset = motif[::-1].find('GTAC')
188
                    site_coordinate = R1.reference_start - offset - 4
189
190
            if site_coordinate is None:
191
                self.set_rejection_reason('no_CATG_in_ref', set_qcfail=True)
192
                return None
193
194
            self.set_site(site_strand=R1.is_reverse, site_chrom=R1.reference_name, site_pos=site_coordinate)
195
            self.set_recognized_sequence(motif)
196
            return site_coordinate
197
198
        else:
199
            forward_motif = R1.seq[:4]
200
            rev_motif = R1.seq[-4:]
201
202
203
        r1_start =(R1.reference_end if R1.is_reverse else R1.reference_start)
204
        if not self.no_umi_cigar_processing:
205
            if R1.is_reverse:
206
                if R1.cigartuples[-1][0]==4: # softclipped at end
207
                    r1_start+=R1.cigartuples[-1][1]
208
            else:
209
                if R1.cigartuples[0][0]==4: # softclipped at start
210
                    r1_start-=R1.cigartuples[0][1]
211
212
        if (not self.check_motif or forward_motif == 'CATG') and not R1.is_reverse: # Not reverse = forward
213
            rpos = (R1.reference_name, r1_start)
214
            self.set_site(site_strand=R1.is_reverse, site_chrom=rpos[0], site_pos=rpos[1])
215
            self.set_recognized_sequence(forward_motif)
216
            return(rpos)
217
        elif (not self.check_motif or rev_motif == 'CATG') and R1.is_reverse:
218
            rpos = (R1.reference_name, r1_start - 4)
219
            self.set_site(site_strand=R1.is_reverse, site_chrom=rpos[0], site_pos=rpos[1])
220
            self.set_recognized_sequence(rev_motif)
221
            return(rpos)
222
223
        # Sometimes the cycle is off, this is dangerous though because the cell barcode and UMI might be shifted too!
224
        elif self.allow_cycle_shift and  forward_motif.startswith( 'ATG') and not R1.is_reverse:
225
            rpos = (R1.reference_name, R1.reference_start - 1)
226
            self.set_site(site_strand=R1.is_reverse, site_chrom=rpos[0], site_pos=rpos[1])
227
            self.set_recognized_sequence('ATG')
228
            return(rpos)
229
        elif self.allow_cycle_shift and rev_motif.startswith( 'ATG') and R1.is_reverse:  # First base was trimmed or lost
230
            rpos = (R1.reference_name, R1.reference_end - 3)
231
            self.set_site(site_strand=R1.is_reverse, site_chrom=rpos[0], site_pos=rpos[1])
232
            self.set_recognized_sequence('CAT')
233
            return(rpos)
234
235
        else:
236
237
            if forward_motif == 'CATG' and R1.is_reverse:
238
                self.set_rejection_reason('found CATG R1 REV exp FWD', set_qcfail=True)
239
240
            elif rev_motif == 'CATG' and not R1.is_reverse:
241
                self.set_rejection_reason('found CATG R1 FWD exp REV', set_qcfail=True)
242
            else:
243
                self.set_rejection_reason('no CATG', set_qcfail=True)
244
245
            # Every fragment needs to have a site. Otherwise it will get lost. Use R1 start location as anchor
246
            if R1.is_reverse:
247
                rpos = (R1.reference_name, r1_start - 4)
248
            else:
249
                rpos = (R1.reference_name, r1_start)
250
251
            self.set_site(site_strand=R1.is_reverse, site_chrom=rpos[0], site_pos=rpos[1], valid=False)
252
            return None
253
254
    def get_undigested_site_count(self, reference_handle):
255
        """
256
        Obtain the amount of undigested sites in the span of the fragment
257
258
        Parameters
259
        ----------
260
        reference : pysam.FastaFile or similiar
261
262
        Returns
263
        -------
264
        undigested_site_count : int
265
            amount of undigested cut sites in the mapping span of the fragment
266
267
        Raises:
268
        -------
269
        ValueError : when the span of the molecule is not properly defined
270
        """
271
        if any(e is None for e in self.span):
272
            raise ValueError('Span of the fragment is not well defined')
273
274
        total = reference_handle.fetch(*self.span).count('CATG')
275
        # ignore 1 CATG if it was Detected:
276
        if self.meta.get('RZ') == 'CATG':
277
            total -= 1
278
        return total
279
280
    def is_valid(self):
281
        if self.qcfail:
282
            return False
283
284
        try:
285
            if self.max_fragment_size is not None and self.get_fragment_size()>self.max_fragment_size:
286
                self.set_rejection_reason('FS', set_qcfail=True)
287
                return False
288
        except TypeError:
289
            pass
290
291
        return self.found_valid_site
292
293
    def get_site_location(self):
294
        if self.site_location is not None:
295
            return self.site_location
296
        else:
297
            # We need some kind of coordinate...
298
            for read in self:
299
                if read is not None and read.reference_name is not None and read.reference_start is not None:
300
                    return read.reference_name, read.reference_start
301
302
    def __repr__(self):
303
        site_loc = self.get_site_location()
304
305
        if site_loc is None or len(site_loc)==0:
306
            return Fragment.__repr__(
307
            self) + f'\n\tNo restriction site found'
308
        else:
309
            site_def_str = f'\n\tRestriction site:{site_loc[0]}:{site_loc[1]}'
310
            try:
311
                return Fragment.__repr__(
312
                    self) + site_def_str
313
            except Exception as e:
314
                print(self.get_site_location())
315
                raise
316
317
    def __eq__(self, other):
318
        # Make sure fragments map to the same strand, cheap comparisons
319
        if self.match_hash != other.match_hash:
320
            return False
321
322
        # Make sure UMI's are similar enough, more expensive hamming distance
323
        # calculation
324
        return self.umi_eq(other)