Switch to unified view

a b/singlecellmultiomics/fragment/chic.py
1
from singlecellmultiomics.fragment import Fragment
2
from singlecellmultiomics.utils.sequtils import hamming_distance
3
4
5
class CHICFragment(Fragment):
6
    def __init__(self,
7
                 reads,
8
                 R1_primer_length=4,
9
                 R2_primer_length=6, # Is automatically detected now
10
                 assignment_radius=0,
11
                 umi_hamming_distance=1,
12
                 invert_strand=False,
13
                 no_umi_cigar_processing=False,
14
                 **kwargs
15
                 ):
16
        self.invert_strand = invert_strand
17
18
19
        # Perform random primer autodect,
20
        # When the demux profile is MX:Z:scCHIC384C8U3l, then there is no random primer
21
        for read in reads:
22
            if read is not None and read.has_tag('MX'):
23
                if read.get_tag('MX')[-1]=='l':
24
                    R2_primer_length = 0
25
                break
26
27
        Fragment.__init__(self,
28
                          reads,
29
                          assignment_radius=assignment_radius,
30
                          R1_primer_length=R1_primer_length,
31
                          R2_primer_length=R2_primer_length,
32
                          umi_hamming_distance=umi_hamming_distance,
33
                          max_NUC_stretch = 18,
34
                          **kwargs
35
36
                )
37
38
        # set CHIC cut site given reads
39
        self.no_umi_cigar_processing = no_umi_cigar_processing
40
        self.strand = None
41
        self.ligation_motif = None
42
        self.site_location = None
43
        self.cut_site_strand = None
44
        self.found_valid_site = False
45
        self.identify_site()
46
47
48
        if self.is_valid():
49
50
            if self.assignment_radius==0:
51
                self.match_hash = (
52
                    self.strand,
53
                    self.cut_site_strand,
54
                    self.site_location[0],
55
                    self.site_location[1],
56
                    self.sample)
57
            else:
58
                self.match_hash = (
59
                    self.cut_site_strand,
60
                    self.site_location[0],
61
                    self.sample)
62
        else:
63
            self.match_hash = None
64
65
    def set_site(
66
            self,
67
            site_chrom,
68
            site_pos,
69
            site_strand=None,
70
            is_trimmed=False):
71
        if self.found_valid_site:
72
            self.set_meta('DS', site_pos)
73
        if site_strand is not None:
74
            self.set_meta('RS', site_strand)
75
        self.set_strand(site_strand)
76
        self.site_location = (site_chrom, site_pos)
77
        self.cut_site_strand = site_strand
78
79
    def identify_site(self):
80
        self.found_valid_site = False
81
        R1 = self.get_R1()
82
83
        if R1 is None:
84
            self.set_rejection_reason("R1_undefined")
85
            return None
86
87
        if R1.has_tag('lh'):
88
            self.ligation_motif = R1.get_tag('lh')
89
90
        """ Valid configs:
91
        Observed read pair:
92
        T######## R1 ########## ^ ########## R2 ##########
93
        Real molecule:
94
        A###########################################################
95
96
        Real molecule: and adapter:
97
        ADAPTER: 5' NNNNT 3'
98
                        A{A:80%,N:20%}NNN [CHIC MOLECULE]
99
                                      ^ real cut site
100
        """
101
102
        is_trimmed = (R1.has_tag('MX') and R1.get_tag(
103
            'MX').startswith('scCHIC'))
104
105
        if R1.is_unmapped:
106
            self.set_rejection_reason("R1_unmapped")
107
            return(None)
108
109
        ### Check if the orientation of the reads makes sense, reads need to point inwards
110
        # | ----- >   < ---- |
111
        if self.has_R2():
112
            R2 = self.get_R2()
113
            if not R2.is_unmapped:
114
                if R1.is_reverse and not R2.is_reverse:
115
                    pass
116
                elif not R1.is_reverse and R2.is_reverse:
117
                    pass
118
                else:
119
                    # it does not make sense ...
120
                    self.set_rejection_reason("orientation")
121
                    return(None)
122
123
124
        # Identify the start coordinate of Read 1 by reading the amount of softclips on the start of the read
125
        r1_start =(R1.reference_end+1 if R1.is_reverse else R1.reference_start-2)
126
        if not self.no_umi_cigar_processing:
127
            if R1.is_reverse:
128
                if R1.cigartuples[-1][0]==4: # softclipped at end
129
                    r1_start+=R1.cigartuples[-1][1]
130
            else:
131
                if R1.cigartuples[0][0]==4: # softclipped at start
132
                    r1_start-=R1.cigartuples[0][1]
133
134
        if is_trimmed:
135
            # The first base of the read has been taken off and the lh tag is
136
            # already set, this can be copied to RZ
137
            self.found_valid_site = True
138
            self.set_site(
139
                site_strand=not R1.is_reverse if self.invert_strand else R1.is_reverse,
140
                # We sequence the other strand (Starting with a T, this is an A in the molecule), the digestion thus happened on the other strand
141
                # On the next line we asume that the mnsase cut is one base after the ligated A, but it can be more bases upstream
142
                site_chrom=R1.reference_name,
143
                site_pos=r1_start,
144
                is_trimmed=True
145
            )
146
147
        else:
148
            self.found_valid_site = True
149
            self.set_site(
150
                site_strand=not R1.is_reverse if self.invert_strand else R1.is_reverse,
151
                # We sequence the other strand (Starting with a T, this is an A in the molecule), the digestion thus happened on the other strand
152
                # On the next line we asume that the mnsase cut is one base after the ligated A, but it can be more bases upstream
153
                site_chrom=R1.reference_name,
154
                site_pos=(r1_start - 1 if R1.is_reverse else r1_start + 1),
155
                is_trimmed=False)
156
157
    def is_valid(self):
158
        if self.qcfail:
159
            return False
160
161
        if self.max_fragment_size is not None:
162
            try:
163
                size = self.get_fragment_size()
164
                if size>self.max_fragment_size:
165
                    return False
166
            except Exception:
167
                pass
168
169
        return self.found_valid_site
170
171
    def get_site_location(self):
172
        if self.site_location is not None:
173
            return self.site_location
174
        else:
175
            # We need some kind of coordinate...
176
            for read in self:
177
                if read is not None and read.reference_name is not None and read.reference_start is not None:
178
                    return read.reference_name, read.reference_start
179
180
    def __repr__(self):
181
        return Fragment.__repr__(
182
            self) + f'\n\tMNase cut site:{self.get_site_location()}'
183
184
    def __eq__(self, other):
185
        # Make sure fragments map to the same strand, cheap comparisons
186
        if self.match_hash != other.match_hash:
187
            return False
188
189
        if self.site_location is None or other.site_location is None:
190
            return False
191
192
        # Check distance between the fragments:
193
        if self.assignment_radius>0 and abs(self.site_location[1]-other.site_location[1])>self.assignment_radius:
194
            return False
195
196
197
        # Make sure UMI's are similar enough, more expensive hamming distance
198
        # calculation
199
        return self.umi_eq(other)