|
a |
|
b/singlecellmultiomics/fragment/chic.py |
|
|
1 |
from singlecellmultiomics.fragment import Fragment |
|
|
2 |
from singlecellmultiomics.utils.sequtils import hamming_distance |
|
|
3 |
|
|
|
4 |
|
|
|
5 |
class CHICFragment(Fragment): |
|
|
6 |
def __init__(self, |
|
|
7 |
reads, |
|
|
8 |
R1_primer_length=4, |
|
|
9 |
R2_primer_length=6, # Is automatically detected now |
|
|
10 |
assignment_radius=0, |
|
|
11 |
umi_hamming_distance=1, |
|
|
12 |
invert_strand=False, |
|
|
13 |
no_umi_cigar_processing=False, |
|
|
14 |
**kwargs |
|
|
15 |
): |
|
|
16 |
self.invert_strand = invert_strand |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
# Perform random primer autodect, |
|
|
20 |
# When the demux profile is MX:Z:scCHIC384C8U3l, then there is no random primer |
|
|
21 |
for read in reads: |
|
|
22 |
if read is not None and read.has_tag('MX'): |
|
|
23 |
if read.get_tag('MX')[-1]=='l': |
|
|
24 |
R2_primer_length = 0 |
|
|
25 |
break |
|
|
26 |
|
|
|
27 |
Fragment.__init__(self, |
|
|
28 |
reads, |
|
|
29 |
assignment_radius=assignment_radius, |
|
|
30 |
R1_primer_length=R1_primer_length, |
|
|
31 |
R2_primer_length=R2_primer_length, |
|
|
32 |
umi_hamming_distance=umi_hamming_distance, |
|
|
33 |
max_NUC_stretch = 18, |
|
|
34 |
**kwargs |
|
|
35 |
|
|
|
36 |
) |
|
|
37 |
|
|
|
38 |
# set CHIC cut site given reads |
|
|
39 |
self.no_umi_cigar_processing = no_umi_cigar_processing |
|
|
40 |
self.strand = None |
|
|
41 |
self.ligation_motif = None |
|
|
42 |
self.site_location = None |
|
|
43 |
self.cut_site_strand = None |
|
|
44 |
self.found_valid_site = False |
|
|
45 |
self.identify_site() |
|
|
46 |
|
|
|
47 |
|
|
|
48 |
if self.is_valid(): |
|
|
49 |
|
|
|
50 |
if self.assignment_radius==0: |
|
|
51 |
self.match_hash = ( |
|
|
52 |
self.strand, |
|
|
53 |
self.cut_site_strand, |
|
|
54 |
self.site_location[0], |
|
|
55 |
self.site_location[1], |
|
|
56 |
self.sample) |
|
|
57 |
else: |
|
|
58 |
self.match_hash = ( |
|
|
59 |
self.cut_site_strand, |
|
|
60 |
self.site_location[0], |
|
|
61 |
self.sample) |
|
|
62 |
else: |
|
|
63 |
self.match_hash = None |
|
|
64 |
|
|
|
65 |
def set_site( |
|
|
66 |
self, |
|
|
67 |
site_chrom, |
|
|
68 |
site_pos, |
|
|
69 |
site_strand=None, |
|
|
70 |
is_trimmed=False): |
|
|
71 |
if self.found_valid_site: |
|
|
72 |
self.set_meta('DS', site_pos) |
|
|
73 |
if site_strand is not None: |
|
|
74 |
self.set_meta('RS', site_strand) |
|
|
75 |
self.set_strand(site_strand) |
|
|
76 |
self.site_location = (site_chrom, site_pos) |
|
|
77 |
self.cut_site_strand = site_strand |
|
|
78 |
|
|
|
79 |
def identify_site(self): |
|
|
80 |
self.found_valid_site = False |
|
|
81 |
R1 = self.get_R1() |
|
|
82 |
|
|
|
83 |
if R1 is None: |
|
|
84 |
self.set_rejection_reason("R1_undefined") |
|
|
85 |
return None |
|
|
86 |
|
|
|
87 |
if R1.has_tag('lh'): |
|
|
88 |
self.ligation_motif = R1.get_tag('lh') |
|
|
89 |
|
|
|
90 |
""" Valid configs: |
|
|
91 |
Observed read pair: |
|
|
92 |
T######## R1 ########## ^ ########## R2 ########## |
|
|
93 |
Real molecule: |
|
|
94 |
A########################################################### |
|
|
95 |
|
|
|
96 |
Real molecule: and adapter: |
|
|
97 |
ADAPTER: 5' NNNNT 3' |
|
|
98 |
A{A:80%,N:20%}NNN [CHIC MOLECULE] |
|
|
99 |
^ real cut site |
|
|
100 |
""" |
|
|
101 |
|
|
|
102 |
is_trimmed = (R1.has_tag('MX') and R1.get_tag( |
|
|
103 |
'MX').startswith('scCHIC')) |
|
|
104 |
|
|
|
105 |
if R1.is_unmapped: |
|
|
106 |
self.set_rejection_reason("R1_unmapped") |
|
|
107 |
return(None) |
|
|
108 |
|
|
|
109 |
### Check if the orientation of the reads makes sense, reads need to point inwards |
|
|
110 |
# | ----- > < ---- | |
|
|
111 |
if self.has_R2(): |
|
|
112 |
R2 = self.get_R2() |
|
|
113 |
if not R2.is_unmapped: |
|
|
114 |
if R1.is_reverse and not R2.is_reverse: |
|
|
115 |
pass |
|
|
116 |
elif not R1.is_reverse and R2.is_reverse: |
|
|
117 |
pass |
|
|
118 |
else: |
|
|
119 |
# it does not make sense ... |
|
|
120 |
self.set_rejection_reason("orientation") |
|
|
121 |
return(None) |
|
|
122 |
|
|
|
123 |
|
|
|
124 |
# Identify the start coordinate of Read 1 by reading the amount of softclips on the start of the read |
|
|
125 |
r1_start =(R1.reference_end+1 if R1.is_reverse else R1.reference_start-2) |
|
|
126 |
if not self.no_umi_cigar_processing: |
|
|
127 |
if R1.is_reverse: |
|
|
128 |
if R1.cigartuples[-1][0]==4: # softclipped at end |
|
|
129 |
r1_start+=R1.cigartuples[-1][1] |
|
|
130 |
else: |
|
|
131 |
if R1.cigartuples[0][0]==4: # softclipped at start |
|
|
132 |
r1_start-=R1.cigartuples[0][1] |
|
|
133 |
|
|
|
134 |
if is_trimmed: |
|
|
135 |
# The first base of the read has been taken off and the lh tag is |
|
|
136 |
# already set, this can be copied to RZ |
|
|
137 |
self.found_valid_site = True |
|
|
138 |
self.set_site( |
|
|
139 |
site_strand=not R1.is_reverse if self.invert_strand else R1.is_reverse, |
|
|
140 |
# We sequence the other strand (Starting with a T, this is an A in the molecule), the digestion thus happened on the other strand |
|
|
141 |
# On the next line we asume that the mnsase cut is one base after the ligated A, but it can be more bases upstream |
|
|
142 |
site_chrom=R1.reference_name, |
|
|
143 |
site_pos=r1_start, |
|
|
144 |
is_trimmed=True |
|
|
145 |
) |
|
|
146 |
|
|
|
147 |
else: |
|
|
148 |
self.found_valid_site = True |
|
|
149 |
self.set_site( |
|
|
150 |
site_strand=not R1.is_reverse if self.invert_strand else R1.is_reverse, |
|
|
151 |
# We sequence the other strand (Starting with a T, this is an A in the molecule), the digestion thus happened on the other strand |
|
|
152 |
# On the next line we asume that the mnsase cut is one base after the ligated A, but it can be more bases upstream |
|
|
153 |
site_chrom=R1.reference_name, |
|
|
154 |
site_pos=(r1_start - 1 if R1.is_reverse else r1_start + 1), |
|
|
155 |
is_trimmed=False) |
|
|
156 |
|
|
|
157 |
def is_valid(self): |
|
|
158 |
if self.qcfail: |
|
|
159 |
return False |
|
|
160 |
|
|
|
161 |
if self.max_fragment_size is not None: |
|
|
162 |
try: |
|
|
163 |
size = self.get_fragment_size() |
|
|
164 |
if size>self.max_fragment_size: |
|
|
165 |
return False |
|
|
166 |
except Exception: |
|
|
167 |
pass |
|
|
168 |
|
|
|
169 |
return self.found_valid_site |
|
|
170 |
|
|
|
171 |
def get_site_location(self): |
|
|
172 |
if self.site_location is not None: |
|
|
173 |
return self.site_location |
|
|
174 |
else: |
|
|
175 |
# We need some kind of coordinate... |
|
|
176 |
for read in self: |
|
|
177 |
if read is not None and read.reference_name is not None and read.reference_start is not None: |
|
|
178 |
return read.reference_name, read.reference_start |
|
|
179 |
|
|
|
180 |
def __repr__(self): |
|
|
181 |
return Fragment.__repr__( |
|
|
182 |
self) + f'\n\tMNase cut site:{self.get_site_location()}' |
|
|
183 |
|
|
|
184 |
def __eq__(self, other): |
|
|
185 |
# Make sure fragments map to the same strand, cheap comparisons |
|
|
186 |
if self.match_hash != other.match_hash: |
|
|
187 |
return False |
|
|
188 |
|
|
|
189 |
if self.site_location is None or other.site_location is None: |
|
|
190 |
return False |
|
|
191 |
|
|
|
192 |
# Check distance between the fragments: |
|
|
193 |
if self.assignment_radius>0 and abs(self.site_location[1]-other.site_location[1])>self.assignment_radius: |
|
|
194 |
return False |
|
|
195 |
|
|
|
196 |
|
|
|
197 |
# Make sure UMI's are similar enough, more expensive hamming distance |
|
|
198 |
# calculation |
|
|
199 |
return self.umi_eq(other) |