|
a |
|
b/singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
import re |
|
|
4 |
import singlecellmultiomics.fastqProcessing.fastqIterator as fastqIterator |
|
|
5 |
import string |
|
|
6 |
from singlecellmultiomics.utils.sequtils import hamming_distance |
|
|
7 |
from singlecellmultiomics.tags import * |
|
|
8 |
|
|
|
9 |
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
TagDefinitions = {tag.tag: tag for tag in tags} |
|
|
13 |
|
|
|
14 |
# Obtain metadata from read |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
def metaFromRead(read, tag): |
|
|
18 |
|
|
|
19 |
if tag == 'chrom': |
|
|
20 |
return read.reference_name |
|
|
21 |
|
|
|
22 |
if read.has_tag(tag): |
|
|
23 |
return read.get_tag(tag) |
|
|
24 |
|
|
|
25 |
# Backwards compatibility with BI > bi tag: |
|
|
26 |
if tag=='BI' and read.has_tag('bi'): |
|
|
27 |
return read.get_tag('bi') |
|
|
28 |
|
|
|
29 |
# Forwards compatibility: |
|
|
30 |
if tag=='bi' and read.has_tag('BI'): |
|
|
31 |
return read.get_tag('BI') |
|
|
32 |
|
|
|
33 |
try: |
|
|
34 |
return getattr(read, tag) |
|
|
35 |
except Exception as e: |
|
|
36 |
pass |
|
|
37 |
# print(e) |
|
|
38 |
|
|
|
39 |
return None |
|
|
40 |
|
|
|
41 |
|
|
|
42 |
# Clean a string to be able to be used in a fastq file header |
|
|
43 |
fastqCleanerRegex = re.compile('[^a-zA-Z0-9-_]', re.UNICODE) |
|
|
44 |
|
|
|
45 |
def fqSafe(string) -> str: |
|
|
46 |
""" |
|
|
47 |
Convert input string into a representation which can be stored in a fastq header |
|
|
48 |
|
|
|
49 |
Input: |
|
|
50 |
string(str) : string to clean |
|
|
51 |
|
|
|
52 |
Returns: |
|
|
53 |
cleaned(str) |
|
|
54 |
""" |
|
|
55 |
global fastqCleanerRegex |
|
|
56 |
return(fastqCleanerRegex.sub('', string)) |
|
|
57 |
|
|
|
58 |
|
|
|
59 |
illuminaHeaderSplitRegex = re.compile(':| ', re.UNICODE) |
|
|
60 |
|
|
|
61 |
|
|
|
62 |
class TaggedRecord(): |
|
|
63 |
def __init__( |
|
|
64 |
self, |
|
|
65 |
tagDefinitions, |
|
|
66 |
rawRecord=False, |
|
|
67 |
library=None, |
|
|
68 |
reason=None, |
|
|
69 |
**kwargs): |
|
|
70 |
self.tags = {} # 2 character Key -> value |
|
|
71 |
self.tagDefinitions = tagDefinitions |
|
|
72 |
if rawRecord is not False: |
|
|
73 |
try: |
|
|
74 |
self.fromRawFastq(rawRecord, **kwargs) |
|
|
75 |
except NonMultiplexable: |
|
|
76 |
raise |
|
|
77 |
if library is not None and not 'LY' in self.tags: |
|
|
78 |
self.tags['LY'] = library |
|
|
79 |
if reason is not None: |
|
|
80 |
self.tags['RR'] = reason |
|
|
81 |
|
|
|
82 |
if isinstance(rawRecord, fastqIterator.FastqRecord): |
|
|
83 |
self.sequence = rawRecord.sequence |
|
|
84 |
self.qualities = rawRecord.qual |
|
|
85 |
self.plus = rawRecord.plus |
|
|
86 |
|
|
|
87 |
def addTagByTag( |
|
|
88 |
self, |
|
|
89 |
tagName, |
|
|
90 |
value, |
|
|
91 |
isPhred=None, |
|
|
92 |
decodePhred=False, |
|
|
93 |
cast_type=str, |
|
|
94 |
make_safe=True): |
|
|
95 |
|
|
|
96 |
if isPhred is None: |
|
|
97 |
isPhred = self.tagDefinitions[tagName].isPhred |
|
|
98 |
if cast_type and not isinstance(value, cast_type): |
|
|
99 |
value = cast_type(value) |
|
|
100 |
if isPhred: |
|
|
101 |
if decodePhred: |
|
|
102 |
# Convert encoded phred scores back to original ascii |
|
|
103 |
self.tags[tagName] = fastqHeaderSafeQualitiesToPhred( |
|
|
104 |
value, method=3) |
|
|
105 |
else: |
|
|
106 |
self.tags[tagName] = phredToFastqHeaderSafeQualities( |
|
|
107 |
value, method=3) |
|
|
108 |
else: |
|
|
109 |
if cast_type is str: |
|
|
110 |
if make_safe: |
|
|
111 |
self.tags[tagName] = fqSafe(value) |
|
|
112 |
else: |
|
|
113 |
self.tags[tagName] = value |
|
|
114 |
else: |
|
|
115 |
self.tags[tagName] = value |
|
|
116 |
|
|
|
117 |
def __repr__(self): |
|
|
118 |
return self.asFastq() |
|
|
119 |
|
|
|
120 |
def asFastq( |
|
|
121 |
self, |
|
|
122 |
sequence=None, |
|
|
123 |
dirAtt=None, |
|
|
124 |
baseQualities=None, |
|
|
125 |
format='illumina'): |
|
|
126 |
if sequence is None: |
|
|
127 |
if self.sequence is None: |
|
|
128 |
raise ValueError() |
|
|
129 |
sequence = self.sequence |
|
|
130 |
if dirAtt is None: |
|
|
131 |
if self.plus is None: |
|
|
132 |
raise ValueError() |
|
|
133 |
dirAtt = self.plus |
|
|
134 |
if baseQualities is None: |
|
|
135 |
if self.qualities is None: |
|
|
136 |
raise ValueError() |
|
|
137 |
baseQualities = self.qualities |
|
|
138 |
|
|
|
139 |
header = ";".join([f"{attribute}:{value}" for attribute, value in self.tags.items( |
|
|
140 |
) if not self.tagDefinitions[attribute].doNotWrite]) |
|
|
141 |
if len(header) > 255: # the header length is stored as uint_8 and includes a null character. The maximum length is thus 255 |
|
|
142 |
raise ValueError( |
|
|
143 |
f"The length of the demultiplexed header is longer than 255 characters. Try to keep your library name below 60 characters. Reduce the length of the header. For example by using -merge _ which will not put the flow cell in the sample name. The header looks like this: {header}") |
|
|
144 |
return f'@{header}\n{sequence}\n{dirAtt}\n{baseQualities}\n' |
|
|
145 |
|
|
|
146 |
def has_tag(self, tag): |
|
|
147 |
return tag in self.tags |
|
|
148 |
|
|
|
149 |
def asIlluminaHeader(self): |
|
|
150 |
return '{Is}:{RN}:{Fc}:{La}:{Ti}:{CX}:{CY}'.format(**self.tags) |
|
|
151 |
|
|
|
152 |
|
|
|
153 |
|
|
|
154 |
|
|
|
155 |
def parse_3dec_header(self,fastqRecord, indexFileParser, indexFileAlias): |
|
|
156 |
|
|
|
157 |
instrument = 'UNK' |
|
|
158 |
runNumber = 'UNK' |
|
|
159 |
flowCellId = 'UNK' |
|
|
160 |
indexSequence = 'N' |
|
|
161 |
lane = 'UNK' |
|
|
162 |
tile = 'UNK' |
|
|
163 |
clusterXpos = '-1' |
|
|
164 |
clusterYpos = '-1' |
|
|
165 |
readPairNumber = '0' |
|
|
166 |
isFiltered = '0' |
|
|
167 |
controlNumber = '0' |
|
|
168 |
|
|
|
169 |
# 3-DEC: @Cluster_s_1_1101_2 |
|
|
170 |
if fastqRecord.header.count('_') == 4: |
|
|
171 |
_cluster_, _s_, lane, tile, readPairNumber = fastqRecord.header.split( |
|
|
172 |
'_') |
|
|
173 |
# check that this s thingy is at the right place |
|
|
174 |
assert(_s_ == 's') |
|
|
175 |
else: |
|
|
176 |
raise |
|
|
177 |
|
|
|
178 |
self.tags.update({ |
|
|
179 |
'Is': instrument, |
|
|
180 |
'RN': runNumber, |
|
|
181 |
'Fc': flowCellId, |
|
|
182 |
'La': lane, |
|
|
183 |
'Ti': tile, |
|
|
184 |
'CX': clusterXpos, |
|
|
185 |
'CY': clusterYpos, |
|
|
186 |
'RP': readPairNumber, |
|
|
187 |
'Fi': isFiltered, |
|
|
188 |
'CN': controlNumber |
|
|
189 |
}) |
|
|
190 |
|
|
|
191 |
|
|
|
192 |
def _parse_illumina_header(self,header, indexFileParser = None, indexFileAlias = None): |
|
|
193 |
|
|
|
194 |
try: |
|
|
195 |
instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos, readPairNumber, isFiltered, controlNumber, indexSequence = header.replace(' ',':').split(':') |
|
|
196 |
except BaseException: |
|
|
197 |
|
|
|
198 |
try: |
|
|
199 |
instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos, readPairNumber, isFiltered, controlNumber = illuminaHeaderSplitRegex.split( |
|
|
200 |
header.replace('::', '')) |
|
|
201 |
indexSequence = "N" |
|
|
202 |
except BaseException: |
|
|
203 |
|
|
|
204 |
try: |
|
|
205 |
instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos = header.split(':') |
|
|
206 |
indexSequence = "N" |
|
|
207 |
readPairNumber = 1 |
|
|
208 |
isFiltered = 0 |
|
|
209 |
controlNumber = 0 |
|
|
210 |
except BaseException: |
|
|
211 |
raise |
|
|
212 |
|
|
|
213 |
self.tags.update({ |
|
|
214 |
'Is': instrument, |
|
|
215 |
'RN': runNumber, |
|
|
216 |
'Fc': flowCellId, |
|
|
217 |
'La': lane, |
|
|
218 |
'Ti': tile, |
|
|
219 |
'CX': clusterXpos, |
|
|
220 |
'CY': clusterYpos, |
|
|
221 |
'RP': readPairNumber, |
|
|
222 |
'Fi': isFiltered, |
|
|
223 |
'CN': controlNumber |
|
|
224 |
}) |
|
|
225 |
|
|
|
226 |
if indexFileParser is not None and indexFileAlias is not None: |
|
|
227 |
# Check if the index is an integer: |
|
|
228 |
try: |
|
|
229 |
indexInteger = int(indexSequence) |
|
|
230 |
indexIdentifier, correctedIndex, hammingDistance = indexSequence, indexSequence, 0 |
|
|
231 |
except ValueError: |
|
|
232 |
indexIdentifier, correctedIndex, hammingDistance = indexFileParser.getIndexCorrectedBarcodeAndHammingDistance( |
|
|
233 |
alias=indexFileAlias, barcode=indexSequence) |
|
|
234 |
|
|
|
235 |
self.tags['aa'] = indexSequence |
|
|
236 |
if correctedIndex is not None: |
|
|
237 |
# |
|
|
238 |
#self.addTagByTag('aA',correctedIndex, isPhred=False) |
|
|
239 |
#self.addTagByTag('aI',indexIdentifier, isPhred=False) |
|
|
240 |
self.tags.update({'aA': correctedIndex, 'aI': indexIdentifier}) |
|
|
241 |
else: |
|
|
242 |
raise NonMultiplexable( |
|
|
243 |
'Could not obtain index for %s %s %s' % |
|
|
244 |
(indexSequence, correctedIndex, indexIdentifier)) |
|
|
245 |
# self.addTagByTag('aA',"None") |
|
|
246 |
# self.addTagByTag('aI',-1) |
|
|
247 |
# self.addTagByTag('ah',hammingDistance) |
|
|
248 |
|
|
|
249 |
else: |
|
|
250 |
#self.addTagByTag('aA',indexSequence, isPhred=False) |
|
|
251 |
self.tags['aa'] = indexSequence |
|
|
252 |
|
|
|
253 |
|
|
|
254 |
def parse_illumina_header(self,fastqRecord, indexFileParser = None, indexFileAlias = None): |
|
|
255 |
return self._parse_illumina_header(fastqRecord.header, indexFileParser=indexFileParser, indexFileAlias=indexFileAlias ) |
|
|
256 |
|
|
|
257 |
|
|
|
258 |
def parse_scmo_header(self, fastqRecord, indexFileParser, indexFileAlias): |
|
|
259 |
self.tags.update( dict( kv.split(':') for kv in fastqRecord.header.strip()[1:].split(';') ) ) |
|
|
260 |
|
|
|
261 |
def fromRawFastq( |
|
|
262 |
self, |
|
|
263 |
fastqRecord, |
|
|
264 |
indexFileParser=None, |
|
|
265 |
indexFileAlias=None): |
|
|
266 |
|
|
|
267 |
try: |
|
|
268 |
self.parse_illumina_header(fastqRecord, indexFileParser, indexFileAlias) |
|
|
269 |
except BaseException: |
|
|
270 |
if fastqRecord.header.startswith('@Is'): |
|
|
271 |
self.parse_scmo_header(fastqRecord, indexFileParser, indexFileAlias) |
|
|
272 |
else: |
|
|
273 |
self.parse_3dec_header(fastqRecord, indexFileParser, indexFileAlias) |
|
|
274 |
|
|
|
275 |
|
|
|
276 |
# NS500413:32:H14TKBGXX:2:11101:16448:1664 1:N:0:: |
|
|
277 |
""" This is the nice and safe way: |
|
|
278 |
self.addTagByTag( 'Is',instrument, isPhred=False) |
|
|
279 |
self.addTagByTag('RN',runNumber, isPhred=False) |
|
|
280 |
self.addTagByTag('Fc',flowCellId, isPhred=False) |
|
|
281 |
self.addTagByTag('La',lane, isPhred=False) |
|
|
282 |
self.addTagByTag('Ti',tile, isPhred=False) |
|
|
283 |
self.addTagByTag('CX',clusterXpos, isPhred=False) |
|
|
284 |
self.addTagByTag('CY',clusterYpos, isPhred=False) |
|
|
285 |
self.addTagByTag('RP',readPairNumber, isPhred=False) |
|
|
286 |
self.addTagByTag('Fi',isFiltered, isPhred=False) |
|
|
287 |
self.addTagByTag('CN',controlNumber, isPhred=False) |
|
|
288 |
""" |
|
|
289 |
|
|
|
290 |
def tagPysamRead(self, read): |
|
|
291 |
|
|
|
292 |
moleculeIdentifier = "" |
|
|
293 |
moleculeQuality = "" |
|
|
294 |
moleculeIdentifiyingTags = [ |
|
|
295 |
('BC', 'QT', False), |
|
|
296 |
('RX', 'RQ', False), |
|
|
297 |
('aA', None, True) |
|
|
298 |
] |
|
|
299 |
try: |
|
|
300 |
QT_missing=False |
|
|
301 |
for tag, qualityTag, required in moleculeIdentifiyingTags: |
|
|
302 |
if self.has_tag(tag) and not self.tags.get(tag) is None: |
|
|
303 |
moleculeIdentifier += self.tags[tag] |
|
|
304 |
if qualityTag is None: # Padding: |
|
|
305 |
moleculeQuality += ('o' * len(self.tags[tag])) |
|
|
306 |
else: |
|
|
307 |
if qualityTag in self.tags: |
|
|
308 |
moleculeQuality += self.tags[qualityTag] |
|
|
309 |
elif qualityTag=='QT': |
|
|
310 |
QT_missing = True |
|
|
311 |
|
|
|
312 |
if required and not self.has_tag(tag): |
|
|
313 |
raise NonMultiplexable('Tag was defined to be required') |
|
|
314 |
|
|
|
315 |
# if len(moleculeQuality)!=len(moleculeIdentifier): |
|
|
316 |
# raise ValueError('Could not reconstruct molecule identifier') |
|
|
317 |
# @todo set this back when we recover QT |
|
|
318 |
|
|
|
319 |
correctedIndex = None if not self.has_tag( |
|
|
320 |
'aA') else self.tags['aA'] |
|
|
321 |
indexSequence = None if not self.has_tag('aa') else self.tags['aa'] |
|
|
322 |
|
|
|
323 |
if correctedIndex is not None and indexSequence is not None: |
|
|
324 |
hd = hamming_distance(indexSequence, correctedIndex) |
|
|
325 |
if hd is None: |
|
|
326 |
raise ValueError( |
|
|
327 |
"Could not resolve hamming distance between {correctedIndex} and {indexSequence}") |
|
|
328 |
self.addTagByTag('ah', hd, isPhred=False, cast_type=int) |
|
|
329 |
|
|
|
330 |
self.addTagByTag('MI', moleculeIdentifier, isPhred=False) |
|
|
331 |
if not QT_missing: |
|
|
332 |
self.addTagByTag('QM', moleculeQuality, isPhred=False) |
|
|
333 |
|
|
|
334 |
except NonMultiplexable: |
|
|
335 |
|
|
|
336 |
# Its bulk |
|
|
337 |
self.tags['BK'] = True |
|
|
338 |
|
|
|
339 |
# Add sample tag: (If possible) |
|
|
340 |
# If the bi tag is present it means we know the index of the cell |
|
|
341 |
# if no bi tag is present, assume the sample is bulk |
|
|
342 |
if 'bi' in self.tags: |
|
|
343 |
self.addTagByTag( |
|
|
344 |
'SM', |
|
|
345 |
f'{self.tags["LY"]}_{self.tags["bi"]}', |
|
|
346 |
isPhred=False) |
|
|
347 |
elif 'BI' in self.tags: |
|
|
348 |
self.addTagByTag( |
|
|
349 |
'SM', |
|
|
350 |
f'{self.tags["LY"]}_{self.tags["BI"]}', |
|
|
351 |
isPhred=False) |
|
|
352 |
|
|
|
353 |
# Remove BI tag |
|
|
354 |
self.tags['bi'] = self.tags["BI"] |
|
|
355 |
del self.tags['BI'] |
|
|
356 |
elif "LY" in self.tags: |
|
|
357 |
self.addTagByTag('SM', f'{self.tags["LY"]}_BULK', isPhred=False) |
|
|
358 |
|
|
|
359 |
|
|
|
360 |
|
|
|
361 |
|
|
|
362 |
# Now we defined the desired values of the tags. Write them to the |
|
|
363 |
# record: |
|
|
364 |
for tag, value in self.tags.items(): |
|
|
365 |
# print(tag,value) |
|
|
366 |
if tag in self.tagDefinitions and self.tagDefinitions[tag].isPhred: |
|
|
367 |
value = fastqHeaderSafeQualitiesToPhred(value, method=3) |
|
|
368 |
read.set_tag(tag, value) |
|
|
369 |
|
|
|
370 |
if not QT_missing and read.has_tag('QM') and len( |
|
|
371 |
read.get_tag('QM')) != len( |
|
|
372 |
read.get_tag('MI')): |
|
|
373 |
raise ValueError('QM and MI tag length not matching') |
|
|
374 |
|
|
|
375 |
def fromTaggedFastq(self, fastqRecord): |
|
|
376 |
for keyValue in fastqRecord.header.replace('@', '').strip().split(';'): |
|
|
377 |
key, value = keyValue.split(':') |
|
|
378 |
self.addTagByTag(key, value, decodePhred=True) |
|
|
379 |
|
|
|
380 |
def fromTaggedBamRecord(self, pysamRecord): |
|
|
381 |
try: |
|
|
382 |
for keyValue in pysamRecord.query_name.strip().split(';'): |
|
|
383 |
key, value = keyValue.split(':') |
|
|
384 |
self.addTagByTag(key, value, isPhred=False) |
|
|
385 |
except ValueError: |
|
|
386 |
# Try to parse "Single Cell Discoveries" header |
|
|
387 |
# These have the following header: |
|
|
388 |
#NBXXXXXX:530:HXXXXXX:2:2:17:6;SS:GTCATTAG;CB:GTCATTAG;QT:eeeeeeee;RX:CTGAAC;RQ:aaaaae;SM:SAMPLE_NAME |
|
|
389 |
illumina_header, attributes = pysamRecord.query_name.strip().split(';',1) |
|
|
390 |
|
|
|
391 |
self._parse_illumina_header(illumina_header, indexFileParser=None, indexFileAlias=None ) |
|
|
392 |
|
|
|
393 |
for keyValue in attributes.split(';'): |
|
|
394 |
key, value = keyValue.split(':') |
|
|
395 |
self.addTagByTag(key, value, isPhred=False) |
|
|
396 |
|
|
|
397 |
|
|
|
398 |
def reverseComplement(seq): |
|
|
399 |
global complement |
|
|
400 |
return("".join(complement.get(base, base) for base in reversed(seq))) |
|
|
401 |
|
|
|
402 |
|
|
|
403 |
def phredToFastqHeaderSafeQualities(asciiEncodedPhredScores, method=3): |
|
|
404 |
""" Convert ASCII encoded pred string to fastq safe string. |
|
|
405 |
numeric encoded string (method 0), |
|
|
406 |
or 65 shifted (method 1) which is safe to use in the fastq header""" |
|
|
407 |
if method == 0: |
|
|
408 |
return(",".join([str(ord(phred) - 33) for phred in asciiEncodedPhredScores])) |
|
|
409 |
elif method == 1: |
|
|
410 |
return("".join([chr(ord(phred) + 32) for phred in asciiEncodedPhredScores])) |
|
|
411 |
else: |
|
|
412 |
return("".join([string.ascii_letters[min(max(0, ord(phred) - 33), len(string.ascii_letters))] for phred in asciiEncodedPhredScores])) |
|
|
413 |
|
|
|
414 |
|
|
|
415 |
def fastqHeaderSafeQualitiesToPhred(phred, method=3): |
|
|
416 |
# print(phred) |
|
|
417 |
return "".join((chr(string.ascii_letters.index(v) + 33) for v in phred)) |
|
|
418 |
|
|
|
419 |
|
|
|
420 |
class NonMultiplexable(Exception): |
|
|
421 |
pass |
|
|
422 |
|
|
|
423 |
# The demultplexing strategy converts a tuple if fastq record(s) into a |
|
|
424 |
# demultiplexed record |
|
|
425 |
|
|
|
426 |
# Every demultiplexing strategy has : |
|
|
427 |
# a full name: the name shown in the tool |
|
|
428 |
# shortName : a short name, will be put into EVERY fastq record, so keep it really short |
|
|
429 |
# autoDetectable: a flag indicating if this method should be auto detected; |
|
|
430 |
# for example for whole genome sequencing reads, we cannot tell from the |
|
|
431 |
# reads that it is this data, and the flag should be False |
|
|
432 |
|
|
|
433 |
# Method demultiplex( *records (R1, R2, R3 ... ) |
|
|
434 |
# Raises NonMultiplexable Exception if the records do not yield a valid |
|
|
435 |
# result (which is used to determine if the demultplexing method is valid |
|
|
436 |
# to use) |
|
|
437 |
|
|
|
438 |
# Upon initialisation the strategies recieve a dictionary containing the barcodes loaded by the barcode file parser |
|
|
439 |
# (barcodeMapping) |
|
|
440 |
|
|
|
441 |
|
|
|
442 |
class DemultiplexingStrategy(object): |
|
|
443 |
|
|
|
444 |
def __init__(self): |
|
|
445 |
self.shortName = 'place holder demultiplexing method' |
|
|
446 |
self.longName = 'placeHolder' |
|
|
447 |
self.autoDetectable = False |
|
|
448 |
self.description = 'inherit this class to build your own demultipexing strategy' |
|
|
449 |
|
|
|
450 |
self.indexSummary = '' |
|
|
451 |
self.barcodeSummary = '' |
|
|
452 |
|
|
|
453 |
def demultiplex(self, records, **kwargs): |
|
|
454 |
raise NotImplementedError() |
|
|
455 |
|
|
|
456 |
def __repr__(self): |
|
|
457 |
return f'{self.longName} {self.shortName} {self.description} DemultiplexingStrategy' |
|
|
458 |
|
|
|
459 |
def getParserSummary(self): |
|
|
460 |
return(' ' + self.indexSummary + '\n barcodes:' + self.barcodeSummary) |
|
|
461 |
|
|
|
462 |
|
|
|
463 |
class IlluminaBaseDemultiplexer(DemultiplexingStrategy): |
|
|
464 |
|
|
|
465 |
def __init__( |
|
|
466 |
self, |
|
|
467 |
indexFileParser, |
|
|
468 |
indexFileAlias='illumina_merged_ThruPlex48S_RP', |
|
|
469 |
**kwargs): |
|
|
470 |
DemultiplexingStrategy.__init__(self) |
|
|
471 |
self.indexFileParser = indexFileParser |
|
|
472 |
self.illuminaIndicesAlias = indexFileAlias |
|
|
473 |
self.shortName = 'ILLU' |
|
|
474 |
self.longName = 'IlluminaDemux' |
|
|
475 |
self.description = 'Demultiplex as a bulk sample' |
|
|
476 |
self.indexSummary = f'sequencing indices: {indexFileAlias}' |
|
|
477 |
|
|
|
478 |
def demultiplex( |
|
|
479 |
self, |
|
|
480 |
records, |
|
|
481 |
inherited=False, |
|
|
482 |
library=None, |
|
|
483 |
reason=None, |
|
|
484 |
**kwargs): |
|
|
485 |
global TagDefinitions |
|
|
486 |
|
|
|
487 |
try: |
|
|
488 |
if inherited: |
|
|
489 |
return [ |
|
|
490 |
TaggedRecord( |
|
|
491 |
rawRecord=record, |
|
|
492 |
tagDefinitions=TagDefinitions, |
|
|
493 |
indexFileParser=self.indexFileParser, |
|
|
494 |
indexFileAlias=self.illuminaIndicesAlias, |
|
|
495 |
library=library, |
|
|
496 |
reason=reason) for record in records] |
|
|
497 |
else: |
|
|
498 |
return [ |
|
|
499 |
TaggedRecord( |
|
|
500 |
rawRecord=record, |
|
|
501 |
tagDefinitions=TagDefinitions, |
|
|
502 |
indexFileParser=self.indexFileParser, |
|
|
503 |
indexFileAlias=self.illuminaIndicesAlias, |
|
|
504 |
library=library, |
|
|
505 |
reason=reason).asFastq( |
|
|
506 |
record.sequence, |
|
|
507 |
record.plus, |
|
|
508 |
record.qual) for record in records] |
|
|
509 |
except NonMultiplexable: |
|
|
510 |
raise |
|
|
511 |
|
|
|
512 |
# Base strategy for read pairs which have both an umi and sample barcode, the barcode and umi are next to another and continuous |
|
|
513 |
class UmiBarcodeDemuxMethod(IlluminaBaseDemultiplexer): |
|
|
514 |
|
|
|
515 |
def __init__( |
|
|
516 |
self, |
|
|
517 |
umiRead=0, |
|
|
518 |
umiStart=0, |
|
|
519 |
umiLength=6, |
|
|
520 |
barcodeRead=0, |
|
|
521 |
barcodeStart=6, |
|
|
522 |
barcodeLength=8, |
|
|
523 |
barcodeFileParser=None, |
|
|
524 |
barcodeFileAlias=None, |
|
|
525 |
indexFileParser=None, |
|
|
526 |
indexFileAlias='illumina_merged_ThruPlex48S_RP', |
|
|
527 |
random_primer_read=None, |
|
|
528 |
random_primer_length=6, |
|
|
529 |
random_primer_end=False, # True for end, False for start |
|
|
530 |
**kwargs): |
|
|
531 |
self.description = '' |
|
|
532 |
self.barcodeFileAlias = barcodeFileAlias |
|
|
533 |
self.barcodeFileParser = barcodeFileParser |
|
|
534 |
IlluminaBaseDemultiplexer.__init__( |
|
|
535 |
self, |
|
|
536 |
indexFileParser=indexFileParser, |
|
|
537 |
indexFileAlias=indexFileAlias) |
|
|
538 |
self.barcodeSummary = self.barcodeFileAlias |
|
|
539 |
self.umiRead = umiRead # 0:Read 1, 1: Read 2 etc |
|
|
540 |
self.umiStart = umiStart # First base |
|
|
541 |
self.umiLength = umiLength |
|
|
542 |
|
|
|
543 |
self.barcodeRead = barcodeRead |
|
|
544 |
self.barcodeStart = barcodeStart |
|
|
545 |
self.barcodeLength = barcodeLength |
|
|
546 |
self.autoDetectable = False |
|
|
547 |
|
|
|
548 |
self.random_primer_read = random_primer_read |
|
|
549 |
self.random_primer_length = random_primer_length |
|
|
550 |
self.random_primer_end = random_primer_end |
|
|
551 |
|
|
|
552 |
# ranges to capture for read 1 and read 2 |
|
|
553 |
self.sequenceCapture = [slice(None), slice(None)] |
|
|
554 |
if umiLength == 0: |
|
|
555 |
# Barcode only |
|
|
556 |
if barcodeStart != 0: |
|
|
557 |
raise NotImplementedError( |
|
|
558 |
'Complicated slice where we need to capture around a region') |
|
|
559 |
self.sequenceCapture[barcodeRead] = slice(barcodeLength, None) |
|
|
560 |
else: |
|
|
561 |
if umiRead != barcodeRead: |
|
|
562 |
raise NotImplementedError() |
|
|
563 |
if not(umiStart == 0 or barcodeStart == 0): |
|
|
564 |
raise NotImplementedError( |
|
|
565 |
'Complicated slice where we need to capture around a region') |
|
|
566 |
self.sequenceCapture[barcodeRead] = slice( |
|
|
567 |
barcodeLength + umiLength, None) |
|
|
568 |
|
|
|
569 |
if random_primer_read is not None: |
|
|
570 |
|
|
|
571 |
if self.sequenceCapture[random_primer_read].stop is not None: |
|
|
572 |
raise NotImplementedError() |
|
|
573 |
if random_primer_end: |
|
|
574 |
|
|
|
575 |
self.sequenceCapture[random_primer_read] = slice( |
|
|
576 |
self.sequenceCapture[random_primer_read].start, |
|
|
577 |
-random_primer_length, |
|
|
578 |
self.sequenceCapture[random_primer_read].step |
|
|
579 |
) |
|
|
580 |
self.random_primer_slice = slice(-random_primer_length, None, None) |
|
|
581 |
else: |
|
|
582 |
self.sequenceCapture[random_primer_read] = slice( |
|
|
583 |
random_primer_length, |
|
|
584 |
None, |
|
|
585 |
self.sequenceCapture[random_primer_read].step |
|
|
586 |
) |
|
|
587 |
self.random_primer_slice = slice(0, random_primer_length, None) |
|
|
588 |
|
|
|
589 |
def __repr__(self): |
|
|
590 |
try: |
|
|
591 |
return f'{self.longName} bc: {self.barcodeStart}:{self.barcodeLength}, umi: {self.umiStart}:{self.umiLength} {self.description}' |
|
|
592 |
except AttributeError: # Happens when this class is inherited and some of the attributes might have not been set |
|
|
593 |
return f'{self.longName}, {self.description}' |
|
|
594 |
|
|
|
595 |
|
|
|
596 |
def demultiplex(self, records, **kwargs): |
|
|
597 |
|
|
|
598 |
# Check if the supplied reads are mate-pair or single end |
|
|
599 |
if len(records) not in (1, 2): |
|
|
600 |
raise NonMultiplexable('Not mate pair or single end') |
|
|
601 |
|
|
|
602 |
|
|
|
603 |
# Perform first pass demultiplexing of the illumina fragments: |
|
|
604 |
try: |
|
|
605 |
taggedRecords = IlluminaBaseDemultiplexer.demultiplex( |
|
|
606 |
self, records, inherited=True, **kwargs) |
|
|
607 |
except NonMultiplexable: |
|
|
608 |
raise |
|
|
609 |
|
|
|
610 |
rawBarcode = records[self.barcodeRead].sequence[self.barcodeStart: |
|
|
611 |
self.barcodeStart + self.barcodeLength] |
|
|
612 |
barcodeQual = records[self.barcodeRead].qual[self.barcodeStart: |
|
|
613 |
self.barcodeStart + self.barcodeLength] |
|
|
614 |
|
|
|
615 |
barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance( |
|
|
616 |
alias=self.barcodeFileAlias, barcode=rawBarcode) |
|
|
617 |
#print(self.barcodeFileParser,self.barcodeFileAlias,rawBarcode,barcodeIdentifier, barcode, hammingDistance) |
|
|
618 |
if barcodeIdentifier is None: |
|
|
619 |
raise NonMultiplexable( |
|
|
620 |
f'bc:{rawBarcode}_not_matching_{self.barcodeFileAlias}') |
|
|
621 |
|
|
|
622 |
random_primer = None |
|
|
623 |
if self.random_primer_read is not None: |
|
|
624 |
random_primer = records[self.random_primer_read].sequence[self.random_primer_slice] |
|
|
625 |
if self.umiLength != 0: |
|
|
626 |
umi = records[self.umiRead].sequence[self.umiStart:self.umiStart + self.umiLength] |
|
|
627 |
umiQual = records[self.umiRead].qual[self.umiStart:self.umiStart + self.umiLength] |
|
|
628 |
|
|
|
629 |
for tr in taggedRecords: |
|
|
630 |
#tr.addTagByTag('uL', self.umiLength, isPhred=False) |
|
|
631 |
if self.umiLength == 0: |
|
|
632 |
#tr.addTagByTag('MI', barcode, isPhred=False) |
|
|
633 |
#tr.addTagByTag('QM', barcodeQual, isPhred=True) |
|
|
634 |
pass |
|
|
635 |
else: |
|
|
636 |
tr.tags['RX'] = umi |
|
|
637 |
tr.addTagByTag('RQ', umiQual, isPhred=True,cast_type=None) |
|
|
638 |
#tr.addTagByTag('MI', barcode+umi, isPhred=False) |
|
|
639 |
#tr.addTagByTag('QM', barcodeQual+umiQual, isPhred=True) |
|
|
640 |
|
|
|
641 |
""" These can be updated at once |
|
|
642 |
tr.addTagByTag('bi', barcodeIdentifier, isPhred=False) |
|
|
643 |
tr.addTagByTag('bc', rawBarcode, isPhred=False) |
|
|
644 |
tr.addTagByTag('MX', self.shortName, isPhred=False) |
|
|
645 |
tr.addTagByTag('BC', barcode, isPhred=False ) |
|
|
646 |
""" |
|
|
647 |
tr.tags.update({ |
|
|
648 |
'bi': barcodeIdentifier, |
|
|
649 |
'bc': rawBarcode, |
|
|
650 |
'MX': self.shortName, |
|
|
651 |
'BC': barcode |
|
|
652 |
}) |
|
|
653 |
#tr.addTagByTag('hd', hammingDistance, isPhred=False) |
|
|
654 |
if random_primer is not None: |
|
|
655 |
tr.addTagByTag('rS', |
|
|
656 |
random_primer, |
|
|
657 |
isPhred=False, |
|
|
658 |
make_safe=False) |
|
|
659 |
|
|
|
660 |
#tr.addTagByTag('QT', barcodeQual, isPhred=True) |
|
|
661 |
|
|
|
662 |
if len(barcode) != len(barcodeQual): |
|
|
663 |
raise ValueError() |
|
|
664 |
|
|
|
665 |
for rid, (record, taggedRecord) in enumerate( |
|
|
666 |
zip(records, taggedRecords)): |
|
|
667 |
taggedRecord.sequence = record.sequence[self.sequenceCapture[rid]] |
|
|
668 |
taggedRecord.qualities = record.qual[self.sequenceCapture[rid]] |
|
|
669 |
taggedRecord.plus = record.plus |
|
|
670 |
|
|
|
671 |
return taggedRecords |
|
|
672 |
# return [ tr.asFastq(record.sequence[self.sequenceCapture[rid]], record.plus, record.qual[self.sequenceCapture[rid]]) for rid,(tr,record) in enumerate(zip(taggedRecords, records))] |
|
|
673 |
# Add information and rebuild header |
|
|
674 |
#header = f'@UMI:{umi};UMIQ:{umiQual};CBI:{barcodeIdentifier};CB:{barcode};CBQ:{barcodeQual};' |
|
|
675 |
|
|
|
676 |
# return fastqIterator.FastqRecord(header, records[1].sequence, |
|
|
677 |
# records[1].plus, records[1].qual ) |
|
|
678 |
|
|
|
679 |
|
|
|
680 |
# Base strategy for read pairs which have both an umi and sample barcode, but are not contiguous, |
|
|
681 |
# for example UMI-BCA-UMI_BCB |
|
|
682 |
|
|
|
683 |
def apply_slices_seq(record, slices): |
|
|
684 |
if len(slices)==0 or record is None: |
|
|
685 |
return '' |
|
|
686 |
return ''.join( (record.sequence[sc] for sc in slices)) |
|
|
687 |
|
|
|
688 |
def apply_slices_qual(record, slices): |
|
|
689 |
if len(slices)==0 or record is None: |
|
|
690 |
return '' |
|
|
691 |
return ''.join( (record.qual[sc] for sc in slices)) |
|
|
692 |
|
|
|
693 |
|
|
|
694 |
class ScatteredUmiBarcodeDemuxMethod(IlluminaBaseDemultiplexer): |
|
|
695 |
|
|
|
696 |
def __init__( |
|
|
697 |
self, |
|
|
698 |
barcode_slices = None, # 2-len Tuple of List of slices where the cell barcode is present in the read (slice(), slice(), ...], [ slice()) |
|
|
699 |
umi_slices = None, # 2-len List of slices where the UMI is present in the read [slice(), slice(), ...], [slice()] # |
|
|
700 |
capture_slices = None, # Slices which indicate what region of read 1 and read2 to store in the final read (required) (slice, slice) |
|
|
701 |
# Note that there is only one slice per read for the capture |
|
|
702 |
barcodeFileParser=None, |
|
|
703 |
barcodeFileAlias=None, |
|
|
704 |
indexFileParser=None, |
|
|
705 |
indexFileAlias='illumina_merged_ThruPlex48S_RP', |
|
|
706 |
random_primer_read=None, |
|
|
707 |
random_primer_length=6, |
|
|
708 |
random_primer_end=False, # True for end, False for start |
|
|
709 |
**kwargs): |
|
|
710 |
self.description = '' |
|
|
711 |
self.barcodeFileAlias = barcodeFileAlias |
|
|
712 |
self.barcodeFileParser = barcodeFileParser |
|
|
713 |
IlluminaBaseDemultiplexer.__init__( |
|
|
714 |
self, |
|
|
715 |
indexFileParser=indexFileParser, |
|
|
716 |
indexFileAlias=indexFileAlias) |
|
|
717 |
self.barcodeSummary = self.barcodeFileAlias |
|
|
718 |
self.barcode_slices = barcode_slices |
|
|
719 |
self.umi_slices = umi_slices |
|
|
720 |
self.capture_slices = capture_slices |
|
|
721 |
self.random_primer_read = random_primer_read |
|
|
722 |
self.random_primer_length = random_primer_length |
|
|
723 |
self.random_primer_end = random_primer_end |
|
|
724 |
|
|
|
725 |
|
|
|
726 |
|
|
|
727 |
def demultiplex(self, records, **kwargs): |
|
|
728 |
|
|
|
729 |
# Check if the supplied reads are mate-pair or single end |
|
|
730 |
if len(records) not in (1, 2): |
|
|
731 |
raise NonMultiplexable('Not mate pair or single end') |
|
|
732 |
|
|
|
733 |
|
|
|
734 |
# Perform first pass demultiplexing of the illumina fragments: |
|
|
735 |
try: |
|
|
736 |
taggedRecords = IlluminaBaseDemultiplexer.demultiplex( |
|
|
737 |
self, records, inherited=True, **kwargs) |
|
|
738 |
except NonMultiplexable: |
|
|
739 |
raise |
|
|
740 |
|
|
|
741 |
|
|
|
742 |
# Extract the UMI barcode_slices |
|
|
743 |
umi, umi_qual = ''.join( (apply_slices_seq(record, slicer) for record, slicer in zip(records, self.umi_slices)) ), ''.join( (apply_slices_qual(record, slicer) for record, slicer in zip(records, self.umi_slices)) ) |
|
|
744 |
raw_barcode, raw_barcode_qual = ''.join( (apply_slices_seq(record, slicer) for record, slicer in zip(records, self.barcode_slices)) ), ''.join( (apply_slices_qual(record, slicer) for record, slicer in zip(records, self.barcode_slices)) ) |
|
|
745 |
|
|
|
746 |
barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance( |
|
|
747 |
alias=self.barcodeFileAlias, barcode=raw_barcode) |
|
|
748 |
|
|
|
749 |
if barcodeIdentifier is None: |
|
|
750 |
raise NonMultiplexable( |
|
|
751 |
f'bc:{raw_barcode}_not_matching_{self.barcodeFileAlias}') |
|
|
752 |
|
|
|
753 |
random_primer = None |
|
|
754 |
if self.random_primer_read is not None: |
|
|
755 |
random_primer = records[self.random_primer_read].sequence[self.random_primer_slice] |
|
|
756 |
|
|
|
757 |
|
|
|
758 |
for tr in taggedRecords: |
|
|
759 |
#tr.addTagByTag('uL', self.umiLength, isPhred=False) |
|
|
760 |
if len(umi)>0: |
|
|
761 |
tr.tags['RX'] = umi |
|
|
762 |
tr.addTagByTag('RQ', umi_qual, isPhred=True,cast_type=None) |
|
|
763 |
|
|
|
764 |
tr.tags.update({ |
|
|
765 |
'bi': barcodeIdentifier, |
|
|
766 |
'bc': raw_barcode, |
|
|
767 |
'MX': self.shortName, |
|
|
768 |
'BC': barcode |
|
|
769 |
}) |
|
|
770 |
if random_primer is not None: |
|
|
771 |
tr.addTagByTag('rS', |
|
|
772 |
random_primer, |
|
|
773 |
isPhred=False, |
|
|
774 |
make_safe=False) |
|
|
775 |
|
|
|
776 |
for rid, (record, taggedRecord) in enumerate( |
|
|
777 |
zip(records, taggedRecords)): |
|
|
778 |
taggedRecord.sequence = record.sequence[self.capture_slices[rid]] |
|
|
779 |
taggedRecord.qualities = record.qual[self.capture_slices[rid]] |
|
|
780 |
taggedRecord.plus = record.plus |
|
|
781 |
|
|
|
782 |
return taggedRecords |