Switch to unified view

a b/singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
import re
4
import singlecellmultiomics.fastqProcessing.fastqIterator as fastqIterator
5
import string
6
from singlecellmultiomics.utils.sequtils import hamming_distance
7
from singlecellmultiomics.tags import *
8
9
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
10
11
12
TagDefinitions = {tag.tag: tag for tag in tags}
13
14
# Obtain metadata from read
15
16
17
def metaFromRead(read, tag):
18
19
    if tag == 'chrom':
20
        return read.reference_name
21
22
    if read.has_tag(tag):
23
        return read.get_tag(tag)
24
25
    # Backwards compatibility with BI > bi tag:
26
    if tag=='BI' and read.has_tag('bi'):
27
        return read.get_tag('bi')
28
29
    # Forwards compatibility:
30
    if tag=='bi' and read.has_tag('BI'):
31
        return read.get_tag('BI')
32
33
    try:
34
        return getattr(read, tag)
35
    except Exception as e:
36
        pass
37
        # print(e)
38
39
    return None
40
41
42
# Clean a string to be able to be used in a fastq file header
43
fastqCleanerRegex = re.compile('[^a-zA-Z0-9-_]', re.UNICODE)
44
45
def fqSafe(string) -> str:
46
    """
47
    Convert input string into a representation which can be stored in a fastq header
48
49
    Input:
50
        string(str) : string to clean
51
52
    Returns:
53
        cleaned(str)
54
    """
55
    global fastqCleanerRegex
56
    return(fastqCleanerRegex.sub('', string))
57
58
59
illuminaHeaderSplitRegex = re.compile(':| ', re.UNICODE)
60
61
62
class TaggedRecord():
63
    def __init__(
64
            self,
65
            tagDefinitions,
66
            rawRecord=False,
67
            library=None,
68
            reason=None,
69
            **kwargs):
70
        self.tags = {}  # 2 character Key -> value
71
        self.tagDefinitions = tagDefinitions
72
        if rawRecord is not False:
73
            try:
74
                self.fromRawFastq(rawRecord, **kwargs)
75
            except NonMultiplexable:
76
                raise
77
        if library is not None and not 'LY' in self.tags:
78
            self.tags['LY'] = library
79
        if reason is not None:
80
            self.tags['RR'] = reason
81
82
        if isinstance(rawRecord, fastqIterator.FastqRecord):
83
            self.sequence = rawRecord.sequence
84
            self.qualities = rawRecord.qual
85
            self.plus = rawRecord.plus
86
87
    def addTagByTag(
88
            self,
89
            tagName,
90
            value,
91
            isPhred=None,
92
            decodePhred=False,
93
            cast_type=str,
94
            make_safe=True):
95
96
        if isPhred is None:
97
            isPhred = self.tagDefinitions[tagName].isPhred
98
        if cast_type and not isinstance(value, cast_type):
99
            value = cast_type(value)
100
        if isPhred:
101
            if decodePhred:
102
                # Convert encoded phred scores back to original ascii
103
                self.tags[tagName] = fastqHeaderSafeQualitiesToPhred(
104
                    value, method=3)
105
            else:
106
                self.tags[tagName] = phredToFastqHeaderSafeQualities(
107
                    value, method=3)
108
        else:
109
            if cast_type is str:
110
                if make_safe:
111
                    self.tags[tagName] = fqSafe(value)
112
                else:
113
                    self.tags[tagName] = value
114
            else:
115
                self.tags[tagName] = value
116
117
    def __repr__(self):
118
        return self.asFastq()
119
120
    def asFastq(
121
            self,
122
            sequence=None,
123
            dirAtt=None,
124
            baseQualities=None,
125
            format='illumina'):
126
        if sequence is None:
127
            if self.sequence is None:
128
                raise ValueError()
129
            sequence = self.sequence
130
        if dirAtt is None:
131
            if self.plus is None:
132
                raise ValueError()
133
            dirAtt = self.plus
134
        if baseQualities is None:
135
            if self.qualities is None:
136
                raise ValueError()
137
            baseQualities = self.qualities
138
139
        header = ";".join([f"{attribute}:{value}" for attribute, value in self.tags.items(
140
        ) if not self.tagDefinitions[attribute].doNotWrite])
141
        if len(header) > 255:  # the header length is stored as uint_8 and includes a null character. The maximum length is thus 255
142
            raise ValueError(
143
                f"The length of the demultiplexed header is longer than 255 characters. Try to keep your library name below 60 characters. Reduce the length of the header. For example by using -merge _ which will not put the flow cell in the sample name. The header looks like this: {header}")
144
        return f'@{header}\n{sequence}\n{dirAtt}\n{baseQualities}\n'
145
146
    def has_tag(self, tag):
147
        return tag in self.tags
148
149
    def asIlluminaHeader(self):
150
        return '{Is}:{RN}:{Fc}:{La}:{Ti}:{CX}:{CY}'.format(**self.tags)
151
152
153
154
155
    def parse_3dec_header(self,fastqRecord, indexFileParser,  indexFileAlias):
156
157
        instrument = 'UNK'
158
        runNumber = 'UNK'
159
        flowCellId = 'UNK'
160
        indexSequence = 'N'
161
        lane = 'UNK'
162
        tile = 'UNK'
163
        clusterXpos = '-1'
164
        clusterYpos = '-1'
165
        readPairNumber = '0'
166
        isFiltered = '0'
167
        controlNumber = '0'
168
169
        # 3-DEC: @Cluster_s_1_1101_2
170
        if fastqRecord.header.count('_') == 4:
171
            _cluster_, _s_, lane, tile, readPairNumber = fastqRecord.header.split(
172
                '_')
173
            # check  that this s thingy is at the right place
174
            assert(_s_ == 's')
175
        else:
176
            raise
177
178
        self.tags.update({
179
            'Is': instrument,
180
            'RN': runNumber,
181
            'Fc': flowCellId,
182
            'La': lane,
183
            'Ti': tile,
184
            'CX': clusterXpos,
185
            'CY': clusterYpos,
186
            'RP': readPairNumber,
187
            'Fi': isFiltered,
188
            'CN': controlNumber
189
        })
190
191
192
    def _parse_illumina_header(self,header, indexFileParser = None,  indexFileAlias = None):
193
194
        try:
195
            instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos, readPairNumber, isFiltered, controlNumber, indexSequence  = header.replace(' ',':').split(':')
196
        except BaseException:
197
198
            try:
199
                instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos, readPairNumber, isFiltered, controlNumber = illuminaHeaderSplitRegex.split(
200
                    header.replace('::', ''))
201
                indexSequence = "N"
202
            except BaseException:
203
204
                try:
205
                    instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos = header.split(':')
206
                    indexSequence = "N"
207
                    readPairNumber = 1
208
                    isFiltered = 0
209
                    controlNumber = 0
210
                except BaseException:
211
                    raise
212
213
        self.tags.update({
214
            'Is': instrument,
215
            'RN': runNumber,
216
            'Fc': flowCellId,
217
            'La': lane,
218
            'Ti': tile,
219
            'CX': clusterXpos,
220
            'CY': clusterYpos,
221
            'RP': readPairNumber,
222
            'Fi': isFiltered,
223
            'CN': controlNumber
224
        })
225
226
        if indexFileParser is not None and indexFileAlias is not None:
227
            # Check if the index is an integer:
228
            try:
229
                indexInteger = int(indexSequence)
230
                indexIdentifier, correctedIndex, hammingDistance = indexSequence, indexSequence, 0
231
            except ValueError:
232
                indexIdentifier, correctedIndex, hammingDistance = indexFileParser.getIndexCorrectedBarcodeAndHammingDistance(
233
                    alias=indexFileAlias, barcode=indexSequence)
234
235
            self.tags['aa']  = indexSequence
236
            if correctedIndex is not None:
237
                #
238
                #self.addTagByTag('aA',correctedIndex, isPhred=False)
239
                #self.addTagByTag('aI',indexIdentifier, isPhred=False)
240
                self.tags.update({'aA': correctedIndex, 'aI': indexIdentifier})
241
            else:
242
                raise NonMultiplexable(
243
                    'Could not obtain index for %s  %s %s' %
244
                    (indexSequence, correctedIndex, indexIdentifier))
245
                # self.addTagByTag('aA',"None")
246
                # self.addTagByTag('aI',-1)
247
                # self.addTagByTag('ah',hammingDistance)
248
249
        else:
250
            #self.addTagByTag('aA',indexSequence, isPhred=False)
251
            self.tags['aa'] = indexSequence
252
253
254
    def parse_illumina_header(self,fastqRecord, indexFileParser = None,  indexFileAlias = None):
255
        return self._parse_illumina_header(fastqRecord.header, indexFileParser=indexFileParser, indexFileAlias=indexFileAlias )
256
257
258
    def parse_scmo_header(self, fastqRecord, indexFileParser,  indexFileAlias):
259
        self.tags.update( dict( kv.split(':') for kv in fastqRecord.header.strip()[1:].split(';') ) )
260
261
    def fromRawFastq(
262
            self,
263
            fastqRecord,
264
            indexFileParser=None,
265
            indexFileAlias=None):
266
267
        try:
268
            self.parse_illumina_header(fastqRecord, indexFileParser,  indexFileAlias)
269
        except BaseException:
270
            if fastqRecord.header.startswith('@Is'):
271
                self.parse_scmo_header(fastqRecord, indexFileParser,  indexFileAlias)
272
            else:
273
                self.parse_3dec_header(fastqRecord, indexFileParser,  indexFileAlias)
274
275
276
            # NS500413:32:H14TKBGXX:2:11101:16448:1664 1:N:0::
277
        """ This is the nice and safe way:
278
        self.addTagByTag( 'Is',instrument, isPhred=False)
279
        self.addTagByTag('RN',runNumber, isPhred=False)
280
        self.addTagByTag('Fc',flowCellId, isPhred=False)
281
        self.addTagByTag('La',lane, isPhred=False)
282
        self.addTagByTag('Ti',tile, isPhred=False)
283
        self.addTagByTag('CX',clusterXpos, isPhred=False)
284
        self.addTagByTag('CY',clusterYpos, isPhred=False)
285
        self.addTagByTag('RP',readPairNumber, isPhred=False)
286
        self.addTagByTag('Fi',isFiltered, isPhred=False)
287
        self.addTagByTag('CN',controlNumber, isPhred=False)
288
        """
289
290
    def tagPysamRead(self, read):
291
292
        moleculeIdentifier = ""
293
        moleculeQuality = ""
294
        moleculeIdentifiyingTags = [
295
            ('BC', 'QT', False),
296
            ('RX', 'RQ', False),
297
            ('aA', None, True)
298
        ]
299
        try:
300
            QT_missing=False
301
            for tag, qualityTag, required in moleculeIdentifiyingTags:
302
                if self.has_tag(tag) and not self.tags.get(tag) is None:
303
                    moleculeIdentifier += self.tags[tag]
304
                    if qualityTag is None:  # Padding:
305
                        moleculeQuality += ('o' * len(self.tags[tag]))
306
                    else:
307
                        if qualityTag in self.tags:
308
                            moleculeQuality += self.tags[qualityTag]
309
                        elif qualityTag=='QT':
310
                            QT_missing = True
311
312
                if required and not self.has_tag(tag):
313
                    raise NonMultiplexable('Tag was defined to be required')
314
315
            # if len(moleculeQuality)!=len(moleculeIdentifier):
316
            # raise ValueError('Could not reconstruct molecule identifier')
317
            # @todo set this back when we recover QT
318
319
            correctedIndex = None if not self.has_tag(
320
                'aA') else self.tags['aA']
321
            indexSequence = None if not self.has_tag('aa') else self.tags['aa']
322
323
            if correctedIndex is not None and indexSequence is not None:
324
                hd = hamming_distance(indexSequence, correctedIndex)
325
                if hd is None:
326
                    raise ValueError(
327
                        "Could not resolve hamming distance between {correctedIndex} and {indexSequence}")
328
                self.addTagByTag('ah', hd, isPhred=False, cast_type=int)
329
330
            self.addTagByTag('MI', moleculeIdentifier, isPhred=False)
331
            if not QT_missing:
332
                self.addTagByTag('QM', moleculeQuality, isPhred=False)
333
334
        except NonMultiplexable:
335
336
            # Its bulk
337
            self.tags['BK'] = True
338
339
        # Add sample tag: (If possible)
340
        # If the bi tag is present it means we know the index of the cell
341
        # if no bi tag is present, assume the sample is bulk
342
        if 'bi' in self.tags:
343
            self.addTagByTag(
344
                'SM',
345
                f'{self.tags["LY"]}_{self.tags["bi"]}',
346
                isPhred=False)
347
        elif 'BI' in self.tags:
348
            self.addTagByTag(
349
                'SM',
350
                f'{self.tags["LY"]}_{self.tags["BI"]}',
351
                isPhred=False)
352
353
            # Remove BI tag
354
            self.tags['bi'] = self.tags["BI"]
355
            del self.tags['BI']
356
        elif "LY" in self.tags:
357
            self.addTagByTag('SM', f'{self.tags["LY"]}_BULK', isPhred=False)
358
359
360
361
362
        # Now we defined the desired values of the tags. Write them to the
363
        # record:
364
        for tag, value in self.tags.items():
365
            # print(tag,value)
366
            if tag in self.tagDefinitions and self.tagDefinitions[tag].isPhred:
367
                value = fastqHeaderSafeQualitiesToPhred(value, method=3)
368
            read.set_tag(tag, value)
369
370
        if not QT_missing and read.has_tag('QM') and len(
371
                read.get_tag('QM')) != len(
372
                read.get_tag('MI')):
373
            raise ValueError('QM and MI tag length not matching')
374
375
    def fromTaggedFastq(self, fastqRecord):
376
        for keyValue in fastqRecord.header.replace('@', '').strip().split(';'):
377
            key, value = keyValue.split(':')
378
            self.addTagByTag(key, value, decodePhred=True)
379
380
    def fromTaggedBamRecord(self, pysamRecord):
381
        try:
382
            for keyValue in pysamRecord.query_name.strip().split(';'):
383
                key, value = keyValue.split(':')
384
                self.addTagByTag(key, value, isPhred=False)
385
        except ValueError:
386
            # Try to parse "Single Cell Discoveries" header
387
            # These have the following header:
388
            #NBXXXXXX:530:HXXXXXX:2:2:17:6;SS:GTCATTAG;CB:GTCATTAG;QT:eeeeeeee;RX:CTGAAC;RQ:aaaaae;SM:SAMPLE_NAME
389
            illumina_header, attributes = pysamRecord.query_name.strip().split(';',1)
390
391
            self._parse_illumina_header(illumina_header, indexFileParser=None, indexFileAlias=None )
392
393
            for keyValue in attributes.split(';'):
394
                key, value = keyValue.split(':')
395
                self.addTagByTag(key, value, isPhred=False)
396
397
398
def reverseComplement(seq):
399
    global complement
400
    return("".join(complement.get(base, base) for base in reversed(seq)))
401
402
403
def phredToFastqHeaderSafeQualities(asciiEncodedPhredScores, method=3):
404
    """ Convert ASCII encoded pred string to fastq safe string.
405
    numeric encoded string (method 0),
406
    or 65 shifted (method 1) which is safe to use in the fastq header"""
407
    if method == 0:
408
        return(",".join([str(ord(phred) - 33) for phred in asciiEncodedPhredScores]))
409
    elif method == 1:
410
        return("".join([chr(ord(phred) + 32) for phred in asciiEncodedPhredScores]))
411
    else:
412
        return("".join([string.ascii_letters[min(max(0, ord(phred) - 33), len(string.ascii_letters))] for phred in asciiEncodedPhredScores]))
413
414
415
def fastqHeaderSafeQualitiesToPhred(phred, method=3):
416
    # print(phred)
417
    return "".join((chr(string.ascii_letters.index(v) + 33) for v in phred))
418
419
420
class NonMultiplexable(Exception):
421
    pass
422
423
# The demultplexing strategy converts a tuple if fastq record(s) into a
424
# demultiplexed record
425
426
# Every demultiplexing strategy has :
427
# a full name: the name shown in the tool
428
# shortName : a short name, will be put into EVERY fastq record, so keep it really short
429
# autoDetectable: a flag indicating if this method should be auto detected;
430
# for example for whole genome sequencing reads, we cannot tell from the
431
# reads that it is this data, and the flag should be False
432
433
# Method demultiplex( *records (R1, R2, R3 ... )
434
# Raises NonMultiplexable Exception if the records do not yield a valid
435
# result (which is used to determine if the demultplexing method is valid
436
# to use)
437
438
# Upon initialisation the strategies recieve a dictionary containing the barcodes loaded by the barcode file parser
439
# (barcodeMapping)
440
441
442
class DemultiplexingStrategy(object):
443
444
    def __init__(self):
445
        self.shortName = 'place holder demultiplexing method'
446
        self.longName = 'placeHolder'
447
        self.autoDetectable = False
448
        self.description = 'inherit this class to build your own demultipexing strategy'
449
450
        self.indexSummary = ''
451
        self.barcodeSummary = ''
452
453
    def demultiplex(self, records, **kwargs):
454
        raise NotImplementedError()
455
456
    def __repr__(self):
457
        return f'{self.longName} {self.shortName} {self.description} DemultiplexingStrategy'
458
459
    def getParserSummary(self):
460
        return(' ' + self.indexSummary + '\n barcodes:' + self.barcodeSummary)
461
462
463
class IlluminaBaseDemultiplexer(DemultiplexingStrategy):
464
465
    def __init__(
466
            self,
467
            indexFileParser,
468
            indexFileAlias='illumina_merged_ThruPlex48S_RP',
469
            **kwargs):
470
        DemultiplexingStrategy.__init__(self)
471
        self.indexFileParser = indexFileParser
472
        self.illuminaIndicesAlias = indexFileAlias
473
        self.shortName = 'ILLU'
474
        self.longName = 'IlluminaDemux'
475
        self.description = 'Demultiplex as a bulk sample'
476
        self.indexSummary = f'sequencing indices: {indexFileAlias}'
477
478
    def demultiplex(
479
            self,
480
            records,
481
            inherited=False,
482
            library=None,
483
            reason=None,
484
            **kwargs):
485
        global TagDefinitions
486
487
        try:
488
            if inherited:
489
                return [
490
                    TaggedRecord(
491
                        rawRecord=record,
492
                        tagDefinitions=TagDefinitions,
493
                        indexFileParser=self.indexFileParser,
494
                        indexFileAlias=self.illuminaIndicesAlias,
495
                        library=library,
496
                        reason=reason) for record in records]
497
            else:
498
                return [
499
                    TaggedRecord(
500
                        rawRecord=record,
501
                        tagDefinitions=TagDefinitions,
502
                        indexFileParser=self.indexFileParser,
503
                        indexFileAlias=self.illuminaIndicesAlias,
504
                        library=library,
505
                        reason=reason).asFastq(
506
                        record.sequence,
507
                        record.plus,
508
                        record.qual) for record in records]
509
        except NonMultiplexable:
510
            raise
511
512
# Base strategy for read pairs which have both an umi and sample barcode, the barcode and umi are next to another and continuous 
513
class UmiBarcodeDemuxMethod(IlluminaBaseDemultiplexer):
514
515
    def __init__(
516
            self,
517
            umiRead=0,
518
            umiStart=0,
519
            umiLength=6,
520
            barcodeRead=0,
521
            barcodeStart=6,
522
            barcodeLength=8,
523
            barcodeFileParser=None,
524
            barcodeFileAlias=None,
525
            indexFileParser=None,
526
            indexFileAlias='illumina_merged_ThruPlex48S_RP',
527
            random_primer_read=None,
528
            random_primer_length=6,
529
            random_primer_end=False, # True for end, False for start
530
            **kwargs):
531
        self.description = ''
532
        self.barcodeFileAlias = barcodeFileAlias
533
        self.barcodeFileParser = barcodeFileParser
534
        IlluminaBaseDemultiplexer.__init__(
535
            self,
536
            indexFileParser=indexFileParser,
537
            indexFileAlias=indexFileAlias)
538
        self.barcodeSummary = self.barcodeFileAlias
539
        self.umiRead = umiRead  # 0:Read 1, 1: Read 2 etc
540
        self.umiStart = umiStart  # First base
541
        self.umiLength = umiLength
542
543
        self.barcodeRead = barcodeRead
544
        self.barcodeStart = barcodeStart
545
        self.barcodeLength = barcodeLength
546
        self.autoDetectable = False
547
548
        self.random_primer_read = random_primer_read
549
        self.random_primer_length = random_primer_length
550
        self.random_primer_end = random_primer_end
551
552
        # ranges to capture for read 1 and read 2
553
        self.sequenceCapture = [slice(None), slice(None)]
554
        if umiLength == 0:
555
            # Barcode only
556
            if barcodeStart != 0:
557
                raise NotImplementedError(
558
                    'Complicated slice where we need to capture around a region')
559
            self.sequenceCapture[barcodeRead] = slice(barcodeLength, None)
560
        else:
561
            if umiRead != barcodeRead:
562
                raise NotImplementedError()
563
            if not(umiStart == 0 or barcodeStart == 0):
564
                raise NotImplementedError(
565
                    'Complicated slice where we need to capture around a region')
566
            self.sequenceCapture[barcodeRead] = slice(
567
                barcodeLength + umiLength, None)
568
569
        if random_primer_read is not None:
570
571
            if self.sequenceCapture[random_primer_read].stop is not None:
572
                raise NotImplementedError()
573
            if random_primer_end:
574
575
                self.sequenceCapture[random_primer_read] = slice(
576
                    self.sequenceCapture[random_primer_read].start,
577
                    -random_primer_length,
578
                    self.sequenceCapture[random_primer_read].step
579
                )
580
                self.random_primer_slice = slice(-random_primer_length, None, None)
581
            else:
582
                self.sequenceCapture[random_primer_read] = slice(
583
                    random_primer_length,
584
                    None,
585
                    self.sequenceCapture[random_primer_read].step
586
                )
587
                self.random_primer_slice = slice(0, random_primer_length, None)
588
589
    def __repr__(self):
590
        try:
591
            return f'{self.longName} bc: {self.barcodeStart}:{self.barcodeLength}, umi: {self.umiStart}:{self.umiLength} {self.description}'
592
        except AttributeError: # Happens when this class is inherited and some of the attributes might have not been set
593
            return f'{self.longName}, {self.description}'
594
    
595
596
    def demultiplex(self, records, **kwargs):
597
598
        # Check if the supplied reads are mate-pair or single end
599
        if len(records) not in (1, 2):
600
            raise NonMultiplexable('Not mate pair or single end')
601
602
603
        # Perform first pass demultiplexing of the illumina fragments:
604
        try:
605
            taggedRecords = IlluminaBaseDemultiplexer.demultiplex(
606
                self, records, inherited=True, **kwargs)
607
        except NonMultiplexable:
608
            raise
609
610
        rawBarcode = records[self.barcodeRead].sequence[self.barcodeStart:
611
                                                        self.barcodeStart + self.barcodeLength]
612
        barcodeQual = records[self.barcodeRead].qual[self.barcodeStart:
613
                                                     self.barcodeStart + self.barcodeLength]
614
615
        barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance(
616
            alias=self.barcodeFileAlias, barcode=rawBarcode)
617
        #print(self.barcodeFileParser,self.barcodeFileAlias,rawBarcode,barcodeIdentifier, barcode, hammingDistance)
618
        if barcodeIdentifier is None:
619
            raise NonMultiplexable(
620
                f'bc:{rawBarcode}_not_matching_{self.barcodeFileAlias}')
621
622
        random_primer = None
623
        if self.random_primer_read is not None:
624
            random_primer = records[self.random_primer_read].sequence[self.random_primer_slice]
625
        if self.umiLength != 0:
626
            umi = records[self.umiRead].sequence[self.umiStart:self.umiStart + self.umiLength]
627
            umiQual = records[self.umiRead].qual[self.umiStart:self.umiStart + self.umiLength]
628
629
        for tr in taggedRecords:
630
            #tr.addTagByTag('uL', self.umiLength, isPhred=False)
631
            if self.umiLength == 0:
632
                #tr.addTagByTag('MI', barcode, isPhred=False)
633
                #tr.addTagByTag('QM', barcodeQual, isPhred=True)
634
                pass
635
            else:
636
                tr.tags['RX'] =  umi
637
                tr.addTagByTag('RQ', umiQual, isPhred=True,cast_type=None)
638
                #tr.addTagByTag('MI', barcode+umi, isPhred=False)
639
                #tr.addTagByTag('QM', barcodeQual+umiQual, isPhred=True)
640
641
            """ These can be updated at once
642
            tr.addTagByTag('bi', barcodeIdentifier, isPhred=False)
643
            tr.addTagByTag('bc', rawBarcode, isPhred=False)
644
            tr.addTagByTag('MX', self.shortName, isPhred=False)
645
            tr.addTagByTag('BC', barcode, isPhred=False )
646
            """
647
            tr.tags.update({
648
                'bi': barcodeIdentifier,
649
                'bc': rawBarcode,
650
                'MX': self.shortName,
651
                'BC': barcode
652
            })
653
            #tr.addTagByTag('hd', hammingDistance, isPhred=False)
654
            if random_primer is not None:
655
                tr.addTagByTag('rS',
656
                               random_primer,
657
                               isPhred=False,
658
                               make_safe=False)
659
660
            #tr.addTagByTag('QT', barcodeQual, isPhred=True)
661
662
            if len(barcode) != len(barcodeQual):
663
                raise ValueError()
664
665
        for rid, (record, taggedRecord) in enumerate(
666
                zip(records, taggedRecords)):
667
            taggedRecord.sequence = record.sequence[self.sequenceCapture[rid]]
668
            taggedRecord.qualities = record.qual[self.sequenceCapture[rid]]
669
            taggedRecord.plus = record.plus
670
671
        return taggedRecords
672
        # return [ tr.asFastq(record.sequence[self.sequenceCapture[rid]], record.plus, record.qual[self.sequenceCapture[rid]]) for rid,(tr,record) in enumerate(zip(taggedRecords, records))]
673
        # Add information and rebuild header
674
        #header = f'@UMI:{umi};UMIQ:{umiQual};CBI:{barcodeIdentifier};CB:{barcode};CBQ:{barcodeQual};'
675
676
        # return fastqIterator.FastqRecord(header, records[1].sequence,
677
        # records[1].plus,  records[1].qual )
678
679
680
# Base strategy for read pairs which have both an umi and sample barcode, but are not contiguous, 
681
# for example UMI-BCA-UMI_BCB
682
683
def apply_slices_seq(record, slices):
684
    if len(slices)==0 or record is None:
685
        return ''
686
    return ''.join( (record.sequence[sc] for sc in slices))
687
688
def apply_slices_qual(record, slices):
689
    if len(slices)==0 or record is None:
690
        return ''
691
    return ''.join( (record.qual[sc] for sc in slices))
692
693
694
class ScatteredUmiBarcodeDemuxMethod(IlluminaBaseDemultiplexer):
695
696
    def __init__(
697
            self,
698
            barcode_slices = None, # 2-len Tuple of List of slices where the cell barcode is present in the read (slice(), slice(), ...], [ slice())
699
            umi_slices = None, # 2-len List of slices where the UMI is present in the read [slice(), slice(), ...], [slice()] # 
700
            capture_slices = None, # Slices which indicate what region of read 1 and read2 to store in the final read (required) (slice, slice)
701
            # Note that there is only one slice per read for the capture
702
            barcodeFileParser=None,
703
            barcodeFileAlias=None,
704
            indexFileParser=None,
705
            indexFileAlias='illumina_merged_ThruPlex48S_RP',
706
            random_primer_read=None,
707
            random_primer_length=6,
708
            random_primer_end=False, # True for end, False for start
709
            **kwargs):
710
        self.description = ''
711
        self.barcodeFileAlias = barcodeFileAlias
712
        self.barcodeFileParser = barcodeFileParser
713
        IlluminaBaseDemultiplexer.__init__(
714
            self,
715
            indexFileParser=indexFileParser,
716
            indexFileAlias=indexFileAlias)
717
        self.barcodeSummary = self.barcodeFileAlias
718
        self.barcode_slices = barcode_slices
719
        self.umi_slices = umi_slices
720
        self.capture_slices = capture_slices
721
        self.random_primer_read = random_primer_read
722
        self.random_primer_length = random_primer_length
723
        self.random_primer_end = random_primer_end
724
725
        
726
727
    def demultiplex(self, records, **kwargs):
728
729
        # Check if the supplied reads are mate-pair or single end
730
        if len(records) not in (1, 2):
731
            raise NonMultiplexable('Not mate pair or single end')
732
733
734
        # Perform first pass demultiplexing of the illumina fragments:
735
        try:
736
            taggedRecords = IlluminaBaseDemultiplexer.demultiplex(
737
                self, records, inherited=True, **kwargs)
738
        except NonMultiplexable:
739
            raise
740
        
741
742
        # Extract the UMI barcode_slices
743
        umi, umi_qual = ''.join( (apply_slices_seq(record, slicer) for record, slicer in zip(records, self.umi_slices)) ), ''.join( (apply_slices_qual(record, slicer) for record, slicer in zip(records, self.umi_slices)) )
744
        raw_barcode, raw_barcode_qual = ''.join( (apply_slices_seq(record, slicer) for record, slicer in zip(records, self.barcode_slices)) ), ''.join( (apply_slices_qual(record, slicer) for record, slicer in zip(records, self.barcode_slices)) )
745
746
        barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance(
747
            alias=self.barcodeFileAlias, barcode=raw_barcode)
748
        
749
        if barcodeIdentifier is None:
750
            raise NonMultiplexable(
751
                f'bc:{raw_barcode}_not_matching_{self.barcodeFileAlias}')
752
753
        random_primer = None
754
        if self.random_primer_read is not None:
755
            random_primer = records[self.random_primer_read].sequence[self.random_primer_slice]
756
        
757
758
        for tr in taggedRecords:
759
            #tr.addTagByTag('uL', self.umiLength, isPhred=False)
760
            if len(umi)>0:
761
                tr.tags['RX'] =  umi
762
                tr.addTagByTag('RQ', umi_qual, isPhred=True,cast_type=None)
763
               
764
            tr.tags.update({
765
                'bi': barcodeIdentifier,
766
                'bc': raw_barcode,
767
                'MX': self.shortName,
768
                'BC': barcode
769
            })
770
            if random_primer is not None:
771
                tr.addTagByTag('rS',
772
                               random_primer,
773
                               isPhred=False,
774
                               make_safe=False)
775
776
        for rid, (record, taggedRecord) in enumerate(
777
                zip(records, taggedRecords)):
778
            taggedRecord.sequence = record.sequence[self.capture_slices[rid]]
779
            taggedRecord.qualities = record.qual[self.capture_slices[rid]]
780
            taggedRecord.plus = record.plus
781
782
        return taggedRecords