SingleCellMultiOmics / Git / Diff of /singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
Diff of /singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py [000000] .. [2c420a]
Switch to side-by-side view

--- a
+++ b/singlecellmultiomics/modularDemultiplexer/baseDemultiplexMethods.py
@@ -0,0 +1,782 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import re
+import singlecellmultiomics.fastqProcessing.fastqIterator as fastqIterator
+import string
+from singlecellmultiomics.utils.sequtils import hamming_distance
+from singlecellmultiomics.tags import *
+
+complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
+
+
+TagDefinitions = {tag.tag: tag for tag in tags}
+
+# Obtain metadata from read
+
+
+def metaFromRead(read, tag):
+
+    if tag == 'chrom':
+        return read.reference_name
+
+    if read.has_tag(tag):
+        return read.get_tag(tag)
+
+    # Backwards compatibility with BI > bi tag:
+    if tag=='BI' and read.has_tag('bi'):
+        return read.get_tag('bi')
+
+    # Forwards compatibility:
+    if tag=='bi' and read.has_tag('BI'):
+        return read.get_tag('BI')
+
+    try:
+        return getattr(read, tag)
+    except Exception as e:
+        pass
+        # print(e)
+
+    return None
+
+
+# Clean a string to be able to be used in a fastq file header
+fastqCleanerRegex = re.compile('[^a-zA-Z0-9-_]', re.UNICODE)
+
+def fqSafe(string) -> str:
+    """
+    Convert input string into a representation which can be stored in a fastq header
+
+    Input:
+        string(str) : string to clean
+
+    Returns:
+        cleaned(str)
+    """
+    global fastqCleanerRegex
+    return(fastqCleanerRegex.sub('', string))
+
+
+illuminaHeaderSplitRegex = re.compile(':| ', re.UNICODE)
+
+
+class TaggedRecord():
+    def __init__(
+            self,
+            tagDefinitions,
+            rawRecord=False,
+            library=None,
+            reason=None,
+            **kwargs):
+        self.tags = {}  # 2 character Key -> value
+        self.tagDefinitions = tagDefinitions
+        if rawRecord is not False:
+            try:
+                self.fromRawFastq(rawRecord, **kwargs)
+            except NonMultiplexable:
+                raise
+        if library is not None and not 'LY' in self.tags:
+            self.tags['LY'] = library
+        if reason is not None:
+            self.tags['RR'] = reason
+
+        if isinstance(rawRecord, fastqIterator.FastqRecord):
+            self.sequence = rawRecord.sequence
+            self.qualities = rawRecord.qual
+            self.plus = rawRecord.plus
+
+    def addTagByTag(
+            self,
+            tagName,
+            value,
+            isPhred=None,
+            decodePhred=False,
+            cast_type=str,
+            make_safe=True):
+
+        if isPhred is None:
+            isPhred = self.tagDefinitions[tagName].isPhred
+        if cast_type and not isinstance(value, cast_type):
+            value = cast_type(value)
+        if isPhred:
+            if decodePhred:
+                # Convert encoded phred scores back to original ascii
+                self.tags[tagName] = fastqHeaderSafeQualitiesToPhred(
+                    value, method=3)
+            else:
+                self.tags[tagName] = phredToFastqHeaderSafeQualities(
+                    value, method=3)
+        else:
+            if cast_type is str:
+                if make_safe:
+                    self.tags[tagName] = fqSafe(value)
+                else:
+                    self.tags[tagName] = value
+            else:
+                self.tags[tagName] = value
+
+    def __repr__(self):
+        return self.asFastq()
+
+    def asFastq(
+            self,
+            sequence=None,
+            dirAtt=None,
+            baseQualities=None,
+            format='illumina'):
+        if sequence is None:
+            if self.sequence is None:
+                raise ValueError()
+            sequence = self.sequence
+        if dirAtt is None:
+            if self.plus is None:
+                raise ValueError()
+            dirAtt = self.plus
+        if baseQualities is None:
+            if self.qualities is None:
+                raise ValueError()
+            baseQualities = self.qualities
+
+        header = ";".join([f"{attribute}:{value}" for attribute, value in self.tags.items(
+        ) if not self.tagDefinitions[attribute].doNotWrite])
+        if len(header) > 255:  # the header length is stored as uint_8 and includes a null character. The maximum length is thus 255
+            raise ValueError(
+                f"The length of the demultiplexed header is longer than 255 characters. Try to keep your library name below 60 characters. Reduce the length of the header. For example by using -merge _ which will not put the flow cell in the sample name. The header looks like this: {header}")
+        return f'@{header}\n{sequence}\n{dirAtt}\n{baseQualities}\n'
+
+    def has_tag(self, tag):
+        return tag in self.tags
+
+    def asIlluminaHeader(self):
+        return '{Is}:{RN}:{Fc}:{La}:{Ti}:{CX}:{CY}'.format(**self.tags)
+
+
+
+
+    def parse_3dec_header(self,fastqRecord, indexFileParser,  indexFileAlias):
+
+        instrument = 'UNK'
+        runNumber = 'UNK'
+        flowCellId = 'UNK'
+        indexSequence = 'N'
+        lane = 'UNK'
+        tile = 'UNK'
+        clusterXpos = '-1'
+        clusterYpos = '-1'
+        readPairNumber = '0'
+        isFiltered = '0'
+        controlNumber = '0'
+
+        # 3-DEC: @Cluster_s_1_1101_2
+        if fastqRecord.header.count('_') == 4:
+            _cluster_, _s_, lane, tile, readPairNumber = fastqRecord.header.split(
+                '_')
+            # check  that this s thingy is at the right place
+            assert(_s_ == 's')
+        else:
+            raise
+
+        self.tags.update({
+            'Is': instrument,
+            'RN': runNumber,
+            'Fc': flowCellId,
+            'La': lane,
+            'Ti': tile,
+            'CX': clusterXpos,
+            'CY': clusterYpos,
+            'RP': readPairNumber,
+            'Fi': isFiltered,
+            'CN': controlNumber
+        })
+
+
+    def _parse_illumina_header(self,header, indexFileParser = None,  indexFileAlias = None):
+
+        try:
+            instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos, readPairNumber, isFiltered, controlNumber, indexSequence  = header.replace(' ',':').split(':')
+        except BaseException:
+
+            try:
+                instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos, readPairNumber, isFiltered, controlNumber = illuminaHeaderSplitRegex.split(
+                    header.replace('::', ''))
+                indexSequence = "N"
+            except BaseException:
+
+                try:
+                    instrument, runNumber, flowCellId, lane, tile, clusterXpos, clusterYpos = header.split(':')
+                    indexSequence = "N"
+                    readPairNumber = 1
+                    isFiltered = 0
+                    controlNumber = 0
+                except BaseException:
+                    raise
+
+        self.tags.update({
+            'Is': instrument,
+            'RN': runNumber,
+            'Fc': flowCellId,
+            'La': lane,
+            'Ti': tile,
+            'CX': clusterXpos,
+            'CY': clusterYpos,
+            'RP': readPairNumber,
+            'Fi': isFiltered,
+            'CN': controlNumber
+        })
+
+        if indexFileParser is not None and indexFileAlias is not None:
+            # Check if the index is an integer:
+            try:
+                indexInteger = int(indexSequence)
+                indexIdentifier, correctedIndex, hammingDistance = indexSequence, indexSequence, 0
+            except ValueError:
+                indexIdentifier, correctedIndex, hammingDistance = indexFileParser.getIndexCorrectedBarcodeAndHammingDistance(
+                    alias=indexFileAlias, barcode=indexSequence)
+
+            self.tags['aa']  = indexSequence
+            if correctedIndex is not None:
+                #
+                #self.addTagByTag('aA',correctedIndex, isPhred=False)
+                #self.addTagByTag('aI',indexIdentifier, isPhred=False)
+                self.tags.update({'aA': correctedIndex, 'aI': indexIdentifier})
+            else:
+                raise NonMultiplexable(
+                    'Could not obtain index for %s  %s %s' %
+                    (indexSequence, correctedIndex, indexIdentifier))
+                # self.addTagByTag('aA',"None")
+                # self.addTagByTag('aI',-1)
+                # self.addTagByTag('ah',hammingDistance)
+
+        else:
+            #self.addTagByTag('aA',indexSequence, isPhred=False)
+            self.tags['aa'] = indexSequence
+
+
+    def parse_illumina_header(self,fastqRecord, indexFileParser = None,  indexFileAlias = None):
+        return self._parse_illumina_header(fastqRecord.header, indexFileParser=indexFileParser, indexFileAlias=indexFileAlias )
+
+
+    def parse_scmo_header(self, fastqRecord, indexFileParser,  indexFileAlias):
+        self.tags.update( dict( kv.split(':') for kv in fastqRecord.header.strip()[1:].split(';') ) )
+
+    def fromRawFastq(
+            self,
+            fastqRecord,
+            indexFileParser=None,
+            indexFileAlias=None):
+
+        try:
+            self.parse_illumina_header(fastqRecord, indexFileParser,  indexFileAlias)
+        except BaseException:
+            if fastqRecord.header.startswith('@Is'):
+                self.parse_scmo_header(fastqRecord, indexFileParser,  indexFileAlias)
+            else:
+                self.parse_3dec_header(fastqRecord, indexFileParser,  indexFileAlias)
+
+
+            # NS500413:32:H14TKBGXX:2:11101:16448:1664 1:N:0::
+        """ This is the nice and safe way:
+        self.addTagByTag( 'Is',instrument, isPhred=False)
+        self.addTagByTag('RN',runNumber, isPhred=False)
+        self.addTagByTag('Fc',flowCellId, isPhred=False)
+        self.addTagByTag('La',lane, isPhred=False)
+        self.addTagByTag('Ti',tile, isPhred=False)
+        self.addTagByTag('CX',clusterXpos, isPhred=False)
+        self.addTagByTag('CY',clusterYpos, isPhred=False)
+        self.addTagByTag('RP',readPairNumber, isPhred=False)
+        self.addTagByTag('Fi',isFiltered, isPhred=False)
+        self.addTagByTag('CN',controlNumber, isPhred=False)
+        """
+
+    def tagPysamRead(self, read):
+
+        moleculeIdentifier = ""
+        moleculeQuality = ""
+        moleculeIdentifiyingTags = [
+            ('BC', 'QT', False),
+            ('RX', 'RQ', False),
+            ('aA', None, True)
+        ]
+        try:
+            QT_missing=False
+            for tag, qualityTag, required in moleculeIdentifiyingTags:
+                if self.has_tag(tag) and not self.tags.get(tag) is None:
+                    moleculeIdentifier += self.tags[tag]
+                    if qualityTag is None:  # Padding:
+                        moleculeQuality += ('o' * len(self.tags[tag]))
+                    else:
+                        if qualityTag in self.tags:
+                            moleculeQuality += self.tags[qualityTag]
+                        elif qualityTag=='QT':
+                            QT_missing = True
+
+                if required and not self.has_tag(tag):
+                    raise NonMultiplexable('Tag was defined to be required')
+
+            # if len(moleculeQuality)!=len(moleculeIdentifier):
+            # raise ValueError('Could not reconstruct molecule identifier')
+            # @todo set this back when we recover QT
+
+            correctedIndex = None if not self.has_tag(
+                'aA') else self.tags['aA']
+            indexSequence = None if not self.has_tag('aa') else self.tags['aa']
+
+            if correctedIndex is not None and indexSequence is not None:
+                hd = hamming_distance(indexSequence, correctedIndex)
+                if hd is None:
+                    raise ValueError(
+                        "Could not resolve hamming distance between {correctedIndex} and {indexSequence}")
+                self.addTagByTag('ah', hd, isPhred=False, cast_type=int)
+
+            self.addTagByTag('MI', moleculeIdentifier, isPhred=False)
+            if not QT_missing:
+                self.addTagByTag('QM', moleculeQuality, isPhred=False)
+
+        except NonMultiplexable:
+
+            # Its bulk
+            self.tags['BK'] = True
+
+        # Add sample tag: (If possible)
+        # If the bi tag is present it means we know the index of the cell
+        # if no bi tag is present, assume the sample is bulk
+        if 'bi' in self.tags:
+            self.addTagByTag(
+                'SM',
+                f'{self.tags["LY"]}_{self.tags["bi"]}',
+                isPhred=False)
+        elif 'BI' in self.tags:
+            self.addTagByTag(
+                'SM',
+                f'{self.tags["LY"]}_{self.tags["BI"]}',
+                isPhred=False)
+
+            # Remove BI tag
+            self.tags['bi'] = self.tags["BI"]
+            del self.tags['BI']
+        elif "LY" in self.tags:
+            self.addTagByTag('SM', f'{self.tags["LY"]}_BULK', isPhred=False)
+
+
+
+
+        # Now we defined the desired values of the tags. Write them to the
+        # record:
+        for tag, value in self.tags.items():
+            # print(tag,value)
+            if tag in self.tagDefinitions and self.tagDefinitions[tag].isPhred:
+                value = fastqHeaderSafeQualitiesToPhred(value, method=3)
+            read.set_tag(tag, value)
+
+        if not QT_missing and read.has_tag('QM') and len(
+                read.get_tag('QM')) != len(
+                read.get_tag('MI')):
+            raise ValueError('QM and MI tag length not matching')
+
+    def fromTaggedFastq(self, fastqRecord):
+        for keyValue in fastqRecord.header.replace('@', '').strip().split(';'):
+            key, value = keyValue.split(':')
+            self.addTagByTag(key, value, decodePhred=True)
+
+    def fromTaggedBamRecord(self, pysamRecord):
+        try:
+            for keyValue in pysamRecord.query_name.strip().split(';'):
+                key, value = keyValue.split(':')
+                self.addTagByTag(key, value, isPhred=False)
+        except ValueError:
+            # Try to parse "Single Cell Discoveries" header
+            # These have the following header:
+            #NBXXXXXX:530:HXXXXXX:2:2:17:6;SS:GTCATTAG;CB:GTCATTAG;QT:eeeeeeee;RX:CTGAAC;RQ:aaaaae;SM:SAMPLE_NAME
+            illumina_header, attributes = pysamRecord.query_name.strip().split(';',1)
+
+            self._parse_illumina_header(illumina_header, indexFileParser=None, indexFileAlias=None )
+
+            for keyValue in attributes.split(';'):
+                key, value = keyValue.split(':')
+                self.addTagByTag(key, value, isPhred=False)
+
+
+def reverseComplement(seq):
+    global complement
+    return("".join(complement.get(base, base) for base in reversed(seq)))
+
+
+def phredToFastqHeaderSafeQualities(asciiEncodedPhredScores, method=3):
+    """ Convert ASCII encoded pred string to fastq safe string.
+    numeric encoded string (method 0),
+    or 65 shifted (method 1) which is safe to use in the fastq header"""
+    if method == 0:
+        return(",".join([str(ord(phred) - 33) for phred in asciiEncodedPhredScores]))
+    elif method == 1:
+        return("".join([chr(ord(phred) + 32) for phred in asciiEncodedPhredScores]))
+    else:
+        return("".join([string.ascii_letters[min(max(0, ord(phred) - 33), len(string.ascii_letters))] for phred in asciiEncodedPhredScores]))
+
+
+def fastqHeaderSafeQualitiesToPhred(phred, method=3):
+    # print(phred)
+    return "".join((chr(string.ascii_letters.index(v) + 33) for v in phred))
+
+
+class NonMultiplexable(Exception):
+    pass
+
+# The demultplexing strategy converts a tuple if fastq record(s) into a
+# demultiplexed record
+
+# Every demultiplexing strategy has :
+# a full name: the name shown in the tool
+# shortName : a short name, will be put into EVERY fastq record, so keep it really short
+# autoDetectable: a flag indicating if this method should be auto detected;
+# for example for whole genome sequencing reads, we cannot tell from the
+# reads that it is this data, and the flag should be False
+
+# Method demultiplex( *records (R1, R2, R3 ... )
+# Raises NonMultiplexable Exception if the records do not yield a valid
+# result (which is used to determine if the demultplexing method is valid
+# to use)
+
+# Upon initialisation the strategies recieve a dictionary containing the barcodes loaded by the barcode file parser
+# (barcodeMapping)
+
+
+class DemultiplexingStrategy(object):
+
+    def __init__(self):
+        self.shortName = 'place holder demultiplexing method'
+        self.longName = 'placeHolder'
+        self.autoDetectable = False
+        self.description = 'inherit this class to build your own demultipexing strategy'
+
+        self.indexSummary = ''
+        self.barcodeSummary = ''
+
+    def demultiplex(self, records, **kwargs):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return f'{self.longName} {self.shortName} {self.description} DemultiplexingStrategy'
+
+    def getParserSummary(self):
+        return(' ' + self.indexSummary + '\n barcodes:' + self.barcodeSummary)
+
+
+class IlluminaBaseDemultiplexer(DemultiplexingStrategy):
+
+    def __init__(
+            self,
+            indexFileParser,
+            indexFileAlias='illumina_merged_ThruPlex48S_RP',
+            **kwargs):
+        DemultiplexingStrategy.__init__(self)
+        self.indexFileParser = indexFileParser
+        self.illuminaIndicesAlias = indexFileAlias
+        self.shortName = 'ILLU'
+        self.longName = 'IlluminaDemux'
+        self.description = 'Demultiplex as a bulk sample'
+        self.indexSummary = f'sequencing indices: {indexFileAlias}'
+
+    def demultiplex(
+            self,
+            records,
+            inherited=False,
+            library=None,
+            reason=None,
+            **kwargs):
+        global TagDefinitions
+
+        try:
+            if inherited:
+                return [
+                    TaggedRecord(
+                        rawRecord=record,
+                        tagDefinitions=TagDefinitions,
+                        indexFileParser=self.indexFileParser,
+                        indexFileAlias=self.illuminaIndicesAlias,
+                        library=library,
+                        reason=reason) for record in records]
+            else:
+                return [
+                    TaggedRecord(
+                        rawRecord=record,
+                        tagDefinitions=TagDefinitions,
+                        indexFileParser=self.indexFileParser,
+                        indexFileAlias=self.illuminaIndicesAlias,
+                        library=library,
+                        reason=reason).asFastq(
+                        record.sequence,
+                        record.plus,
+                        record.qual) for record in records]
+        except NonMultiplexable:
+            raise
+
+# Base strategy for read pairs which have both an umi and sample barcode, the barcode and umi are next to another and continuous 
+class UmiBarcodeDemuxMethod(IlluminaBaseDemultiplexer):
+
+    def __init__(
+            self,
+            umiRead=0,
+            umiStart=0,
+            umiLength=6,
+            barcodeRead=0,
+            barcodeStart=6,
+            barcodeLength=8,
+            barcodeFileParser=None,
+            barcodeFileAlias=None,
+            indexFileParser=None,
+            indexFileAlias='illumina_merged_ThruPlex48S_RP',
+            random_primer_read=None,
+            random_primer_length=6,
+            random_primer_end=False, # True for end, False for start
+            **kwargs):
+        self.description = ''
+        self.barcodeFileAlias = barcodeFileAlias
+        self.barcodeFileParser = barcodeFileParser
+        IlluminaBaseDemultiplexer.__init__(
+            self,
+            indexFileParser=indexFileParser,
+            indexFileAlias=indexFileAlias)
+        self.barcodeSummary = self.barcodeFileAlias
+        self.umiRead = umiRead  # 0:Read 1, 1: Read 2 etc
+        self.umiStart = umiStart  # First base
+        self.umiLength = umiLength
+
+        self.barcodeRead = barcodeRead
+        self.barcodeStart = barcodeStart
+        self.barcodeLength = barcodeLength
+        self.autoDetectable = False
+
+        self.random_primer_read = random_primer_read
+        self.random_primer_length = random_primer_length
+        self.random_primer_end = random_primer_end
+
+        # ranges to capture for read 1 and read 2
+        self.sequenceCapture = [slice(None), slice(None)]
+        if umiLength == 0:
+            # Barcode only
+            if barcodeStart != 0:
+                raise NotImplementedError(
+                    'Complicated slice where we need to capture around a region')
+            self.sequenceCapture[barcodeRead] = slice(barcodeLength, None)
+        else:
+            if umiRead != barcodeRead:
+                raise NotImplementedError()
+            if not(umiStart == 0 or barcodeStart == 0):
+                raise NotImplementedError(
+                    'Complicated slice where we need to capture around a region')
+            self.sequenceCapture[barcodeRead] = slice(
+                barcodeLength + umiLength, None)
+
+        if random_primer_read is not None:
+
+            if self.sequenceCapture[random_primer_read].stop is not None:
+                raise NotImplementedError()
+            if random_primer_end:
+
+                self.sequenceCapture[random_primer_read] = slice(
+                    self.sequenceCapture[random_primer_read].start,
+                    -random_primer_length,
+                    self.sequenceCapture[random_primer_read].step
+                )
+                self.random_primer_slice = slice(-random_primer_length, None, None)
+            else:
+                self.sequenceCapture[random_primer_read] = slice(
+                    random_primer_length,
+                    None,
+                    self.sequenceCapture[random_primer_read].step
+                )
+                self.random_primer_slice = slice(0, random_primer_length, None)
+
+    def __repr__(self):
+        try:
+            return f'{self.longName} bc: {self.barcodeStart}:{self.barcodeLength}, umi: {self.umiStart}:{self.umiLength} {self.description}'
+        except AttributeError: # Happens when this class is inherited and some of the attributes might have not been set
+            return f'{self.longName}, {self.description}'
+    
+
+    def demultiplex(self, records, **kwargs):
+
+        # Check if the supplied reads are mate-pair or single end
+        if len(records) not in (1, 2):
+            raise NonMultiplexable('Not mate pair or single end')
+
+
+        # Perform first pass demultiplexing of the illumina fragments:
+        try:
+            taggedRecords = IlluminaBaseDemultiplexer.demultiplex(
+                self, records, inherited=True, **kwargs)
+        except NonMultiplexable:
+            raise
+
+        rawBarcode = records[self.barcodeRead].sequence[self.barcodeStart:
+                                                        self.barcodeStart + self.barcodeLength]
+        barcodeQual = records[self.barcodeRead].qual[self.barcodeStart:
+                                                     self.barcodeStart + self.barcodeLength]
+
+        barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance(
+            alias=self.barcodeFileAlias, barcode=rawBarcode)
+        #print(self.barcodeFileParser,self.barcodeFileAlias,rawBarcode,barcodeIdentifier, barcode, hammingDistance)
+        if barcodeIdentifier is None:
+            raise NonMultiplexable(
+                f'bc:{rawBarcode}_not_matching_{self.barcodeFileAlias}')
+
+        random_primer = None
+        if self.random_primer_read is not None:
+            random_primer = records[self.random_primer_read].sequence[self.random_primer_slice]
+        if self.umiLength != 0:
+            umi = records[self.umiRead].sequence[self.umiStart:self.umiStart + self.umiLength]
+            umiQual = records[self.umiRead].qual[self.umiStart:self.umiStart + self.umiLength]
+
+        for tr in taggedRecords:
+            #tr.addTagByTag('uL', self.umiLength, isPhred=False)
+            if self.umiLength == 0:
+                #tr.addTagByTag('MI', barcode, isPhred=False)
+                #tr.addTagByTag('QM', barcodeQual, isPhred=True)
+                pass
+            else:
+                tr.tags['RX'] =  umi
+                tr.addTagByTag('RQ', umiQual, isPhred=True,cast_type=None)
+                #tr.addTagByTag('MI', barcode+umi, isPhred=False)
+                #tr.addTagByTag('QM', barcodeQual+umiQual, isPhred=True)
+
+            """ These can be updated at once
+            tr.addTagByTag('bi', barcodeIdentifier, isPhred=False)
+            tr.addTagByTag('bc', rawBarcode, isPhred=False)
+            tr.addTagByTag('MX', self.shortName, isPhred=False)
+            tr.addTagByTag('BC', barcode, isPhred=False )
+            """
+            tr.tags.update({
+                'bi': barcodeIdentifier,
+                'bc': rawBarcode,
+                'MX': self.shortName,
+                'BC': barcode
+            })
+            #tr.addTagByTag('hd', hammingDistance, isPhred=False)
+            if random_primer is not None:
+                tr.addTagByTag('rS',
+                               random_primer,
+                               isPhred=False,
+                               make_safe=False)
+
+            #tr.addTagByTag('QT', barcodeQual, isPhred=True)
+
+            if len(barcode) != len(barcodeQual):
+                raise ValueError()
+
+        for rid, (record, taggedRecord) in enumerate(
+                zip(records, taggedRecords)):
+            taggedRecord.sequence = record.sequence[self.sequenceCapture[rid]]
+            taggedRecord.qualities = record.qual[self.sequenceCapture[rid]]
+            taggedRecord.plus = record.plus
+
+        return taggedRecords
+        # return [ tr.asFastq(record.sequence[self.sequenceCapture[rid]], record.plus, record.qual[self.sequenceCapture[rid]]) for rid,(tr,record) in enumerate(zip(taggedRecords, records))]
+        # Add information and rebuild header
+        #header = f'@UMI:{umi};UMIQ:{umiQual};CBI:{barcodeIdentifier};CB:{barcode};CBQ:{barcodeQual};'
+
+        # return fastqIterator.FastqRecord(header, records[1].sequence,
+        # records[1].plus,  records[1].qual )
+
+
+# Base strategy for read pairs which have both an umi and sample barcode, but are not contiguous, 
+# for example UMI-BCA-UMI_BCB
+
+def apply_slices_seq(record, slices):
+    if len(slices)==0 or record is None:
+        return ''
+    return ''.join( (record.sequence[sc] for sc in slices))
+
+def apply_slices_qual(record, slices):
+    if len(slices)==0 or record is None:
+        return ''
+    return ''.join( (record.qual[sc] for sc in slices))
+
+
+class ScatteredUmiBarcodeDemuxMethod(IlluminaBaseDemultiplexer):
+
+    def __init__(
+            self,
+            barcode_slices = None, # 2-len Tuple of List of slices where the cell barcode is present in the read (slice(), slice(), ...], [ slice())
+            umi_slices = None, # 2-len List of slices where the UMI is present in the read [slice(), slice(), ...], [slice()] # 
+            capture_slices = None, # Slices which indicate what region of read 1 and read2 to store in the final read (required) (slice, slice)
+            # Note that there is only one slice per read for the capture
+            barcodeFileParser=None,
+            barcodeFileAlias=None,
+            indexFileParser=None,
+            indexFileAlias='illumina_merged_ThruPlex48S_RP',
+            random_primer_read=None,
+            random_primer_length=6,
+            random_primer_end=False, # True for end, False for start
+            **kwargs):
+        self.description = ''
+        self.barcodeFileAlias = barcodeFileAlias
+        self.barcodeFileParser = barcodeFileParser
+        IlluminaBaseDemultiplexer.__init__(
+            self,
+            indexFileParser=indexFileParser,
+            indexFileAlias=indexFileAlias)
+        self.barcodeSummary = self.barcodeFileAlias
+        self.barcode_slices = barcode_slices
+        self.umi_slices = umi_slices
+        self.capture_slices = capture_slices
+        self.random_primer_read = random_primer_read
+        self.random_primer_length = random_primer_length
+        self.random_primer_end = random_primer_end
+
+        
+
+    def demultiplex(self, records, **kwargs):
+
+        # Check if the supplied reads are mate-pair or single end
+        if len(records) not in (1, 2):
+            raise NonMultiplexable('Not mate pair or single end')
+
+
+        # Perform first pass demultiplexing of the illumina fragments:
+        try:
+            taggedRecords = IlluminaBaseDemultiplexer.demultiplex(
+                self, records, inherited=True, **kwargs)
+        except NonMultiplexable:
+            raise
+        
+
+        # Extract the UMI barcode_slices
+        umi, umi_qual = ''.join( (apply_slices_seq(record, slicer) for record, slicer in zip(records, self.umi_slices)) ), ''.join( (apply_slices_qual(record, slicer) for record, slicer in zip(records, self.umi_slices)) )
+        raw_barcode, raw_barcode_qual = ''.join( (apply_slices_seq(record, slicer) for record, slicer in zip(records, self.barcode_slices)) ), ''.join( (apply_slices_qual(record, slicer) for record, slicer in zip(records, self.barcode_slices)) )
+
+        barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance(
+            alias=self.barcodeFileAlias, barcode=raw_barcode)
+        
+        if barcodeIdentifier is None:
+            raise NonMultiplexable(
+                f'bc:{raw_barcode}_not_matching_{self.barcodeFileAlias}')
+
+        random_primer = None
+        if self.random_primer_read is not None:
+            random_primer = records[self.random_primer_read].sequence[self.random_primer_slice]
+        
+
+        for tr in taggedRecords:
+            #tr.addTagByTag('uL', self.umiLength, isPhred=False)
+            if len(umi)>0:
+                tr.tags['RX'] =  umi
+                tr.addTagByTag('RQ', umi_qual, isPhred=True,cast_type=None)
+               
+            tr.tags.update({
+                'bi': barcodeIdentifier,
+                'bc': raw_barcode,
+                'MX': self.shortName,
+                'BC': barcode
+            })
+            if random_primer is not None:
+                tr.addTagByTag('rS',
+                               random_primer,
+                               isPhred=False,
+                               make_safe=False)
+
+        for rid, (record, taggedRecord) in enumerate(
+                zip(records, taggedRecords)):
+            taggedRecord.sequence = record.sequence[self.capture_slices[rid]]
+            taggedRecord.qualities = record.qual[self.capture_slices[rid]]
+            taggedRecord.plus = record.plus
+
+        return taggedRecords
\ No newline at end of file