a b/singlecellmultiomics/modularDemultiplexer/demultiplexModules/restrictionbisulfite.py
1
import sys
2
# sys.path.append("/hpc/hub_oudenaarden/group_scripts/")
3
# sys.path.append("/hpc/hub_oudenaarden/group_scripts/modularDemultiplexer/")
4
5
from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import UmiBarcodeDemuxMethod, IlluminaBaseDemultiplexer
6
7
from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import IlluminaBaseDemultiplexer
8
from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import NonMultiplexable
9
10
# TO do, currently not implemented:
11
# enzyme ID correction based on Hamming Distance
12
# same for ISPCR sequence
13
14
15
class Base_RestrictionBisulfiteDemuxMethod(UmiBarcodeDemuxMethod):
16
17
    def __init__(self,
18
                 umiRead=0, umiStart=0, umiLength=8,  # default settings UMI
19
                 barcodeRead=0, barcodeStart=8, barcodeLength=8,  # default settings Barcode
20
                 enzymeRead=0, enzymeStart=16, enzymeLength=3,  # default settings Enzyme ID
21
                 ispcrRead=0, ispcrStart=19, ispcrLength=15,  # default settings ISPCR
22
                 ispcrSeq="CAGTGGTATCAGAGT",
23
                 barcodeFileParser=None,  # compatible, no need to change
24
                 barcodeFileAlias=None,  # passed from lower-level Classes, e.g. "reBS_nla384w"
25
                 indexFileParser=None,  # compatible, no need to change
26
                 **kwargs):  # additional arguments
27
        self.description = 'base class for restriction bisulfite'
28
        self.barcodeFileAlias = barcodeFileAlias  # description , e.g. "maya_384NLA"
29
        self.barcodeFileParser = barcodeFileParser  # Namespace for barcode file parse
30
        UmiBarcodeDemuxMethod.__init__(
31
            self,
32
            umiRead=umiRead,
33
            umiStart=umiStart,
34
            umiLength=umiLength,
35
            barcodeRead=barcodeRead,
36
            barcodeStart=barcodeStart,
37
            barcodeLength=barcodeLength,
38
            barcodeFileAlias=self.barcodeFileAlias,
39
            barcodeFileParser=barcodeFileParser,
40
            **kwargs)
41
42
        self.barcodeSummary = self.barcodeFileAlias
43
        self.umiRead = umiRead  # 0:Read 1, 1: Read 2 etc
44
        self.umiStart = umiStart  # First base
45
        self.umiLength = umiLength
46
        self.shortName = 'RB'
47
        self.longName = 'base class for restriction bisulfite'
48
        self.illumina_mux = IlluminaBaseDemultiplexer(
49
            indexFileParser=indexFileParser,
50
            indexFileAlias='illumina_merged_ThruPlex48S_RP')
51
52
        self.barcodeRead = barcodeRead
53
        self.barcodeStart = barcodeStart
54
        self.barcodeLength = barcodeLength
55
56
        self.enzymeRead = enzymeRead
57
        self.enzymeStart = enzymeStart
58
        self.enzymeLength = enzymeLength
59
60
        self.ispcrRead = ispcrRead
61
        self.ispcrStart = ispcrStart
62
        self.ispcrLength = ispcrLength
63
64
        self.autoDetectable = False
65
66
        self.sequenceCapture = [slice(None), slice(None)]  # ranges
67
        # TAKE OUT IF STATEMENT
68
        if umiLength == 0:
69
            # if there is a barcode only
70
            if barcodeStart != 0:
71
                raise NotImplementedError(
72
                    'Complicated slice where we need to capture around a region')
73
            self.sequenceCapture[barcodeRead] = slice(barcodeLength, None)
74
        else:
75
            if umiRead != barcodeRead:
76
                raise NotImplementedError()
77
            if not(umiStart == 0 or barcodeStart == 0):
78
                raise NotImplementedError(
79
                    'Complicated slice where we need to capture around a region')
80
            self.sequenceCapture[barcodeRead] = slice(
81
                barcodeLength + umiLength + enzymeLength + ispcrLength, None)
82
83
    def __repr__(self):
84
        return f'{self.longName} {self.description}'
85
86
    def demultiplex(self, records, **kwargs):
87
88
        # Check if the supplied reads are mate-pair:
89
        if len(records) != 2:
90
            raise NonMultiplexable('Not mate pair')
91
92
        # Perform first pass demultiplexing of the illumina fragments:
93
        try:
94
            taggedRecords = self.illumina_mux.demultiplex(
95
                records, inherited=True, **kwargs)
96
        except NonMultiplexable:
97
            raise
98
        rawBarcode = records[self.barcodeRead].sequence[self.barcodeStart:
99
                                                        self.barcodeStart + self.barcodeLength]
100
        barcodeQual = records[self.barcodeRead].qual[self.barcodeStart:
101
                                                     self.barcodeStart + self.barcodeLength]
102
103
        barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance(
104
            alias=self.barcodeFileAlias, barcode=rawBarcode)
105
        #print(barcodeIdentifier, barcode, hammingDistance)
106
        if barcodeIdentifier is None:
107
            raise NonMultiplexable('Illumina barcode not set')
108
109
        if self.umiLength != 0:
110
            umi = records[self.umiRead].sequence[self.umiStart:self.umiStart + self.umiLength]
111
            umiQual = records[self.umiRead].qual[self.umiStart:self.umiStart + self.umiLength]
112
113
        if self.enzymeLength != 0:
114
            enz = records[self.enzymeRead].sequence[self.enzymeStart:
115
                                                    self.enzymeStart + self.enzymeLength]
116
            enzQual = records[self.enzymeRead].qual[self.enzymeStart:
117
                                                    self.enzymeStart + self.enzymeLength]
118
119
        if self.ispcrLength != 0:
120
            ispcr = records[self.ispcrRead].sequence[self.ispcrStart:
121
                                                     self.ispcrStart + self.ispcrLength]
122
            ispcrQual = records[self.ispcrRead].qual[self.ispcrStart:
123
                                                     self.ispcrStart + self.ispcrLength]
124
125
        for tr in taggedRecords:
126
            #tr.addTagByTag('uL', self.umiLength, isPhred=False)
127
            if self.umiLength == 0:
128
                #tr.addTagByTag('MI', barcode, isPhred=False)
129
                #tr.addTagByTag('QM', barcodeQual, isPhred=True)
130
                pass
131
            else:
132
                tr.addTagByTag('RX', umi, isPhred=False, make_safe=False)
133
                tr.addTagByTag('RQ', umiQual, isPhred=True)
134
                #tr.addTagByTag('MI', barcode+umi, isPhred=False)
135
                #tr.addTagByTag('QM', barcodeQual+umiQual, isPhred=True)
136
137
            tr.addTagByTag(
138
                'bi',
139
                barcodeIdentifier,
140
                isPhred=False,
141
                make_safe=False)
142
            tr.addTagByTag('bc', rawBarcode, isPhred=False, make_safe=False)
143
            #tr.addTagByTag('hd', hammingDistance, isPhred=False)
144
145
            tr.addTagByTag('BC', barcode, isPhred=False, make_safe=False)
146
            tr.addTagByTag('QT', barcodeQual, isPhred=True)
147
148
            if len(barcode) != len(barcodeQual):
149
                raise ValueError()
150
151
         # Enzyme sequence
152
            tr.addTagByTag(
153
                'ES',
154
                enz,
155
                isPhred=False,
156
                make_safe=False)  # Enzyme sequence
157
         # Enzyme quality
158
            tr.addTagByTag('eq', enzQual, isPhred=True)
159
160
            # ISPCR sequence
161
            tr.addTagByTag('IS', ispcr, isPhred=False, make_safe=False)
162
            # ISPCR quality
163
            #tr.addTagByTag('is', ispcrQual, isPhred=True)
164
165
            tr.addTagByTag(
166
                'MX',
167
                self.shortName,
168
                make_safe=False,
169
                isPhred=False)
170
171
        for rid, (record, taggedRecord) in enumerate(
172
                zip(records, taggedRecords)):
173
            taggedRecord.sequence = record.sequence[self.sequenceCapture[rid]]
174
            taggedRecord.qualities = record.qual[self.sequenceCapture[rid]]
175
            taggedRecord.plus = record.plus
176
177
        return taggedRecords
178
        # Add information and rebuild header
179
        #header = f'@UMI:{umi};UMIQ:{umiQual};CBI:{barcodeIdentifier};CB:{barcode};CBQ:{barcodeQual};'
180
181
        # return fastqIterator.FastqRecord(header, records[1].sequence,
182
        # records[1].plus,  records[1].qual )
183
184
185
# NLAIII, 384 well format with 3bp UMI
186
class Nla_384w_u8_c8_ad3_is15(Base_RestrictionBisulfiteDemuxMethod):
187
    def __init__(self, barcodeFileParser, **kwargs):
188
        self.barcodeFileAlias = 'nla_bisulfite'
189
        Base_RestrictionBisulfiteDemuxMethod.__init__(
190
            self,
191
            barcodeFileAlias=self.barcodeFileAlias,
192
            barcodeFileParser=barcodeFileParser,
193
            **kwargs)
194
        self.shortName = 'RBSN' #'ReBsNla384C8U8E3I15'
195
        self.longName = 'Restriction-BS, NlaIII; 384w; UMI: 8 bp, CB: 8bp, Enz. ID: 3bp, ISPCR: 15 bp'
196
        self.autoDetectable = False
197
        self.description = 'Bisulfite-compatible barcoded Nla Adapters (384w). R1 contains UMI, BC, Enzyme ID and ISPCR.'