|
a |
|
b/singlecellmultiomics/modularDemultiplexer/demultiplexModules/restrictionbisulfite.py |
|
|
1 |
import sys |
|
|
2 |
# sys.path.append("/hpc/hub_oudenaarden/group_scripts/") |
|
|
3 |
# sys.path.append("/hpc/hub_oudenaarden/group_scripts/modularDemultiplexer/") |
|
|
4 |
|
|
|
5 |
from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import UmiBarcodeDemuxMethod, IlluminaBaseDemultiplexer |
|
|
6 |
|
|
|
7 |
from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import IlluminaBaseDemultiplexer |
|
|
8 |
from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import NonMultiplexable |
|
|
9 |
|
|
|
10 |
# TO do, currently not implemented: |
|
|
11 |
# enzyme ID correction based on Hamming Distance |
|
|
12 |
# same for ISPCR sequence |
|
|
13 |
|
|
|
14 |
|
|
|
15 |
class Base_RestrictionBisulfiteDemuxMethod(UmiBarcodeDemuxMethod): |
|
|
16 |
|
|
|
17 |
def __init__(self, |
|
|
18 |
umiRead=0, umiStart=0, umiLength=8, # default settings UMI |
|
|
19 |
barcodeRead=0, barcodeStart=8, barcodeLength=8, # default settings Barcode |
|
|
20 |
enzymeRead=0, enzymeStart=16, enzymeLength=3, # default settings Enzyme ID |
|
|
21 |
ispcrRead=0, ispcrStart=19, ispcrLength=15, # default settings ISPCR |
|
|
22 |
ispcrSeq="CAGTGGTATCAGAGT", |
|
|
23 |
barcodeFileParser=None, # compatible, no need to change |
|
|
24 |
barcodeFileAlias=None, # passed from lower-level Classes, e.g. "reBS_nla384w" |
|
|
25 |
indexFileParser=None, # compatible, no need to change |
|
|
26 |
**kwargs): # additional arguments |
|
|
27 |
self.description = 'base class for restriction bisulfite' |
|
|
28 |
self.barcodeFileAlias = barcodeFileAlias # description , e.g. "maya_384NLA" |
|
|
29 |
self.barcodeFileParser = barcodeFileParser # Namespace for barcode file parse |
|
|
30 |
UmiBarcodeDemuxMethod.__init__( |
|
|
31 |
self, |
|
|
32 |
umiRead=umiRead, |
|
|
33 |
umiStart=umiStart, |
|
|
34 |
umiLength=umiLength, |
|
|
35 |
barcodeRead=barcodeRead, |
|
|
36 |
barcodeStart=barcodeStart, |
|
|
37 |
barcodeLength=barcodeLength, |
|
|
38 |
barcodeFileAlias=self.barcodeFileAlias, |
|
|
39 |
barcodeFileParser=barcodeFileParser, |
|
|
40 |
**kwargs) |
|
|
41 |
|
|
|
42 |
self.barcodeSummary = self.barcodeFileAlias |
|
|
43 |
self.umiRead = umiRead # 0:Read 1, 1: Read 2 etc |
|
|
44 |
self.umiStart = umiStart # First base |
|
|
45 |
self.umiLength = umiLength |
|
|
46 |
self.shortName = 'RB' |
|
|
47 |
self.longName = 'base class for restriction bisulfite' |
|
|
48 |
self.illumina_mux = IlluminaBaseDemultiplexer( |
|
|
49 |
indexFileParser=indexFileParser, |
|
|
50 |
indexFileAlias='illumina_merged_ThruPlex48S_RP') |
|
|
51 |
|
|
|
52 |
self.barcodeRead = barcodeRead |
|
|
53 |
self.barcodeStart = barcodeStart |
|
|
54 |
self.barcodeLength = barcodeLength |
|
|
55 |
|
|
|
56 |
self.enzymeRead = enzymeRead |
|
|
57 |
self.enzymeStart = enzymeStart |
|
|
58 |
self.enzymeLength = enzymeLength |
|
|
59 |
|
|
|
60 |
self.ispcrRead = ispcrRead |
|
|
61 |
self.ispcrStart = ispcrStart |
|
|
62 |
self.ispcrLength = ispcrLength |
|
|
63 |
|
|
|
64 |
self.autoDetectable = False |
|
|
65 |
|
|
|
66 |
self.sequenceCapture = [slice(None), slice(None)] # ranges |
|
|
67 |
# TAKE OUT IF STATEMENT |
|
|
68 |
if umiLength == 0: |
|
|
69 |
# if there is a barcode only |
|
|
70 |
if barcodeStart != 0: |
|
|
71 |
raise NotImplementedError( |
|
|
72 |
'Complicated slice where we need to capture around a region') |
|
|
73 |
self.sequenceCapture[barcodeRead] = slice(barcodeLength, None) |
|
|
74 |
else: |
|
|
75 |
if umiRead != barcodeRead: |
|
|
76 |
raise NotImplementedError() |
|
|
77 |
if not(umiStart == 0 or barcodeStart == 0): |
|
|
78 |
raise NotImplementedError( |
|
|
79 |
'Complicated slice where we need to capture around a region') |
|
|
80 |
self.sequenceCapture[barcodeRead] = slice( |
|
|
81 |
barcodeLength + umiLength + enzymeLength + ispcrLength, None) |
|
|
82 |
|
|
|
83 |
def __repr__(self): |
|
|
84 |
return f'{self.longName} {self.description}' |
|
|
85 |
|
|
|
86 |
def demultiplex(self, records, **kwargs): |
|
|
87 |
|
|
|
88 |
# Check if the supplied reads are mate-pair: |
|
|
89 |
if len(records) != 2: |
|
|
90 |
raise NonMultiplexable('Not mate pair') |
|
|
91 |
|
|
|
92 |
# Perform first pass demultiplexing of the illumina fragments: |
|
|
93 |
try: |
|
|
94 |
taggedRecords = self.illumina_mux.demultiplex( |
|
|
95 |
records, inherited=True, **kwargs) |
|
|
96 |
except NonMultiplexable: |
|
|
97 |
raise |
|
|
98 |
rawBarcode = records[self.barcodeRead].sequence[self.barcodeStart: |
|
|
99 |
self.barcodeStart + self.barcodeLength] |
|
|
100 |
barcodeQual = records[self.barcodeRead].qual[self.barcodeStart: |
|
|
101 |
self.barcodeStart + self.barcodeLength] |
|
|
102 |
|
|
|
103 |
barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance( |
|
|
104 |
alias=self.barcodeFileAlias, barcode=rawBarcode) |
|
|
105 |
#print(barcodeIdentifier, barcode, hammingDistance) |
|
|
106 |
if barcodeIdentifier is None: |
|
|
107 |
raise NonMultiplexable('Illumina barcode not set') |
|
|
108 |
|
|
|
109 |
if self.umiLength != 0: |
|
|
110 |
umi = records[self.umiRead].sequence[self.umiStart:self.umiStart + self.umiLength] |
|
|
111 |
umiQual = records[self.umiRead].qual[self.umiStart:self.umiStart + self.umiLength] |
|
|
112 |
|
|
|
113 |
if self.enzymeLength != 0: |
|
|
114 |
enz = records[self.enzymeRead].sequence[self.enzymeStart: |
|
|
115 |
self.enzymeStart + self.enzymeLength] |
|
|
116 |
enzQual = records[self.enzymeRead].qual[self.enzymeStart: |
|
|
117 |
self.enzymeStart + self.enzymeLength] |
|
|
118 |
|
|
|
119 |
if self.ispcrLength != 0: |
|
|
120 |
ispcr = records[self.ispcrRead].sequence[self.ispcrStart: |
|
|
121 |
self.ispcrStart + self.ispcrLength] |
|
|
122 |
ispcrQual = records[self.ispcrRead].qual[self.ispcrStart: |
|
|
123 |
self.ispcrStart + self.ispcrLength] |
|
|
124 |
|
|
|
125 |
for tr in taggedRecords: |
|
|
126 |
#tr.addTagByTag('uL', self.umiLength, isPhred=False) |
|
|
127 |
if self.umiLength == 0: |
|
|
128 |
#tr.addTagByTag('MI', barcode, isPhred=False) |
|
|
129 |
#tr.addTagByTag('QM', barcodeQual, isPhred=True) |
|
|
130 |
pass |
|
|
131 |
else: |
|
|
132 |
tr.addTagByTag('RX', umi, isPhred=False, make_safe=False) |
|
|
133 |
tr.addTagByTag('RQ', umiQual, isPhred=True) |
|
|
134 |
#tr.addTagByTag('MI', barcode+umi, isPhred=False) |
|
|
135 |
#tr.addTagByTag('QM', barcodeQual+umiQual, isPhred=True) |
|
|
136 |
|
|
|
137 |
tr.addTagByTag( |
|
|
138 |
'bi', |
|
|
139 |
barcodeIdentifier, |
|
|
140 |
isPhred=False, |
|
|
141 |
make_safe=False) |
|
|
142 |
tr.addTagByTag('bc', rawBarcode, isPhred=False, make_safe=False) |
|
|
143 |
#tr.addTagByTag('hd', hammingDistance, isPhred=False) |
|
|
144 |
|
|
|
145 |
tr.addTagByTag('BC', barcode, isPhred=False, make_safe=False) |
|
|
146 |
tr.addTagByTag('QT', barcodeQual, isPhred=True) |
|
|
147 |
|
|
|
148 |
if len(barcode) != len(barcodeQual): |
|
|
149 |
raise ValueError() |
|
|
150 |
|
|
|
151 |
# Enzyme sequence |
|
|
152 |
tr.addTagByTag( |
|
|
153 |
'ES', |
|
|
154 |
enz, |
|
|
155 |
isPhred=False, |
|
|
156 |
make_safe=False) # Enzyme sequence |
|
|
157 |
# Enzyme quality |
|
|
158 |
tr.addTagByTag('eq', enzQual, isPhred=True) |
|
|
159 |
|
|
|
160 |
# ISPCR sequence |
|
|
161 |
tr.addTagByTag('IS', ispcr, isPhred=False, make_safe=False) |
|
|
162 |
# ISPCR quality |
|
|
163 |
#tr.addTagByTag('is', ispcrQual, isPhred=True) |
|
|
164 |
|
|
|
165 |
tr.addTagByTag( |
|
|
166 |
'MX', |
|
|
167 |
self.shortName, |
|
|
168 |
make_safe=False, |
|
|
169 |
isPhred=False) |
|
|
170 |
|
|
|
171 |
for rid, (record, taggedRecord) in enumerate( |
|
|
172 |
zip(records, taggedRecords)): |
|
|
173 |
taggedRecord.sequence = record.sequence[self.sequenceCapture[rid]] |
|
|
174 |
taggedRecord.qualities = record.qual[self.sequenceCapture[rid]] |
|
|
175 |
taggedRecord.plus = record.plus |
|
|
176 |
|
|
|
177 |
return taggedRecords |
|
|
178 |
# Add information and rebuild header |
|
|
179 |
#header = f'@UMI:{umi};UMIQ:{umiQual};CBI:{barcodeIdentifier};CB:{barcode};CBQ:{barcodeQual};' |
|
|
180 |
|
|
|
181 |
# return fastqIterator.FastqRecord(header, records[1].sequence, |
|
|
182 |
# records[1].plus, records[1].qual ) |
|
|
183 |
|
|
|
184 |
|
|
|
185 |
# NLAIII, 384 well format with 3bp UMI |
|
|
186 |
class Nla_384w_u8_c8_ad3_is15(Base_RestrictionBisulfiteDemuxMethod): |
|
|
187 |
def __init__(self, barcodeFileParser, **kwargs): |
|
|
188 |
self.barcodeFileAlias = 'nla_bisulfite' |
|
|
189 |
Base_RestrictionBisulfiteDemuxMethod.__init__( |
|
|
190 |
self, |
|
|
191 |
barcodeFileAlias=self.barcodeFileAlias, |
|
|
192 |
barcodeFileParser=barcodeFileParser, |
|
|
193 |
**kwargs) |
|
|
194 |
self.shortName = 'RBSN' #'ReBsNla384C8U8E3I15' |
|
|
195 |
self.longName = 'Restriction-BS, NlaIII; 384w; UMI: 8 bp, CB: 8bp, Enz. ID: 3bp, ISPCR: 15 bp' |
|
|
196 |
self.autoDetectable = False |
|
|
197 |
self.description = 'Bisulfite-compatible barcoded Nla Adapters (384w). R1 contains UMI, BC, Enzyme ID and ISPCR.' |