|
a |
|
b/singlecellmultiomics/modularDemultiplexer/demultiplexingStrategyLoader.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
import os |
|
|
4 |
import collections |
|
|
5 |
import glob |
|
|
6 |
import sys |
|
|
7 |
from colorama import Fore |
|
|
8 |
from colorama import Back |
|
|
9 |
from colorama import Style |
|
|
10 |
import importlib |
|
|
11 |
import inspect |
|
|
12 |
import traceback |
|
|
13 |
import singlecellmultiomics.modularDemultiplexer.demultiplexModules as dm |
|
|
14 |
import singlecellmultiomics.fastqProcessing.fastqIterator as fastqIterator |
|
|
15 |
from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import NonMultiplexable, IlluminaBaseDemultiplexer |
|
|
16 |
import logging |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
class DemultiplexingStrategyLoader: |
|
|
20 |
def __init__( |
|
|
21 |
self, |
|
|
22 |
barcodeParser, |
|
|
23 |
moduleSearchDir='demultiplexModules', |
|
|
24 |
indexParser=None, |
|
|
25 |
ignoreMethods=None, |
|
|
26 |
only_detect_methods=None, # |
|
|
27 |
indexFileAlias=None): |
|
|
28 |
package = f'singlecellmultiomics.modularDemultiplexer.{moduleSearchDir}' |
|
|
29 |
moduleSearchPath = os.path.join( |
|
|
30 |
os.path.dirname( |
|
|
31 |
os.path.realpath(__file__)), |
|
|
32 |
moduleSearchDir).replace( |
|
|
33 |
'/./', |
|
|
34 |
'/') |
|
|
35 |
self.barcodeParser = barcodeParser |
|
|
36 |
self.indexParser = indexParser |
|
|
37 |
self.only_detect_methods = only_detect_methods |
|
|
38 |
moduleSearchPath = moduleSearchPath |
|
|
39 |
|
|
|
40 |
#print(f'{Style.DIM}Current script location: {__file__}') |
|
|
41 |
#print(f'Searchdir: {moduleSearchDir}') |
|
|
42 |
#print(f'Looking for modules in {moduleSearchPath}{Style.RESET_ALL}') |
|
|
43 |
self.demultiplexingStrategies = [] |
|
|
44 |
self.demux_classes = [ |
|
|
45 |
IlluminaBaseDemultiplexer, |
|
|
46 |
|
|
|
47 |
dm.CELSeq1_c8_u4, |
|
|
48 |
dm.CELSeq2_c8_u6, |
|
|
49 |
dm.CELSeq2_c8_u6_NH, |
|
|
50 |
dm.CELSeq2_c8_u8, |
|
|
51 |
dm.CELSeq2_c8_u8_NNLAIII, |
|
|
52 |
|
|
|
53 |
dm.CELSeq2_c8_u6_swapped_reads, |
|
|
54 |
|
|
|
55 |
dm.NLAIII_384w_c8_u3, |
|
|
56 |
dm.NLAIII_96w_c8_u3, |
|
|
57 |
dm.Nla_384w_u8_c8_ad3_is15, |
|
|
58 |
dm.NLAIII_384w_c8_u3_SINGLE_END, |
|
|
59 |
dm.NLAIII_96w_c8_u3_SINGLE_END, |
|
|
60 |
|
|
|
61 |
dm.SCCHIC_384w_c8_u3, |
|
|
62 |
dm.SCCHIC_384w_c8_u3_cs2, |
|
|
63 |
dm.SCCHIC_384w_c8_u3_pdt, |
|
|
64 |
dm.SCCHIC_384w_c8_u3_direct_ligation, |
|
|
65 |
dm.SCCHIC_384w_c8_u3_direct_ligation_SINGLE_END, |
|
|
66 |
|
|
|
67 |
dm.MSPJI_c8_u3, |
|
|
68 |
dm.ScartraceR2, |
|
|
69 |
dm.ScartraceR1, |
|
|
70 |
dm.ScartraceR2RP4, |
|
|
71 |
|
|
|
72 |
dm.chrom10x_c16_u12, |
|
|
73 |
|
|
|
74 |
dm.DamID2, |
|
|
75 |
dm.DamID2_c8_u3_cs2, |
|
|
76 |
dm.DamID2_SCA, |
|
|
77 |
dm.DamID2andT_SCA, |
|
|
78 |
dm.DamID2andT_SCA6, |
|
|
79 |
dm.DamID2_NO_OVERHANG |
|
|
80 |
|
|
|
81 |
|
|
|
82 |
] |
|
|
83 |
for c in self.demux_classes: |
|
|
84 |
|
|
|
85 |
|
|
|
86 |
initialised_demux = c( |
|
|
87 |
barcodeFileParser=barcodeParser, |
|
|
88 |
indexFileParser=indexParser, |
|
|
89 |
indexFileAlias=indexFileAlias) |
|
|
90 |
|
|
|
91 |
if self.only_detect_methods is not None: |
|
|
92 |
|
|
|
93 |
if initialised_demux.shortName in self.only_detect_methods: |
|
|
94 |
print(f'Only loading {initialised_demux.shortName}') |
|
|
95 |
else: |
|
|
96 |
continue |
|
|
97 |
|
|
|
98 |
self.demultiplexingStrategies.append(initialised_demux) |
|
|
99 |
|
|
|
100 |
|
|
|
101 |
def getSelectedStrategiesFromStringList(self, strList, verbose=True): |
|
|
102 |
selectedStrategies = [] |
|
|
103 |
|
|
|
104 |
resolved = {part: False for part in strList} |
|
|
105 |
for strategy in self.demultiplexingStrategies: |
|
|
106 |
if strategy.shortName in strList: |
|
|
107 |
selectedStrategies.append(strategy) |
|
|
108 |
if verbose: |
|
|
109 |
print('Selected strategy %s' % strategy) |
|
|
110 |
resolved[strategy.shortName] = True |
|
|
111 |
|
|
|
112 |
if any([v is False for v in resolved.values()]): |
|
|
113 |
for strat in strList: |
|
|
114 |
if resolved[strat] is False: |
|
|
115 |
print(f'{Fore.RED}Could not resolve {strat}{Style.RESET_ALL}') |
|
|
116 |
print('Available:') |
|
|
117 |
for s in self.demultiplexingStrategies: |
|
|
118 |
print(s.shortName) |
|
|
119 |
raise ValueError(f'Strategy {strat} not found') |
|
|
120 |
|
|
|
121 |
raise ValueError() |
|
|
122 |
return selectedStrategies |
|
|
123 |
|
|
|
124 |
def list(self): |
|
|
125 |
print(f"{Style.BRIGHT}Available demultiplexing strategies:{Style.RESET_ALL}") |
|
|
126 |
#print('Name, alias, will be auto detected, description') |
|
|
127 |
for strategy in self.demultiplexingStrategies: |
|
|
128 |
|
|
|
129 |
try: |
|
|
130 |
print( |
|
|
131 |
f'{Style.BRIGHT}{strategy.shortName}{Style.RESET_ALL}\t{strategy.longName}\t' + |
|
|
132 |
( |
|
|
133 |
f'{Fore.GREEN}Will be autodetected' if strategy.autoDetectable else f'{Fore.RED}Will not be autodetected') + |
|
|
134 |
Style.RESET_ALL + |
|
|
135 |
Style.DIM + |
|
|
136 |
f' {strategy.barcodeFileParser.getTargetCount(strategy.barcodeFileAlias) if hasattr(strategy,"barcodeFileParser") else "NA"} targets\n ' + |
|
|
137 |
Style.DIM + |
|
|
138 |
strategy.description + |
|
|
139 |
'\n' + |
|
|
140 |
strategy.getParserSummary() + |
|
|
141 |
Style.RESET_ALL + |
|
|
142 |
'\n') |
|
|
143 |
except Exception as e: |
|
|
144 |
print( |
|
|
145 |
f"{Fore.RED}{Style.BRIGHT}Error in: {strategy.shortName}\nException: {e}{Style.RESET_ALL}\nTraceback for the error:\n") |
|
|
146 |
import traceback |
|
|
147 |
traceback.print_exc() |
|
|
148 |
from os import stat |
|
|
149 |
from pwd import getpwuid |
|
|
150 |
try: |
|
|
151 |
modulePath = sys.modules[strategy.__module__].__file__ |
|
|
152 |
|
|
|
153 |
print( |
|
|
154 |
f'Contact {Style.BRIGHT}%s{Style.RESET_ALL} for help\n' % |
|
|
155 |
getpwuid( |
|
|
156 |
stat(modulePath).st_uid).pw_name) |
|
|
157 |
print( |
|
|
158 |
'The error only affects this module.\nProceeding to load more modules...\n') |
|
|
159 |
except Exception as e: |
|
|
160 |
pass |
|
|
161 |
|
|
|
162 |
def getAutodetectStrategies(self): |
|
|
163 |
return [ |
|
|
164 |
strategy for strategy in self.demultiplexingStrategies if strategy.autoDetectable] |
|
|
165 |
|
|
|
166 |
def getDemultiplexingSelectedStrategies(self): |
|
|
167 |
if self.selectedStrategies is None: |
|
|
168 |
raise ValueError('No strategies selected') |
|
|
169 |
return self.selectedStrategies |
|
|
170 |
|
|
|
171 |
def demultiplex( |
|
|
172 |
self, |
|
|
173 |
fastqfiles, |
|
|
174 |
maxReadPairs=None, |
|
|
175 |
strategies=None, |
|
|
176 |
library=None, |
|
|
177 |
targetFile=None, |
|
|
178 |
rejectHandle=None, |
|
|
179 |
log_handle=None, |
|
|
180 |
probe=None |
|
|
181 |
): |
|
|
182 |
|
|
|
183 |
useStrategies = strategies if strategies is not None else self.getAutodetectStrategies() |
|
|
184 |
strategyYields = collections.Counter() |
|
|
185 |
processedReadPairs = 0 |
|
|
186 |
baseDemux = IlluminaBaseDemultiplexer( |
|
|
187 |
indexFileParser=self.indexParser, |
|
|
188 |
barcodeParser=self.barcodeParser, |
|
|
189 |
probe=probe) |
|
|
190 |
|
|
|
191 |
for p, reads in enumerate( |
|
|
192 |
fastqIterator.FastqIterator(*fastqfiles)): |
|
|
193 |
processedReadPairs = p+1 |
|
|
194 |
|
|
|
195 |
for strategy in useStrategies: |
|
|
196 |
try: |
|
|
197 |
recodedRecords = strategy.demultiplex( |
|
|
198 |
reads, library=library, probe=probe) |
|
|
199 |
|
|
|
200 |
if targetFile is not None: |
|
|
201 |
targetFile.write(recodedRecords) |
|
|
202 |
|
|
|
203 |
except NonMultiplexable as reason: |
|
|
204 |
# print('NonMultiplexable') |
|
|
205 |
if rejectHandle is not None: |
|
|
206 |
|
|
|
207 |
try: |
|
|
208 |
to_write = baseDemux.demultiplex( |
|
|
209 |
reads, library=library, reason=reason) |
|
|
210 |
rejectHandle.write(to_write) |
|
|
211 |
|
|
|
212 |
except NonMultiplexable as e: |
|
|
213 |
# we cannot read the header of the read.. |
|
|
214 |
reads = [ |
|
|
215 |
'\n'.join( |
|
|
216 |
(read.header + |
|
|
217 |
f';RR:{reason};Rr:{e}', |
|
|
218 |
read.sequence, |
|
|
219 |
read.plus, |
|
|
220 |
read.qual)) for read in reads] |
|
|
221 |
rejectHandle.write(reads) |
|
|
222 |
|
|
|
223 |
continue |
|
|
224 |
except Exception as e: |
|
|
225 |
if probe: |
|
|
226 |
continue |
|
|
227 |
traceback_str = str(traceback.format_exc()) |
|
|
228 |
print(traceback_str) |
|
|
229 |
print( |
|
|
230 |
f'{Fore.RED}Fatal error. While demultiplexing strategy {strategy.longName} yielded an error, the error message was: {e}') |
|
|
231 |
print('The read(s) causing the error looked like this:') |
|
|
232 |
for read in reads: |
|
|
233 |
print(str(read)) |
|
|
234 |
print(Style.RESET_ALL) |
|
|
235 |
if log_handle is not None: |
|
|
236 |
log_handle.write( |
|
|
237 |
f"Error occured using {strategy.longName}\n{traceback_str}\n") |
|
|
238 |
# print(recodedRecord) |
|
|
239 |
strategyYields[strategy.shortName] += 1 |
|
|
240 |
if (maxReadPairs is not None and ( |
|
|
241 |
processedReadPairs) >= maxReadPairs): |
|
|
242 |
break |
|
|
243 |
# write yields to log file if applicable: |
|
|
244 |
if log_handle is not None: |
|
|
245 |
log_handle.write(f'processed {processedReadPairs} read pairs\n') |
|
|
246 |
log_handle.write(f'Reads obtained per protocol\n') |
|
|
247 |
log_handle.write(f'Strategy\tReads\n') |
|
|
248 |
for strategy, used_reads in strategyYields.items(): |
|
|
249 |
log_handle.write(f'{strategy}\t{used_reads}\n') |
|
|
250 |
return processedReadPairs, strategyYields |
|
|
251 |
|
|
|
252 |
def detectLibYields( |
|
|
253 |
self, |
|
|
254 |
libraries, |
|
|
255 |
strategies=None, |
|
|
256 |
testReads=100000, |
|
|
257 |
maxAutoDetectMethods=1, |
|
|
258 |
minAutoDetectPct=5, |
|
|
259 |
verbose=False): |
|
|
260 |
libYields = dict() |
|
|
261 |
|
|
|
262 |
for lib, lanes in libraries.items(): |
|
|
263 |
for lane, readPairs in lanes.items(): |
|
|
264 |
|
|
|
265 |
for readPair in readPairs: |
|
|
266 |
if len(readPairs) == 1: |
|
|
267 |
processedReadPairs, strategyYields = self.demultiplex( |
|
|
268 |
[ readPairs[ |
|
|
269 |
list(readPairs.keys())[0] |
|
|
270 |
][0] ], maxReadPairs=testReads, strategies=strategies, probe=True) |
|
|
271 |
elif len(readPairs) == 2: |
|
|
272 |
processedReadPairs, strategyYields = self.demultiplex( |
|
|
273 |
(readPairs['R1'][0], readPairs['R2'][0]), maxReadPairs=testReads, strategies=strategies, probe=True) |
|
|
274 |
else: |
|
|
275 |
raise ValueError('Error: %s' % readPairs.keys()) |
|
|
276 |
|
|
|
277 |
if verbose: |
|
|
278 |
print(f'Report for {lib}:') |
|
|
279 |
self.strategyYieldsToFormattedReport( |
|
|
280 |
processedReadPairs, |
|
|
281 |
strategyYields, |
|
|
282 |
maxAutoDetectMethods=maxAutoDetectMethods, |
|
|
283 |
minAutoDetectPct=minAutoDetectPct) |
|
|
284 |
libYields[lib] = { |
|
|
285 |
'processedReadPairs': processedReadPairs, |
|
|
286 |
'strategyYields': strategyYields} |
|
|
287 |
break |
|
|
288 |
return processedReadPairs, libYields |
|
|
289 |
|
|
|
290 |
def strategyYieldsToFormattedReport( |
|
|
291 |
self, |
|
|
292 |
processedReadPairs, |
|
|
293 |
strategyYields, |
|
|
294 |
selectedStrategies=None, |
|
|
295 |
maxAutoDetectMethods=1, |
|
|
296 |
minAutoDetectPct=5): |
|
|
297 |
print( |
|
|
298 |
f'Analysed {Style.BRIGHT}{processedReadPairs}{Style.RESET_ALL} read pairs') |
|
|
299 |
|
|
|
300 |
if selectedStrategies is None: |
|
|
301 |
selectedStrategies = {} |
|
|
302 |
#selectedStrategies = self.selectedStrategiesBasedOnYield(processedReadPairs, strategyYields) |
|
|
303 |
for i, (strategy, strategyYield) in enumerate( |
|
|
304 |
strategyYields.most_common()): |
|
|
305 |
yieldRatio = strategyYield / (0.001 + processedReadPairs) |
|
|
306 |
print( |
|
|
307 |
( |
|
|
308 |
Style.BRIGHT + |
|
|
309 |
Fore.GREEN if ( |
|
|
310 |
(strategy in selectedStrategies) or i < maxAutoDetectMethods) else ( |
|
|
311 |
Fore.YELLOW if yieldRatio * |
|
|
312 |
100 >= minAutoDetectPct else Style.DIM)) + |
|
|
313 |
f'\t {strategy}:%.2f%%{Style.RESET_ALL}' % |
|
|
314 |
(100.0 * |
|
|
315 |
yieldRatio)) |
|
|
316 |
|
|
|
317 |
def selectedStrategiesBasedOnYield( |
|
|
318 |
self, |
|
|
319 |
processedReadPairs, |
|
|
320 |
strategyYields, |
|
|
321 |
maxAutoDetectMethods=1, |
|
|
322 |
minAutoDetectPct=0.05): |
|
|
323 |
selectedStrategies = [] |
|
|
324 |
for strategy, strategyYield in strategyYields.most_common( |
|
|
325 |
maxAutoDetectMethods): |
|
|
326 |
yieldRatio = strategyYield / (0.001 + processedReadPairs) * 100.0 |
|
|
327 |
if yieldRatio >= minAutoDetectPct: |
|
|
328 |
selectedStrategies.append(strategy) |
|
|
329 |
return selectedStrategies |