[45ad7e]: / singlecellmultiomics / barcodeFileParser / barcodeFileParser.py

Download this file

240 lines (205 with data), 10.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import glob
import logging
from colorama import Fore
from colorama import Back
from colorama import Style
import os
import collections
import itertools
import gzip
#logging.basicConfig(level=logging.DEBUG)
# http://codereview.stackexchange.com/questions/88912/create-a-list-of-all-strings-within-hamming-distance-of-a-reference-string-with
def hamming_circle(s, n, alphabet):
for positions in itertools.combinations(range(len(s)), n):
for replacements in itertools.product(
range(len(alphabet) - 1), repeat=n):
cousin = list(s)
for p, r in zip(positions, replacements):
if cousin[p] == alphabet[r]:
cousin[p] = alphabet[-1]
else:
cousin[p] = alphabet[r]
yield ''.join(cousin)
class BarcodeParser():
def path_to_barcode_alias(self, path):
barcodeFileAlias = os.path.splitext(
os.path.basename(path))[0].replace('.gz','').replace('.bc','')
return barcodeFileAlias
def parse_pending_barcode_file_of_alias(self, alias):
if not alias in self.pending_files:
raise ValueError(f"trying to load {alias}, but barcode file not found")
logging.info(f"Loading promised barcode file {alias}")
self.parse_barcode_file( self.pending_files[alias] )
logging.info(f"Performing hamming extension for {alias}")
self.expand(self.hammingDistanceExpansion, alias=alias)
del self.pending_files[alias]
def parse_barcode_file(self, barcodeFile):
barcodeFileAlias = self.path_to_barcode_alias(barcodeFile)
# Decide the file type (index first or name first)
indexNotFirst = False
with gzip.open(barcodeFile,'rt') if barcodeFile.endswith('.gz') else open(barcodeFile) as f :
for i, line in enumerate(f):
parts = line.strip().split()
if len(parts) == 1 and ' ' in line:
parts = line.strip().split(' ')
if len(parts) == 1:
pass
elif len(parts) == 2:
indexFirst = not all((c in 'ATCGNX') for c in parts[0])
if not indexFirst:
indexNotFirst = True
# print(parts[1],indexFirst)
nospec=False
i=0
with gzip.open(barcodeFile,'rt') if barcodeFile.endswith('.gz') else open(barcodeFile) as f :
for i, line in enumerate(f):
parts = line.strip().split()
if len(parts) == 1 and ' ' in line:
parts = line.strip().split(' ')
if len(parts) == 1:
self.addBarcode(
barcodeFileAlias, barcode=parts[0], index=i+1)
#self.barcodes[barcodeFileAlias][parts[0]] = i
if not nospec: # only show this once:
logging.info(
f"\t{parts[0]}:{i} (No index specified in file)")
nospec=True
elif len(parts) == 2:
if indexNotFirst:
barcode, index = parts
else:
index, barcode = parts
#self.barcodes[barcodeFileAlias][barcode] = index
# When the index is only digits, convert to integer
try:
if int(index)==int(str(int(index))):
index = int(index)
else:
pass
except Exception as e:
pass
self.addBarcode(
barcodeFileAlias, barcode=barcode, index=index)
if not nospec: # only show this once:
logging.info(
f"\t{barcode}:{index} (index was specified in file, {'index' if indexFirst else 'barcode'} on first column)")
nospec=True
else:
e = f'The barcode file {barcodeFile} contains more than two columns. Failed to parse!'
logging.error(e)
raise ValueError(e)
logging.info(f'done loading {i} barcodes')
def __init__(
self,
barcodeDirectory='barcodes',
hammingDistanceExpansion=0,
spaceFill=False,
lazyLoad=None # these aliases wiill not be loaded until requested, '*' matches all files
):
self.hammingDistanceExpansion = hammingDistanceExpansion
barcodeDirectory = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
barcodeDirectory)
barcode_files = list(glob.glob(f'{barcodeDirectory}/*'))
self.spaceFill = spaceFill
self.hammingDistanceExpansion = hammingDistanceExpansion
self.barcodes = collections.defaultdict(
dict) # alias -> barcode -> index
# alias -> barcode -> (index, hammingDistance)
self.extendedBarcodes = collections.defaultdict(dict)
self.pending_files = dict()
for barcodeFile in barcode_files:
barcodeFileAlias = self.path_to_barcode_alias(barcodeFile)
if lazyLoad is not None and (barcodeFileAlias in lazyLoad or lazyLoad =='*'):
logging.info(f"Lazy loading {barcodeFile}, alias {barcodeFileAlias}")
self.pending_files[barcodeFileAlias] = barcodeFile
continue
logging.info(f"Parsing {barcodeFile}, alias {barcodeFileAlias}")
self.parse_barcode_file(barcodeFile)
if hammingDistanceExpansion > 0:
self.expand(hammingDistanceExpansion, alias=barcodeFileAlias)
def getTargetCount(self, barcodeFileAlias):
return(len(self.barcodes[barcodeFileAlias]), len(self.extendedBarcodes[barcodeFileAlias]))
def expand(
self,
hammingDistanceExpansion,
alias,
reportCollisions=True,
spaceFill=None):
barcodes = self.barcodes[alias]
# hammingBarcode -> ( ( distance,origin) )
hammingSpace = collections.defaultdict(list)
for barcode in barcodes:
for hammingDistance in range(0, hammingDistanceExpansion + 1):
for hammingInstance in hamming_circle(
barcode, hammingDistance, 'ACTGN'):
hammingSpace[hammingInstance].append(
(hammingDistance, barcode))
# Resolve all
for hammingBarcode in hammingSpace:
# Check if there is a closest origin:
sortedDistances = sorted(hammingSpace[hammingBarcode])
if len(sortedDistances) > 1 and (
sortedDistances[0][0] == sortedDistances[1][0]):
# We cannot resolve this, two or more origins are at the same distance:
#print('Cannot resolve %s' % hammingBarcode)
continue
hammingDistance, origin = sortedDistances[0]
self.addBarcode(
alias,
barcode=hammingBarcode,
index=self.barcodes[alias][origin],
hammingDistance=hammingDistance,
originBarcode=origin)
def addBarcode(
self,
barcodeFileAlias,
barcode,
index,
hammingDistance=0,
originBarcode=None):
if hammingDistance == 0:
self.barcodes[barcodeFileAlias][barcode] = index
else:
if originBarcode is None:
raise ValueError()
self.extendedBarcodes[barcodeFileAlias][barcode] = (
index, originBarcode, hammingDistance)
# get index and hamming distance to barcode returns none if not Available
def getIndexCorrectedBarcodeAndHammingDistance(self, barcode, alias, try_lazy_load_pending=True):
# Check if the alias still needs to be loaded:
if barcode in self.barcodes[alias]:
return (self.barcodes[alias][barcode], barcode, 0)
if barcode in self.extendedBarcodes[alias]:
return self.extendedBarcodes[alias][barcode]
if alias in self.pending_files:
if not try_lazy_load_pending:
raise RecursionError()
self.parse_pending_barcode_file_of_alias(alias)
return self.getIndexCorrectedBarcodeAndHammingDistance(barcode, alias, try_lazy_load_pending=False)
return (None, None, None)
def list(self, showBarcodes=5): # showBarcodes=None show all
for barcodeAlias, mapping in self.barcodes.items():
print(
f'{len(mapping)} barcodes{Style.DIM} obtained from {Style.RESET_ALL}{barcodeAlias}')
if len(mapping):
for bcId in list(mapping.keys())[:showBarcodes]:
try:
print(('%s%s%s%s%s%s' % (Fore.GREEN, bcId,
Fore.WHITE, '→', mapping[bcId], Style.RESET_ALL)))
except Exception as e:
print(('%s%s%s%s%s%s' % (Fore.GREEN, bcId,
Fore.WHITE, '->', mapping[bcId], Style.RESET_ALL)))
if showBarcodes is not None and len(mapping) > showBarcodes:
print(
f'{Style.DIM} %s more ...\n{Style.RESET_ALL}' %
(len(mapping) - showBarcodes))
def getBarcodeMapping(self):
return self.barcodes
def __getitem__(self, alias):
if alias in self.pending_files:
self.parse_pending_barcode_file_of_alias(alias)
return self.barcodes.get(alias)