[45ad7e]: / singlecellmultiomics / alleleTools / alleleTools.py

Download this file

375 lines (308 with data), 15.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pysam
import argparse
import collections
import functools
import gzip
import os
from singlecellmultiomics.utils import Prefetcher
def get_allele_dict():
return collections.defaultdict(nested_set_defaultdict)
def nested_set_defaultdict():
return collections.defaultdict( set_defaultdict )
def set_defaultdict():
return collections.defaultdict (set)
class AlleleResolver(Prefetcher):
def clean_vcf_name(self, vcffile):
# Convert file name to string if it is not
if type(vcffile) == pysam.libcbcf.VariantFile:
vcffile = vcffile.filename.decode('ascii')
if type(vcffile) == bytes:
vcffile = vcffile.decode('ascii')
return vcffile
def __init__(self, vcffile=None,
chrom=None,
phased=True,
uglyMode=False,
lazyLoad=False,
select_samples=None,
use_cache=False,
ignore_conversions=None,
verbose=False,
region_start=None,
region_end=None
# When this flag is true a cache file is generated containing
# usable SNPs for every chromosome in gzipped format
):
"""Initialise AlleleResolver
Args:
vcffile (str): path of vcf file
chrom (str): contig/chromosome to prepare variants for
phased(bool) : the variants in the vcf file are phased (every sample is one haplotype)
uglyMode (bool) : the vcf file is invalid (not indexed) and has to be loaded to memory
verbose (bool) : Print debug information
lazyLoad (bool) : the vcf file is valid and indexed and does not need to be loaded to memory
select_samples (list) : Use only these samples from the VCF file
use_cache (bool) : When this flag is true a cache file is generated containing usable SNPs for every chromosome in gzipped format
ignore_conversions(set) : conversions to ignore {(ref, alt), ..} , for example set( ('C','T'), ('G','A') )
region_start(int) : only load variants within this range (region_start-region_end) when reading a cached variant file
region_end(int) : only load variants within this range (region_start-region_end) when reading a cached variant file
"""
self.args = locals().copy()
del self.args['self']
self.ignore_conversions = ignore_conversions
self.phased = phased
self.verbose = verbose
self.locationToAllele = get_allele_dict() # chrom -> pos-> base -> sample(s)
self.select_samples = select_samples
self.region_start = region_start
self.region_end = region_end
self.lazyLoad = lazyLoad
if vcffile is None:
return
self.vcffile = self.clean_vcf_name(vcffile)
self.use_cache = use_cache
if use_cache:
lazyLoad = True
try:
with pysam.VariantFile(vcffile) as f:
pass
except Exception as e:
print(e)
uglyMode = True
if self.verbose:
print(
'Pysam cannot read the VCF, rolling back to slower homebrew parser')
if lazyLoad and uglyMode:
if self.verbose:
print(
'Lazy loading is not supported for non proper VCFs. Loading all variants to memory.')
if self.select_samples is not None:
raise NotImplementedError(
"Sample selection is not implemented for non proper VCF")
lazyLoad = False
# collections.defaultdict(set) ) #(chrom, pos)-> base -> sample(s)
if uglyMode:
foundIt = False # If the target chromosome was found
with open(vcffile, 'r') as f:
for line in f:
if line[0] != '#':
chromosome, location, id, ref, alt = line.strip().split()
if chrom is None or chromosome == chrom:
pos = int(location)
foundIt = True
self.locationToAllele[chromosome][pos -
1][ref].add("REF")
self.locationToAllele[chromosome][pos -
1][alt].add("ALT")
elif chrom is not None and foundIt:
print('Stopping for speed')
break
if self.verbose:
print('Finished loading dirty vcf file')
else:
if not lazyLoad:
self.fetchChromosome(vcffile, chrom)
else:
if self.verbose:
print('Lazy loading allele file')
pass
def addAlleleInfoOneBased(self, chromosome, location, base, alleleName):
self.locationToAllele[chromosome][location][base].add(alleleName)
def write_cache(self, path, chrom):
"""Write to cache file, this will make later lookups to the chromosome faster
Args:
path (str): path of the cache file
chrom (str): contig/chromosome to write cache file for (every contig has it's own cache)
"""
temp_path = path + '.unfinished'
with gzip.open(temp_path, 'wt') as f:
for position in sorted(list(self.locationToAllele[chrom].keys())):
for base in self.locationToAllele[chrom][position]:
f.write(
f'{position}\t{base}\t{",".join(sorted(list(self.locationToAllele[chrom][position][base])))}\n')
os.rename(temp_path, path)
def read_cached(self, path, chrom):
"""Read cache file
Args:
path (str): path of the cache file
chrom (str): contig/chromosome
"""
with gzip.open(path, 'rt') as f:
for line in f:
position, base, samples = line.strip().split('\t', 3)
position = int(position)
if self.region_start is not None and position<self.region_start:
continue
if self.region_end is not None and position>self.region_end:
break
self.locationToAllele[chrom][position][base] = set(samples.split(','))
def instance(self, arg_update):
if 'self' in self.args:
del self.args['self']
clone = AlleleResolver(**self.args)
return clone
def prefetch(self, contig, start, end):
clone = self.instance({'region_start':start, 'region_end':end})
#print(f'Prefetching {contig}:{start}-{end}')
try:
self.fetchChromosome(self.vcffile, contig, True)
except ValueError:
# This means the chromosome is not available
pass
return clone
def fetchChromosome(self, vcffile, chrom, clear=False):
if clear:
self.locationToAllele = get_allele_dict() # chrom -> pos-> base -> sample(s)
vcffile = self.clean_vcf_name(vcffile)
# allocate:
# Decide if this is an allele we would may be cache?
write_cache_file_flag = False
if self.use_cache:
if (chrom.startswith('KN') or chrom.startswith('KZ') or chrom.startswith(
'chrUn') or chrom.endswith('_random') or 'ERCC' in chrom):
write_cache_file_flag = False
else:
write_cache_file_flag = True
if self.use_cache and write_cache_file_flag:
allele_dir = f'{os.path.abspath(vcffile)}_allele_cache/'
if not os.path.exists(allele_dir):
os.makedirs(allele_dir)
cache_file_name = f'{allele_dir}/{chrom}'
if self.select_samples is not None:
sample_list_id = '-'.join(sorted(list(self.select_samples)))
cache_file_name = cache_file_name + '_' + sample_list_id
cache_file_name += '.tsv.gz'
if os.path.exists(cache_file_name):
if self.verbose:
print(f"Cached file exists at {cache_file_name}")
self.read_cached(cache_file_name, chrom)
return
if self.verbose:
print(
f"Cache enabled, but file is not available, creating cache file at {cache_file_name}")
self.locationToAllele[chrom][-1]['N'].add('Nop')
unTrusted = []
added = 0
if self.verbose:
print(f'Reading variants for {chrom} ', end='')
with pysam.VariantFile(vcffile) as v:
try:
for rec in v.fetch(chrom, start=self.region_start, stop=self.region_end):
used = False
bad = False
bases_to_alleles = collections.defaultdict(
set) # base -> samples
if self.phased: # variants are phased, assign a random allele
if len(rec.samples)==0: # File without samples
bases_to_alleles[rec.ref]=set('r')
bases_to_alleles[rec.alts[0]]=set('a')
self.locationToAllele[rec.chrom][rec.pos - 1]=bases_to_alleles
else:
samples_assigned = set()
most_assigned_base = 0
monomorphic=False
for sample, sampleData in rec.samples.items():
if self.select_samples is not None and sample not in self.select_samples:
continue
for base in sampleData.alleles:
if base is None:
# This site is monomorphic:
monomorphic=True
continue
if len(base) == 1:
bases_to_alleles[base].add(sample)
used = True
samples_assigned.add(sample)
else: # This location cannot be trusted:
bad = True
# We can prune this site if all samples are associated
# with the same base
if self.select_samples is not None and used:
if len(samples_assigned) != len(
self.select_samples):
# The site is not informative
bad = True
if monomorphic and len(bases_to_alleles)>0:
bad=False
elif len(bases_to_alleles) < 2:
bad = True
# The site is not informative
else: # not phased
if not all(
len(allele) == 1 for allele in rec.alleles): # only select SNVs
bad = True
else:
bad = False
for allele, base in zip('UVWXYZ', rec.alleles):
bases_to_alleles[base].add(allele)
used = True
if not bad and self.ignore_conversions is not None: # prune conversions which are banned
bad = any(
((rec.ref, base) in self.ignore_conversions for base in bases_to_alleles))
if used and not bad:
self.locationToAllele[rec.chrom][rec.pos -
1] = bases_to_alleles
added += 1
except Exception as e:
raise
# for t in unTrusted:
# if t in self.locationToAllele:
# del self.locationToAllele[t[0]][t[1]]
#del unTrusted
if self.verbose:
print(f'{added} variants [OK]')
if self.use_cache and write_cache_file_flag:
if self.verbose:
print("writing cache file")
try:
self.write_cache(cache_file_name, chrom)
except Exception as e:
if self.verbose:
print(f"Exception writing cache: {e}")
pass # @todo
def getAllele(self, reads):
alleles = set()
for read in reads:
if read is None or read.is_unmapped:
continue
chrom = read.reference_name
for readPos, refPos in read.get_aligned_pairs(matches_only=True):
readBase = read.query_sequence[readPos]
c = self.getAllelesAt(chrom, refPos, readBase)
if c is not None and len(c) == 1:
alleles.update(c)
return alleles
# @functools.lru_cache(maxsize=1000) not necessary anymore... complete data is already saved in dict
def has_location(self, chrom ,pos):
if self.lazyLoad and chrom not in self.locationToAllele:
try:
self.fetchChromosome(self.vcffile, chrom, clear=True)
except Exception as e:
if 'invalid contig' in str(e):
# The contig does not exists
return True
if 'fetch requires an index' in str(e):
raise Exception('The variant file used for allele resolving does not have an index file. Use bcftools index, or vcftools index to generate an index')
print('ERROR, in has_location (Allele Resolver):', e)
pass
if chrom not in self.locationToAllele or pos not in self.locationToAllele[chrom]:
return False
return True
def getAllelesAt(self, chrom, pos, base):
if self.lazyLoad and chrom not in self.locationToAllele:
try:
self.fetchChromosome(self.vcffile, chrom, clear=True)
except Exception as e:
if 'fetch requires an index' in str(e):
raise Exception('The variant file used for allele resolving does not have an index file. Use bcftools index, or vcftools index to generate an index')
print('ERROR, in getAllelesAt:', e)
pass
if chrom not in self.locationToAllele or pos not in self.locationToAllele[chrom]:
return None
if base not in self.locationToAllele[chrom][pos]:
return None
return self.locationToAllele[chrom][pos][base]
# FWD_-13_C REV_-16_C
# FWD_-16_C REV_+12_C