Download this file

469 lines (365 with data), 18.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
import itertools
from singlecellmultiomics.molecule import Molecule
from singlecellmultiomics.molecule.nlaIII import NlaIIIMolecule
from singlecellmultiomics.molecule.nlaIII import AnnotatedNLAIIIMolecule
from singlecellmultiomics.molecule.chic import CHICMolecule
from singlecellmultiomics.molecule.chic import AnnotatedCHICMolecule
from collections import Counter
from singlecellmultiomics.utils.sequtils import complement
from itertools import product
from matplotlib.pyplot import get_cmap
from copy import copy
import numpy as np
complement_trans = str.maketrans('ATGC', 'TACG')
class TAPS:
# Methylated Cs get converted into T readout
def __init__(self, reference=None, reference_variants=None, taps_strand='F', **kwargs):
"""
Intialise the TAPS class
Args:
reference (pysam.FastaFile) : reference fasta file
reference_variants(pysam.VariantFile) : variants in comparison to supplied reference. @todo
"""
assert reference is None, 'reference is now supplied by the molecule'
if reference_variants is not None:
raise NotImplementedError()
self.overlap_tag = 'XM'
"""
z unmethylated C in CpG context (CG)
Z methylated C in CpG context (CG)
x unmethylated C in CHG context ( C[ACT]G )
X methylated C in CHG context ( C[ACT]G )
h unmethylated C in CHH context ( C[ACT][ACT] )
H methylated C in CHH context ( C[ACT][ACT] )
"""
self.context_mapping = dict()
self.context_mapping[False] = {
'CGA': 'z',
'CGT': 'z',
'CGC': 'z',
'CGG': 'z',
'CAG': 'x',
'CCG': 'x',
'CTG': 'x',
}
self.context_mapping[True] = {
context: letter.upper() for context,
letter in self.context_mapping[False].items()}
self.taps_strand = taps_strand
for x in list(itertools.product('ACT', repeat=2)):
self.context_mapping[True][''.join(['C'] + list(x))] = 'H'
self.context_mapping[False][''.join(['C'] + list(x))] = 'h'
self.colormap = copy(get_cmap('RdYlBu_r')) # Make a copy
self.colormap.set_bad((0,0,0)) # For reads without C's
def position_to_context(
self,
chromosome,
position,
ref_base,
observed_base='N',
strand=None,
reference=None
):
"""Extract bismark call letter from a chromosomal location given the observed base
Args:
chromosome (str) : chromosome / contig of location to test
position (int) : genomic location of location to test
observed_base(str) : base observed in the read
strand(int) : mapping strand, False: forward, True: Reverse
Returns:
context(str) : 3 basepair context
bismark_letter(str) : bismark call
"""
assert reference is not None
assert strand is not None
qbase = observed_base.upper()
#ref_base = reference.fetch( chromosome, position, position + 1).upper()
# if a vcf file is supplied we can extract the possible reference bases
# @todo
context = None
methylated = None
try:
if ref_base == 'C':
context = reference.fetch(chromosome, position, position + 3).upper()
if qbase == 'T':
methylated = True
elif qbase=='C':
methylated = False
elif ref_base == 'G':
origin = reference.fetch( chromosome, position - 2, position + 1).upper()
context = origin.translate(complement_trans)[::-1]
if qbase == 'A':
methylated = True
elif qbase == 'G':
methylated = False
else:
raise ValueError('Only supply reference C or G')
except ValueError: # Happens when the coordinates are outstide the reference:
context = None
methylated = None
except FileNotFoundError:
raise ValueError('Got a file not found error. This is probably caused by the reference handle being shared between processes. Stop doing that (quick solution: disable multithreading/multiprocess).')
#print(ref_base, qbase, strand, position, chromosome,context, '?' if methylated is None else ('methylated' if methylated else 'not methylated'))
if methylated is None:
symbol='.'
else:
symbol = self.context_mapping[methylated].get(context, '.')
return context, symbol
def molecule_to_context_call_dict(self, molecule):
"""Extract bismark call_string dictionary from a molecule
Args:
molecule(singlecellmultiomics.molecule.Molecule) : molecule to extract calls from
Returns:
context_call_dict(dict) : {(chrom,pos):bismark call letter}
"""
call_dict = {} # (chrom,pos)-> "bismark" call_string
for (chrom, pos), base in molecule.get_consensus().items():
context, letter = self.position_to_context(chromosome=molecule.chromosome,
position=pos,
reference=molecule.reference,
observed_base=base,
strand=molecule.strand)
if letter is not None:
call_dict[(chrom, pos)] = letter
return call_dict
class TAPSMolecule(Molecule):
def __init__(self, fragments=None, taps=None, classifier=None, taps_strand='F', allow_unsafe_base_calls=False, methylation_consensus_kwargs=None, **kwargs):
""" TAPSMolecule
Args:
fragments(list) : list of fragments to associate with the Molecule
taps(singlecellmultiomics.molecule.TAPS) : TAPS class which performs the methylation calling
classifier : fitted sklearn classifier, when supplied this classifier is used to obtain a consensus from which the methylation calls are generated.
"""
Molecule.__init__(self, fragments=fragments, **kwargs)
if taps is None:
raise ValueError("""Supply initialised TAPS class
taps = singlecellmultiomics.molecule.TAPS( )
""")
self.taps = taps # initialised TAPS class**self.
self.methylation_call_dict = None
self.classifier = classifier
self.taps_strand = taps_strand
self.allow_unsafe_base_calls = allow_unsafe_base_calls
self.get_consensus_dictionaries_kwargs = methylation_consensus_kwargs if methylation_consensus_kwargs is not None else dict()
def add_cpg_color_tag_to_read(self, read):
try:
CpG_methylation_rate = read.get_tag('sZ') / (read.get_tag('sZ') + read.get_tag('sz'))
cfloat = self.taps.colormap(CpG_methylation_rate)[:3]
except Exception as e:
CpG_methylation_rate = None
cfloat = self.taps.colormap._rgba_bad[:3]
read.set_tag('YC', '%s,%s,%s' % tuple((int(x * 255) for x in cfloat)))
def __finalise__(self):
super().__finalise__()
self.obtain_methylation_calls(**self.get_consensus_dictionaries_kwargs)
for read in self.iter_reads():
try:
CpG_methylation_rate = read.get_tag('sZ')/(read.get_tag('sZ')+read.get_tag('sz'))
cfloat = self.taps.colormap(CpG_methylation_rate)[:3]
except Exception as e:
CpG_methylation_rate = None
cfloat = self.taps.colormap._rgba_bad[:3]
read.set_tag('YC', '%s,%s,%s' % tuple( (int(x*255) for x in cfloat)))
def write_tags_to_psuedoreads(self, reads, call_super=True):
if call_super:
Molecule.write_tags_to_psuedoreads(self,reads)
for read in reads:
self.add_cpg_color_tag_to_read(read)
def is_valid(self, set_rejection_reasons=False):
if not super().is_valid(set_rejection_reasons=set_rejection_reasons):
return False
"""
try:
consensus = self.get_consensus(allow_unsafe=self.allow_unsafe_base_calls)
except ValueError:
if set_rejection_reasons:
self.set_rejection_reason('no_consensus')
return False
except TypeError:
if set_rejection_reasons:
self.set_rejection_reason('getPairGenomicLocations_failed')
return False
"""
if self.methylation_call_dict is None:
if set_rejection_reasons:
self.set_rejection_reason('methylation_calls_failed')
return False
return True
#def obtain_methylation_calls_experimental(self):
def obtain_methylation_calls(self, **get_consensus_dictionaries_kwargs):
""" This methods returns a methylation call dictionary
returns:
mcalls(dict) : (chrom,pos) : {'consensus': consensusBase, 'reference': referenceBase, 'call': call}
"""
expected_base_to_be_converted = ('G' if self.strand else 'C') \
if self.taps_strand=='F' else ('C' if self.strand else 'G')
try:
if True:
c_pos_consensus, phreds, coverage = self.get_consensus(dove_safe = not self.allow_unsafe_base_calls,
only_include_refbase=expected_base_to_be_converted,
with_probs_and_obs=True, # Include phred scores and coverage
**get_consensus_dictionaries_kwargs)
else:
c_pos_consensus = self.get_consensus(dove_safe = not self.allow_unsafe_base_calls,
only_include_refbase=expected_base_to_be_converted,
with_probs_and_obs=False, # Include phred scores and coverage
**get_consensus_dictionaries_kwargs)
except ValueError as e:
if 'MD tag not present' in str(e):
self.set_rejection_reason("MD_TAG_MISSING")
return None
raise
# obtain the context of the conversions:
conversion_contexts = {
(contig, position):
{'consensus': base_call,
'reference_base': expected_base_to_be_converted,
'cov': len(phreds[(contig, position)][base_call]),
'qual': np.mean( phreds[(contig, position)][base_call] ),
'context': self.taps.position_to_context(
chromosome=contig,
position=position,
reference=self.reference,
observed_base=base_call,
ref_base=expected_base_to_be_converted,
strand=self.strand)[1]}
for (contig, position), base_call in c_pos_consensus.items()}
# Write bismark tags:
self.set_methylation_call_tags(conversion_contexts)
return conversion_contexts
class TAPSNlaIIIMolecule(NlaIIIMolecule, TAPSMolecule):
"""Molecule class for combined TAPS and NLAIII """
def __init__(self, fragments=None, taps=None, taps_strand='R', **kwargs):
NlaIIIMolecule.__init__(self, fragments, **kwargs)
TAPSMolecule.__init__(self, fragments=fragments, taps_strand=taps_strand, taps=taps, **kwargs)
def write_tags(self):
NlaIIIMolecule.write_tags(self)
TAPSMolecule.write_tags(self)
def __finalise__(self):
NlaIIIMolecule.__finalise__(self)
TAPSMolecule.__finalise__(self)
def is_valid(self, set_rejection_reasons=False):
return NlaIIIMolecule.is_valid(
self, set_rejection_reasons=set_rejection_reasons) and TAPSMolecule.is_valid(
self, set_rejection_reasons=set_rejection_reasons)
def write_tags_to_psuedoreads(self, reads):
NlaIIIMolecule.write_tags_to_psuedoreads(self, reads)
TAPSMolecule.write_tags_to_psuedoreads(self, reads, call_super=False)
class AnnotatedTAPSNlaIIIMolecule(AnnotatedNLAIIIMolecule, TAPSMolecule):
"""Molecule class for combined TAPS, NLAIII and transcriptome """
def __init__(self, fragments=None, features=None, taps=None, **kwargs):
assert features is not None, "Supply features!"
AnnotatedNLAIIIMolecule.__init__(
self, fragments, features=features, **kwargs)
TAPSMolecule.__init__(self, fragments=fragments, taps=taps, **kwargs)
def write_tags(self):
AnnotatedNLAIIIMolecule.write_tags(self)
TAPSMolecule.write_tags(self)
def is_valid(self, set_rejection_reasons=False):
return AnnotatedNLAIIIMolecule.is_valid(
self, set_rejection_reasons=set_rejection_reasons) and TAPSMolecule.is_valid(
self, set_rejection_reasons=set_rejection_reasons)
def write_tags_to_psuedoreads(self, reads):
AnnotatedNLAIIIMolecule.write_tags_to_psuedoreads(self, reads)
TAPSMolecule.write_tags_to_psuedoreads(self, reads, call_super=False)
class TAPSCHICMolecule(CHICMolecule, TAPSMolecule):
"""Molecule class for combined TAPS and CHIC """
def __init__(self, fragments=None, taps=None, taps_strand='R', **kwargs):
CHICMolecule.__init__(self, fragments, **kwargs)
TAPSMolecule.__init__(self, fragments=fragments, taps_strand=taps_strand,taps=taps, **kwargs)
def write_tags(self):
CHICMolecule.write_tags(self)
TAPSMolecule.write_tags(self)
def is_valid(self, set_rejection_reasons=False):
return CHICMolecule.is_valid(
self, set_rejection_reasons=set_rejection_reasons) and TAPSMolecule.is_valid(
self, set_rejection_reasons=set_rejection_reasons)
def __finalise__(self):
CHICMolecule.__finalise__(self)
TAPSMolecule.__finalise__(self)
def write_tags_to_psuedoreads(self, reads):
CHICMolecule.write_tags_to_psuedoreads(self, reads)
TAPSMolecule.write_tags_to_psuedoreads(self, reads, call_super=False)
class AnnotatedTAPSCHICMolecule(AnnotatedCHICMolecule, TAPSMolecule):
"""Molecule class for combined TAPS, CHIC and transcriptome """
def __init__(self, fragments=None, features=None, taps_strand='R', taps=None, **kwargs):
assert features is not None, "Supply features!"
AnnotatedCHICMolecule.__init__(
self, fragments, features=features, **kwargs)
TAPSMolecule.__init__(self, fragments=fragments, taps_strand=taps_strand,taps=taps, **kwargs)
def write_tags(self):
AnnotatedCHICMolecule.write_tags(self)
TAPSMolecule.write_tags(self)
def __finalise__(self):
AnnotatedCHICMolecule.__finalise__(self)
TAPSMolecule.__finalise__(self)
def is_valid(self, set_rejection_reasons=False):
return AnnotatedCHICMolecule.is_valid(
self, set_rejection_reasons=set_rejection_reasons) and TAPSMolecule.is_valid(
self, set_rejection_reasons=set_rejection_reasons)
def write_tags_to_psuedoreads(self, reads):
AnnotatedCHICMolecule.write_tags_to_psuedoreads(self, reads)
TAPSMolecule.write_tags_to_psuedoreads(self, reads, call_super=False)
def strip_array_tags(molecule):
for read in molecule.iter_reads():
read.set_tag('jM',None)
read.set_tag('jI',None)
return molecule
idmap = [''.join(p) for p in product('0123','0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')]
context_ids = {''.join(ctx):idmap[i] for i,ctx in enumerate(product('ACTG',repeat=3))}
class TAPSPTaggedMolecule(AnnotatedTAPSNlaIIIMolecule):
def __finalise__(self):
AnnotatedTAPSNlaIIIMolecule.__finalise__(self)
# Additional finalisation steps:
conversions_count = 0
forward_counts = 0
rev_counts = 0
conversion_locations = []
mismatches = 0
ambig_count = 0
molecule_variants=set()
# Obtain consensus call for the molecule
consensus = self.get_consensus(allow_unsafe=True)
context_obs = Counter()
for read in self.iter_reads():
for qpos, refpos, reference_base in read.get_aligned_pairs(with_seq=True, matches_only=True):
location = (read.reference_name, refpos)
#if (location[0], location[1]) in variant_locations:
# skipped+=1
# continue
query = read.seq[qpos]
context = self.reference.fetch(self.chromosome, refpos-1, refpos+2).upper()
# Check if call is the same as the consensus call:
if query!=consensus.get((read.reference_name,refpos), 'X'):
continue
reference_base = reference_base.upper()
if read.is_reverse: # forward, we sequence the other side
reference_base = complement(reference_base)
query = complement(query)
context=complement(context)
if query!=reference_base:
mismatches += 1
molecule_variants.add(refpos)
if (reference_base, query) in ( ('A','G'), ('G','A'), ('T','C'), ('C','T')):
conversion_locations.append(f'{location[0]}:{location[1]+1} {reference_base}>{query}' )
conversions_count+=1
context_obs[context_ids.get(context,'99')]+=1
if (reference_base, query) in ( ('A','G'), ('G','A') ):
forward_counts+=1
elif (reference_base, query) in (('T','C'), ('C','T')):
rev_counts+=1
self.set_meta('mM',mismatches)
self.set_meta('pP',conversions_count)
self.set_meta('pR',rev_counts)
self.set_meta('pF',forward_counts)
self.set_meta('pE',','.join(conversion_locations))
for context,obs in context_obs.most_common():
self.set_meta(context,obs)
self.mismatches = mismatches
self.conversions_count = conversions_count
self.context_obs = context_obs
self.rev_counts = rev_counts
self.forward_counts = forward_counts
self.conversion_locations=conversion_locations
def write_tags_to_psuedoreads(self, reads):
AnnotatedTAPSNlaIIIMolecule.write_tags_to_psuedoreads(self,reads)