[9b26b7]: / deepvariant / pileup_image.py

Download this file

610 lines (535 with data), 23.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
# Copyright 2020 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""Encodes reference and read data into a PileupImage for DeepVariant."""
import itertools
from typing import Iterable, List, Optional, Tuple
from absl import logging
import numpy as np
from deepvariant import dv_constants
from deepvariant import sample as sample_lib
from deepvariant.protos import deepvariant_pb2
from deepvariant.python import pileup_image_native
from third_party.nucleus.io import fasta
from third_party.nucleus.io import sam
from third_party.nucleus.protos import reads_pb2
from third_party.nucleus.protos import variants_pb2
from third_party.nucleus.util import ranges
def default_options(read_requirements=None):
"""Creates a PileupImageOptions populated with good default values."""
if not read_requirements:
read_requirements = reads_pb2.ReadRequirements(
min_base_quality=10,
min_mapping_quality=10,
min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT,
)
return deepvariant_pb2.PileupImageOptions(
reference_band_height=5,
base_color_offset_a_and_g=40,
base_color_offset_t_and_c=30,
base_color_stride=70,
allele_supporting_read_alpha=1.0,
allele_unsupporting_read_alpha=0.6,
other_allele_supporting_read_alpha=0.6,
reference_matching_read_alpha=0.2,
reference_mismatching_read_alpha=1.0,
indel_anchoring_base_char='*',
reference_alpha=0.4,
reference_base_quality=60,
positive_strand_color=70,
negative_strand_color=240,
base_quality_cap=40,
mapping_quality_cap=60,
height=dv_constants.PILEUP_DEFAULT_HEIGHT,
width=dv_constants.PILEUP_DEFAULT_WIDTH,
num_channels=dv_constants.PILEUP_NUM_CHANNELS,
read_overlap_buffer_bp=5,
read_requirements=read_requirements,
multi_allelic_mode=deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES,
# Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
random_seed=2101079370,
sequencing_type=deepvariant_pb2.PileupImageOptions.UNSPECIFIED_SEQ_TYPE,
alt_aligned_pileup='none',
types_to_alt_align='indels',
min_non_zero_allele_frequency=0.00001,
use_allele_frequency=False,
)
def _compute_half_width(width):
return int((width - 1) / 2)
def _represent_alt_aligned_pileups(representation, ref_image, alt_images):
"""Combines ref and alt-aligned pileup images according to the representation.
Args:
representation: string, one of "rows", "base_channels", "diff_channels".
ref_image: 3D numpy array. The original pileup image.
alt_images: list of either one or two 3D numpy arrays, both of the same
dimensions as ref_image. Pileup image(s) of the same reads aligned to the
alternate haplotype(s).
Returns:
One 3D numpy array containing a selection of data from the input arrays.
"""
# If there is only one alt, duplicate it to make all pileups the same size.
if len(alt_images) == 1:
alt_images = alt_images + alt_images
if len(alt_images) != 2:
raise ValueError('alt_images must contain exactly one or two arrays.')
# Ensure that all three pileups have the same shape.
if not ref_image.shape == alt_images[0].shape == alt_images[1].shape:
raise ValueError(
'Pileup images must be the same shape to be combined. '
'ref_image.shape is {}. alt_images[0].shape is {}. '
'alt_images[1].shape is {}.'.format(
ref_image.shape, alt_images[0].shape, alt_images[1].shape
)
)
if representation == 'rows':
# Combine all images: [ref, alt1, alt2].
return np.concatenate([ref_image] + alt_images, axis=0)
elif representation == 'base_channels':
channels = [ref_image[:, :, c] for c in range(ref_image.shape[2])]
# Add channel 0 (bases ATCG) of both alts as channels.
channels.append(alt_images[0][:, :, 0])
channels.append(alt_images[1][:, :, 0])
return np.stack(channels, axis=2)
elif representation == 'diff_channels':
channels = [ref_image[:, :, c] for c in range(ref_image.shape[2])]
# Add channel 5 (base differs from ref) of both alts as channels.
channels.append(alt_images[0][:, :, 5])
channels.append(alt_images[1][:, :, 5])
return np.stack(channels, axis=2)
else:
raise ValueError(
'_represent_alt_aligned_pileups received invalid value for '
'representation: "{}". Must be one of '
'rows, base_channels, or diff_channels.'.format(representation)
)
class PileupImageCreator(object):
"""High-level API for creating images of pileups of reads and reference bases.
This class provides a higher-level and more natural API for constructing
images at a candidate variant call site. Given a DeepVariantCall, which
contains the candidate variant call along with key supplementary information,
this class provides create_pileup_images() that will do all of the necessary
fetching of reads and reference bases from readers and pass those off to the
lower-level PileupImageEncoder to construct the image Tensor.
for dv_call in candidates:
allele_and_images = pic.create_pileup_images(dv_call)
...
A quick note on how we deal with multiple alt alleles:
Suppose variant has ref and two alt alleles. Assuming the sample is diploid,
we have the following six possible genotypes:
ref/ref => 0/0
ref/alt1 => 0/1
alt1/alt1 => 1/1
ref/alt2 => 0/2
alt1/alt2 => 1/2
alt2/alt2 => 2/2
In DeepVariant we predict the genotype count (0, 1, 2) for a specific set of
alternate alleles. If we only had a single alt, we'd construct an image for
ref vs. alt1:
image1 => ref vs. alt1 => determine if we are 0/0, 0/1, 1/1
If we add a second image for alt2, we get:
image2 => ref vs. alt2 => determine if we are 0/0, 0/2, 2/2
but the problem here is that we don't have a good estimate for the het-alt
state 1/2. So we construct a third image contrasting ref vs. either alt1 or
alt2:
image3 => ref vs. alt1 or alt2 => determines 0/0, 0/{1,2}, {1,2}/{1,2}
Given the predictions for each image:
image1 => p00, p01, p11
image2 => p00, p02, p22
image3 => p00, p0x, pxx where x is {1,2}
we calculate our six genotype likelihoods as:
0/0 => p00 [from any image]
0/1 => p01 [image1]
1/1 => p11 [image1]
0/2 => p02 [image2]
2/2 => p22 [image2]
1/2 => pxx [image3]
The function create_pileup_images() returns all of the necessary images, along
with the alt alleles used for each image.
"""
def __init__(
self,
options: deepvariant_pb2.PileupImageOptions,
ref_reader: fasta.IndexedFastaReader,
samples: List[sample_lib.Sample],
):
self._options = options
self._encoder = pileup_image_native.PileupImageEncoderNative(self._options)
self._channels_enum = self._encoder.all_channels_enum(
options.alt_aligned_pileup
)
self._ref_reader = ref_reader
self._samples = samples
def __getattr__(self, attr):
"""Gets attributes from self._options as though they are our attributes."""
return self._options.__getattribute__(attr)
@property
def half_width(self):
return _compute_half_width(self._options.width)
def get_channels(self):
return self._channels_enum
def get_reads(
self, variant: variants_pb2.Variant, sam_reader: sam.SamReader
) -> List[reads_pb2.Read]:
"""Gets the reads used to construct the pileup image around variant.
Args:
variant: A third_party.nucleus.protos.Variant proto describing the variant
we are creating the pileup image of.
sam_reader: Nucleus sam_reader from which to query.
Returns:
A list of third_party.nucleus.protos.Read protos.
"""
query_start = variant.start - self._options.read_overlap_buffer_bp
query_end = variant.end + self._options.read_overlap_buffer_bp
region = ranges.make_range(variant.reference_name, query_start, query_end)
return list(sam_reader.query(region))
def get_reference_bases(self, variant: variants_pb2.Variant) -> Optional[str]:
"""Gets the reference bases used to make the pileup image around variant.
Args:
variant: A third_party.nucleus.protos.Variant proto describing the variant
we are creating the pileup image of.
Returns:
A string of reference bases or None. Returns None if the reference
interval for variant isn't valid for some reason.
"""
start = variant.start - self.half_width
end = start + self._options.width
region = ranges.make_range(variant.reference_name, start, end)
if self._ref_reader.is_valid(region):
return self._ref_reader.query(region)
else:
return None
def _alt_allele_combinations(
self, variant: variants_pb2.Variant
) -> Iterable[List[str]]:
"""Yields the set of all alt_alleles for variant.
This function computes the sets of alt_alleles we want to use to cover all
genotype likelihood calculations we need for n alt alleles (see class docs
for background). The easiest way to do this is to calculate all combinations
of 2 alleles from ref + alts and then strip away the reference alleles,
leaving us with the set of alts for the pileup image encoder. The sets are
converted to sorted lists at the end for downstream consistency.
Args:
variant: third_party.nucleus.protos.Variant to generate the alt allele
combinations for.
Yields:
A series of lists containing the alt alleles we want to use for a single
pileup image. The entire series covers all combinations of alt alleles
needed for variant.
Raises:
ValueError: if options.multi_allelic_mode is UNSPECIFIED.
"""
ref = variant.reference_bases
alts = list(variant.alternate_bases)
if (
self.multi_allelic_mode
== deepvariant_pb2.PileupImageOptions.UNSPECIFIED
):
raise ValueError('multi_allelic_mode cannot be UNSPECIFIED')
elif (
self.multi_allelic_mode
== deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES
):
for alt in alts:
yield sorted([alt])
else:
for combination in itertools.combinations([ref] + alts, 2):
yield sorted(list(set(combination) - {ref}))
def build_pileup(
self,
dv_call: deepvariant_pb2.DeepVariantCall,
refbases: str,
reads_for_samples: List[List[reads_pb2.Read]],
alt_alleles: List[str],
sample_order: Optional[List[int]] = None,
custom_ref: bool = False,
):
"""Creates a pileup tensor for dv_call.
Args:
dv_call: learning.genomics.deepvariant.DeepVariantCall object with
information on our candidate call and allele support information.
refbases: A string options.width in length containing the reference base
sequence to encode. The middle base of this string should be at the
start of the variant in dv_call.
reads_for_samples: list by sample of Iterable of
third_party.nucleus.protos.Read objects that we'll use to encode the
read information supporting our call. Assumes each read is aligned and
is well-formed (e.g., has bases and quality scores, cigar). Rows of the
image are encoded in the same order as reads.
alt_alleles: A collection of alternative_bases from dv_call.variant that
we are treating as "alt" when constructing this pileup image. A read
will be considered supporting the "alt" allele if it occurs in the
support list for any alt_allele in this collection.
sample_order: A list of indices representing the order in which samples
should be represented in the pileup image. Example: [1,0,2] to swap the
first two samples out of three. This is None by default which puts the
samples in order.
custom_ref: True if refbases should not be checked for matching against
variant's reference_bases.
Returns:
A uint8 Tensor image of shape
[self.width, <sum of sample pileup heights>, DEFAULT_NUM_CHANNEL]
Raises:
ValueError: if any arguments are invalid.
"""
if len(refbases) != self.width:
raise ValueError(
'refbases is {} long but width is {}'.format(
len(refbases), self.width
)
)
if not alt_alleles:
raise ValueError('alt_alleles cannot be empty')
if any(alt not in dv_call.variant.alternate_bases for alt in alt_alleles):
raise ValueError(
(
'all elements of alt_alleles must be the alternate bases'
' of dv_call.variant'
),
alt_alleles,
dv_call.variant,
)
if len(self._samples) != len(reads_for_samples):
raise ValueError(
'The number of self._samples ({}) must be the same as the number of '
'reads_for_samples ({}).'.format(
len(self._samples), len(reads_for_samples)
)
)
image_start_pos = dv_call.variant.start - self.half_width
if not custom_ref and (
refbases[self.half_width] != dv_call.variant.reference_bases[0]
):
raise ValueError(
'The middle base of reference sequence in the window '
"({} at base {}) doesn't match first "
'character of variant.reference_bases ({}).'.format(
refbases[self.half_width],
self.half_width,
dv_call.variant.reference_bases,
)
)
def build_pileup_for_one_sample(
reads: List[reads_pb2.Read], sample: sample_lib.Sample
) -> List[np.ndarray]:
"""Create read pileup image section for one sample."""
# We start with n copies of our encoded reference bases.
rows = [
self._encoder.encode_reference(refbases)
] * self.reference_band_height
def _update_hap_index(
read: reads_pb2.Read, hp_tag_for_assembly_polishing: int
) -> int:
default_hap_idx = 0 # By default, reads with no HP is set to 0.
if 'HP' not in read.info:
return default_hap_idx
read_info_hp = read.info.get('HP')
if read_info_hp is None or not read_info_hp.values:
return default_hap_idx
hp_field = next(iter(read_info_hp.values))
if not hp_field.HasField('int_value'):
return default_hap_idx
hp_value = hp_field.int_value
if (
hp_tag_for_assembly_polishing > 0
and hp_value == hp_tag_for_assembly_polishing
):
# For the target HP tag, set it to -1 so it will be sorted on
# top of the pileup image.
return -1
elif hp_value < 0:
return 0 # For reads with HP < 0, assume it is not tagged.
else:
return hp_value
# Returns tuples of the form (haplotype, position, row),
# if the read can be encoded as a valid row to be used in the pileup
# image.
def _row_helper(
read: reads_pb2.Read,
) -> Optional[Tuple[int, int, np.ndarray]]:
"""A function that returns tuples of (haplotype, position, row)."""
read_row = self._encoder.encode_read(
dv_call, refbases, read, image_start_pos, alt_alleles
)
if read_row is None:
return None
hap_idx = 0 # By default, reads with no HP is set to 0.
if self._options.sort_by_haplotypes:
hap_idx = _update_hap_index(
read, self._options.hp_tag_for_assembly_polishing
)
return hap_idx, read.alignment.position.position, read_row
# We add a row for each read in order, down-sampling if the number of
# reads is greater than the max reads for each sample. Sort the reads by
# their alignment position.
random_for_image = np.random.RandomState(self._options.random_seed)
# Use sample height or default to pic height.
if sample.options.pileup_height != 0:
pileup_height = sample.options.pileup_height
else:
pileup_height = self.height
max_reads = pileup_height - self.reference_band_height
reads_indices = list(range(len(reads)))
if len(reads) > max_reads:
# Shuffle the indices instead of the reads, so that we won't change
# the order of the reads list.
random_for_image.shuffle(reads_indices)
pileup_of_reads = []
for reads_index in reads_indices:
read = reads[reads_index]
if len(pileup_of_reads) == max_reads:
break
row = _row_helper(read)
if row is None:
continue
pileup_of_reads.append(row)
pileup_of_reads = sorted(pileup_of_reads, key=lambda x: (x[0], x[1]))
rows += [read_row for _, _, read_row in pileup_of_reads]
# Finally, fill in any missing rows to bring our image to pileup_height
# rows with empty (all black) pixels.
n_missing_rows = pileup_height - len(rows)
if n_missing_rows > 0:
# Add values to rows to fill it out with zeros.
rows += [self._empty_image_row()] * n_missing_rows
return rows
sample_sections = []
if sample_order is None:
sample_order = range(len(self._samples))
for i in sample_order:
sample = self._samples[i]
sample_sections.extend(
build_pileup_for_one_sample(reads_for_samples[i], sample)
)
# Vertically stack the image rows to create a single
# h x w x DEFAULT_NUM_CHANNEL image.
return np.vstack(sample_sections)
def _empty_image_row(self) -> np.ndarray:
"""Creates an empty image row as an uint8 np.array."""
return np.zeros((1, self.width, self.num_channels), dtype=np.uint8)
def create_pileup_images(
self,
dv_call,
reads_for_samples,
sample_order=None,
haplotype_alignments_for_samples=None,
haplotype_sequences=None,
):
"""Creates a DeepVariant TF.Example for the DeepVariant call dv_call.
See class documents for more details.
Args:
dv_call: A learning.genomics.deepvariant.DeepVariantCall proto that we
want to create a TF.Example pileup image of.
reads_for_samples: list of read generators, one for each sample.
sample_order: A list of indices representing the order in which samples
should be represented in the pileup image. Example: [1,0,2] to swap the
first two samples out of three. This is None by default which puts the
samples in order.
haplotype_alignments_for_samples: list with a dict for each sample of read
alignments keyed by haplotype.
haplotype_sequences: dict of sequences keyed by haplotype.
Returns:
A list of tuples. The first element of the tuple is a set of alternate
alleles used as 'alt' when encoding this image. The second element is a
[w, h, DEFAULT_NUM_CHANNEL] uint8 Tensor of the pileup image for those
alt alleles.
"""
variant = dv_call.variant
# Ref bases to show at the top of the pileup:
ref_bases = self.get_reference_bases(variant)
if not ref_bases:
# This interval isn't valid => we are off the edge of the chromosome, so
# return None to indicate we couldn't process this variant.
return None
alt_aligned_representation = self._options.alt_aligned_pileup
def _pileup_for_pair_of_alts(alt_alleles):
"""Create pileup image for one combination of alt alleles."""
# Always create the ref-aligned pileup image.
ref_image = self.build_pileup(
dv_call=dv_call,
refbases=ref_bases,
reads_for_samples=reads_for_samples,
alt_alleles=alt_alleles,
sample_order=sample_order,
)
# Optionally also create pileup images with reads aligned to alts.
if alt_aligned_representation != 'none':
if (
haplotype_alignments_for_samples is None
or haplotype_sequences is None
):
# Use sample height or default to pic height.
sample_heights = [
sample.options.pileup_height for sample in self._samples
]
if None not in sample_heights:
pileup_height = sum(sample_heights)
else:
pileup_height = self.height
pileup_shape = (pileup_height, self.width, self.num_channels)
alt_images = [
np.zeros(pileup_shape, dtype=np.uint8) for alt in alt_alleles
]
else:
alt_images = []
for alt in alt_alleles:
if len(haplotype_sequences[alt]) != self.width:
logging.warning(
(
'haplotype_sequences[alt] is %d long but pileup '
'image width is %d. Giving up on this image'
),
len(haplotype_sequences[alt]),
self.width,
)
return None
alt_image = self.build_pileup(
dv_call=dv_call,
refbases=haplotype_sequences[alt],
reads_for_samples=[
sample[alt] for sample in haplotype_alignments_for_samples
],
alt_alleles=alt_alleles,
sample_order=sample_order,
custom_ref=True,
)
alt_images.append(alt_image)
composite_image = _represent_alt_aligned_pileups(
alt_aligned_representation, ref_image, alt_images
)
return composite_image
else:
return ref_image
retval = []
for alts in self._alt_allele_combinations(variant):
pileup = _pileup_for_pair_of_alts(alts)
# If the pileup is None, this can mean that we're near the edge of the
# contig, so one pileup width is invalid.
# Return None to indicate we couldn't process this variant.
if pileup is None:
return None
retval.append((alts, pileup))
return retval