[9b26b7]: / deepvariant / dv_utils_using_clif.py

Download this file

136 lines (116 with data), 5.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Copyright 2023 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""Utility functions that uses dependencies with CLIF under the hood."""
import enum
from deepvariant import dv_utils
from deepvariant.protos import deepvariant_pb2
from third_party.nucleus.util import ranges
from third_party.nucleus.util import variant_utils
from tensorflow.core.example import example_pb2
class EncodedVariantType(enum.Enum):
"""Enum capturing the int64 values we encode for different variant types.
TPUs really like fixed length features, which makes it very difficult to use
extract the type of a variant for an example using an encoded Variant
protobufs or even a string value like "snp". The current best option appears
to be to encode the type of a variant directly in an example as an int64. This
enum provides a mapping between those raw int64 values in the example and a
human-meaningful name for that type.
"""
UNKNOWN = 0 # A variant of unknown type.
SNP = 1 # The variant is a SNP.
INDEL = 2 # The variant is an indel.
def encoded_variant_type(variant):
"""Gets the EncodedVariantType for variant.
This function examines variant and returns the EncodedVariantType that best
describes the variation type of variant. For example, if variant has
`reference_bases = "A"` and `alternative_bases = ["C"]` this function would
return EncodedVariantType.SNP.
Args:
variant: nucleus.Variant proto. The variant whose EncodedVariantType we want
to get.
Returns:
EncodedVariantType enum value.
"""
if variant_utils.is_snp(variant):
return EncodedVariantType.SNP
elif variant_utils.is_indel(variant):
return EncodedVariantType.INDEL
else:
return EncodedVariantType.UNKNOWN
def make_example(
variant,
alt_alleles,
encoded_image,
shape,
second_image=None,
sequencing_type=0,
):
"""Creates a new tf.Example suitable for use with DeepVariant.
Args:
variant: third_party.nucleus.protos.Variant protobuf containing information
about a candidate variant call.
alt_alleles: A set of strings. Indicates the alternate alleles used as "alt"
when constructing the image.
encoded_image: a Tensor of type tf.string. Should contain an image encoding
the reference and read data supporting variant. The encoding should be
consistent with the image_format argument.
shape: a list of (width, height, channel).
second_image: a Tensor of type tf.string or None. Contains second image that
encodes read data from another DNA sample. Must satisfy the same
requirements as encoded_image.
sequencing_type: int. The sequencing type of the input image.
Returns:
A tf.Example proto containing the standard DeepVariant features.
"""
example = example_pb2.Example()
features = example.features
features.feature['locus'].bytes_list.value.append(
ranges.to_literal(
ranges.make_range(variant.reference_name, variant.start, variant.end)
).encode('latin-1')
)
dv_utils.example_set_variant(example, variant)
variant_type = encoded_variant_type(variant).value
features.feature['variant_type'].int64_list.value.append(variant_type)
all_alts = list(variant.alternate_bases)
alt_indices = sorted(all_alts.index(alt) for alt in alt_alleles)
features.feature['alt_allele_indices/encoded'].bytes_list.value.append(
deepvariant_pb2.CallVariantsOutput.AltAlleleIndices(
indices=alt_indices
).SerializeToString()
)
features.feature['image/encoded'].bytes_list.value.append(encoded_image)
features.feature['image/shape'].int64_list.value.extend(shape)
if second_image is not None:
features.feature['second_image/encoded'].bytes_list.value.append(
second_image.encode('latin-1')
)
features.feature['second_image/shape'].int64_list.value.extend(shape)
features.feature['sequencing_type'].int64_list.value.append(sequencing_type)
return example