[9b26b7]: / third_party / nucleus / util / variantcall_utils.py

Download this file

275 lines (208 with data), 9.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Copyright 2018 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""VariantCall utilities."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from third_party.nucleus.protos import struct_pb2
from third_party.nucleus.protos import variants_pb2
from third_party.nucleus.util import struct_utils
from third_party.nucleus.util import vcf_constants
# Special-cased FORMAT fields that are first-class fields in the VariantCall.
_GL = 'GL'
_GT = 'GT'
def set_format(variant_call, field_name, value, vcf_object=None):
"""Sets a field of the info map of the `VariantCall` to the given value(s).
`variant_call.info` is analogous to the FORMAT field of a VCF call.
Example usage:
with vcf.VcfReader('/path/to/my.vcf') as vcf_reader:
for variant in vcf_reader:
first_call = variant.calls[0]
# Type can be inferred for reserved VCF fields.
set_format(first_call, 'AD', 25)
# Specify the reader explicitly for unknown fields.
set_format(first_call, 'MYFIELD', 30, vcf_reader)
Args:
variant_call: VariantCall proto. The VariantCall to modify.
field_name: str. The name of the field to set.
value: A single value or list of values to update the VariantCall with.
The type of the value is determined by the `vcf_object` if one is given,
otherwise is looked up based on the reserved FORMAT fields in the VCF
specification.
vcf_object: (Optional) A VcfReader or VcfWriter object. If not None, the
type of the field is inferred from the associated VcfReader or VcfWriter
based on its name. Otherwise, the type is inferred if it is a reserved
field.
"""
if field_name == _GL:
set_gl(variant_call, value)
return
if field_name == _GT:
set_gt(variant_call, value)
return
if vcf_object is None:
set_field_fn = vcf_constants.reserved_format_field_set_fn(field_name)
else:
set_field_fn = vcf_object.field_access_cache.format_field_set_fn(field_name)
set_field_fn(variant_call.info, field_name, value)
def get_format(variant_call, field_name, vcf_object=None):
"""Returns the value of the `field_name` FORMAT field.
The `vcf_object` is used to determine the type of the resulting value. If it
is a single value or a Flag, that single value will be returned. Otherwise,
the list of values is returned.
Args:
variant_call: VariantCall proto. The VariantCall of interest.
field_name: str. The name of the field to retrieve values from.
vcf_object: (Optional) A VcfReader or VcfWriter object. If not None, the
type of the field is inferred from the associated VcfReader or VcfWriter
based on its name. Otherwise, the type is inferred if it is a reserved
field.
"""
if field_name == _GL:
return get_gl(variant_call)
if field_name == _GT:
return get_gt(variant_call)
if vcf_object is None:
get_field_fn = vcf_constants.reserved_format_field_get_fn(field_name)
else:
get_field_fn = vcf_object.field_access_cache.format_field_get_fn(field_name)
return get_field_fn(variant_call.info, field_name)
# The following functions are convenience methods for getting/setting some
# reserved FORMAT fields of a VariantCall as well as some non-reserved FORMAT
# fields used by DeepVariant. Note that these functions will use the types of
# each field as defined by the VCF 4.3 specification, mirrored in
# vcf_constants.py, so if you have redefined any of these fields to have
# different types these functions will not do what you want.
def set_ad(variant_call, ad):
"""Sets the allele depth of the VariantCall."""
set_format(variant_call, 'AD', ad)
def get_ad(variant_call):
"""Gets the allele depth of the VariantCall."""
return get_format(variant_call, 'AD')
def set_gl(variant_call, gl):
"""Sets the genotype likelihoods of the VariantCall.
Args:
variant_call: VariantCall proto. The VariantCall to modify.
gl: list(float). The list of genotype likelihoods for the VariantCall.
"""
# Note: genotype_likelihood is extracted to a first-class field within
# VariantCall. Consequently, we just set its value directly here.
variant_call.genotype_likelihood[:] = gl
def get_gl(variant_call):
"""Returns the genotype likelihoods of the VariantCall.
Args:
variant_call: VariantCall proto. The VariantCall for which to return GLs.
Returns:
A list of floats representing the genotype likelihoods of this call.
"""
return variant_call.genotype_likelihood
def set_gt(variant_call, gt):
"""Sets the genotypes of the VariantCall.
Args:
variant_call: VariantCall proto. The VariantCall to modify.
gt: list(int). The list of genotypes for the VariantCall.
"""
# Note: genotype is extracted to a first-class field within
# VariantCall. Consequently, we just set its value directly here.
variant_call.genotype[:] = gt
def get_gt(variant_call):
"""Returns the genotypes of the VariantCall.
Args:
variant_call: VariantCall proto. The VariantCall for which to return GTs.
Returns:
A list of ints representing the genotype indices of this call.
"""
return variant_call.genotype
def set_gq(variant_call, gq):
"""Sets the genotype quality of the VariantCall."""
set_format(variant_call, 'GQ', gq)
def get_gq(variant_call):
"""Gets the genotype quality of the VariantCall."""
return get_format(variant_call, 'GQ')
def set_med_dp(variant_call, med_dp):
"""Sets the 'MED_DP' field of the VariantCall."""
struct_utils.set_int_field(variant_call.info, 'MED_DP', med_dp)
def get_med_dp(variant_call):
"""Gets the 'MED_DP' field of the VariantCall."""
return struct_utils.get_int_field(
variant_call.info, 'MED_DP', is_single_field=True)
def set_min_dp(variant_call, min_dp):
"""Sets the 'MIN_DP' field of the VariantCall."""
struct_utils.set_int_field(variant_call.info, 'MIN_DP', min_dp)
def get_min_dp(variant_call):
"""Gets the 'MIN_DP' field of the VariantCall."""
return struct_utils.get_int_field(
variant_call.info, 'MIN_DP', is_single_field=True)
def set_bam_fname(variant_call, bam_fname):
"""Sets 'BAM_FNAME' field of the VariantCall."""
return struct_utils.set_string_field(variant_call.info, 'BAM_FNAME',
bam_fname)
def has_genotypes(variant_call):
"""Returns True iff the VariantCall has one or more called genotypes.
Args:
variant_call: VariantCall proto. The VariantCall to evaluate.
Returns:
True if the VariantCall has one or more called genotypes, False otherwise.
"""
return any(gt >= 0 for gt in variant_call.genotype)
def has_full_genotypes(variant_call):
"""Returns True iff the VariantCall has only known genotypes.
Args:
variant_call: VariantCall proto. The VariantCall to evaluate.
Returns:
True if all `genotype` fields are known genotypes.
"""
return all(gt >= 0 for gt in variant_call.genotype)
def ploidy(variant_call):
"""Returns the ploidy of the VariantCall.
Args:
variant_call: VariantCall proto. The VariantCall to evaluate.
Returns:
The ploidy of the call (a non-negative integer).
"""
# Unknown genotypes are represented as -1 in VariantCall protos. When
# a VCF is parsed that contains multiple ploidies in different samples,
# a separate padding value of -2**30 - 1 is inserted into the calls.
return sum(gt >= -1 for gt in variant_call.genotype)
def has_variation(variant_call):
"""Returns True if and only if the call has a non-reference genotype.
Args:
variant_call: VariantCall proto. The VariantCall to evaluate.
Returns:
True if and only if the call has a non-reference genotype.
"""
return any(gt > 0 for gt in variant_call.genotype)
def is_heterozygous(variant_call):
"""Returns True if and only if the call is heterozygous.
Args:
variant_call: VariantCall proto. The VariantCall to evaluate.
Returns:
True if and only if the call is heterozygous.
"""
return len({gt for gt in variant_call.genotype if gt >= 0}) >= 2