[9b26b7]: / deepvariant / dv_vcf_constants.py

Download this file

150 lines (136 with data), 5.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Copyright 2017 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""Library for generating VCF information created by DeepVariant."""
from third_party.nucleus.protos import variants_pb2
from third_party.nucleus.util import vcf_constants
# Current release version of DeepVariant.
DEEP_VARIANT_VERSION = '1.6.1'
# FILTER field IDs.
DEEP_VARIANT_PASS = 'PASS'
DEEP_VARIANT_REF_FILTER = 'RefCall'
DEEP_VARIANT_QUAL_FILTER = 'LowQual'
DEEP_VARIANT_NO_CALL = 'NoCall'
DEEP_VARIANT_GERMLINE = 'GERMLINE'
# FORMAT field IDs.
DEEP_VARIANT_MIN_DP_FORMAT = 'MIN_DP'
DEEP_VARIANT_MED_DP_FORMAT = 'MED_DP'
DEEP_VARIANT_VAF_FORMAT = 'VAF'
# Genotype codes:
UNCALLED_GENOTYPE = -1
def deepvariant_header(
contigs, sample_names, add_info_candidates=False, include_med_dp=True
):
"""Returns a VcfHeader used for writing VCF output.
This function fills out the FILTER, INFO, FORMAT, and extra header information
created by the DeepVariant pipeline using consistent fields that DeepVariant
creates. The `contigs` and `sample_names` fields are unique depending on the
input data used, so are required inputs.
Args:
contigs: list(ContigInfo). The list of contigs on which variants were
called.
sample_names: list(str). The list of samples present in the run.
add_info_candidates: Adds the 'CANDIDATES' info field for debugging
purposes.
include_med_dp: boolean. If True, we will include MED_DP.
Returns:
A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given
samples and contigs populated.
"""
version = variants_pb2.VcfExtra(
key='DeepVariant_version', value=DEEP_VARIANT_VERSION
)
info_fields = [
vcf_constants.reserved_info_field('END'),
]
formats = [
vcf_constants.reserved_format_field('GT'),
vcf_constants.reserved_format_field('GQ'),
vcf_constants.reserved_format_field('DP'),
variants_pb2.VcfFormatInfo(
id=DEEP_VARIANT_MIN_DP_FORMAT,
number='1',
type='Integer',
description='Minimum DP observed within the GVCF block.',
),
vcf_constants.reserved_format_field('AD'),
variants_pb2.VcfFormatInfo(
id=DEEP_VARIANT_VAF_FORMAT,
number='A',
type='Float',
description='Variant allele fractions.',
),
vcf_constants.reserved_format_field('PL'),
]
if add_info_candidates:
info_fields.append(
variants_pb2.VcfInfo(
id='CANDIDATES',
number='1',
type=vcf_constants.STRING_TYPE,
description='pipe-delimited candidate alleles.',
)
)
if include_med_dp:
formats.append(
variants_pb2.VcfFormatInfo(
id=DEEP_VARIANT_MED_DP_FORMAT,
number='1',
type='Integer',
description=(
'Median DP observed within the GVCF block '
'rounded to the nearest integer.'
),
)
)
return variants_pb2.VcfHeader(
fileformat='VCFv4.2',
filters=[
vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS),
variants_pb2.VcfFilterInfo(
id=DEEP_VARIANT_REF_FILTER,
description='Genotyping model thinks this site is reference.',
),
variants_pb2.VcfFilterInfo(
id=DEEP_VARIANT_QUAL_FILTER,
description=(
'Confidence in this variant being real is below '
'calling threshold.'
),
),
variants_pb2.VcfFilterInfo(
id=DEEP_VARIANT_NO_CALL,
description='Site has depth=0 resulting in no call.',
),
],
infos=info_fields,
formats=formats,
contigs=contigs,
sample_names=sample_names,
extras=[version],
)