|
a |
|
b/third_party/nucleus/protos/variants.proto |
|
|
1 |
// Copyright 2018 Google LLC. |
|
|
2 |
// |
|
|
3 |
// Redistribution and use in source and binary forms, with or without |
|
|
4 |
// modification, are permitted provided that the following conditions |
|
|
5 |
// are met: |
|
|
6 |
// |
|
|
7 |
// 1. Redistributions of source code must retain the above copyright notice, |
|
|
8 |
// this list of conditions and the following disclaimer. |
|
|
9 |
// |
|
|
10 |
// 2. Redistributions in binary form must reproduce the above copyright |
|
|
11 |
// notice, this list of conditions and the following disclaimer in the |
|
|
12 |
// documentation and/or other materials provided with the distribution. |
|
|
13 |
// |
|
|
14 |
// 3. Neither the name of the copyright holder nor the names of its |
|
|
15 |
// contributors may be used to endorse or promote products derived from this |
|
|
16 |
// software without specific prior written permission. |
|
|
17 |
// |
|
|
18 |
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
|
19 |
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
|
20 |
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
|
21 |
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
|
|
22 |
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
|
23 |
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
|
24 |
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
|
25 |
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
|
26 |
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
|
27 |
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
|
28 |
// POSSIBILITY OF SUCH DAMAGE. |
|
|
29 |
syntax = "proto3"; |
|
|
30 |
|
|
|
31 |
package nucleus.genomics.v1; |
|
|
32 |
|
|
|
33 |
import "third_party/nucleus/protos/reference.proto"; |
|
|
34 |
import "third_party/nucleus/protos/struct.proto"; |
|
|
35 |
|
|
|
36 |
// A variant represents a change in DNA sequence relative to a reference |
|
|
37 |
// sequence. For example, a variant could represent a SNP or an insertion. |
|
|
38 |
// |
|
|
39 |
// The definition of the Variant message closely follows the common VCF variant |
|
|
40 |
// representation. |
|
|
41 |
// |
|
|
42 |
// Each of the calls on a variant represent a determination of genotype with |
|
|
43 |
// respect to that variant. For example, a call might assign probability of 0.32 |
|
|
44 |
// to the occurrence of a SNP named rs1234 in a sample named NA12345. |
|
|
45 |
// NextID: 17 |
|
|
46 |
message Variant { |
|
|
47 |
reserved 1, 4, 5; |
|
|
48 |
|
|
|
49 |
// The reference on which this variant occurs. |
|
|
50 |
// (such as `chr20` or `X`) |
|
|
51 |
// Corresponds to the "CHROM" field of VCF 4.3. |
|
|
52 |
string reference_name = 14; |
|
|
53 |
|
|
|
54 |
// The position at which this variant occurs (0-based inclusive). |
|
|
55 |
// This corresponds to the first base of the string of reference bases. |
|
|
56 |
int64 start = 16; |
|
|
57 |
|
|
|
58 |
// The end position (0-based exclusive) of this variant. This corresponds to |
|
|
59 |
// the first base after the last base in the reference allele. So, the length |
|
|
60 |
// of the reference allele is (end - start). This is useful for variants |
|
|
61 |
// that don't explicitly give alternate bases, for example large deletions. |
|
|
62 |
int64 end = 13; |
|
|
63 |
|
|
|
64 |
// Names for the variant, for example a dbSNP ID. |
|
|
65 |
// Corresponds to the "ID" field of VCF 4.3. |
|
|
66 |
repeated string names = 3; |
|
|
67 |
|
|
|
68 |
// The reference bases for this variant. They start at the given |
|
|
69 |
// position. |
|
|
70 |
string reference_bases = 6; |
|
|
71 |
|
|
|
72 |
// The bases that appear instead of the reference bases. |
|
|
73 |
repeated string alternate_bases = 7; |
|
|
74 |
|
|
|
75 |
// A measure of how likely this variant is to be real. |
|
|
76 |
// A higher value is better. |
|
|
77 |
// Since this is a Phred-scaled probability (i.e. is -10 * log_10(p) for some |
|
|
78 |
// p, which depends on whether this is a variant or non-variant site) it is |
|
|
79 |
// guaranteed to be non-negative. We use -1 to represent the `unset` value. |
|
|
80 |
double quality = 8; |
|
|
81 |
|
|
|
82 |
// A list of filters (normally quality filters) this variant has failed. |
|
|
83 |
// `PASS` indicates this variant has passed all filters. |
|
|
84 |
repeated string filter = 9; |
|
|
85 |
|
|
|
86 |
// A map of additional variant information. This must be of the form |
|
|
87 |
// map<string, string[]> (string key mapping to a list of string values). |
|
|
88 |
map<string, ListValue> info = 10; |
|
|
89 |
|
|
|
90 |
// The variant calls for this particular variant. Each one represents the |
|
|
91 |
// determination of genotype with respect to this variant. |
|
|
92 |
repeated VariantCall calls = 11; |
|
|
93 |
|
|
|
94 |
///////////////////////////////////////////////////////////////////////// |
|
|
95 |
// DEPRECATED or unused fields of the Variant proto below. |
|
|
96 |
// These are relics of the Google Genomics API and/or are used to support |
|
|
97 |
// GA4GH specs. |
|
|
98 |
// |
|
|
99 |
// The ID of the variant set this variant belongs to. |
|
|
100 |
// DEPRECATED. |
|
|
101 |
string variant_set_id = 15; |
|
|
102 |
|
|
|
103 |
// The server-generated variant ID, unique across all variants. |
|
|
104 |
// DEPRECATED. |
|
|
105 |
string id = 2; |
|
|
106 |
|
|
|
107 |
// The date this variant was created, in milliseconds from the epoch. |
|
|
108 |
// (-- GA4GH also specifies an "updated" timestamp. --) |
|
|
109 |
// DEPRECATED. |
|
|
110 |
int64 created = 12; |
|
|
111 |
} |
|
|
112 |
|
|
|
113 |
// A call represents the determination of genotype with respect to a particular |
|
|
114 |
// variant. It may include associated information such as quality and phasing. |
|
|
115 |
// For example, a call might assign a probability of 0.32 to the occurrence of |
|
|
116 |
// a SNP named rs1234 in a call set with the name NA12345. |
|
|
117 |
// NextID: 10 |
|
|
118 |
message VariantCall { |
|
|
119 |
reserved 1, 3, 4; |
|
|
120 |
|
|
|
121 |
// The name of the call set this variant call belongs to. Also known as |
|
|
122 |
// "sample". |
|
|
123 |
string call_set_name = 9; |
|
|
124 |
|
|
|
125 |
// The genotype of this variant call. Each value represents either the value |
|
|
126 |
// of the `referenceBases` field or a 1-based index into `alternateBases`. If |
|
|
127 |
// a variant had a `referenceBases` value of `T` and an `alternateBases` value |
|
|
128 |
// of `["A", "C"]`, and the `genotype` was `[2, 1]`, that would mean the call |
|
|
129 |
// represented the heterozygous value `CA` for this variant. If the `genotype` |
|
|
130 |
// was instead `[0, 1]`, the represented value would be `TA`. Ordering of the |
|
|
131 |
// genotype values is important if the `phaseset` is present ('PS' field in |
|
|
132 |
// the call.info map). Uncalled genotypes (represented as `.` in the GT |
|
|
133 |
// string) are represented by -1 in this array. |
|
|
134 |
repeated int32 genotype = 7; |
|
|
135 |
|
|
|
136 |
// If true, this variant call's genotype ordering implies the phase of the |
|
|
137 |
// bases and is consistent with any other variant calls in the same reference |
|
|
138 |
// sequence which have the same phaseset value (the integer 'PS' field in the |
|
|
139 |
// call.info map). If this is true but the 'PS' field is not set, the call is |
|
|
140 |
// assumed to be phased with all other calls for which the same state applies. |
|
|
141 |
bool is_phased = 10; |
|
|
142 |
|
|
|
143 |
// DEPRECATED. This previously was used as a special-cased field for capturing |
|
|
144 |
// phasing information. This field should no longer be set, in favor of using |
|
|
145 |
// the 'PS' field in the call.info map and the `is_phased` boolean attribute. |
|
|
146 |
string phaseset = 5; // DEPRECATED. |
|
|
147 |
|
|
|
148 |
// The genotype log10-likelihoods for this variant call. Each array entry |
|
|
149 |
// represents how likely a specific genotype is for this call. The value |
|
|
150 |
// ordering is defined by the GL tag in the VCF spec. |
|
|
151 |
// If Phred-scaled genotype likelihood scores (PL) are available and |
|
|
152 |
// log10(P) genotype likelihood scores (GL) are not, PL scores are converted |
|
|
153 |
// to GL scores. If both are available, the GL scores are stored here and |
|
|
154 |
// PL scores are omitted (as they are just a lower-resolution representation |
|
|
155 |
// of GL scores). |
|
|
156 |
repeated double genotype_likelihood = 6; |
|
|
157 |
|
|
|
158 |
// A map of additional variant call information. This must be of the form |
|
|
159 |
// map<string, string[]> (string key mapping to a list of string values). |
|
|
160 |
map<string, ListValue> info = 2; |
|
|
161 |
|
|
|
162 |
///////////////////////////////////////////////////////////////////////// |
|
|
163 |
// DEPRECATED or unused fields of the VariantCall proto below. |
|
|
164 |
// These are relics of the Google Genomics API and/or are used to support |
|
|
165 |
// GA4GH specs. |
|
|
166 |
// |
|
|
167 |
// The ID of the call set this variant call belongs to. |
|
|
168 |
string call_set_id = 8; |
|
|
169 |
} |
|
|
170 |
|
|
|
171 |
// This record type mirrors a VCF header. See |
|
|
172 |
// https://samtools.github.io/hts-specs/VCFv4.3.pdf for details on the spec. |
|
|
173 |
message VcfHeader { |
|
|
174 |
// The required first line of the VCF. Values e.g. "VCFv4.3". |
|
|
175 |
string fileformat = 1; |
|
|
176 |
|
|
|
177 |
// The list of contigs used to produce this VCF. All variants within the VCF |
|
|
178 |
// must lie on a contig represented here. All contigs must have distinct IDs. |
|
|
179 |
repeated ContigInfo contigs = 2; |
|
|
180 |
|
|
|
181 |
// A list of all filters used to produce this VCF. All variants within the VCF |
|
|
182 |
// must only use filters represented here. All filters must have distinct IDs. |
|
|
183 |
repeated VcfFilterInfo filters = 3; |
|
|
184 |
|
|
|
185 |
// A list of all info tags used to annotate variants within the VCF. All info |
|
|
186 |
// fields present in Variants must only use infos with IDs represented here. |
|
|
187 |
// All infos must have distinct IDs. |
|
|
188 |
repeated VcfInfo infos = 4; |
|
|
189 |
|
|
|
190 |
// A list of all format fields used to annotate genotypes within the VCF. All |
|
|
191 |
// fields present in VariantCalls must only use formats with IDs represented |
|
|
192 |
// here. All formats must have distinct IDs. |
|
|
193 |
repeated VcfFormatInfo formats = 5; |
|
|
194 |
|
|
|
195 |
// An ordered list of all the sample names present in the VCF. All Variants |
|
|
196 |
// within the VCF must contain `len(sample_names)` VariantCalls and must be |
|
|
197 |
// in the same order. I.e. for any Variant v, |
|
|
198 |
// v.calls[i].call_set_name == sample_names[i] for all i. |
|
|
199 |
repeated string sample_names = 6; |
|
|
200 |
|
|
|
201 |
// A list of all header lines that are not one represented above, represented |
|
|
202 |
// in a key=value format. The key within the extras may be duplicated. |
|
|
203 |
repeated VcfStructuredExtra structured_extras = 8; |
|
|
204 |
|
|
|
205 |
// A list of all header lines that are not one represented above, represented |
|
|
206 |
// in an unstructured format. The key within the extras may be duplicated. |
|
|
207 |
repeated VcfExtra extras = 7; |
|
|
208 |
} |
|
|
209 |
|
|
|
210 |
// The below messages are sub-messages of the VCF header. They are not nested |
|
|
211 |
// within VcfHeader simply to avoid verbosity. |
|
|
212 |
// |
|
|
213 |
// We comment fields in one of three states: |
|
|
214 |
// "Required": Required by both the VCF file format and for downstream users of |
|
|
215 |
// Variant and VariantCall protos. |
|
|
216 |
// "Required by VCF": Required by the VCF file format, unused otherwise. |
|
|
217 |
// "Optional": Optional within the VCF file format, unused otherwise. |
|
|
218 |
// |
|
|
219 |
// This record type mirrors a VCF "FILTER" header. |
|
|
220 |
message VcfFilterInfo { |
|
|
221 |
// Required. The unique ID of the filter. Examples include "PASS", "RefCall". |
|
|
222 |
string id = 1; |
|
|
223 |
|
|
|
224 |
// Required by VCF. The description of the filter. |
|
|
225 |
string description = 2; |
|
|
226 |
} |
|
|
227 |
|
|
|
228 |
// This message type mirrors a VCF "INFO" header. |
|
|
229 |
message VcfInfo { |
|
|
230 |
// Required. The unique ID of the INFO field. Examples include "MQ0" or "END". |
|
|
231 |
string id = 1; |
|
|
232 |
|
|
|
233 |
// Required. The number of values included with the info field. This should be |
|
|
234 |
// the string representation of the number, e.g. "1" for a single entry, "2" |
|
|
235 |
// for a pair of entries, etc. Special cases arise when the number of entries |
|
|
236 |
// depend on attributes of the Variant or are unknown in advance, and include: |
|
|
237 |
// "A": The field has one value per alternate allele. |
|
|
238 |
// "R": The field has one value per allele (including the reference). |
|
|
239 |
// "G": The field has one value for each possible genotype. |
|
|
240 |
// ".": The number of values varies, is unknown, or is unbounded. |
|
|
241 |
string number = 2; |
|
|
242 |
|
|
|
243 |
// Required. The type of the INFO field. Valid values are "Integer", "Float", |
|
|
244 |
// "Flag", "Character", and "String". |
|
|
245 |
string type = 3; |
|
|
246 |
|
|
|
247 |
// Required by VCF. The description of the field. |
|
|
248 |
string description = 4; |
|
|
249 |
|
|
|
250 |
// Optional. The annotation source used to generate the field. |
|
|
251 |
string source = 5; |
|
|
252 |
|
|
|
253 |
// Optional. The version of the annotation source used to generate the field. |
|
|
254 |
string version = 6; |
|
|
255 |
} |
|
|
256 |
|
|
|
257 |
// This record type mirrors a VCF "FORMAT" header. |
|
|
258 |
message VcfFormatInfo { |
|
|
259 |
// Required. The unique ID of the FORMAT field. Examples include "GT", "PL". |
|
|
260 |
string id = 1; |
|
|
261 |
|
|
|
262 |
// Required. The number of entries expected. See description above in the |
|
|
263 |
// VcfInfo message. |
|
|
264 |
string number = 2; |
|
|
265 |
|
|
|
266 |
// Required. The type of the field. Valid values are "Integer", "Float", |
|
|
267 |
// "Character", and "String" (same as INFO except "Flag" is not supported). |
|
|
268 |
string type = 3; |
|
|
269 |
|
|
|
270 |
// Required by VCF. The description of the field. |
|
|
271 |
string description = 4; |
|
|
272 |
} |
|
|
273 |
|
|
|
274 |
// This record type is a catch-all for other headers containing multiple |
|
|
275 |
// key-value pairs. For example, headers may have META lines that provide |
|
|
276 |
// metadata about the VCF as a whole, e.g. |
|
|
277 |
// ##META=<ID=Assay,Type=String,Number=.,Values=[WholeGenome, Exome]> |
|
|
278 |
// The VcfStructuredExtra message would represent this with key="META", |
|
|
279 |
// and fields mapping "ID" -> "Assay", "Type" -> "String", etc. |
|
|
280 |
message VcfStructuredExtra { |
|
|
281 |
// Required by VCF. The key of the extra header field. Note that this key does |
|
|
282 |
// not have to be unique within a VcfHeader. |
|
|
283 |
string key = 1; |
|
|
284 |
|
|
|
285 |
// Required by VCF. The key=value pairs contained in the structure. |
|
|
286 |
repeated VcfExtra fields = 2; |
|
|
287 |
} |
|
|
288 |
|
|
|
289 |
// This record type is a catch-all for other types of headers. For example, |
|
|
290 |
// ##pedigreeDB=http://url_of_pedigrees |
|
|
291 |
// The VcfExtra message would represent this with key="pedigreeDB", |
|
|
292 |
// value="http://url_of_pedigrees". |
|
|
293 |
message VcfExtra { |
|
|
294 |
// Required by VCF. The key of the extra header field. Note that this key does |
|
|
295 |
// not have to be unique within a VcfHeader. |
|
|
296 |
string key = 1; |
|
|
297 |
|
|
|
298 |
// Required by VCF. The value of the extra header field. |
|
|
299 |
string value = 2; |
|
|
300 |
} |
|
|
301 |
|
|
|
302 |
/////////////////////////////////////////////////////////////////////////////// |
|
|
303 |
// I/O-related messages. |
|
|
304 |
/////////////////////////////////////////////////////////////////////////////// |
|
|
305 |
|
|
|
306 |
// The Vcf{Reader,Writer}Options messages are used to alter the properties of |
|
|
307 |
// reading and writing variants. They enables certain fields to be omitted from |
|
|
308 |
// parsing. |
|
|
309 |
message VcfReaderOptions { |
|
|
310 |
reserved 1, 2; |
|
|
311 |
|
|
|
312 |
// A list of all INFO field IDs that should be excluded from parsing. |
|
|
313 |
repeated string excluded_info_fields = 3; |
|
|
314 |
|
|
|
315 |
// A list of all FORMAT field IDs that should be excluded from parsing. |
|
|
316 |
repeated string excluded_format_fields = 4; |
|
|
317 |
|
|
|
318 |
// If true, the GL and PL format tags are stored in the VariantCall.info map |
|
|
319 |
// with the type and number as specified in the VCF header, similar to other |
|
|
320 |
// FORMAT fields. Otherwise, the GL and PL tags are special-cased and |
|
|
321 |
// available in the VariantCall.genotype_likelihood field, with the |
|
|
322 |
// enforcement that each is of type=Float and Number=G. |
|
|
323 |
bool store_gl_and_pl_in_info_map = 5; |
|
|
324 |
} |
|
|
325 |
|
|
|
326 |
message VcfWriterOptions { |
|
|
327 |
reserved 1, 2, 3, 4, 5; |
|
|
328 |
|
|
|
329 |
// A list of all INFO field IDs that should be excluded from writing. |
|
|
330 |
repeated string excluded_info_fields = 7; |
|
|
331 |
|
|
|
332 |
// A list of all FORMAT field IDs that should be excluded from writing. |
|
|
333 |
repeated string excluded_format_fields = 8; |
|
|
334 |
|
|
|
335 |
// Should QUAL field values be rounded to one point past the decimal? |
|
|
336 |
bool round_qual_values = 6; |
|
|
337 |
|
|
|
338 |
// If true, the GL and PL format tags are written from the VariantCall.info |
|
|
339 |
// map with the type and number as specified in the VCF header. In this case, |
|
|
340 |
// any values set in the VariantCall.genotype_likelihood field are ignored. |
|
|
341 |
// Otherwise, the GL and PL tags are retrieved from the |
|
|
342 |
// VariantCall.genotype_likelihood field, with the enforcement that each is of |
|
|
343 |
// type=Float and Number=G, and neither GL nor PL should be present in the |
|
|
344 |
// VariantCall.info map. |
|
|
345 |
bool retrieve_gl_and_pl_from_info_map = 9; |
|
|
346 |
|
|
|
347 |
// If true, the writer will skip writing the VcfHeader. |
|
|
348 |
bool exclude_header = 10; |
|
|
349 |
} |