Switch to unified view

a b/third_party/nucleus/protos/variants.proto
1
// Copyright 2018 Google LLC.
2
//
3
// Redistribution and use in source and binary forms, with or without
4
// modification, are permitted provided that the following conditions
5
// are met:
6
//
7
// 1. Redistributions of source code must retain the above copyright notice,
8
//    this list of conditions and the following disclaimer.
9
//
10
// 2. Redistributions in binary form must reproduce the above copyright
11
//    notice, this list of conditions and the following disclaimer in the
12
//    documentation and/or other materials provided with the distribution.
13
//
14
// 3. Neither the name of the copyright holder nor the names of its
15
//    contributors may be used to endorse or promote products derived from this
16
//    software without specific prior written permission.
17
//
18
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
// POSSIBILITY OF SUCH DAMAGE.
29
syntax = "proto3";
30
31
package nucleus.genomics.v1;
32
33
import "third_party/nucleus/protos/reference.proto";
34
import "third_party/nucleus/protos/struct.proto";
35
36
// A variant represents a change in DNA sequence relative to a reference
37
// sequence. For example, a variant could represent a SNP or an insertion.
38
//
39
// The definition of the Variant message closely follows the common VCF variant
40
// representation.
41
//
42
// Each of the calls on a variant represent a determination of genotype with
43
// respect to that variant. For example, a call might assign probability of 0.32
44
// to the occurrence of a SNP named rs1234 in a sample named NA12345.
45
// NextID: 17
46
message Variant {
47
  reserved 1, 4, 5;
48
49
  // The reference on which this variant occurs.
50
  // (such as `chr20` or `X`)
51
  // Corresponds to the "CHROM" field of VCF 4.3.
52
  string reference_name = 14;
53
54
  // The position at which this variant occurs (0-based inclusive).
55
  // This corresponds to the first base of the string of reference bases.
56
  int64 start = 16;
57
58
  // The end position (0-based exclusive) of this variant. This corresponds to
59
  // the first base after the last base in the reference allele. So, the length
60
  // of the reference allele is (end - start). This is useful for variants
61
  // that don't explicitly give alternate bases, for example large deletions.
62
  int64 end = 13;
63
64
  // Names for the variant, for example a dbSNP ID.
65
  // Corresponds to the "ID" field of VCF 4.3.
66
  repeated string names = 3;
67
68
  // The reference bases for this variant. They start at the given
69
  // position.
70
  string reference_bases = 6;
71
72
  // The bases that appear instead of the reference bases.
73
  repeated string alternate_bases = 7;
74
75
  // A measure of how likely this variant is to be real.
76
  // A higher value is better.
77
  // Since this is a Phred-scaled probability (i.e. is -10 * log_10(p) for some
78
  // p, which depends on whether this is a variant or non-variant site) it is
79
  // guaranteed to be non-negative. We use -1 to represent the `unset` value.
80
  double quality = 8;
81
82
  // A list of filters (normally quality filters) this variant has failed.
83
  // `PASS` indicates this variant has passed all filters.
84
  repeated string filter = 9;
85
86
  // A map of additional variant information. This must be of the form
87
  // map<string, string[]> (string key mapping to a list of string values).
88
  map<string, ListValue> info = 10;
89
90
  // The variant calls for this particular variant. Each one represents the
91
  // determination of genotype with respect to this variant.
92
  repeated VariantCall calls = 11;
93
94
  /////////////////////////////////////////////////////////////////////////
95
  // DEPRECATED or unused fields of the Variant proto below.
96
  // These are relics of the Google Genomics API and/or are used to support
97
  // GA4GH specs.
98
  //
99
  // The ID of the variant set this variant belongs to.
100
  // DEPRECATED.
101
  string variant_set_id = 15;
102
103
  // The server-generated variant ID, unique across all variants.
104
  // DEPRECATED.
105
  string id = 2;
106
107
  // The date this variant was created, in milliseconds from the epoch.
108
  // (-- GA4GH also specifies an "updated" timestamp. --)
109
  // DEPRECATED.
110
  int64 created = 12;
111
}
112
113
// A call represents the determination of genotype with respect to a particular
114
// variant. It may include associated information such as quality and phasing.
115
// For example, a call might assign a probability of 0.32 to the occurrence of
116
// a SNP named rs1234 in a call set with the name NA12345.
117
// NextID: 10
118
message VariantCall {
119
  reserved 1, 3, 4;
120
121
  // The name of the call set this variant call belongs to. Also known as
122
  // "sample".
123
  string call_set_name = 9;
124
125
  // The genotype of this variant call. Each value represents either the value
126
  // of the `referenceBases` field or a 1-based index into `alternateBases`. If
127
  // a variant had a `referenceBases` value of `T` and an `alternateBases` value
128
  // of `["A", "C"]`, and the `genotype` was `[2, 1]`, that would mean the call
129
  // represented the heterozygous value `CA` for this variant. If the `genotype`
130
  // was instead `[0, 1]`, the represented value would be `TA`. Ordering of the
131
  // genotype values is important if the `phaseset` is present ('PS' field in
132
  // the call.info map). Uncalled genotypes (represented as `.` in the GT
133
  // string) are represented by -1 in this array.
134
  repeated int32 genotype = 7;
135
136
  // If true, this variant call's genotype ordering implies the phase of the
137
  // bases and is consistent with any other variant calls in the same reference
138
  // sequence which have the same phaseset value (the integer 'PS' field in the
139
  // call.info map). If this is true but the 'PS' field is not set, the call is
140
  // assumed to be phased with all other calls for which the same state applies.
141
  bool is_phased = 10;
142
143
  // DEPRECATED. This previously was used as a special-cased field for capturing
144
  // phasing information. This field should no longer be set, in favor of using
145
  // the 'PS' field in the call.info map and the `is_phased` boolean attribute.
146
  string phaseset = 5;  // DEPRECATED.
147
148
  // The genotype log10-likelihoods for this variant call. Each array entry
149
  // represents how likely a specific genotype is for this call. The value
150
  // ordering is defined by the GL tag in the VCF spec.
151
  // If Phred-scaled genotype likelihood scores (PL) are available and
152
  // log10(P) genotype likelihood scores (GL) are not, PL scores are converted
153
  // to GL scores.  If both are available, the GL scores are stored here and
154
  // PL scores are omitted (as they are just a lower-resolution representation
155
  // of GL scores).
156
  repeated double genotype_likelihood = 6;
157
158
  // A map of additional variant call information. This must be of the form
159
  // map<string, string[]> (string key mapping to a list of string values).
160
  map<string, ListValue> info = 2;
161
162
  /////////////////////////////////////////////////////////////////////////
163
  // DEPRECATED or unused fields of the VariantCall proto below.
164
  // These are relics of the Google Genomics API and/or are used to support
165
  // GA4GH specs.
166
  //
167
  // The ID of the call set this variant call belongs to.
168
  string call_set_id = 8;
169
}
170
171
// This record type mirrors a VCF header. See
172
// https://samtools.github.io/hts-specs/VCFv4.3.pdf for details on the spec.
173
message VcfHeader {
174
  // The required first line of the VCF. Values e.g. "VCFv4.3".
175
  string fileformat = 1;
176
177
  // The list of contigs used to produce this VCF. All variants within the VCF
178
  // must lie on a contig represented here. All contigs must have distinct IDs.
179
  repeated ContigInfo contigs = 2;
180
181
  // A list of all filters used to produce this VCF. All variants within the VCF
182
  // must only use filters represented here. All filters must have distinct IDs.
183
  repeated VcfFilterInfo filters = 3;
184
185
  // A list of all info tags used to annotate variants within the VCF. All info
186
  // fields present in Variants must only use infos with IDs represented here.
187
  // All infos must have distinct IDs.
188
  repeated VcfInfo infos = 4;
189
190
  // A list of all format fields used to annotate genotypes within the VCF. All
191
  // fields present in VariantCalls must only use formats with IDs represented
192
  // here. All formats must have distinct IDs.
193
  repeated VcfFormatInfo formats = 5;
194
195
  // An ordered list of all the sample names present in the VCF. All Variants
196
  // within the VCF must contain `len(sample_names)` VariantCalls and must be
197
  // in the same order. I.e. for any Variant v,
198
  // v.calls[i].call_set_name == sample_names[i] for all i.
199
  repeated string sample_names = 6;
200
201
  // A list of all header lines that are not one represented above, represented
202
  // in a key=value format. The key within the extras may be duplicated.
203
  repeated VcfStructuredExtra structured_extras = 8;
204
205
  // A list of all header lines that are not one represented above, represented
206
  // in an unstructured format. The key within the extras may be duplicated.
207
  repeated VcfExtra extras = 7;
208
}
209
210
// The below messages are sub-messages of the VCF header. They are not nested
211
// within VcfHeader simply to avoid verbosity.
212
//
213
// We comment fields in one of three states:
214
// "Required": Required by both the VCF file format and for downstream users of
215
//             Variant and VariantCall protos.
216
// "Required by VCF": Required by the VCF file format, unused otherwise.
217
// "Optional": Optional within the VCF file format, unused otherwise.
218
//
219
// This record type mirrors a VCF "FILTER" header.
220
message VcfFilterInfo {
221
  // Required. The unique ID of the filter. Examples include "PASS", "RefCall".
222
  string id = 1;
223
224
  // Required by VCF. The description of the filter.
225
  string description = 2;
226
}
227
228
// This message type mirrors a VCF "INFO" header.
229
message VcfInfo {
230
  // Required. The unique ID of the INFO field. Examples include "MQ0" or "END".
231
  string id = 1;
232
233
  // Required. The number of values included with the info field. This should be
234
  // the string representation of the number, e.g. "1" for a single entry, "2"
235
  // for a pair of entries, etc. Special cases arise when the number of entries
236
  // depend on attributes of the Variant or are unknown in advance, and include:
237
  // "A": The field has one value per alternate allele.
238
  // "R": The field has one value per allele (including the reference).
239
  // "G": The field has one value for each possible genotype.
240
  // ".": The number of values varies, is unknown, or is unbounded.
241
  string number = 2;
242
243
  // Required. The type of the INFO field. Valid values are "Integer", "Float",
244
  // "Flag", "Character", and "String".
245
  string type = 3;
246
247
  // Required by VCF. The description of the field.
248
  string description = 4;
249
250
  // Optional. The annotation source used to generate the field.
251
  string source = 5;
252
253
  // Optional. The version of the annotation source used to generate the field.
254
  string version = 6;
255
}
256
257
// This record type mirrors a VCF "FORMAT" header.
258
message VcfFormatInfo {
259
  // Required. The unique ID of the FORMAT field. Examples include "GT", "PL".
260
  string id = 1;
261
262
  // Required. The number of entries expected. See description above in the
263
  // VcfInfo message.
264
  string number = 2;
265
266
  // Required. The type of the field. Valid values are "Integer", "Float",
267
  // "Character", and "String" (same as INFO except "Flag" is not supported).
268
  string type = 3;
269
270
  // Required by VCF. The description of the field.
271
  string description = 4;
272
}
273
274
// This record type is a catch-all for other headers containing multiple
275
// key-value pairs. For example, headers may have META lines that provide
276
// metadata about the VCF as a whole, e.g.
277
// ##META=<ID=Assay,Type=String,Number=.,Values=[WholeGenome, Exome]>
278
// The VcfStructuredExtra message would represent this with key="META",
279
// and fields mapping "ID" -> "Assay", "Type" -> "String", etc.
280
message VcfStructuredExtra {
281
  // Required by VCF. The key of the extra header field. Note that this key does
282
  // not have to be unique within a VcfHeader.
283
  string key = 1;
284
285
  // Required by VCF. The key=value pairs contained in the structure.
286
  repeated VcfExtra fields = 2;
287
}
288
289
// This record type is a catch-all for other types of headers. For example,
290
// ##pedigreeDB=http://url_of_pedigrees
291
// The VcfExtra message would represent this with key="pedigreeDB",
292
// value="http://url_of_pedigrees".
293
message VcfExtra {
294
  // Required by VCF. The key of the extra header field. Note that this key does
295
  // not have to be unique within a VcfHeader.
296
  string key = 1;
297
298
  // Required by VCF. The value of the extra header field.
299
  string value = 2;
300
}
301
302
///////////////////////////////////////////////////////////////////////////////
303
// I/O-related messages.
304
///////////////////////////////////////////////////////////////////////////////
305
306
// The Vcf{Reader,Writer}Options messages are used to alter the properties of
307
// reading and writing variants. They enables certain fields to be omitted from
308
// parsing.
309
message VcfReaderOptions {
310
  reserved 1, 2;
311
312
  // A list of all INFO field IDs that should be excluded from parsing.
313
  repeated string excluded_info_fields = 3;
314
315
  // A list of all FORMAT field IDs that should be excluded from parsing.
316
  repeated string excluded_format_fields = 4;
317
318
  // If true, the GL and PL format tags are stored in the VariantCall.info map
319
  // with the type and number as specified in the VCF header, similar to other
320
  // FORMAT fields. Otherwise, the GL and PL tags are special-cased and
321
  // available in the VariantCall.genotype_likelihood field, with the
322
  // enforcement that each is of type=Float and Number=G.
323
  bool store_gl_and_pl_in_info_map = 5;
324
}
325
326
message VcfWriterOptions {
327
  reserved 1, 2, 3, 4, 5;
328
329
  // A list of all INFO field IDs that should be excluded from writing.
330
  repeated string excluded_info_fields = 7;
331
332
  // A list of all FORMAT field IDs that should be excluded from writing.
333
  repeated string excluded_format_fields = 8;
334
335
  // Should QUAL field values be rounded to one point past the decimal?
336
  bool round_qual_values = 6;
337
338
  // If true, the GL and PL format tags are written from the VariantCall.info
339
  // map with the type and number as specified in the VCF header. In this case,
340
  // any values set in the VariantCall.genotype_likelihood field are ignored.
341
  // Otherwise, the GL and PL tags are retrieved from the
342
  // VariantCall.genotype_likelihood field, with the enforcement that each is of
343
  // type=Float and Number=G, and neither GL nor PL should be present in the
344
  // VariantCall.info map.
345
  bool retrieve_gl_and_pl_from_info_map = 9;
346
347
  // If true, the writer will skip writing the VcfHeader.
348
  bool exclude_header = 10;
349
}