Switch to unified view

a b/third_party/nucleus/protos/reads.proto
1
// Copyright 2018 Google LLC.
2
//
3
// Redistribution and use in source and binary forms, with or without
4
// modification, are permitted provided that the following conditions
5
// are met:
6
//
7
// 1. Redistributions of source code must retain the above copyright notice,
8
//    this list of conditions and the following disclaimer.
9
//
10
// 2. Redistributions in binary form must reproduce the above copyright
11
//    notice, this list of conditions and the following disclaimer in the
12
//    documentation and/or other materials provided with the distribution.
13
//
14
// 3. Neither the name of the copyright holder nor the names of its
15
//    contributors may be used to endorse or promote products derived from this
16
//    software without specific prior written permission.
17
//
18
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
// POSSIBILITY OF SUCH DAMAGE.
29
syntax = "proto3";
30
31
package nucleus.genomics.v1;
32
33
import "third_party/nucleus/protos/cigar.proto";
34
import "third_party/nucleus/protos/position.proto";
35
import "third_party/nucleus/protos/reference.proto";
36
import "third_party/nucleus/protos/struct.proto";
37
38
// A linear alignment can be represented by one CIGAR string. Describes the
39
// mapped position and local alignment of the read to the reference.
40
message LinearAlignment {
41
  // The position of this alignment.
42
  Position position = 1;
43
44
  // The mapping quality of this alignment. Represents how likely
45
  // the read maps to this position as opposed to other locations.
46
  //
47
  // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
48
  // the nearest integer.
49
  int32 mapping_quality = 2;
50
51
  // Represents the local alignment of this sequence (alignment matches, indels,
52
  // etc) against the reference.
53
  repeated CigarUnit cigar = 3;
54
}
55
56
// A read alignment describes a linear alignment of a string of DNA to a
57
// [reference sequence][learning.genomics.v1.Reference], in addition to metadata
58
// about the fragment (the molecule of DNA sequenced) and the read (the bases
59
// which were read by the sequencer). A read is equivalent to a line in a SAM
60
// file. A read belongs to exactly one read group and exactly one
61
// [read group set][learning.genomics.v1.ReadGroupSet].
62
//
63
// For more genomics resource definitions, see [Fundamentals of Google
64
// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
65
//
66
// ### Reverse-stranded reads
67
//
68
// Mapped reads (reads having a non-null `alignment`) can be aligned to either
69
// the forward or the reverse strand of their associated reference. Strandedness
70
// of a mapped read is encoded by `alignment.position.reverseStrand`.
71
//
72
// If we consider the reference to be a forward-stranded coordinate space of
73
// `[0, reference.length)` with `0` as the left-most position and
74
// `reference.length` as the right-most position, reads are always aligned left
75
// to right. That is, `alignment.position.position` always refers to the
76
// left-most reference coordinate and `alignment.cigar` describes the alignment
77
// of this read to the reference from left to right. All per-base fields such as
78
// `alignedSequence` and `alignedQuality` share this same left-to-right
79
// orientation; this is true of reads which are aligned to either strand. For
80
// reverse-stranded reads, this means that `alignedSequence` is the reverse
81
// complement of the bases that were originally reported by the sequencing
82
// machine.
83
//
84
// ### Generating a reference-aligned sequence string
85
//
86
// When interacting with mapped reads, it's often useful to produce a string
87
// representing the local alignment of the read to reference. The following
88
// pseudocode demonstrates one way of doing this:
89
//
90
//     out = ""
91
//     offset = 0
92
//     for c in read.alignment.cigar {
93
//       switch c.operation {
94
//       case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
95
//         out += read.alignedSequence[offset:offset+c.operationLength]
96
//         offset += c.operationLength
97
//         break
98
//       case "CLIP_SOFT", "INSERT":
99
//         offset += c.operationLength
100
//         break
101
//       case "PAD":
102
//         out += repeat("*", c.operationLength)
103
//         break
104
//       case "DELETE":
105
//         out += repeat("-", c.operationLength)
106
//         break
107
//       case "SKIP":
108
//         out += repeat(" ", c.operationLength)
109
//         break
110
//       case "CLIP_HARD":
111
//         break
112
//       }
113
//     }
114
//     return out
115
//
116
// ### Converting to SAM's CIGAR string
117
//
118
// The following pseudocode generates a SAM CIGAR string from the
119
// `cigar` field. Note that this is a lossy conversion
120
// (`cigar.referenceSequence` is lost).
121
//
122
//     cigarMap = {
123
//       "ALIGNMENT_MATCH": "M",
124
//       "INSERT": "I",
125
//       "DELETE": "D",
126
//       "SKIP": "N",
127
//       "CLIP_SOFT": "S",
128
//       "CLIP_HARD": "H",
129
//       "PAD": "P",
130
//       "SEQUENCE_MATCH": "=",
131
//       "SEQUENCE_MISMATCH": "X",
132
//     }
133
//     cigarStr = ""
134
//     for c in read.alignment.cigar {
135
//       cigarStr += c.operationLength + cigarMap[c.operation]
136
//     }
137
//     return cigarStr
138
//
139
// (== resource_for v1.reads ==)
140
message Read {
141
  // The server-generated read ID, unique across all reads. This is different
142
  // from the `fragmentName`.
143
  string id = 1;
144
145
  // The ID of the read group this read belongs to. A read belongs to exactly
146
  // one read group. This is a server-generated ID which is distinct from SAM's
147
  // RG tag (for that value, see
148
  // [ReadGroup.name][learning.genomics.v1.ReadGroup.name]).
149
  string read_group_id = 2;
150
151
  // The ID of the read group set this read belongs to. A read belongs to
152
  // exactly one read group set.
153
  string read_group_set_id = 3;
154
155
  // The fragment name. Equivalent to QNAME (query template name) in SAM.
156
  string fragment_name = 4;
157
158
  // The orientation and the distance between reads from the fragment are
159
  // consistent with the sequencing protocol (SAM flag 0x2).
160
  bool proper_placement = 5;
161
162
  // The fragment is a PCR or optical duplicate (SAM flag 0x400).
163
  bool duplicate_fragment = 6;
164
165
  // The observed length of the fragment, equivalent to TLEN in SAM.
166
  int32 fragment_length = 7;
167
168
  // The read number in sequencing. 0-based and less than numberReads. This
169
  // field replaces SAM flag 0x40 and 0x80.
170
  int32 read_number = 8;
171
172
  // The number of reads in the fragment (extension to SAM flag 0x1).
173
  int32 number_reads = 9;
174
175
  // Whether this read did not pass filters, such as platform or vendor quality
176
  // controls (SAM flag 0x200).
177
  bool failed_vendor_quality_checks = 10;
178
179
  // The linear alignment for this alignment record. This field is null for
180
  // unmapped reads.
181
  LinearAlignment alignment = 11;
182
183
  // Whether this alignment is secondary. Equivalent to SAM flag 0x100.
184
  // A secondary alignment represents an alternative to the primary alignment
185
  // for this read. Aligners may return secondary alignments if a read can map
186
  // ambiguously to multiple coordinates in the genome. By convention, each read
187
  // has one and only one alignment where both `secondaryAlignment`
188
  // and `supplementaryAlignment` are false.
189
  bool secondary_alignment = 12;
190
191
  // Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
192
  // Supplementary alignments are used in the representation of a chimeric
193
  // alignment. In a chimeric alignment, a read is split into multiple
194
  // linear alignments that map to different reference contigs. The first
195
  // linear alignment in the read will be designated as the representative
196
  // alignment; the remaining linear alignments will be designated as
197
  // supplementary alignments. These alignments may have different mapping
198
  // quality scores. In each linear alignment in a chimeric alignment, the read
199
  // will be hard clipped. The `alignedSequence` and
200
  // `alignedQuality` fields in the alignment record will only
201
  // represent the bases for its respective linear alignment.
202
  bool supplementary_alignment = 13;
203
204
  // The bases of the read sequence contained in this alignment record,
205
  // **without CIGAR operations applied** (equivalent to SEQ in SAM).
206
  // `alignedSequence` and `alignedQuality` may be
207
  // shorter than the full read sequence and quality. This will occur if the
208
  // alignment is part of a chimeric alignment, or if the read was trimmed. When
209
  // this occurs, the CIGAR for this read will begin/end with a hard clip
210
  // operator that will indicate the length of the excised sequence.
211
  string aligned_sequence = 14;
212
213
  // The quality of the read sequence contained in this alignment record
214
  // (equivalent to QUAL in SAM). Optionally can be read from OQ tag. See
215
  // `SamReaderOptions` proto for more details.
216
  // `alignedSequence` and `alignedQuality` may be shorter than the full read
217
  // sequence and quality. This will occur if the alignment is part of a
218
  // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
219
  // for this read will begin/end with a hard clip operator that will indicate
220
  // the length of the excised sequence.
221
  repeated int32 aligned_quality = 15;
222
223
  // The mapping of the primary alignment of the
224
  // `(readNumber+1)%numberReads` read in the fragment. It replaces
225
  // mate position and mate strand in SAM.
226
  Position next_mate_position = 16;
227
228
  // A map of additional read alignment information. This must be of the form
229
  // map<string, string[]> (string key mapping to a list of string values).
230
  map<string, ListValue> info = 17;
231
}
232
233
// The SamHeader message represents the metadata present in the header of a
234
// SAM/BAM file.
235
message SamHeader {
236
  // The VN field from the HD line.  Empty if not present (valid formats
237
  // will match /^[0-9]+\.[0-9]+$/).
238
  string format_version = 1;
239
240
  // The SO field from the HD line.
241
  enum SortingOrder {
242
    UNKNOWN = 0;
243
    UNSORTED = 1;
244
    QUERYNAME = 2;
245
    COORDINATE = 3;
246
  }
247
  SortingOrder sorting_order = 2;
248
249
  // The GO field from the HD line.
250
  enum AlignmentGrouping {
251
    NONE = 0;
252
    QUERY = 1;
253
    REFERENCE = 2;
254
  }
255
  AlignmentGrouping alignment_grouping = 3;
256
257
  // @SQ header field in SAM spec.
258
  // The order of the contigs defines the sorting order.
259
  repeated nucleus.genomics.v1.ContigInfo contigs = 4;
260
261
  // @RG header field in SAM spec.
262
  // Read groups.
263
  repeated ReadGroup read_groups = 5;
264
265
  // @PG header field in SAM spec.
266
  // A program run to generate the alignment data.
267
  repeated Program programs = 6;
268
269
  // @CO header field in SAM spec.
270
  // One-line text comments.
271
  repeated string comments = 7;
272
}
273
274
// A read group is all the data that's processed the same way by the sequencer.
275
// This is a sub-message of SamHeader, at the same scope to reduce verbosity.
276
message ReadGroup {
277
  // RG@ ID field in SAM spec.
278
  // The read group name.
279
  string name = 1;
280
281
  // RG@ CN field in SAM spec.
282
  // The name of the sequencing center producing the read.
283
  string sequencing_center = 2;
284
285
  // @RG DS field in SAM spec.
286
  // A free-form text description of this read group.
287
  string description = 3;
288
289
  // @RG DT field in SAM spec.
290
  string date = 4;
291
292
  // @RG FO field in SAM spec.
293
  string flow_order = 5;
294
295
  // @RG KS field in SAM spec.
296
  string key_sequence = 6;
297
298
  // @RG LB field in SAM spec.
299
  // A library is a collection of DNA fragments which have been prepared for
300
  // sequencing from a sample. This field is important for quality control as
301
  // error or bias can be introduced during sample preparation.
302
  string library_id = 7;
303
304
  // @RG PG field in SAM spec.
305
  repeated string program_ids = 8;
306
307
  // @RG PI field in SAM spec.
308
  // The predicted insert size of this read group. The insert size is the length
309
  // of the sequenced DNA fragment from end-to-end, not including the adapters.
310
  int32 predicted_insert_size = 9;
311
312
  // @RG PL field in SAM spec.
313
  // The platform/technology used to produce the reads.
314
  string platform = 10;
315
316
  // @RG PM field in SAM spec.
317
  // The platform model used as part of this run.
318
  string platform_model = 11;
319
320
  // @RG PU field in SAM spec.
321
  // The platform unit used as part of this experiment, for example
322
  // flowcell-barcode.lane for Illumina or slide for SOLiD. A unique identifier.
323
  string platform_unit = 12;
324
325
  // @RG SM field in SAM spec.
326
  // A client-supplied sample identifier for the reads in this read group.
327
  string sample_id = 13;
328
}
329
330
// A Program is used in the SAM header to track how alignment data is generated.
331
// This is a sub-message of SamHeader, at the same scope to reduce verbosity.
332
message Program {
333
  // @PG ID field in SAM spec.
334
  // The locally unique ID of the program. Used along with
335
  // `prev_program_id` to define an ordering between programs.
336
  string id = 2;
337
338
  // @PG PN field in SAM spec.
339
  // The display name of the program. This is typically the colloquial name of
340
  // the tool used, for example 'bwa' or 'picard'.
341
  string name = 3;
342
343
  // @PG CL field in SAM spec.
344
  // The command line used to run this program.
345
  string command_line = 1;
346
347
  // @PG PP field in SAM spec.
348
  // The ID of the program run before this one.
349
  string prev_program_id = 4;
350
351
  // @PG DS field in SAM spec.
352
  // The description of the program.
353
  string description = 6;
354
355
  // @PG VN field in SAM spec.
356
  // The version of the program run.
357
  string version = 5;
358
}
359
360
///////////////////////////////////////////////////////////////////////////////
361
// I/O-related messages.
362
///////////////////////////////////////////////////////////////////////////////
363
364
// The SamReaderOptions message is used to alter the properties of a SamReader.
365
// It enables reads to be omitted from parsing based on their attributes, as
366
// well as more fine-grained handling of particular fields within the SAM
367
// records.
368
// Next ID: 12.
369
message SamReaderOptions {
370
  // Read requirements that must be satisfied before our reader will return
371
  // a read to use.
372
  ReadRequirements read_requirements = 1;
373
374
  // How should we handle the aux fields in the SAM record?
375
  enum AuxFieldHandling {
376
    UNSPECIFIED = 0;
377
    SKIP_AUX_FIELDS = 1;
378
    PARSE_ALL_AUX_FIELDS = 2;
379
  }
380
  AuxFieldHandling aux_field_handling = 3;
381
382
  // Block size to use in htslib, in reading the SAM/BAM. Value <=0 will use the
383
  // default htslib block size.
384
  int64 hts_block_size = 4;
385
386
  // Controls if, and at what rate, we discard reads from the input stream.
387
  //
388
  // This option allows the user to efficiently remove a random fraction of
389
  // reads from the source SAM/BAM file. The reads are discarded on the fly
390
  // before being parsed into protos, so the downsampling is reasonably
391
  // efficient.
392
  //
393
  // If 0.0 (the default protobuf value), this field is ignored. If != 0.0, then
394
  // this must be a value between (0.0, 1.0] indicating the probability p that a
395
  // read should be kept, or equivalently (1 - p) that a read will be kept. For
396
  // example, if downsample_fraction is 0.25, then each read has a 25% chance of
397
  // being included in the output reads.
398
  float downsample_fraction = 5;
399
400
  // Random seed to use with downsampling fraction.
401
  int64 random_seed = 6;
402
403
  // By default aligned_quality field is read from QUAL in SAM. If flag is set,
404
  // aligned_quality field is read from OQ tag in SAM.
405
  bool use_original_base_quality_scores = 10;
406
407
  // By default, this field is empty. If empty, we keep all aux fields if they
408
  // are parsed. If set, we only keep the aux fields with the names in this
409
  // list.
410
  repeated string aux_fields_to_keep = 11;
411
}
412
413
// Describes requirements for a read for it to be returned by a SamReader.
414
message ReadRequirements {
415
  // By default, duplicate reads will not be kept. Set this flag to keep them.
416
  bool keep_duplicates = 1;
417
  // By default, reads that failed the vendor quality checks will not be kept.
418
  // Set this flag to keep them.
419
  bool keep_failed_vendor_quality_checks = 2;
420
  // By default, reads that are marked as secondary alignments will not be kept.
421
  // Set this flag to keep them.
422
  bool keep_secondary_alignments = 3;
423
  // By default, reads that are marked as supplementary alignments will not be
424
  // kept. Set this flag to keep them.
425
  bool keep_supplementary_alignments = 4;
426
  // By default, reads that aren't aligned are not kept. Set this flag to keep
427
  // them.
428
  bool keep_unaligned = 5;
429
  // Paired (or greater) reads that are improperly placed are not kept by
430
  // default. Set this flag to keep them. We define improperly placed to mean
431
  // reads whose (next) mate is mapped to a different contig.
432
  bool keep_improperly_placed = 6;
433
  // By default, reads with any mapping quality are kept. Setting this field
434
  // to a positive integer i will only keep reads that have a MAPQ >= i. Note
435
  // this only applies to aligned reads. If keep_unaligned is set, unaligned
436
  // reads, which by definition do not have a mapping quality, will still be
437
  // kept.
438
  int32 min_mapping_quality = 7;
439
440
  // Minimum base quality. This field indicates that we are enforcing a minimum
441
  // base quality score for a read to be used. How this field is enforced,
442
  // though, depends on the enum field min_base_quality_mode, as there are
443
  // multiple ways for this requirement to be interpreted.
444
  int32 min_base_quality = 8;
445
446
  // How should we enforce the min_base_quality requirement?
447
  enum MinBaseQualityMode {
448
    // If UNSPECIFIED, there are no guarantees on whether and how
449
    // min_base_quality would be enforced. By default we recommend
450
    // implementations ignore min_base_quality if this is set to UNSPECIFIED.
451
    UNSPECIFIED = 0;
452
    // The min_base_quality requirement is being enforced not by the reader but
453
    // by the client itself. This is commonly used when the algorithm for
454
    // computing whether a read satisfying the min_base_quality requirement is
455
    // too complex or too specific for the reader.
456
    ENFORCED_BY_CLIENT = 1;
457
  }
458
  MinBaseQualityMode min_base_quality_mode = 9;
459
}