--- a
+++ b/third_party/nucleus/protos/reads.proto
@@ -0,0 +1,459 @@
+// Copyright 2018 Google LLC.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from this
+//    software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+syntax = "proto3";
+
+package nucleus.genomics.v1;
+
+import "third_party/nucleus/protos/cigar.proto";
+import "third_party/nucleus/protos/position.proto";
+import "third_party/nucleus/protos/reference.proto";
+import "third_party/nucleus/protos/struct.proto";
+
+// A linear alignment can be represented by one CIGAR string. Describes the
+// mapped position and local alignment of the read to the reference.
+message LinearAlignment {
+  // The position of this alignment.
+  Position position = 1;
+
+  // The mapping quality of this alignment. Represents how likely
+  // the read maps to this position as opposed to other locations.
+  //
+  // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
+  // the nearest integer.
+  int32 mapping_quality = 2;
+
+  // Represents the local alignment of this sequence (alignment matches, indels,
+  // etc) against the reference.
+  repeated CigarUnit cigar = 3;
+}
+
+// A read alignment describes a linear alignment of a string of DNA to a
+// [reference sequence][learning.genomics.v1.Reference], in addition to metadata
+// about the fragment (the molecule of DNA sequenced) and the read (the bases
+// which were read by the sequencer). A read is equivalent to a line in a SAM
+// file. A read belongs to exactly one read group and exactly one
+// [read group set][learning.genomics.v1.ReadGroupSet].
+//
+// For more genomics resource definitions, see [Fundamentals of Google
+// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
+//
+// ### Reverse-stranded reads
+//
+// Mapped reads (reads having a non-null `alignment`) can be aligned to either
+// the forward or the reverse strand of their associated reference. Strandedness
+// of a mapped read is encoded by `alignment.position.reverseStrand`.
+//
+// If we consider the reference to be a forward-stranded coordinate space of
+// `[0, reference.length)` with `0` as the left-most position and
+// `reference.length` as the right-most position, reads are always aligned left
+// to right. That is, `alignment.position.position` always refers to the
+// left-most reference coordinate and `alignment.cigar` describes the alignment
+// of this read to the reference from left to right. All per-base fields such as
+// `alignedSequence` and `alignedQuality` share this same left-to-right
+// orientation; this is true of reads which are aligned to either strand. For
+// reverse-stranded reads, this means that `alignedSequence` is the reverse
+// complement of the bases that were originally reported by the sequencing
+// machine.
+//
+// ### Generating a reference-aligned sequence string
+//
+// When interacting with mapped reads, it's often useful to produce a string
+// representing the local alignment of the read to reference. The following
+// pseudocode demonstrates one way of doing this:
+//
+//     out = ""
+//     offset = 0
+//     for c in read.alignment.cigar {
+//       switch c.operation {
+//       case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
+//         out += read.alignedSequence[offset:offset+c.operationLength]
+//         offset += c.operationLength
+//         break
+//       case "CLIP_SOFT", "INSERT":
+//         offset += c.operationLength
+//         break
+//       case "PAD":
+//         out += repeat("*", c.operationLength)
+//         break
+//       case "DELETE":
+//         out += repeat("-", c.operationLength)
+//         break
+//       case "SKIP":
+//         out += repeat(" ", c.operationLength)
+//         break
+//       case "CLIP_HARD":
+//         break
+//       }
+//     }
+//     return out
+//
+// ### Converting to SAM's CIGAR string
+//
+// The following pseudocode generates a SAM CIGAR string from the
+// `cigar` field. Note that this is a lossy conversion
+// (`cigar.referenceSequence` is lost).
+//
+//     cigarMap = {
+//       "ALIGNMENT_MATCH": "M",
+//       "INSERT": "I",
+//       "DELETE": "D",
+//       "SKIP": "N",
+//       "CLIP_SOFT": "S",
+//       "CLIP_HARD": "H",
+//       "PAD": "P",
+//       "SEQUENCE_MATCH": "=",
+//       "SEQUENCE_MISMATCH": "X",
+//     }
+//     cigarStr = ""
+//     for c in read.alignment.cigar {
+//       cigarStr += c.operationLength + cigarMap[c.operation]
+//     }
+//     return cigarStr
+//
+// (== resource_for v1.reads ==)
+message Read {
+  // The server-generated read ID, unique across all reads. This is different
+  // from the `fragmentName`.
+  string id = 1;
+
+  // The ID of the read group this read belongs to. A read belongs to exactly
+  // one read group. This is a server-generated ID which is distinct from SAM's
+  // RG tag (for that value, see
+  // [ReadGroup.name][learning.genomics.v1.ReadGroup.name]).
+  string read_group_id = 2;
+
+  // The ID of the read group set this read belongs to. A read belongs to
+  // exactly one read group set.
+  string read_group_set_id = 3;
+
+  // The fragment name. Equivalent to QNAME (query template name) in SAM.
+  string fragment_name = 4;
+
+  // The orientation and the distance between reads from the fragment are
+  // consistent with the sequencing protocol (SAM flag 0x2).
+  bool proper_placement = 5;
+
+  // The fragment is a PCR or optical duplicate (SAM flag 0x400).
+  bool duplicate_fragment = 6;
+
+  // The observed length of the fragment, equivalent to TLEN in SAM.
+  int32 fragment_length = 7;
+
+  // The read number in sequencing. 0-based and less than numberReads. This
+  // field replaces SAM flag 0x40 and 0x80.
+  int32 read_number = 8;
+
+  // The number of reads in the fragment (extension to SAM flag 0x1).
+  int32 number_reads = 9;
+
+  // Whether this read did not pass filters, such as platform or vendor quality
+  // controls (SAM flag 0x200).
+  bool failed_vendor_quality_checks = 10;
+
+  // The linear alignment for this alignment record. This field is null for
+  // unmapped reads.
+  LinearAlignment alignment = 11;
+
+  // Whether this alignment is secondary. Equivalent to SAM flag 0x100.
+  // A secondary alignment represents an alternative to the primary alignment
+  // for this read. Aligners may return secondary alignments if a read can map
+  // ambiguously to multiple coordinates in the genome. By convention, each read
+  // has one and only one alignment where both `secondaryAlignment`
+  // and `supplementaryAlignment` are false.
+  bool secondary_alignment = 12;
+
+  // Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
+  // Supplementary alignments are used in the representation of a chimeric
+  // alignment. In a chimeric alignment, a read is split into multiple
+  // linear alignments that map to different reference contigs. The first
+  // linear alignment in the read will be designated as the representative
+  // alignment; the remaining linear alignments will be designated as
+  // supplementary alignments. These alignments may have different mapping
+  // quality scores. In each linear alignment in a chimeric alignment, the read
+  // will be hard clipped. The `alignedSequence` and
+  // `alignedQuality` fields in the alignment record will only
+  // represent the bases for its respective linear alignment.
+  bool supplementary_alignment = 13;
+
+  // The bases of the read sequence contained in this alignment record,
+  // **without CIGAR operations applied** (equivalent to SEQ in SAM).
+  // `alignedSequence` and `alignedQuality` may be
+  // shorter than the full read sequence and quality. This will occur if the
+  // alignment is part of a chimeric alignment, or if the read was trimmed. When
+  // this occurs, the CIGAR for this read will begin/end with a hard clip
+  // operator that will indicate the length of the excised sequence.
+  string aligned_sequence = 14;
+
+  // The quality of the read sequence contained in this alignment record
+  // (equivalent to QUAL in SAM). Optionally can be read from OQ tag. See
+  // `SamReaderOptions` proto for more details.
+  // `alignedSequence` and `alignedQuality` may be shorter than the full read
+  // sequence and quality. This will occur if the alignment is part of a
+  // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
+  // for this read will begin/end with a hard clip operator that will indicate
+  // the length of the excised sequence.
+  repeated int32 aligned_quality = 15;
+
+  // The mapping of the primary alignment of the
+  // `(readNumber+1)%numberReads` read in the fragment. It replaces
+  // mate position and mate strand in SAM.
+  Position next_mate_position = 16;
+
+  // A map of additional read alignment information. This must be of the form
+  // map<string, string[]> (string key mapping to a list of string values).
+  map<string, ListValue> info = 17;
+}
+
+// The SamHeader message represents the metadata present in the header of a
+// SAM/BAM file.
+message SamHeader {
+  // The VN field from the HD line.  Empty if not present (valid formats
+  // will match /^[0-9]+\.[0-9]+$/).
+  string format_version = 1;
+
+  // The SO field from the HD line.
+  enum SortingOrder {
+    UNKNOWN = 0;
+    UNSORTED = 1;
+    QUERYNAME = 2;
+    COORDINATE = 3;
+  }
+  SortingOrder sorting_order = 2;
+
+  // The GO field from the HD line.
+  enum AlignmentGrouping {
+    NONE = 0;
+    QUERY = 1;
+    REFERENCE = 2;
+  }
+  AlignmentGrouping alignment_grouping = 3;
+
+  // @SQ header field in SAM spec.
+  // The order of the contigs defines the sorting order.
+  repeated nucleus.genomics.v1.ContigInfo contigs = 4;
+
+  // @RG header field in SAM spec.
+  // Read groups.
+  repeated ReadGroup read_groups = 5;
+
+  // @PG header field in SAM spec.
+  // A program run to generate the alignment data.
+  repeated Program programs = 6;
+
+  // @CO header field in SAM spec.
+  // One-line text comments.
+  repeated string comments = 7;
+}
+
+// A read group is all the data that's processed the same way by the sequencer.
+// This is a sub-message of SamHeader, at the same scope to reduce verbosity.
+message ReadGroup {
+  // RG@ ID field in SAM spec.
+  // The read group name.
+  string name = 1;
+
+  // RG@ CN field in SAM spec.
+  // The name of the sequencing center producing the read.
+  string sequencing_center = 2;
+
+  // @RG DS field in SAM spec.
+  // A free-form text description of this read group.
+  string description = 3;
+
+  // @RG DT field in SAM spec.
+  string date = 4;
+
+  // @RG FO field in SAM spec.
+  string flow_order = 5;
+
+  // @RG KS field in SAM spec.
+  string key_sequence = 6;
+
+  // @RG LB field in SAM spec.
+  // A library is a collection of DNA fragments which have been prepared for
+  // sequencing from a sample. This field is important for quality control as
+  // error or bias can be introduced during sample preparation.
+  string library_id = 7;
+
+  // @RG PG field in SAM spec.
+  repeated string program_ids = 8;
+
+  // @RG PI field in SAM spec.
+  // The predicted insert size of this read group. The insert size is the length
+  // of the sequenced DNA fragment from end-to-end, not including the adapters.
+  int32 predicted_insert_size = 9;
+
+  // @RG PL field in SAM spec.
+  // The platform/technology used to produce the reads.
+  string platform = 10;
+
+  // @RG PM field in SAM spec.
+  // The platform model used as part of this run.
+  string platform_model = 11;
+
+  // @RG PU field in SAM spec.
+  // The platform unit used as part of this experiment, for example
+  // flowcell-barcode.lane for Illumina or slide for SOLiD. A unique identifier.
+  string platform_unit = 12;
+
+  // @RG SM field in SAM spec.
+  // A client-supplied sample identifier for the reads in this read group.
+  string sample_id = 13;
+}
+
+// A Program is used in the SAM header to track how alignment data is generated.
+// This is a sub-message of SamHeader, at the same scope to reduce verbosity.
+message Program {
+  // @PG ID field in SAM spec.
+  // The locally unique ID of the program. Used along with
+  // `prev_program_id` to define an ordering between programs.
+  string id = 2;
+
+  // @PG PN field in SAM spec.
+  // The display name of the program. This is typically the colloquial name of
+  // the tool used, for example 'bwa' or 'picard'.
+  string name = 3;
+
+  // @PG CL field in SAM spec.
+  // The command line used to run this program.
+  string command_line = 1;
+
+  // @PG PP field in SAM spec.
+  // The ID of the program run before this one.
+  string prev_program_id = 4;
+
+  // @PG DS field in SAM spec.
+  // The description of the program.
+  string description = 6;
+
+  // @PG VN field in SAM spec.
+  // The version of the program run.
+  string version = 5;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// I/O-related messages.
+///////////////////////////////////////////////////////////////////////////////
+
+// The SamReaderOptions message is used to alter the properties of a SamReader.
+// It enables reads to be omitted from parsing based on their attributes, as
+// well as more fine-grained handling of particular fields within the SAM
+// records.
+// Next ID: 12.
+message SamReaderOptions {
+  // Read requirements that must be satisfied before our reader will return
+  // a read to use.
+  ReadRequirements read_requirements = 1;
+
+  // How should we handle the aux fields in the SAM record?
+  enum AuxFieldHandling {
+    UNSPECIFIED = 0;
+    SKIP_AUX_FIELDS = 1;
+    PARSE_ALL_AUX_FIELDS = 2;
+  }
+  AuxFieldHandling aux_field_handling = 3;
+
+  // Block size to use in htslib, in reading the SAM/BAM. Value <=0 will use the
+  // default htslib block size.
+  int64 hts_block_size = 4;
+
+  // Controls if, and at what rate, we discard reads from the input stream.
+  //
+  // This option allows the user to efficiently remove a random fraction of
+  // reads from the source SAM/BAM file. The reads are discarded on the fly
+  // before being parsed into protos, so the downsampling is reasonably
+  // efficient.
+  //
+  // If 0.0 (the default protobuf value), this field is ignored. If != 0.0, then
+  // this must be a value between (0.0, 1.0] indicating the probability p that a
+  // read should be kept, or equivalently (1 - p) that a read will be kept. For
+  // example, if downsample_fraction is 0.25, then each read has a 25% chance of
+  // being included in the output reads.
+  float downsample_fraction = 5;
+
+  // Random seed to use with downsampling fraction.
+  int64 random_seed = 6;
+
+  // By default aligned_quality field is read from QUAL in SAM. If flag is set,
+  // aligned_quality field is read from OQ tag in SAM.
+  bool use_original_base_quality_scores = 10;
+
+  // By default, this field is empty. If empty, we keep all aux fields if they
+  // are parsed. If set, we only keep the aux fields with the names in this
+  // list.
+  repeated string aux_fields_to_keep = 11;
+}
+
+// Describes requirements for a read for it to be returned by a SamReader.
+message ReadRequirements {
+  // By default, duplicate reads will not be kept. Set this flag to keep them.
+  bool keep_duplicates = 1;
+  // By default, reads that failed the vendor quality checks will not be kept.
+  // Set this flag to keep them.
+  bool keep_failed_vendor_quality_checks = 2;
+  // By default, reads that are marked as secondary alignments will not be kept.
+  // Set this flag to keep them.
+  bool keep_secondary_alignments = 3;
+  // By default, reads that are marked as supplementary alignments will not be
+  // kept. Set this flag to keep them.
+  bool keep_supplementary_alignments = 4;
+  // By default, reads that aren't aligned are not kept. Set this flag to keep
+  // them.
+  bool keep_unaligned = 5;
+  // Paired (or greater) reads that are improperly placed are not kept by
+  // default. Set this flag to keep them. We define improperly placed to mean
+  // reads whose (next) mate is mapped to a different contig.
+  bool keep_improperly_placed = 6;
+  // By default, reads with any mapping quality are kept. Setting this field
+  // to a positive integer i will only keep reads that have a MAPQ >= i. Note
+  // this only applies to aligned reads. If keep_unaligned is set, unaligned
+  // reads, which by definition do not have a mapping quality, will still be
+  // kept.
+  int32 min_mapping_quality = 7;
+
+  // Minimum base quality. This field indicates that we are enforcing a minimum
+  // base quality score for a read to be used. How this field is enforced,
+  // though, depends on the enum field min_base_quality_mode, as there are
+  // multiple ways for this requirement to be interpreted.
+  int32 min_base_quality = 8;
+
+  // How should we enforce the min_base_quality requirement?
+  enum MinBaseQualityMode {
+    // If UNSPECIFIED, there are no guarantees on whether and how
+    // min_base_quality would be enforced. By default we recommend
+    // implementations ignore min_base_quality if this is set to UNSPECIFIED.
+    UNSPECIFIED = 0;
+    // The min_base_quality requirement is being enforced not by the reader but
+    // by the client itself. This is commonly used when the algorithm for
+    // computing whether a read satisfying the min_base_quality requirement is
+    // too complex or too specific for the reader.
+    ENFORCED_BY_CLIENT = 1;
+  }
+  MinBaseQualityMode min_base_quality_mode = 9;
+}