// Copyright 2018 Google LLC.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
syntax = "proto3";
package nucleus.genomics.v1;
import "third_party/nucleus/protos/cigar.proto";
import "third_party/nucleus/protos/position.proto";
import "third_party/nucleus/protos/reference.proto";
import "third_party/nucleus/protos/struct.proto";
// A linear alignment can be represented by one CIGAR string. Describes the
// mapped position and local alignment of the read to the reference.
message LinearAlignment {
// The position of this alignment.
Position position = 1;
// The mapping quality of this alignment. Represents how likely
// the read maps to this position as opposed to other locations.
//
// Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
// the nearest integer.
int32 mapping_quality = 2;
// Represents the local alignment of this sequence (alignment matches, indels,
// etc) against the reference.
repeated CigarUnit cigar = 3;
}
// A read alignment describes a linear alignment of a string of DNA to a
// [reference sequence][learning.genomics.v1.Reference], in addition to metadata
// about the fragment (the molecule of DNA sequenced) and the read (the bases
// which were read by the sequencer). A read is equivalent to a line in a SAM
// file. A read belongs to exactly one read group and exactly one
// [read group set][learning.genomics.v1.ReadGroupSet].
//
// For more genomics resource definitions, see [Fundamentals of Google
// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
//
// ### Reverse-stranded reads
//
// Mapped reads (reads having a non-null `alignment`) can be aligned to either
// the forward or the reverse strand of their associated reference. Strandedness
// of a mapped read is encoded by `alignment.position.reverseStrand`.
//
// If we consider the reference to be a forward-stranded coordinate space of
// `[0, reference.length)` with `0` as the left-most position and
// `reference.length` as the right-most position, reads are always aligned left
// to right. That is, `alignment.position.position` always refers to the
// left-most reference coordinate and `alignment.cigar` describes the alignment
// of this read to the reference from left to right. All per-base fields such as
// `alignedSequence` and `alignedQuality` share this same left-to-right
// orientation; this is true of reads which are aligned to either strand. For
// reverse-stranded reads, this means that `alignedSequence` is the reverse
// complement of the bases that were originally reported by the sequencing
// machine.
//
// ### Generating a reference-aligned sequence string
//
// When interacting with mapped reads, it's often useful to produce a string
// representing the local alignment of the read to reference. The following
// pseudocode demonstrates one way of doing this:
//
// out = ""
// offset = 0
// for c in read.alignment.cigar {
// switch c.operation {
// case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
// out += read.alignedSequence[offset:offset+c.operationLength]
// offset += c.operationLength
// break
// case "CLIP_SOFT", "INSERT":
// offset += c.operationLength
// break
// case "PAD":
// out += repeat("*", c.operationLength)
// break
// case "DELETE":
// out += repeat("-", c.operationLength)
// break
// case "SKIP":
// out += repeat(" ", c.operationLength)
// break
// case "CLIP_HARD":
// break
// }
// }
// return out
//
// ### Converting to SAM's CIGAR string
//
// The following pseudocode generates a SAM CIGAR string from the
// `cigar` field. Note that this is a lossy conversion
// (`cigar.referenceSequence` is lost).
//
// cigarMap = {
// "ALIGNMENT_MATCH": "M",
// "INSERT": "I",
// "DELETE": "D",
// "SKIP": "N",
// "CLIP_SOFT": "S",
// "CLIP_HARD": "H",
// "PAD": "P",
// "SEQUENCE_MATCH": "=",
// "SEQUENCE_MISMATCH": "X",
// }
// cigarStr = ""
// for c in read.alignment.cigar {
// cigarStr += c.operationLength + cigarMap[c.operation]
// }
// return cigarStr
//
// (== resource_for v1.reads ==)
message Read {
// The server-generated read ID, unique across all reads. This is different
// from the `fragmentName`.
string id = 1;
// The ID of the read group this read belongs to. A read belongs to exactly
// one read group. This is a server-generated ID which is distinct from SAM's
// RG tag (for that value, see
// [ReadGroup.name][learning.genomics.v1.ReadGroup.name]).
string read_group_id = 2;
// The ID of the read group set this read belongs to. A read belongs to
// exactly one read group set.
string read_group_set_id = 3;
// The fragment name. Equivalent to QNAME (query template name) in SAM.
string fragment_name = 4;
// The orientation and the distance between reads from the fragment are
// consistent with the sequencing protocol (SAM flag 0x2).
bool proper_placement = 5;
// The fragment is a PCR or optical duplicate (SAM flag 0x400).
bool duplicate_fragment = 6;
// The observed length of the fragment, equivalent to TLEN in SAM.
int32 fragment_length = 7;
// The read number in sequencing. 0-based and less than numberReads. This
// field replaces SAM flag 0x40 and 0x80.
int32 read_number = 8;
// The number of reads in the fragment (extension to SAM flag 0x1).
int32 number_reads = 9;
// Whether this read did not pass filters, such as platform or vendor quality
// controls (SAM flag 0x200).
bool failed_vendor_quality_checks = 10;
// The linear alignment for this alignment record. This field is null for
// unmapped reads.
LinearAlignment alignment = 11;
// Whether this alignment is secondary. Equivalent to SAM flag 0x100.
// A secondary alignment represents an alternative to the primary alignment
// for this read. Aligners may return secondary alignments if a read can map
// ambiguously to multiple coordinates in the genome. By convention, each read
// has one and only one alignment where both `secondaryAlignment`
// and `supplementaryAlignment` are false.
bool secondary_alignment = 12;
// Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
// Supplementary alignments are used in the representation of a chimeric
// alignment. In a chimeric alignment, a read is split into multiple
// linear alignments that map to different reference contigs. The first
// linear alignment in the read will be designated as the representative
// alignment; the remaining linear alignments will be designated as
// supplementary alignments. These alignments may have different mapping
// quality scores. In each linear alignment in a chimeric alignment, the read
// will be hard clipped. The `alignedSequence` and
// `alignedQuality` fields in the alignment record will only
// represent the bases for its respective linear alignment.
bool supplementary_alignment = 13;
// The bases of the read sequence contained in this alignment record,
// **without CIGAR operations applied** (equivalent to SEQ in SAM).
// `alignedSequence` and `alignedQuality` may be
// shorter than the full read sequence and quality. This will occur if the
// alignment is part of a chimeric alignment, or if the read was trimmed. When
// this occurs, the CIGAR for this read will begin/end with a hard clip
// operator that will indicate the length of the excised sequence.
string aligned_sequence = 14;
// The quality of the read sequence contained in this alignment record
// (equivalent to QUAL in SAM). Optionally can be read from OQ tag. See
// `SamReaderOptions` proto for more details.
// `alignedSequence` and `alignedQuality` may be shorter than the full read
// sequence and quality. This will occur if the alignment is part of a
// chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
// for this read will begin/end with a hard clip operator that will indicate
// the length of the excised sequence.
repeated int32 aligned_quality = 15;
// The mapping of the primary alignment of the
// `(readNumber+1)%numberReads` read in the fragment. It replaces
// mate position and mate strand in SAM.
Position next_mate_position = 16;
// A map of additional read alignment information. This must be of the form
// map<string, string[]> (string key mapping to a list of string values).
map<string, ListValue> info = 17;
}
// The SamHeader message represents the metadata present in the header of a
// SAM/BAM file.
message SamHeader {
// The VN field from the HD line. Empty if not present (valid formats
// will match /^[0-9]+\.[0-9]+$/).
string format_version = 1;
// The SO field from the HD line.
enum SortingOrder {
UNKNOWN = 0;
UNSORTED = 1;
QUERYNAME = 2;
COORDINATE = 3;
}
SortingOrder sorting_order = 2;
// The GO field from the HD line.
enum AlignmentGrouping {
NONE = 0;
QUERY = 1;
REFERENCE = 2;
}
AlignmentGrouping alignment_grouping = 3;
// @SQ header field in SAM spec.
// The order of the contigs defines the sorting order.
repeated nucleus.genomics.v1.ContigInfo contigs = 4;
// @RG header field in SAM spec.
// Read groups.
repeated ReadGroup read_groups = 5;
// @PG header field in SAM spec.
// A program run to generate the alignment data.
repeated Program programs = 6;
// @CO header field in SAM spec.
// One-line text comments.
repeated string comments = 7;
}
// A read group is all the data that's processed the same way by the sequencer.
// This is a sub-message of SamHeader, at the same scope to reduce verbosity.
message ReadGroup {
// RG@ ID field in SAM spec.
// The read group name.
string name = 1;
// RG@ CN field in SAM spec.
// The name of the sequencing center producing the read.
string sequencing_center = 2;
// @RG DS field in SAM spec.
// A free-form text description of this read group.
string description = 3;
// @RG DT field in SAM spec.
string date = 4;
// @RG FO field in SAM spec.
string flow_order = 5;
// @RG KS field in SAM spec.
string key_sequence = 6;
// @RG LB field in SAM spec.
// A library is a collection of DNA fragments which have been prepared for
// sequencing from a sample. This field is important for quality control as
// error or bias can be introduced during sample preparation.
string library_id = 7;
// @RG PG field in SAM spec.
repeated string program_ids = 8;
// @RG PI field in SAM spec.
// The predicted insert size of this read group. The insert size is the length
// of the sequenced DNA fragment from end-to-end, not including the adapters.
int32 predicted_insert_size = 9;
// @RG PL field in SAM spec.
// The platform/technology used to produce the reads.
string platform = 10;
// @RG PM field in SAM spec.
// The platform model used as part of this run.
string platform_model = 11;
// @RG PU field in SAM spec.
// The platform unit used as part of this experiment, for example
// flowcell-barcode.lane for Illumina or slide for SOLiD. A unique identifier.
string platform_unit = 12;
// @RG SM field in SAM spec.
// A client-supplied sample identifier for the reads in this read group.
string sample_id = 13;
}
// A Program is used in the SAM header to track how alignment data is generated.
// This is a sub-message of SamHeader, at the same scope to reduce verbosity.
message Program {
// @PG ID field in SAM spec.
// The locally unique ID of the program. Used along with
// `prev_program_id` to define an ordering between programs.
string id = 2;
// @PG PN field in SAM spec.
// The display name of the program. This is typically the colloquial name of
// the tool used, for example 'bwa' or 'picard'.
string name = 3;
// @PG CL field in SAM spec.
// The command line used to run this program.
string command_line = 1;
// @PG PP field in SAM spec.
// The ID of the program run before this one.
string prev_program_id = 4;
// @PG DS field in SAM spec.
// The description of the program.
string description = 6;
// @PG VN field in SAM spec.
// The version of the program run.
string version = 5;
}
///////////////////////////////////////////////////////////////////////////////
// I/O-related messages.
///////////////////////////////////////////////////////////////////////////////
// The SamReaderOptions message is used to alter the properties of a SamReader.
// It enables reads to be omitted from parsing based on their attributes, as
// well as more fine-grained handling of particular fields within the SAM
// records.
// Next ID: 12.
message SamReaderOptions {
// Read requirements that must be satisfied before our reader will return
// a read to use.
ReadRequirements read_requirements = 1;
// How should we handle the aux fields in the SAM record?
enum AuxFieldHandling {
UNSPECIFIED = 0;
SKIP_AUX_FIELDS = 1;
PARSE_ALL_AUX_FIELDS = 2;
}
AuxFieldHandling aux_field_handling = 3;
// Block size to use in htslib, in reading the SAM/BAM. Value <=0 will use the
// default htslib block size.
int64 hts_block_size = 4;
// Controls if, and at what rate, we discard reads from the input stream.
//
// This option allows the user to efficiently remove a random fraction of
// reads from the source SAM/BAM file. The reads are discarded on the fly
// before being parsed into protos, so the downsampling is reasonably
// efficient.
//
// If 0.0 (the default protobuf value), this field is ignored. If != 0.0, then
// this must be a value between (0.0, 1.0] indicating the probability p that a
// read should be kept, or equivalently (1 - p) that a read will be kept. For
// example, if downsample_fraction is 0.25, then each read has a 25% chance of
// being included in the output reads.
float downsample_fraction = 5;
// Random seed to use with downsampling fraction.
int64 random_seed = 6;
// By default aligned_quality field is read from QUAL in SAM. If flag is set,
// aligned_quality field is read from OQ tag in SAM.
bool use_original_base_quality_scores = 10;
// By default, this field is empty. If empty, we keep all aux fields if they
// are parsed. If set, we only keep the aux fields with the names in this
// list.
repeated string aux_fields_to_keep = 11;
}
// Describes requirements for a read for it to be returned by a SamReader.
message ReadRequirements {
// By default, duplicate reads will not be kept. Set this flag to keep them.
bool keep_duplicates = 1;
// By default, reads that failed the vendor quality checks will not be kept.
// Set this flag to keep them.
bool keep_failed_vendor_quality_checks = 2;
// By default, reads that are marked as secondary alignments will not be kept.
// Set this flag to keep them.
bool keep_secondary_alignments = 3;
// By default, reads that are marked as supplementary alignments will not be
// kept. Set this flag to keep them.
bool keep_supplementary_alignments = 4;
// By default, reads that aren't aligned are not kept. Set this flag to keep
// them.
bool keep_unaligned = 5;
// Paired (or greater) reads that are improperly placed are not kept by
// default. Set this flag to keep them. We define improperly placed to mean
// reads whose (next) mate is mapped to a different contig.
bool keep_improperly_placed = 6;
// By default, reads with any mapping quality are kept. Setting this field
// to a positive integer i will only keep reads that have a MAPQ >= i. Note
// this only applies to aligned reads. If keep_unaligned is set, unaligned
// reads, which by definition do not have a mapping quality, will still be
// kept.
int32 min_mapping_quality = 7;
// Minimum base quality. This field indicates that we are enforcing a minimum
// base quality score for a read to be used. How this field is enforced,
// though, depends on the enum field min_base_quality_mode, as there are
// multiple ways for this requirement to be interpreted.
int32 min_base_quality = 8;
// How should we enforce the min_base_quality requirement?
enum MinBaseQualityMode {
// If UNSPECIFIED, there are no guarantees on whether and how
// min_base_quality would be enforced. By default we recommend
// implementations ignore min_base_quality if this is set to UNSPECIFIED.
UNSPECIFIED = 0;
// The min_base_quality requirement is being enforced not by the reader but
// by the client itself. This is commonly used when the algorithm for
// computing whether a read satisfying the min_base_quality requirement is
// too complex or too specific for the reader.
ENFORCED_BY_CLIENT = 1;
}
MinBaseQualityMode min_base_quality_mode = 9;
}