--- a +++ b/third_party/nucleus/protos/cigar.proto @@ -0,0 +1,94 @@ +// Copyright 2018 Google LLC. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +syntax = "proto3"; + +package nucleus.genomics.v1; + +// A single CIGAR operation. +message CigarUnit { + // Describes the different types of CIGAR alignment operations that exist. + // Used wherever CIGAR alignments are used. + enum Operation { + OPERATION_UNSPECIFIED = 0; + // An alignment match indicates that a sequence can be aligned to the + // reference without evidence of an INDEL. Unlike the + // `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, + // the `ALIGNMENT_MATCH` operator does not indicate whether the + // reference and read sequences are an exact match. This operator is + // equivalent to SAM's `M`. + ALIGNMENT_MATCH = 1; + // The insert operator indicates that the read contains evidence of bases + // being inserted into the reference. This operator is equivalent to SAM's + // `I`. + INSERT = 2; + // The delete operator indicates that the read contains evidence of bases + // being deleted from the reference. This operator is equivalent to SAM's + // `D`. + DELETE = 3; + // The skip operator indicates that this read skips a long segment of the + // reference, but the bases have not been deleted. This operator is commonly + // used when working with RNA-seq data, where reads may skip long segments + // of the reference between exons. This operator is equivalent to SAM's + // `N`. + SKIP = 4; + // The soft clip operator indicates that bases at the start/end of a read + // have not been considered during alignment. This may occur if the majority + // of a read maps, except for low quality bases at the start/end of a read. + // This operator is equivalent to SAM's `S`. Bases that are soft + // clipped will still be stored in the read. + CLIP_SOFT = 5; + // The hard clip operator indicates that bases at the start/end of a read + // have been omitted from this alignment. This may occur if this linear + // alignment is part of a chimeric alignment, or if the read has been + // trimmed (for example, during error correction or to trim poly-A tails for + // RNA-seq). This operator is equivalent to SAM's `H`. + CLIP_HARD = 6; + // The pad operator indicates that there is padding in an alignment. This + // operator is equivalent to SAM's `P`. + PAD = 7; + // This operator indicates that this portion of the aligned sequence exactly + // matches the reference. This operator is equivalent to SAM's `=`. + SEQUENCE_MATCH = 8; + // This operator indicates that this portion of the aligned sequence is an + // alignment match to the reference, but a sequence mismatch. This can + // indicate a SNP or a read error. This operator is equivalent to SAM's + // `X`. + SEQUENCE_MISMATCH = 9; + } + Operation operation = 1; + + // The number of genomic bases that the operation runs for. Required. + int64 operation_length = 2; + + // `referenceSequence` is only used at mismatches + // (`SEQUENCE_MISMATCH`) and deletions (`DELETE`). + // Filling this field replaces SAM's MD tag. If the relevant information is + // not available, this field is unset. + string reference_sequence = 3; +}