// Copyright 2018 Google LLC.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
syntax = "proto3";
package nucleus.genomics.v1;
import "third_party/nucleus/protos/range.proto";
// This message represents a single GFF3 record. See
// https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
// for details on the file format; that document is quoted below.
//
// TODO: deal with %-encoding.
message GffRecord {
// `range` field, reflecting the genomic location of the feature.
// NOTE: this is a 0-based, end-exclusive interval, in contrast to the 1-based
// encoding used natively in the GFF text format. For reference, consult the
// documentation for `Range`. This field is required.
Range range = 1;
// `source` designation, as defined by GFF spec:
// "The source is a free text qualifier intended to describe the algorithm or
// operating procedure that generated this feature. Typically this is the name
// of a piece of software, such as "Genescan" or a database name, such as
// "Genbank." In effect, the source is used to extend the feature ontology by
// adding a qualifier to the type creating a new composite type that is a
// subclass of the type in the type column."
//
// Missingness of this field is encoded as "".
string source = 2;
// `type` designation, as defined by GFF spec:
// "The type of the feature (previously called the "method"). This is
// constrained to be either a term from the Sequence Ontology or an SO
// accession number. The latter alternative is distinguished using the syntax
// SO:000000. In either case, it must be sequence_feature (SO:0000110) or an
// is_a child of it."
//
// Missingness of this field is encoded as "".
string type = 3;
// `score` designation, as defined by GFF spec:
// "The score of the feature, a floating point number. As in earlier versions
// of the format, the semantics of the score are ill-defined. It is strongly
// recommended that E-values be used for sequence similarity features, and
// that P-values be used for ab initio gene prediction features."
//
// Missingness of this field is encoded by -infinity.
double score = 4;
// TODO: factor this out (here and BED, at least)
enum Strand {
// The strand is unspecified, unknown, or not meaningful.
UNSPECIFIED_STRAND = 0;
FORWARD_STRAND = 1;
REVERSE_STRAND = 2;
}
// The strand if the feature, if relevant.
Strand strand = 5;
// `phase` designation, as defined by the GFF spec:
// "For features of type "CDS", the phase indicates where the feature begins
// with reference to the reading frame. The phase is one of the integers 0, 1,
// or 2, indicating the number of bases that should be removed from the
// beginning of this feature to reach the first base of the next codon. In
// other words, a phase of "0" indicates that the next codon begins at the
// first base of the region described by the current line, a phase of "1"
// indicates that the next codon begins at the second base of this region, and
// a phase of "2" indicates that the codon begins at the third base of this
// region. This is NOT to be confused with the frame, which is simply start
// modulo 3.
//
// For forward strand features, phase is counted from the start field. For
// reverse strand features, phase is counted from the end field.
//
// The phase is REQUIRED for all CDS features."
//
// Missingness of this field is encoded by -1.
int32 phase = 6;
// `attributes`, a free-form map of keys to string values, corresponding to
// the semi-colon separated attributes field in the GFF text format.
map<string, string> attributes = 7;
}
// A message encoding the directives contained in a GFF3 file header.
// Consult the file format reference for detailed descriptions of
// these directives. Note that we do NOT handle the FASTA directive
// (a rarely-used method to bundle reference sequences within a GFF
// file.)
message GffHeader {
// `gff_version`, a string of the format "gff-version 3.#.#"
// encoding the exact GFF spec version used.
string gff_version = 1;
// `sequence_regions` is a list of the sequence regions that are
// referenced by GFF records.
repeated Range sequence_regions = 2;
// An OntologyDirective holds the URI to a sequence ontology
// database, reflecting the ontology over the entities in the
// `type`, `source`, and `attributes` fields of a GffRecord.
message OntologyDirective {
string uri = 1;
}
repeated OntologyDirective feature_ontologies = 3;
repeated OntologyDirective attribute_ontologies = 4;
repeated OntologyDirective source_ontologies = 5;
// A string name for the biological species analyzed.
// "This directive indicates the species that the annotations apply
// to. The preferred format is a NCBI URL that points to the
// relevant species page in either of the following formats:
// http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6239
// http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=Caenorhabditis+elegans"
string species = 6;
message GenomeBuildDirective {
string source = 1;
string name = 2;
}
GenomeBuildDirective genome_build = 7;
}
message GffReaderOptions {
}
message GffWriterOptions {
}