Switch to unified view

a b/third_party/nucleus/protos/gff.proto
1
// Copyright 2018 Google LLC.
2
//
3
// Redistribution and use in source and binary forms, with or without
4
// modification, are permitted provided that the following conditions
5
// are met:
6
//
7
// 1. Redistributions of source code must retain the above copyright notice,
8
//    this list of conditions and the following disclaimer.
9
//
10
// 2. Redistributions in binary form must reproduce the above copyright
11
//    notice, this list of conditions and the following disclaimer in the
12
//    documentation and/or other materials provided with the distribution.
13
//
14
// 3. Neither the name of the copyright holder nor the names of its
15
//    contributors may be used to endorse or promote products derived from this
16
//    software without specific prior written permission.
17
//
18
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
// POSSIBILITY OF SUCH DAMAGE.
29
syntax = "proto3";
30
31
package nucleus.genomics.v1;
32
33
import "third_party/nucleus/protos/range.proto";
34
35
// This message represents a single GFF3 record.  See
36
// https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
37
// for details on the file format; that document is quoted below.
38
//
39
// TODO: deal with %-encoding.
40
message GffRecord {
41
  // `range` field, reflecting the genomic location of the feature.
42
  // NOTE: this is a 0-based, end-exclusive interval, in contrast to the 1-based
43
  // encoding used natively in the GFF text format.  For reference, consult the
44
  // documentation for `Range`.  This field is required.
45
  Range range = 1;
46
47
  // `source` designation, as defined by GFF spec:
48
  // "The source is a free text qualifier intended to describe the algorithm or
49
  // operating procedure that generated this feature. Typically this is the name
50
  // of a piece of software, such as "Genescan" or a database name, such as
51
  // "Genbank." In effect, the source is used to extend the feature ontology by
52
  // adding a qualifier to the type creating a new composite type that is a
53
  // subclass of the type in the type column."
54
  //
55
  // Missingness of this field is encoded as "".
56
  string source = 2;
57
58
  // `type` designation, as defined by GFF spec:
59
  // "The type of the feature (previously called the "method"). This is
60
  // constrained to be either a term from the Sequence Ontology or an SO
61
  // accession number. The latter alternative is distinguished using the syntax
62
  // SO:000000. In either case, it must be sequence_feature (SO:0000110) or an
63
  // is_a child of it."
64
  //
65
  // Missingness of this field is encoded as "".
66
  string type = 3;
67
68
  // `score` designation, as defined by GFF spec:
69
  // "The score of the feature, a floating point number. As in earlier versions
70
  // of the format, the semantics of the score are ill-defined. It is strongly
71
  // recommended that E-values be used for sequence similarity features, and
72
  // that P-values be used for ab initio gene prediction features."
73
  //
74
  // Missingness of this field is encoded by -infinity.
75
  double score = 4;
76
77
  // TODO: factor this out (here and BED, at least)
78
  enum Strand {
79
    // The strand is unspecified, unknown, or not meaningful.
80
    UNSPECIFIED_STRAND = 0;
81
    FORWARD_STRAND = 1;
82
    REVERSE_STRAND = 2;
83
  }
84
85
  // The strand if the feature, if relevant.
86
  Strand strand = 5;
87
88
  // `phase` designation, as defined by the GFF spec:
89
  // "For features of type "CDS", the phase indicates where the feature begins
90
  // with reference to the reading frame. The phase is one of the integers 0, 1,
91
  // or 2, indicating the number of bases that should be removed from the
92
  // beginning of this feature to reach the first base of the next codon. In
93
  // other words, a phase of "0" indicates that the next codon begins at the
94
  // first base of the region described by the current line, a phase of "1"
95
  // indicates that the next codon begins at the second base of this region, and
96
  // a phase of "2" indicates that the codon begins at the third base of this
97
  // region. This is NOT to be confused with the frame, which is simply start
98
  // modulo 3.
99
  //
100
  // For forward strand features, phase is counted from the start field. For
101
  // reverse strand features, phase is counted from the end field.
102
  //
103
  // The phase is REQUIRED for all CDS features."
104
  //
105
  // Missingness of this field is encoded by -1.
106
  int32 phase = 6;
107
108
  // `attributes`, a free-form map of keys to string values, corresponding to
109
  // the semi-colon separated attributes field in the GFF text format.
110
  map<string, string> attributes = 7;
111
}
112
113
// A message encoding the directives contained in a GFF3 file header.
114
// Consult the file format reference for detailed descriptions of
115
// these directives.  Note that we do NOT handle the FASTA directive
116
// (a rarely-used method to bundle reference sequences within a GFF
117
// file.)
118
message GffHeader {
119
  // `gff_version`, a string of the format "gff-version 3.#.#"
120
  // encoding the exact GFF spec version used.
121
  string gff_version = 1;
122
123
  // `sequence_regions` is a list of the sequence regions that are
124
  // referenced by GFF records.
125
  repeated Range sequence_regions = 2;
126
127
  // An OntologyDirective holds the URI to a sequence ontology
128
  // database, reflecting the ontology over the entities in the
129
  // `type`, `source`, and `attributes` fields of a GffRecord.
130
  message OntologyDirective {
131
    string uri = 1;
132
  }
133
  repeated OntologyDirective feature_ontologies = 3;
134
  repeated OntologyDirective attribute_ontologies = 4;
135
  repeated OntologyDirective source_ontologies = 5;
136
137
  // A string name for the biological species analyzed.
138
  // "This directive indicates the species that the annotations apply
139
  // to. The preferred format is a NCBI URL that points to the
140
  // relevant species page in either of the following formats:
141
  //   http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6239
142
  //   http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=Caenorhabditis+elegans"
143
  string species = 6;
144
145
  message GenomeBuildDirective {
146
    string source = 1;
147
    string name = 2;
148
  }
149
  GenomeBuildDirective genome_build = 7;
150
}
151
152
message GffReaderOptions {
153
}
154
155
message GffWriterOptions {
156
}