|
a |
|
b/third_party/nucleus/protos/gff.proto |
|
|
1 |
// Copyright 2018 Google LLC. |
|
|
2 |
// |
|
|
3 |
// Redistribution and use in source and binary forms, with or without |
|
|
4 |
// modification, are permitted provided that the following conditions |
|
|
5 |
// are met: |
|
|
6 |
// |
|
|
7 |
// 1. Redistributions of source code must retain the above copyright notice, |
|
|
8 |
// this list of conditions and the following disclaimer. |
|
|
9 |
// |
|
|
10 |
// 2. Redistributions in binary form must reproduce the above copyright |
|
|
11 |
// notice, this list of conditions and the following disclaimer in the |
|
|
12 |
// documentation and/or other materials provided with the distribution. |
|
|
13 |
// |
|
|
14 |
// 3. Neither the name of the copyright holder nor the names of its |
|
|
15 |
// contributors may be used to endorse or promote products derived from this |
|
|
16 |
// software without specific prior written permission. |
|
|
17 |
// |
|
|
18 |
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
|
19 |
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
|
20 |
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
|
21 |
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
|
|
22 |
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
|
23 |
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
|
24 |
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
|
25 |
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
|
26 |
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
|
27 |
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
|
28 |
// POSSIBILITY OF SUCH DAMAGE. |
|
|
29 |
syntax = "proto3"; |
|
|
30 |
|
|
|
31 |
package nucleus.genomics.v1; |
|
|
32 |
|
|
|
33 |
import "third_party/nucleus/protos/range.proto"; |
|
|
34 |
|
|
|
35 |
// This message represents a single GFF3 record. See |
|
|
36 |
// https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md |
|
|
37 |
// for details on the file format; that document is quoted below. |
|
|
38 |
// |
|
|
39 |
// TODO: deal with %-encoding. |
|
|
40 |
message GffRecord { |
|
|
41 |
// `range` field, reflecting the genomic location of the feature. |
|
|
42 |
// NOTE: this is a 0-based, end-exclusive interval, in contrast to the 1-based |
|
|
43 |
// encoding used natively in the GFF text format. For reference, consult the |
|
|
44 |
// documentation for `Range`. This field is required. |
|
|
45 |
Range range = 1; |
|
|
46 |
|
|
|
47 |
// `source` designation, as defined by GFF spec: |
|
|
48 |
// "The source is a free text qualifier intended to describe the algorithm or |
|
|
49 |
// operating procedure that generated this feature. Typically this is the name |
|
|
50 |
// of a piece of software, such as "Genescan" or a database name, such as |
|
|
51 |
// "Genbank." In effect, the source is used to extend the feature ontology by |
|
|
52 |
// adding a qualifier to the type creating a new composite type that is a |
|
|
53 |
// subclass of the type in the type column." |
|
|
54 |
// |
|
|
55 |
// Missingness of this field is encoded as "". |
|
|
56 |
string source = 2; |
|
|
57 |
|
|
|
58 |
// `type` designation, as defined by GFF spec: |
|
|
59 |
// "The type of the feature (previously called the "method"). This is |
|
|
60 |
// constrained to be either a term from the Sequence Ontology or an SO |
|
|
61 |
// accession number. The latter alternative is distinguished using the syntax |
|
|
62 |
// SO:000000. In either case, it must be sequence_feature (SO:0000110) or an |
|
|
63 |
// is_a child of it." |
|
|
64 |
// |
|
|
65 |
// Missingness of this field is encoded as "". |
|
|
66 |
string type = 3; |
|
|
67 |
|
|
|
68 |
// `score` designation, as defined by GFF spec: |
|
|
69 |
// "The score of the feature, a floating point number. As in earlier versions |
|
|
70 |
// of the format, the semantics of the score are ill-defined. It is strongly |
|
|
71 |
// recommended that E-values be used for sequence similarity features, and |
|
|
72 |
// that P-values be used for ab initio gene prediction features." |
|
|
73 |
// |
|
|
74 |
// Missingness of this field is encoded by -infinity. |
|
|
75 |
double score = 4; |
|
|
76 |
|
|
|
77 |
// TODO: factor this out (here and BED, at least) |
|
|
78 |
enum Strand { |
|
|
79 |
// The strand is unspecified, unknown, or not meaningful. |
|
|
80 |
UNSPECIFIED_STRAND = 0; |
|
|
81 |
FORWARD_STRAND = 1; |
|
|
82 |
REVERSE_STRAND = 2; |
|
|
83 |
} |
|
|
84 |
|
|
|
85 |
// The strand if the feature, if relevant. |
|
|
86 |
Strand strand = 5; |
|
|
87 |
|
|
|
88 |
// `phase` designation, as defined by the GFF spec: |
|
|
89 |
// "For features of type "CDS", the phase indicates where the feature begins |
|
|
90 |
// with reference to the reading frame. The phase is one of the integers 0, 1, |
|
|
91 |
// or 2, indicating the number of bases that should be removed from the |
|
|
92 |
// beginning of this feature to reach the first base of the next codon. In |
|
|
93 |
// other words, a phase of "0" indicates that the next codon begins at the |
|
|
94 |
// first base of the region described by the current line, a phase of "1" |
|
|
95 |
// indicates that the next codon begins at the second base of this region, and |
|
|
96 |
// a phase of "2" indicates that the codon begins at the third base of this |
|
|
97 |
// region. This is NOT to be confused with the frame, which is simply start |
|
|
98 |
// modulo 3. |
|
|
99 |
// |
|
|
100 |
// For forward strand features, phase is counted from the start field. For |
|
|
101 |
// reverse strand features, phase is counted from the end field. |
|
|
102 |
// |
|
|
103 |
// The phase is REQUIRED for all CDS features." |
|
|
104 |
// |
|
|
105 |
// Missingness of this field is encoded by -1. |
|
|
106 |
int32 phase = 6; |
|
|
107 |
|
|
|
108 |
// `attributes`, a free-form map of keys to string values, corresponding to |
|
|
109 |
// the semi-colon separated attributes field in the GFF text format. |
|
|
110 |
map<string, string> attributes = 7; |
|
|
111 |
} |
|
|
112 |
|
|
|
113 |
// A message encoding the directives contained in a GFF3 file header. |
|
|
114 |
// Consult the file format reference for detailed descriptions of |
|
|
115 |
// these directives. Note that we do NOT handle the FASTA directive |
|
|
116 |
// (a rarely-used method to bundle reference sequences within a GFF |
|
|
117 |
// file.) |
|
|
118 |
message GffHeader { |
|
|
119 |
// `gff_version`, a string of the format "gff-version 3.#.#" |
|
|
120 |
// encoding the exact GFF spec version used. |
|
|
121 |
string gff_version = 1; |
|
|
122 |
|
|
|
123 |
// `sequence_regions` is a list of the sequence regions that are |
|
|
124 |
// referenced by GFF records. |
|
|
125 |
repeated Range sequence_regions = 2; |
|
|
126 |
|
|
|
127 |
// An OntologyDirective holds the URI to a sequence ontology |
|
|
128 |
// database, reflecting the ontology over the entities in the |
|
|
129 |
// `type`, `source`, and `attributes` fields of a GffRecord. |
|
|
130 |
message OntologyDirective { |
|
|
131 |
string uri = 1; |
|
|
132 |
} |
|
|
133 |
repeated OntologyDirective feature_ontologies = 3; |
|
|
134 |
repeated OntologyDirective attribute_ontologies = 4; |
|
|
135 |
repeated OntologyDirective source_ontologies = 5; |
|
|
136 |
|
|
|
137 |
// A string name for the biological species analyzed. |
|
|
138 |
// "This directive indicates the species that the annotations apply |
|
|
139 |
// to. The preferred format is a NCBI URL that points to the |
|
|
140 |
// relevant species page in either of the following formats: |
|
|
141 |
// http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6239 |
|
|
142 |
// http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=Caenorhabditis+elegans" |
|
|
143 |
string species = 6; |
|
|
144 |
|
|
|
145 |
message GenomeBuildDirective { |
|
|
146 |
string source = 1; |
|
|
147 |
string name = 2; |
|
|
148 |
} |
|
|
149 |
GenomeBuildDirective genome_build = 7; |
|
|
150 |
} |
|
|
151 |
|
|
|
152 |
message GffReaderOptions { |
|
|
153 |
} |
|
|
154 |
|
|
|
155 |
message GffWriterOptions { |
|
|
156 |
} |