[9b26b7]: / deepvariant / protos / realigner.proto

Download this file

250 lines (203 with data), 9.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
// Copyright 2017 Google LLC.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
syntax = "proto3";
package learning.genomics.deepvariant;
import "third_party/nucleus/protos/range.proto";
// Encapsulates a list of candidate haplotype sequences for a genomic region.
message CandidateHaplotypes {
// The genomic region containing the candidate haplotypes.
nucleus.genomics.v1.Range span = 1;
// The list of candidate haplotype sequences. Each individual haplotype
// is represented by its nucleotide sequence.
repeated string haplotypes = 2;
}
// Config parameters for the selection of candidate location in the
// "window selector (ws)" phase.
message WindowSelectorModel {
// Two models are currently supported:
// - VARIANT_READS: based on the number of SNPs, INDELs and SOFT_CLIPs at a
// location.
// - ALLELE_COUNT_LINEAR: linear model based on the AlleleCount at each
// location.
enum ModelType {
UNDEFINED = 0;
VARIANT_READS = 1;
ALLELE_COUNT_LINEAR = 2;
}
// Model requiring #reads > min_num_supporting_reads and
// #reads < max_num_supporting_reads.
message VariantReadsThresholdModel {
// Minimum number of supporting reads to call a reference position for
// local assembly.
int32 min_num_supporting_reads = 1;
// Maximum number of supporting reads to call a reference position for local
// assembly.
int32 max_num_supporting_reads = 2;
}
// Linear model based on the type of reads at each locus.
message AlleleCountLinearModel {
float bias = 1;
float coeff_soft_clip = 2;
float coeff_substitution = 3;
float coeff_insertion = 4;
float coeff_deletion = 5;
float coeff_reference = 6;
// Threshold for realignment, the higher it is, the lower the recall.
float decision_boundary = 7;
}
// Window selection algorithm to be used.
ModelType model_type = 1;
// Configuration associated with the selected algorithm.
oneof model {
VariantReadsThresholdModel variant_reads_model = 2;
AlleleCountLinearModel allele_count_linear_model = 3;
}
}
// Config parameters for "window selector (ws)" phase.
// Next ID: 10.
message WindowSelectorOptions {
// Minimum number of supporting reads to call a reference position for
// local assembly.
// DEPRECATED: Use VariantReadsThresholdModel.min_num_supporting_reads
// instead.
int32 min_num_supporting_reads = 1;
// Maximum number of supporting reads to call a reference position for local
// assembly.
// DEPRECATED: Use VariantReadsThresholdModel.max_num_supporting_reads
// instead.
int32 max_num_supporting_reads = 2;
// Minimum read alignment quality to consider in calling a reference
// position for local assembly.
int32 min_mapq = 3;
// Minimum base quality to consider in calling a reference position for
// local assembly.
int32 min_base_quality = 4;
// Minimum distance between candidate windows for local assembly.
int32 min_windows_distance = 5;
// Maximum window size to consider for local assembly. Large noisy regions
// are skipped for realignment.
int32 max_window_size = 6;
// How much should we expand the region we compute the candidate positions?
// This is needed because we want variants near, but not within, our actual
// window region to contribute evidence towards our window sites. Larger
// values allow larger events (i.e., an 50 bp deletion) 49 bp away from the
// region to contribute. However, larger values also means greater
// computation overhead as we are processing extra positions that aren't
// themselves directly used.
int32 region_expansion_in_bp = 7;
// Config for the '_candidates_from_reads' phase.
WindowSelectorModel window_selector_model = 8;
// If True, the behavior in this commit is reverted:
// https://github.com/google/deepvariant/commit/fbde0674639a28cb9e8004c7a01bbe25240c7d46
bool keep_legacy_behavior = 9;
}
// Config parameters for "de-Bruijn graph (dbg)" phase.
message DeBruijnGraphOptions {
// Initial k-mer size to build the graph.
int32 min_k = 1;
// Maximum k-mer size. Larger k-mer size is used to resolve graph cycles.
int32 max_k = 2;
// Increment size for k to try in resolving graph cycles.
int32 step_k = 3;
// Minimum read alignment quality to consider in building the graph.
int32 min_mapq = 4;
// Minimum base quality in a k-mer sequence to consider in building the
// graph.
int32 min_base_quality = 5;
// Minimum number of supporting reads to keep an edge.
int32 min_edge_weight = 6;
// Maximum number of paths within a graph to consider for realignment.
// Set max_num_paths to 0 to have unlimited number of paths.
int32 max_num_paths = 7;
}
// Config parameters for "alignment (aln)" phase.
message AlignerOptions {
// Match score (expected to be a non-negative score).
int32 match = 1;
// Mismatch score (expected to be a non-positive score).
int32 mismatch = 2;
// Gap open score (expected to be a non-positive score).
// Score for a gap of length g is (gap_open + (g - 1) * gap_extend).
int32 gap_open = 3;
// Gap extend score (expected to be a non-positive score).
// Score for a gap of length g is (gap_open + (g - 1) * gap_extend).
int32 gap_extend = 4;
// k-mer size used to index target sequence.
// TODO This parameter is not used fast_pass_aligner. Since we no
// longer use python realigner this parameter is obsolete.
int32 k = 5;
// Estimated sequencing error rate.
// TODO This parameter is not used in fast_pass_aligner. We need to
// remove it.
float error_rate = 6;
// Average read size. This parameter is used to calculate a
// ssw_alignment_score_threshold_ - the threshold to filter out reads
// aligned with SSW library. Not all the reads may be the same size.
// This parameter needs to be set to a value close enough to the average
// read size.
int32 read_size = 8;
// K-mer size in read index used in Fast Pass Aligner.
int32 kmer_size = 9;
// Num of maximum allowed mismatches for quick read to haplotype alignment.
int32 max_num_of_mismatches = 10;
// Similarity threshold used to filter out bad read alignments made with
// Smith-Waterman alignment. Alignment is discarded if read is aligned to
// a haplotype with too many mismatches.
double realignment_similarity_threshold = 11;
// Force realignment so the original alignment is never returned, defaulting
// instead to computing a new SSW alignment against the reference. This is
// used for alt-aligned pileups where reads are aligned to a new "reference",
// making the original read alignments invalid.
bool force_alignment = 12;
}
// Config parameters for "alignment (aln)" phase.
message Diagnostics {
// Enable runtime diagnostic outputs.
bool enabled = 1;
// The root where we'll put our diagnostic outputs.
string output_root = 2;
// True if we should also emit the realigned reads themselves.
bool emit_realigned_reads = 3;
}
message RealignerOptions {
// Config parameters for "window selector (ws)" phase.
WindowSelectorOptions ws_config = 1;
// Config parameters for "de-Bruijn graph (dbg)" phase.
DeBruijnGraphOptions dbg_config = 2;
// Config parameters for "alignment (aln)" phase.
AlignerOptions aln_config = 3;
// Diagnostics options.
Diagnostics diagnostics = 4;
// Split reads with large SKIP regions (i.e. RNA-seq)
bool split_skip_reads = 5;
// This value should be the same as the one in AlleleCounterOptions, both come
// from --normalize_reads flag.
// Realigner might act differently based on whether normalize_reads is set.
bool normalize_reads = 6;
}