// Copyright 2017 Google LLC.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
syntax = "proto3";
package learning.genomics.deepvariant;
import "third_party/nucleus/protos/range.proto";
// Encapsulates a list of candidate haplotype sequences for a genomic region.
message CandidateHaplotypes {
// The genomic region containing the candidate haplotypes.
nucleus.genomics.v1.Range span = 1;
// The list of candidate haplotype sequences. Each individual haplotype
// is represented by its nucleotide sequence.
repeated string haplotypes = 2;
}
// Config parameters for the selection of candidate location in the
// "window selector (ws)" phase.
message WindowSelectorModel {
// Two models are currently supported:
// - VARIANT_READS: based on the number of SNPs, INDELs and SOFT_CLIPs at a
// location.
// - ALLELE_COUNT_LINEAR: linear model based on the AlleleCount at each
// location.
enum ModelType {
UNDEFINED = 0;
VARIANT_READS = 1;
ALLELE_COUNT_LINEAR = 2;
}
// Model requiring #reads > min_num_supporting_reads and
// #reads < max_num_supporting_reads.
message VariantReadsThresholdModel {
// Minimum number of supporting reads to call a reference position for
// local assembly.
int32 min_num_supporting_reads = 1;
// Maximum number of supporting reads to call a reference position for local
// assembly.
int32 max_num_supporting_reads = 2;
}
// Linear model based on the type of reads at each locus.
message AlleleCountLinearModel {
float bias = 1;
float coeff_soft_clip = 2;
float coeff_substitution = 3;
float coeff_insertion = 4;
float coeff_deletion = 5;
float coeff_reference = 6;
// Threshold for realignment, the higher it is, the lower the recall.
float decision_boundary = 7;
}
// Window selection algorithm to be used.
ModelType model_type = 1;
// Configuration associated with the selected algorithm.
oneof model {
VariantReadsThresholdModel variant_reads_model = 2;
AlleleCountLinearModel allele_count_linear_model = 3;
}
}
// Config parameters for "window selector (ws)" phase.
// Next ID: 10.
message WindowSelectorOptions {
// Minimum number of supporting reads to call a reference position for
// local assembly.
// DEPRECATED: Use VariantReadsThresholdModel.min_num_supporting_reads
// instead.
int32 min_num_supporting_reads = 1;
// Maximum number of supporting reads to call a reference position for local
// assembly.
// DEPRECATED: Use VariantReadsThresholdModel.max_num_supporting_reads
// instead.
int32 max_num_supporting_reads = 2;
// Minimum read alignment quality to consider in calling a reference
// position for local assembly.
int32 min_mapq = 3;
// Minimum base quality to consider in calling a reference position for
// local assembly.
int32 min_base_quality = 4;
// Minimum distance between candidate windows for local assembly.
int32 min_windows_distance = 5;
// Maximum window size to consider for local assembly. Large noisy regions
// are skipped for realignment.
int32 max_window_size = 6;
// How much should we expand the region we compute the candidate positions?
// This is needed because we want variants near, but not within, our actual
// window region to contribute evidence towards our window sites. Larger
// values allow larger events (i.e., an 50 bp deletion) 49 bp away from the
// region to contribute. However, larger values also means greater
// computation overhead as we are processing extra positions that aren't
// themselves directly used.
int32 region_expansion_in_bp = 7;
// Config for the '_candidates_from_reads' phase.
WindowSelectorModel window_selector_model = 8;
// If True, the behavior in this commit is reverted:
// https://github.com/google/deepvariant/commit/fbde0674639a28cb9e8004c7a01bbe25240c7d46
bool keep_legacy_behavior = 9;
}
// Config parameters for "de-Bruijn graph (dbg)" phase.
message DeBruijnGraphOptions {
// Initial k-mer size to build the graph.
int32 min_k = 1;
// Maximum k-mer size. Larger k-mer size is used to resolve graph cycles.
int32 max_k = 2;
// Increment size for k to try in resolving graph cycles.
int32 step_k = 3;
// Minimum read alignment quality to consider in building the graph.
int32 min_mapq = 4;
// Minimum base quality in a k-mer sequence to consider in building the
// graph.
int32 min_base_quality = 5;
// Minimum number of supporting reads to keep an edge.
int32 min_edge_weight = 6;
// Maximum number of paths within a graph to consider for realignment.
// Set max_num_paths to 0 to have unlimited number of paths.
int32 max_num_paths = 7;
}
// Config parameters for "alignment (aln)" phase.
message AlignerOptions {
// Match score (expected to be a non-negative score).
int32 match = 1;
// Mismatch score (expected to be a non-positive score).
int32 mismatch = 2;
// Gap open score (expected to be a non-positive score).
// Score for a gap of length g is (gap_open + (g - 1) * gap_extend).
int32 gap_open = 3;
// Gap extend score (expected to be a non-positive score).
// Score for a gap of length g is (gap_open + (g - 1) * gap_extend).
int32 gap_extend = 4;
// k-mer size used to index target sequence.
// TODO This parameter is not used fast_pass_aligner. Since we no
// longer use python realigner this parameter is obsolete.
int32 k = 5;
// Estimated sequencing error rate.
// TODO This parameter is not used in fast_pass_aligner. We need to
// remove it.
float error_rate = 6;
// Average read size. This parameter is used to calculate a
// ssw_alignment_score_threshold_ - the threshold to filter out reads
// aligned with SSW library. Not all the reads may be the same size.
// This parameter needs to be set to a value close enough to the average
// read size.
int32 read_size = 8;
// K-mer size in read index used in Fast Pass Aligner.
int32 kmer_size = 9;
// Num of maximum allowed mismatches for quick read to haplotype alignment.
int32 max_num_of_mismatches = 10;
// Similarity threshold used to filter out bad read alignments made with
// Smith-Waterman alignment. Alignment is discarded if read is aligned to
// a haplotype with too many mismatches.
double realignment_similarity_threshold = 11;
// Force realignment so the original alignment is never returned, defaulting
// instead to computing a new SSW alignment against the reference. This is
// used for alt-aligned pileups where reads are aligned to a new "reference",
// making the original read alignments invalid.
bool force_alignment = 12;
}
// Config parameters for "alignment (aln)" phase.
message Diagnostics {
// Enable runtime diagnostic outputs.
bool enabled = 1;
// The root where we'll put our diagnostic outputs.
string output_root = 2;
// True if we should also emit the realigned reads themselves.
bool emit_realigned_reads = 3;
}
message RealignerOptions {
// Config parameters for "window selector (ws)" phase.
WindowSelectorOptions ws_config = 1;
// Config parameters for "de-Bruijn graph (dbg)" phase.
DeBruijnGraphOptions dbg_config = 2;
// Config parameters for "alignment (aln)" phase.
AlignerOptions aln_config = 3;
// Diagnostics options.
Diagnostics diagnostics = 4;
// Split reads with large SKIP regions (i.e. RNA-seq)
bool split_skip_reads = 5;
// This value should be the same as the one in AlleleCounterOptions, both come
// from --normalize_reads flag.
// Realigner might act differently based on whether normalize_reads is set.
bool normalize_reads = 6;
}