// Copyright 2020 Google LLC.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
syntax = "proto3";
package learning.genomics.deepvariant;
import "deepvariant/protos/realigner.proto";
import "deepvariant/protos/resources.proto";
import "third_party/nucleus/protos/position.proto";
import "third_party/nucleus/protos/reads.proto";
import "third_party/nucleus/protos/variants.proto";
// The type of an Allele.
//
// An allele type indicates what kind of event would have produced
// this allele. An allele can be the reference sequence, a substitution
// of bases, insertion of bases, or deletion of bases. Allele types need not be
// real genetic variants: for example, the SOFT_CLIP type indicates that a read
// contained bases SOFT_CLIPPED away (similar to an insertion), which is often
// indicative of some large event near the start or end of the read.
enum AlleleType {
// Default should be unspecified:
// https://docs.google.com/document/d/1oavZD9XB_147ti93MCBoR5HrFKoBh1xkZcTxjInYf0M/edit#heading=h.8ylxmf942vui
UNSPECIFIED = 0;
// The allele corresponding to that found in the genome sequence.
REFERENCE = 1;
// A substitution of bases that are difference from the genome sequence.
SUBSTITUTION = 2;
// An insertion of bases w.r.t. the reference genome.
INSERTION = 3;
// A deletion of bases w.r.t. the reference genome.
DELETION = 4;
// An allele type produced by a SOFT_CLIP operation during alignment.
// Maybe indicative of a real genetic event occurring at this position,
// or may be a data quality / alignment artifact.
SOFT_CLIP = 5;
}
// An Allele observed in some type of NGS read data.
//
// Conceptually, an Allele is a sequence of bases that represent a type
// of change relative to a reference genome sequence, along with a discrete
// count of the number of times that allele was observed in the NGS data.
message Allele {
// The string of bases that make up this Allele. Should not be empty.
// A simple reference allele might have a single base "A", while a complex
// insertion of the bases "CTG" following that base "A" would have a
// bases sequence of "ACTG".
string bases = 1;
// The type of this allele.
AlleleType type = 2;
// The number of times this Allele was seen in the NGS data. The count
// should be >= 0, where 0 indicates that no observations of the allele
// were observed (which can happen if you want to record that you checked
// for some allele in the data and never saw any evidence for it).
int32 count = 3;
// Set to true if allele contains low quality bases.
bool is_low_quality = 4;
}
// An AlleleCount summarizes the NGS data observed at a position in the genome.
//
// An AlleleCount proto is a key intermediate data structure in DeepVariant
// summarizing the NGS read data covering a site in the genome. It is
// intended to be relatively simple but keep track of the key pieces of
// information about the observed reads and their associated alleles at this
// position so that downstream tools can reconstruct the read coverage, do
// variant calling, and compute reference confidence. It is conceptually similar
// to a samtools read pileup (http://samtools.sourceforge.net/pileup.shtml) but
// without detailed information about bases or their qualities.
//
// The AlleleCount at its core tracks the Alleles observed in reads that overlap
// this position in the genome. Consider a read that has a base, X, aligned to
// the position of interest. If X is a non-reference allele, the AlleleCount
// proto adds a new read name ==> Allele key-value entry to the read_alleles
// map field. If X is the reference allele, the AlleleCount proto increments
// either the ref_supporting_read_count or the ref_nonconfident_read_count
// counter field, depending on read alignment confidence as defined by
// pipeline-specific parameters.
//
// The complexity here is introduced by following the VCF convention of
// representing indel and complex substitution alleles as occurring at the
// preceding base in the genome. So if in fact our base X is followed by a 3 bp
// insertion of acg, than we would in fact not have a count for X at all but
// would see an allele Xacg with a count of 1 (or more if other reads have the
// same allele). The primary contract here is that each aligned base at this
// site goes into a +1 for exactly one allele. A concrete example might
// clarify this logic. Consider the following alignment of two reads to the
// reference genome:
//
// Position: 123 4567
// Ref: ATT---TGCT
// Read1: ATT---TGCT
// Read2: ATTCCCTG-T
//
// The 'T' base in position 3 of Read 1 matches the reference and so the
// AlleleCount's ref_supporting_read_count is incremented. The 'T' base in
// position 3 of Read 2 also matches the reference, but it is the anchor base
// preceding the 'CCC' insertion. The 'TCCC' INSERTION Allele is therefore added
// to the AlleleCount proto for this position. A deletion occurs in read 2 at
// position 6 as well, which produces a DELETION allele 'GT' at position 5.
// Additionally, because only read 1 has a C base at position 6, the AlleleCount
// at 6 would have no entries in its read_alleles and ref_supporting_read_count
// would be 1.
//
// Another design choice is that spanning deletions don't count as coverage
// under bases, so there's an actual drop in coverage under regions of the
// genome with spanning deletions. This is classically the difference between
// physical coverage and sequence coverage:
//
// https://en.wikipedia.org/wiki/Shotgun_sequencing#Coverage
//
// so it's safe to think of an AlleleCount as representing the sequence coverage
// of a position, not its physical coverage.
//
// What this means is that its very straightforward to inspect the reference
// counters and read_alleles in an AlleleCount and determine the corresponding
// alleles for a Variant as well as compute the depth of coverage. This enables
// us to write algorithms to call SNPs, indels, CNVs as well as identify regions
// for assembly using a series of AlleleCount objects rather than the underlying
// read data.
//
// An AlleleCount is a lossy transformation of the raw read data. Fundamentally,
// the digestion of a read into its correspond AlleleCount components loses some
// of this contiguity information provided by reads spanning across multiple
// positions on the genome. None of the specifics of base quality, mapping
// quality, read names, etc. are preserved. Furthermore, an AlleleCount can be
// constructed using only a subset of all of the raw reads (e.g., those that
// pass minimum quality criteria) and even only parts of each read (e.g., if
// the read contains Ns or low quality bases). The data used to compute an
// AlleleCount isn't specified as part of the proto, but is left up to the
// implementation details and runtime parameters of the generating program.
//
// For usability and performance reasons we track reference and alternate allele
// supporting reads in slightly different ways. The number of reads that
// confidently carry the reference allele at this position is stored in
// ref_supporting_read_count. Confidence here means that the read's alignment to
// the reference is reliable. See the Base Alignment Quality (BAQ) paper:
//
// http://bioinformatics.oxfordjournals.org/content/early/2011/02/13/bioinformatics.btr076.full.pdf
//
// For background and motivation. For reads that would have been counted as
// reference supporting but don't have a reliable alignment we instead tally
// those in ref_nonconfident_read_count. Finally, reads that don't have the
// reference allele are stored in a map from the string
// "fragment_name/read_number" to the allele it supports, which by construction
// will always have a count of 1. This allows for more detailed downstream
// analyses of the alt allele containing reads.
//
// Consequentially, the total (usable) coverage at this location is:
//
// coverage =
// // reads supporting an observed alternate allele.
// sum(read_allele_i.count) // [also equal to read_alleles_size()]
// // All of the reads confidently asserting reference.
// + ref_supporting_read_count
// // All of the reads supporting ref without a confident alignment.
// + ref_nonconfident_read_count
//
message AlleleCount {
// The position on the genome of this AlleleCount.
nucleus.genomics.v1.Position position = 1;
// The reference bases of this AlleleCount. Since AlleleCount currently
// only represents a single location, this field should always be a
// single base.
string ref_base = 2;
// The number of reads that confidently carry the reference allele at this
// position.
int32 ref_supporting_read_count = 3;
// A map from a read's key to an Allele message containing information about
// the allele supported by that read at this position. There will be one
// binding for each usable read spanning this position that supports a
// non-reference allele. The read's key is a unique string that identifies
// the read, currently "fragment_name/read_number".
map<string, Allele> read_alleles = 4;
// A count of the number of reads that supported the reference allele but
// whose alignment to the reference genome isn't 100% certain.
int32 ref_nonconfident_read_count = 5;
// A map where key is a sample name and value is a list of alt alleles that
// are supported by reads from this sample.
message Alleles {
repeated Allele alleles = 1;
}
map<string, Alleles> sample_alleles = 6;
// If true reads supporting ref are tracked (read ids are saved).
bool track_ref_reads = 7;
}
// A lighter-weight version of AlleleCount.
//
// The only material difference with this proto is that we don't store the map
// from read names to Alleles, but instead have the total number of reads we've
// seen at this position.
message AlleleCountSummary {
// The Position field of AlleleCount with values inlined here.
string reference_name = 1;
int64 position = 2;
// Same as in AlleleCount.
string ref_base = 3;
// Same as in AlleleCount.
int32 ref_supporting_read_count = 4;
// This is the total number of reads observed at position.
int32 total_read_count = 5;
// Same as in AlleleCount.
int32 ref_nonconfident_read_count = 6;
}
// A message encapsulating all of the information about a Variant call site for
// consumption by further stages of the DeepVariant data processing workflow.
message DeepVariantCall {
// A Variant call based on the information in allele_count. Will always be a
// non-reference variant call (no gVCF or reference records).
nucleus.genomics.v1.Variant variant = 1;
// A map from an alt allele in Variant to Read key that support that allele.
// Every alt allele in the variant will have an entry. Reference supporting
// reads aren't listed. There may be a special key "UNCALLED_ALLELE" for reads
// that don't support either the reference allele or any alt allele in the
// variant. This can happen when the read supports an allele that didn't pass
// our calling thresholds. The read's key is a unique string that identifies
// the read constructed as "fragment_read/read_number".
message SupportingReads {
repeated string read_names = 1;
}
map<string, SupportingReads> allele_support = 2;
map<string, float> allele_frequency = 3;
// List of Read keys supporting red allele.
repeated string ref_support = 4;
message ReadSupport {
string read_name = 1;
bool is_low_quality = 2;
}
// A map from alt allele in Variant to ReadSupport structure. This structrue
// is to replace SupportingReads but for back a backward compatibility old
// one is kept.
message SupportingReadsExt {
repeated ReadSupport read_infos = 1;
}
// This is to replace allele_support.
map<string, SupportingReadsExt> allele_support_ext = 5;
SupportingReadsExt ref_support_ext = 6;
}
// Options to control how our AlleleCounter code works.
message AlleleCounterOptions {
// The number of basepairs to include in each partition of the reference
// genome. This determines how many map/reduce jobs are used to compute the
// AlleleCounts. Using a too small value (below 10000 for example) results
// in having many many intervals to process which may be a performance problem
// for the tool. Using too large of a value will result in difficulty
// parallelizing the computation as there will be too few work units to
// parallelize and each unit will use a lot of memory.
int32 partition_size = 1;
// The requirements for reads to be used when counting alleles.
nucleus.genomics.v1.ReadRequirements read_requirements = 2;
// Determains how allele counter keeps track of ref reads. If True then
// allele_counter stores reads IDs of ref reads, otherwise just a counter is
// used for ref reads. Default value is False.
bool track_ref_reads = 3;
// Option to left align INDELs for each read.
bool normalize_reads = 4;
// If True, the behavior in this commit is reverted:
// https://github.com/google/deepvariant/commit/fbde0674639a28cb9e8004c7a01bbe25240c7d46
bool keep_legacy_behavior = 5;
}
// Variant call for a single site, in a pseudo-biallelic manner. This is an
// intermediate format for call_variants.py that needs to be merged if there
// are multiallelics.
// The `variant` here likely doesn't have fully filled information for output to
// a VCF file yet.
message CallVariantsOutput {
nucleus.genomics.v1.Variant variant = 1;
// The alt allele indices is represented as a sub-message so that it's easier
// to re-use as a standalone proto for encoding+decoding.
message AltAlleleIndices {
repeated int32 indices = 1;
}
AltAlleleIndices alt_allele_indices = 2;
repeated double genotype_probabilities = 3;
// Next ID: 11
message DebugInfo {
int32 predicted_label = 1;
bool has_insertion = 2;
bool has_deletion = 3;
bool is_snp = 4;
int32 true_label = 5;
repeated double logits = 6;
repeated double prelogits = 7;
// The encoded image used for inference.
bytes image_encoded = 8;
// Key-value pairs of layer names of call variant models and the encoded
// layers' outputs.
map<string, bytes> layer_output_encoded = 9;
message PileupCuration {
// The following enums are defined in nucleus/util/vis.py.
int32 diff_category = 1;
int32 base_quality = 2;
int32 mapping_quality = 3;
int32 strand_bias = 4;
int32 read_support = 5;
}
PileupCuration pileup_curation = 10;
}
DebugInfo debug_info = 4;
}
// Options to control how our candidate VariantCaller works.
// Next ID: 18
message VariantCallerOptions {
// Alleles occurring at least this many times in our AlleleCount are
// considered candidate variants.
int32 min_count_snps = 1;
int32 min_count_indels = 2;
// Alleles that have counts at least this fraction of the all counts in an
// AlleleCount are considered candidate variants.
float min_fraction_snps = 3;
float min_fraction_indels = 4;
// In candidate generation, this multiplier is applied to the minimum allele
// fraction thresholds (vsc_min_fraction_snps and vsc_min_fraction_indels)
// to adapt thresholds for multi-sample calling.
float min_fraction_multiplier = 12;
// In candidate generation, this threshold is used to exclude a variant when
// the allele frequency is above this threshold from a non-target sample.
// This is designed for the somatic case - where we want to avoid generating
// a candidate if the AF is high in any of the non-target samples.
float max_fraction_snps_for_non_target_sample = 16;
float max_fraction_indels_for_non_target_sample = 17;
// If provided, we will emit "candidate" variant records at a random fraction
// of otherwise non-candidate sites. Useful for training.
float fraction_reference_sites_to_emit = 5;
// The random seed to use in our variant caller. If not provided, a truly
// random seed will be used.
uint32 random_seed = 6;
// The name of the sample we will put in our VariantCall field of constructed
// variants.
string sample_name = 7;
// The probability that a non-reference allele is actually an error.
float p_error = 8;
// The maximum genotype quality we'll emit for a reference site.
int32 max_gq = 9;
// The width of a GQ bin used to quantize the raw double GQ
// values into coarser-grained bins than just 1 integer unit. See QuantizeGQ
// for more information about the quantization process.
int32 gq_resolution = 10;
// The ploidy of this sample. For humans, this is 2 (diploid). Currently the
// code makes implicit assumptions that the ploidy is 2, but this value is
// used in calculations directly involving ploidy so when we generalize the
// caller to handle other ploidy values we don't have to update all of those
// constants.
int32 ploidy = 11;
// Skip uncalled genotypes. This is used during training so that
// uncalled ./. genotypes are not used to generate and label examples.
bool skip_uncalled_genotypes = 13;
bool track_ref_reads = 14;
int32 phase_reads_region_padding_pct = 15;
}
// Options to control how we label variant calls.
message VariantLabelerOptions {
// Currently there are no options for VariantLabeler.
}
// Options to control how we construct pileup images.
// Next ID: 41.
message PileupImageOptions {
// The height, in pixels, of the pileup image we'll construct.
int32 height = 1;
// The width, in pixels, of the pileup image we'll construct.
int32 width = 2;
// We include at the top of the each image a band of reference pixels with
// this specified height.
int32 reference_band_height = 3;
// Controls how bases are encoded as red pixel values.
//
// A is base_color_offset_a_and_g + base_color_stride * 3
// G is base_color_offset_a_and_g + base_color_stride * 2
// T is base_color_offset_t_and_c + base_color_stride * 1
// C is base_color_offset_t_and_c + base_color_stride * 0
//
// The offset in red color space for A and G bases.
int32 base_color_offset_a_and_g = 4;
// The offset in red color space for T and C bases.
int32 base_color_offset_t_and_c = 5;
// Each base color is offset from each other by this stride.
int32 base_color_stride = 6;
// The alpha value applied to pixels in the reference genome band.
float reference_alpha = 7;
// The base quality we assume for the reference genome bases.
int32 reference_base_quality = 8;
// The alpha to apply to reads that support our alt alleles.
float allele_supporting_read_alpha = 9;
// The alpha to apply to reads that support the other alt allele.
float other_allele_supporting_read_alpha = 32;
// The alpha to apply to reads that do not support our alt alleles.
float allele_unsupporting_read_alpha = 10;
// The alpha to apply to a base that matches the reference sequence.
float reference_matching_read_alpha = 11;
// The alpha to apply to a base that doesn't matches the reference sequence.
float reference_mismatching_read_alpha = 12;
// The character we'll use when encoding insertion/deletion anchor bases.
string indel_anchoring_base_char = 13;
// The color value to use for reads on the positive strand.
int32 positive_strand_color = 14;
// The color value to use for reads on the negative strand.
int32 negative_strand_color = 15;
// The maximum base quality we'll allow in PIC. Base qualities above this
// value are treated as being base_quality_cap.
int32 base_quality_cap = 16;
// Extend read windows by a small amount when calculating overlap with calls.
// This is important to include all the reads involved in deletions in a
// pileup image.
int32 read_overlap_buffer_bp = 17;
// The requirements for reads to be used when creating pileup images.
nucleus.genomics.v1.ReadRequirements read_requirements = 18;
enum MultiAllelicMode {
UNSPECIFIED = 0;
ADD_HET_ALT_IMAGES = 1;
NO_HET_ALT_IMAGES = 2;
}
MultiAllelicMode multi_allelic_mode = 19;
// The maximum mapping quality we'll allow in PIC. Mapping qualities above
// this value are treated as being mapping_quality_cap.
int32 mapping_quality_cap = 20;
// The random seed to use in our Pileup Image Creation.
uint32 random_seed = 21;
// The number of data channels in our pileup images.
int32 num_channels = 22;
// (Experimental feature that was removed.)
// The character we'll use when encoding insertion anchor bases.
string unused_insert_base_char = 23;
// (Experimental feature that was removed.)
// The character we'll use when encoding deletion anchor bases.
string unused_delete_base_char = 24;
// (Experimental feature that was removed.)
// Include custom pileup image feature.
bool unused_custom_pileup_image = 25;
// (Experimental feature that was removed.)
// Include sequencing type image feature.
bool unused_sequencing_type_image = 26;
// Sequencing type of input bam file.
enum SequencingType {
UNSPECIFIED_SEQ_TYPE = 0;
WGS = 1;
WES = 2;
TRIO = 3;
}
SequencingType sequencing_type = 27;
// Whether and how to include alt-aligned pileup images (experimental).
string alt_aligned_pileup = 30;
// If set reads are sorted by haplotype tag (HP tag) and then by alignment
// position.
bool sort_by_haplotypes = 31;
bool reverse_haplotypes = 40;
// Minimal non-zero allele frequency. This is used when normalizing color
// intensities for the allele frequency channel.
float min_non_zero_allele_frequency = 33;
// Whether to consider allele frequencies.
bool use_allele_frequency = 34;
// Which variant types to use alt-align on.
string types_to_alt_align = 36;
// If true, add an additional channel where the color information per-read
// indicates the HP value.
bool add_hp_channel = 37;
// For assembly polishing, specifies the HP tag we're calling for.
int32 hp_tag_for_assembly_polishing = 38;
// The set of channels to collect
repeated string channels = 39;
// Deprecated fields.
int32 sort_by_haplotypes_sample_hp_tag = 35 [deprecated = true];
}
// Options that may differ by sample.
// Next ID: 11.
message SampleOptions {
// A string to identify the role of this sample in the analysis, e.g. in
// trios 'child', 'parent1', or 'parent2'. Importantly, `role` strings should
// not be checked inside make_examples_core.py. For example, instead of
// checking whether sample.role == "child" to set pileup height, instead add
// the pileup height to the sample, adding new properties to this proto if
// needed. This keeps make_examples_core.py functioning for multiple samples
// without it having to reason about sample roles that belong to each
// application. This role is used to keep track of sample identities
// throughout the analysis, and for debugging.
string role = 6;
// Sample name, e.g. HG002. Often given by a --sample_name flag or inferred
// from input files.
string name = 7;
// Paths to files with read alignments, e.g. BAM or CRAM files.
repeated string reads_filenames = 1;
// Should we downsample our reads and if so, by how much? If == 0.0 (default),
// no downsampling occurs. But if set, must be between 0.0 and 1.0 and
// indicates the probability that a read will be kept (randomly) when read
// from the input. This option makes it easy to simulate lower coverage data.
float downsample_fraction = 2;
// Options for finding candidate variants.
VariantCallerOptions variant_caller_options = 3;
// Height of the pileup image for this sample.
int32 pileup_height = 4;
// A list of integers indicating the order in which samples should be shown
// in the pileup image when calling on this sample. The indices refer to the
// list of samples in the regionprocessor.
repeated int32 order = 5;
// Path to the variants for vcf_candidate_importer.
string proposed_variants_filename = 8;
// Path to binary file containing candidate positions.
string candidate_positions = 9;
// If true, skip any output generation for this sample.
bool skip_output_generation = 10;
}
// High-level options that encapsulates all of the parameters needed to run
// DeepVariant end-to-end.
// Next ID: 60.
message MakeExamplesOptions {
// A list of contig names we never want to call variants on. For example,
// chrM in humans is the mitocondrial genome and the caller isn't trained to
// call variants on that genome.
repeated string exclude_contigs = 1;
// List of regions where we want to call variants. If missing, we will call
// variants throughout the entire genome.
repeated string calling_regions = 2;
// Fixed random seed to use for DeepVariant itself.
uint32 random_seed = 3;
// The number of cores to use when running DeepVariant. Must be >= 1.
int32 n_cores = 4;
// Options to control how we run the AlleleCounter.
AlleleCounterOptions allele_counter_options = 5;
// Deprecated. Use sample_options instead.
VariantCallerOptions deprecated_variant_caller_options = 6;
// Options to control how we generate pileup images.
PileupImageOptions pic_options = 7;
// Options to control how we label our examples.
VariantLabelerOptions labeler_options = 8;
// Only reads satisfying these requirements will be used in DeepVariant.
// This parameters are propagated as appropriate to read_requirement fields
// in our tool-specific options.
nucleus.genomics.v1.ReadRequirements read_requirements = 9;
// Options to control out input data sources and output data sinks.
// Path to our genome reference.
string reference_filename = 10;
// Deprecated. Use sample_options instead.
repeated string deprecated_reads_filenames = 32;
// Deprecated.
string deprecated_reads_filename = 11;
// Path where we'll write out our candidate variants.
string candidates_filename = 12;
// Path to examples.
string examples_filename = 13;
// Path to a list of regions we are confident in, for determining which
// candidate variants get labels.
string confident_regions_filename = 14;
// Path to the truth variants, for use in labeling our examples.
string truth_variants_filename = 15;
// Path to the variants for vcf_candidate_importer.
string deprecated_proposed_variants_filename = 33 [deprecated = true];
// Path where we should put our gVCF records.
string gvcf_filename = 16;
// Whether to generate MED_DP in gVCF records or not.
bool include_med_dp = 43;
// The name of the deep learning model to use with DeepVariant.
string model_name = 17;
enum Mode {
UNSPECIFIED = 0;
CALLING = 1;
TRAINING = 2;
CANDIDATE_SWEEP = 3;
}
Mode mode = 18;
// The minimum fraction of basepairs that must be shared by all contigs common
// to DeepVariant inputs and the reference contigs alone. If the common
// contigs cover less than min_shared_contig_basepairs of the reference genome
// contigs DeepVariant will signal an error that the input datasets aren't
// from compatible genomes.
float min_shared_contigs_basepairs = 19;
// The task identifier, as an integer, of this task. If we are running with
// multiple tasks processing the same inputs into sharded outputs, this id
// should be set to a number from 0 (master) to N - 1 to indicate which of the
// tasks we are currently processing.
int32 task_id = 20;
// When running in sharded output mode (i.e., writing outputs to foo@N), this
// field captures the number of sharded outputs (i.e., N). When not running in
// sharded output mode, this field should be 0.
int32 num_shards = 21;
// Options to control realigner module.
// Whether the realigner should be enabled.
bool realigner_enabled = 22;
// If True, realign reads from all samples together. If False, realign
// per sample.
bool joint_realignment = 54;
// Settings for the realigner module.
learning.genomics.deepvariant.RealignerOptions realigner_options = 23;
// The maximum number of reads per partition that we consider before following
// processing such as sampling and realigner.
int32 max_reads_per_partition = 24;
// Similar to `max_reads_per_partition`, we want to we add another field
// to constrain the number of reads to downsample. Even with
// `max_reads_per_partition`, the memory usage can sometimes still be too
// large, especially when the reads are very long.
// When this field is set, we'll multiple it by the region we're in. And then
// we will only sample the reads up to a point where the number of bases in
// the region are larger than
// (max_reads_for_dynamic_bases_per_region * region length).
int32 max_reads_for_dynamic_bases_per_region = 57;
// Deprecated. Use sample_options instead.
float deprecated_downsample_fraction = 25;
// List of regions where we DON'T want to call variants. If missing, no
// regions will be excluded from calling.
repeated string exclude_calling_regions = 26;
// An enumeration of all of the labeler algorithms we support in DeepVariant.
enum LabelerAlgorithm {
UNSPECIFIED_LABELER_ALGORITHM = 0;
// The labeling algorithm used with DeepVariant 0.4-0.5, which does position
// matching to find truth variant to label our candidates.
POSITIONAL_LABELER = 1;
// A haplotype-aware labeling algorithm, similar to hap.py xcmp, that looks
// for genotypes for candidate variants that produce haplotypes that match
// those implied by the genotypes of our truth variants. Produces more
// accurate labels than the POSITIONAL_LABELER labeling algorithm.
HAPLOTYPE_LABELER = 2;
// The labeling algorithm which labels the variants into customized classes
// specified in the specified INFO field in the VCF file.
CUSTOMIZED_CLASSES_LABELER = 3;
}
// The labeling algorithm we are using in this DeepVariant run. Only needed
// when in CALLING mode.
LabelerAlgorithm labeler_algorithm = 27;
string run_info_filename = 28;
// By default aligned_quality field is read from QUAL in SAM. If flag is set,
// aligned_quality field is read from OQ tag in SAM.
bool use_original_quality_scores = 29;
// A list of variant types that we want to restrict our examples to. E.g.,
// select_variant_types = ['snps'] would indicate that we only want to
// generate SNP candidate variants.
repeated string select_variant_types = 30;
enum VariantCaller {
UNSPECIFIED_CALLER = 0;
// The default very sensitive caller.
VERY_SENSITIVE_CALLER = 1;
// An advanced caller that uses an input VCF to call variants.
VCF_CANDIDATE_IMPORTER = 2;
}
VariantCaller variant_caller = 31;
// If flag is set, consider allele frequency.
bool use_allele_frequency = 34;
// A list of VCF or VCF.gz files that specify allele frequency information.
repeated string population_vcf_filenames = 35;
// Path to output optional runtime profiling by region.
string runtime_by_region = 36;
// Use --ref argument as the reference file for the CRAM.
bool use_ref_for_cram = 37;
// Parse aux fields from BAM -- needed for some features like using HP tags.
bool parse_sam_aux_fields = 38;
// This field is used to pass into SamReaderOptions.
// By default, this field is empty. If empty, we keep all aux fields if they
// are parsed. If set, we only keep the aux fields with the names in this
// list.
repeated string aux_fields_to_keep = 50;
// Size of blocks to read from BAM.
int32 hts_block_size = 39;
// How often to show log messages.
int32 logging_every_n_candidates = 40;
string customized_classes_labeler_classes_list = 41;
string customized_classes_labeler_info_field_name = 42;
// The index of the sample to focus on within the list of samples.
int32 main_sample_index = 44;
// per-sample statistics for training examples.
string bam_fname = 45;
// Samples, e.g. a list of 3 for DeepTrio, list of 1 for DeepVariant.
repeated SampleOptions sample_options = 46;
// Sample role to focus on for training. This can be different from the sample
// indicated by the main_sample_index.
string sample_role_to_train = 47;
// DirectPhasing related options.
bool phase_reads = 51;
int32 phase_reads_region_padding_pct = 52;
int32 phase_max_candidates = 53;
string read_phases_output = 55;
bool discard_non_dna_regions = 56;
bool output_sitelist = 58;
// Related to de novo variants labeling
string denovo_regions_filename = 59;
}
// Config describe information needed for a dataset that can be used for
// training, validation, or testing.
message DeepVariantDatasetConfig {
// A human-readable name of the dataset.
string name = 1;
// Full path of the tensorflow.Example TFRecord file.
string tfrecord_path = 2;
// Number of examples for this dataset. Right now this needs to be manually
// filled in order to compute how the learning rate decays, and also used
// in make_training_batches.
uint32 num_examples = 3;
}
// Metrics on the labeling of candidate / truth variants when running
// DeepVariant's make_examples in training mode.
// Next ID: 17.
message LabelingMetrics {
// Notes on counting by site or by allele:
//
// Throughout this proto we often measure the same quantity (e.g., false
// positives) by site and by alleles. This reflects two different ways of
// counting errors in genomes where the number of chromosomes > 1. We can give
// a concrete example:
//
// Candidate: chr20:10 with A/C
// Truth: chr20:10 with A/C alleles and with genotype (0, 1)
//
// Since we have the same variant with the same alleles at the same position
// in both the candidates and the truth, the matching is trivial. For this
// variant, we'd update our counts as follows:
//
// # We have only a single site, so we +1 to each truth and candidates.
// n_truth_variant_sites += 1
// n_candidate_variant_sites += 1
//
// # Both the candidate and truth variants only have 1 alternative allele,
// # so we increment the allele counts by 1 as well.
// n_truth_variant_alleles += 1
// n_candidate_variant_alleles += 1
//
// A similar logic would apply to counting true positives (+1), false
// negatives (+0), and false positives (+0) for both sites and alleles.
//
// Now let's take a more complex example where candidates and truth differ
// in their alleles:
//
// Candidate: chr20:20 with A/C/T
// Truth: chr20:20 with A/C/G alleles and with genotype (1, 2)
//
// Here we have a candidate and truth at the same position but they can only
// be partially matched, since the truth variant includes a G allele (e.g.,
// genotype == 2) that isn't even present in the candidate. And the candidate
// has an extra allele T that isn't real. Matching these variants produces a
// genotype of (0, 1) for the candidate, since we have one copy of the C
// allele (e.g., genotype == 1) but we cannot match the true G allele, so we
// are forced to say one allele is reference (e.g., genotype == 0).
//
// Now let's update our counts:
//
// # We have only a single site, so we +1 to each truth and candidates, even
// # though they both have multiple alt allele, as this is the sites-level
// # metric.
// n_truth_variant_sites += 1
// n_candidate_variant_sites += 1
//
// # Both variants have 2 alternative alleles, so we +2 each of the alleles
// # counts.
// n_truth_variant_alleles += 2
// n_candidate_variant_alleles += 2
//
// # Now for the complex TP/FN/FP counts.
// # We found a candidate for this true site, even though we didn't get all
// # of the alleles, so we increment the n_true_positive_sites by 1 and get
// # +0 for each of the FN and FP sites metrics:
// n_true_positive_sites += 1
// n_false_negative_sites += 0
// n_false_positive_sites += 0
//
// # However, we only got one of the two true positive alleles, so we get
// # one TP allele (the C), one FN allele (the missed G), and one FP allele
// # (the bad T allele in the candidate).
// n_true_positive_alleles += 1
// n_false_negative_alleles += 1
// n_false_positive_alleles += 1
// Notes on deriving higher-level metrics from the counts here:
//
// This proto only contains counts, not ratios like PPV, since we want to make
// it easy to sum up the metrics in this protos across shared make_example
// runs. That said, it's valuable to note here how to compute common
// ratiometric statistics from the counts in this proto since there are many
// fields to choose from. I give the sites-level metrics here, but the
// allele-level values can be computed similarly.
//
// See https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values
// for a general description of these metrics.
//
// PPV is TP / (TP + FP):
// n_true_positive_sites / (n_true_positive_sites + n_false_positive_sites)
//
// Sensitivity: TP / (TP + FN)
// n_true_positive_sites / (n_true_positive_sites + n_false_negative_sites)
// Counts of candidate and truth variants by site and by allele.
//
// The sites metrics are essentially the number of records seen of each type,
// after removing non-PASS truth variants, regardless of the number of alleles
// in the records. The allele metrics are like the sites metrics, but instead
// of getting a +1 for each record we get +N for each site where N is the
// number of alternative alleles.
//
// See above for more information on the difference between sites and alleles
// counting.
int32 n_truth_variant_sites = 1;
int32 n_truth_variant_alleles = 2;
int32 n_candidate_variant_sites = 3;
int32 n_candidate_variant_alleles = 4;
int32 n_non_confident_candidate_variant_sites = 5;
// Counts of TP, FN, and FP implied by the labeling of candidates against
// truth.
//
// See above for more information on the difference between sites and alleles
// counting.
//
// Note that we don't track true negatives since true negatives are poorly
// defined in the variant calling problem.
// TPs:
//
// The number of true variants assigned a non-ref genotype in the match for
// the sites calculation. For the allele calculation, the number of true
// variant alleles with a gt > 0 in the assignment.
int32 n_true_positive_sites = 6;
int32 n_true_positive_alleles = 7;
// FNs:
//
// The number of true variants assigned 0/0 genotypes for the sites
// calculation. For the allele calculation, the number of true variant alleles
// without a gt > 0 in the assignment.
int32 n_false_negative_sites = 8;
int32 n_false_negative_alleles = 9;
// FPs:
//
// The number of candidate variants assigned a 0/0 genotype for the sites
// calculation. For the allele calculation, the number of candidate variant
// alternative alleles without a genotype > 0 in the assignment.
int32 n_false_positive_sites = 10;
int32 n_false_positive_alleles = 11;
// A few more complex metrics that aren't derivable from the above metrics as
// their calculation requires access to the underlying matched variants
// themselves.
// The number of candidate variants (counted by site) that are assigned a
// non-reference genotype (e.g., != 0/0) but that don't have an exact
// position match in truth.
int32 n_inexact_position_matches = 12;
// A count of the number of sites where the candidate and truth variants occur
// at the same position, with increasingly strict additional matching
// criteria. These metrics are all computed over sites, not alleles.
//
// The number of sites where candidate and truth have the same start position.
int32 n_exact_position_matches = 13;
// Same criteria as above but with the additional requirement that all of the
// alleles be exactly the same between candidate and truth.
int32 n_exact_position_and_allele_matches = 14;
// Same criteria as above but with the additional requirement that the matched
// genotypes to be identical as well.
int32 n_exact_position_and_allele_and_genotype_matches = 15;
// Number of truth variants (counted by site) with more than one alternative
// allele where at least one alternative allele was missed (i.e., was assigned
// a genotype of 0 in the match).
int32 n_truth_multiallelics_sites_with_missed_alleles = 16;
}
// Statistics about MakeExamples.
// Next ID: 9.
message MakeExamplesStats {
int32 num_examples = 1;
int32 num_indels = 2;
int32 num_snps = 3;
int32 num_class_0 = 4;
int32 num_class_1 = 5;
int32 num_class_2 = 6;
int32 num_denovo = 7;
int32 num_nondenovo = 8;
}
// Configuration and runtime information about a MakeExamples run in
// DeepVariant.
// Next ID: 5.
message MakeExamplesRunInfo {
MakeExamplesOptions options = 1;
LabelingMetrics labeling_metrics = 2;
ResourceMetrics resource_metrics = 3;
MakeExamplesStats stats = 4;
}
// Next ID: 22.
enum DeepVariantChannelEnum {
// Default should be unspecified.
CH_UNSPECIFIED = 0;
// 6 channels that exist in all DeepVariant production models.
CH_READ_BASE = 1;
CH_BASE_QUALITY = 2;
CH_MAPPING_QUALITY = 3;
CH_STRAND = 4;
CH_READ_SUPPORTS_VARIANT = 5;
CH_BASE_DIFFERS_FROM_REF = 6;
// "Improving Variant Calling using Haplotype Information"
// https://google.github.io/deepvariant/posts/2021-02-08-the-haplotype-channel/
CH_HAPLOTYPE_TAG = 7;
// "Improving variant calling using population data and deep learning"
// https://doi.org/10.1101/2021.01.06.425550
CH_ALLELE_FREQUENCY = 8;
// Two extra channels for diff_channels:
CH_DIFF_CHANNELS_ALTERNATE_ALLELE_1 = 9;
CH_DIFF_CHANNELS_ALTERNATE_ALLELE_2 = 10;
// Two extra channels for base_channels:
CH_BASE_CHANNELS_ALTERNATE_ALLELE_1 = 20;
CH_BASE_CHANNELS_ALTERNATE_ALLELE_2 = 21;
// The following channels correspond to the "Opt Channels" defined in
// deepvariant/pileup_channel_lib.h:
CH_READ_MAPPING_PERCENT = 11;
CH_AVG_BASE_QUALITY = 12;
CH_IDENTITY = 13;
CH_GAP_COMPRESSED_IDENTITY = 14;
CH_GC_CONTENT = 15;
CH_IS_HOMOPOLYMER = 16;
CH_HOMOPOLYMER_WEIGHTED = 17;
CH_BLANK = 18;
CH_INSERT_SIZE = 19;
}