deepvariant-r1.6.1 / Git / [9b26b7] /deepvariant/protos/deepvariant.proto

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / deepvariant / protos / deepvariant.proto
History
Download this file
1081 lines (918 with data), 43.8 kB

// Copyright 2020 Google LLC.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
//    this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
//    contributors may be used to endorse or promote products derived from this
//    software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

syntax = "proto3";

package learning.genomics.deepvariant;

import "deepvariant/protos/realigner.proto";
import "deepvariant/protos/resources.proto";
import "third_party/nucleus/protos/position.proto";
import "third_party/nucleus/protos/reads.proto";
import "third_party/nucleus/protos/variants.proto";

// The type of an Allele.
//
// An allele type indicates what kind of event would have produced
// this allele.  An allele can be the reference sequence, a substitution
// of bases, insertion of bases, or deletion of bases. Allele types need not be
// real genetic variants: for example, the SOFT_CLIP type indicates that a read
// contained bases SOFT_CLIPPED away (similar to an insertion), which is often
// indicative of some large event near the start or end of the read.
enum AlleleType {
  // Default should be unspecified:
  // https://docs.google.com/document/d/1oavZD9XB_147ti93MCBoR5HrFKoBh1xkZcTxjInYf0M/edit#heading=h.8ylxmf942vui
  UNSPECIFIED = 0;

  // The allele corresponding to that found in the genome sequence.
  REFERENCE = 1;

  // A substitution of bases that are difference from the genome sequence.
  SUBSTITUTION = 2;

  // An insertion of bases w.r.t. the reference genome.
  INSERTION = 3;

  // A deletion of bases w.r.t. the reference genome.
  DELETION = 4;

  // An allele type produced by a SOFT_CLIP operation during alignment.
  // Maybe indicative of a real genetic event occurring at this position,
  // or may be a data quality / alignment artifact.
  SOFT_CLIP = 5;
}

// An Allele observed in some type of NGS read data.
//
// Conceptually, an Allele is a sequence of bases that represent a type
// of change relative to a reference genome sequence, along with a discrete
// count of the number of times that allele was observed in the NGS data.
message Allele {
  // The string of bases that make up this Allele. Should not be empty.
  // A simple reference allele might have a single base "A", while a complex
  // insertion of the bases "CTG" following that base "A" would have a
  // bases sequence of "ACTG".
  string bases = 1;

  // The type of this allele.
  AlleleType type = 2;

  // The number of times this Allele was seen in the NGS data. The count
  // should be >= 0, where 0 indicates that no observations of the allele
  // were observed (which can happen if you want to record that you checked
  // for some allele in the data and never saw any evidence for it).
  int32 count = 3;

  // Set to true if allele contains low quality bases.
  bool is_low_quality = 4;
}

// An AlleleCount summarizes the NGS data observed at a position in the genome.
//
// An AlleleCount proto is a key intermediate data structure in DeepVariant
// summarizing the NGS read data covering a site in the genome. It is
// intended to be relatively simple but keep track of the key pieces of
// information about the observed reads and their associated alleles at this
// position so that downstream tools can reconstruct the read coverage, do
// variant calling, and compute reference confidence. It is conceptually similar
// to a samtools read pileup (http://samtools.sourceforge.net/pileup.shtml) but
// without detailed information about bases or their qualities.
//
// The AlleleCount at its core tracks the Alleles observed in reads that overlap
// this position in the genome. Consider a read that has a base, X, aligned to
// the position of interest. If X is a non-reference allele, the AlleleCount
// proto adds a new read name ==> Allele key-value entry to the read_alleles
// map field. If X is the reference allele, the AlleleCount proto increments
// either the ref_supporting_read_count or the ref_nonconfident_read_count
// counter field, depending on read alignment confidence as defined by
// pipeline-specific parameters.
//
// The complexity here is introduced by following the VCF convention of
// representing indel and complex substitution alleles as occurring at the
// preceding base in the genome. So if in fact our base X is followed by a 3 bp
// insertion of acg, than we would in fact not have a count for X at all but
// would see an allele Xacg with a count of 1 (or more if other reads have the
// same allele). The primary contract here is that each aligned base at this
// site goes into a +1 for exactly one allele. A concrete example might
// clarify this logic.  Consider the following alignment of two reads to the
// reference genome:
//
// Position: 123   4567
// Ref:      ATT---TGCT
// Read1:    ATT---TGCT
// Read2:    ATTCCCTG-T
//
// The 'T' base in position 3 of Read 1 matches the reference and so the
// AlleleCount's ref_supporting_read_count is incremented. The 'T' base in
// position 3 of Read 2 also matches the reference, but it is the anchor base
// preceding the 'CCC' insertion. The 'TCCC' INSERTION Allele is therefore added
// to the AlleleCount proto for this position. A deletion occurs in read 2 at
// position 6 as well, which produces a DELETION allele 'GT' at position 5.
// Additionally, because only read 1 has a C base at position 6, the AlleleCount
// at 6 would have no entries in its read_alleles and ref_supporting_read_count
// would be 1.
//
// Another design choice is that spanning deletions don't count as coverage
// under bases, so there's an actual drop in coverage under regions of the
// genome with spanning deletions.  This is classically the difference between
// physical coverage and sequence coverage:
//
//      https://en.wikipedia.org/wiki/Shotgun_sequencing#Coverage
//
// so it's safe to think of an AlleleCount as representing the sequence coverage
// of a position, not its physical coverage.
//
// What this means is that its very straightforward to inspect the reference
// counters and read_alleles in an AlleleCount and determine the corresponding
// alleles for a Variant as well as compute the depth of coverage. This enables
// us to write algorithms to call SNPs, indels, CNVs as well as identify regions
// for assembly using a series of AlleleCount objects rather than the underlying
// read data.
//
// An AlleleCount is a lossy transformation of the raw read data. Fundamentally,
// the digestion of a read into its correspond AlleleCount components loses some
// of this contiguity information provided by reads spanning across multiple
// positions on the genome. None of the specifics of base quality, mapping
// quality, read names, etc. are preserved. Furthermore, an AlleleCount can be
// constructed using only a subset of all of the raw reads (e.g., those that
// pass minimum quality criteria) and even only parts of each read (e.g., if
// the read contains Ns or low quality bases). The data used to compute an
// AlleleCount isn't specified as part of the proto, but is left up to the
// implementation details and runtime parameters of the generating program.
//
// For usability and performance reasons we track reference and alternate allele
// supporting reads in slightly different ways. The number of reads that
// confidently carry the reference allele at this position is stored in
// ref_supporting_read_count. Confidence here means that the read's alignment to
// the reference is reliable. See the Base Alignment Quality (BAQ) paper:
//
// http://bioinformatics.oxfordjournals.org/content/early/2011/02/13/bioinformatics.btr076.full.pdf
//
// For background and motivation. For reads that would have been counted as
// reference supporting but don't have a reliable alignment we instead tally
// those in ref_nonconfident_read_count. Finally, reads that don't have the
// reference allele are stored in a map from the string
// "fragment_name/read_number" to the allele it supports, which by construction
// will always have a count of 1. This allows for more detailed downstream
// analyses of the alt allele containing reads.
//
// Consequentially, the total (usable) coverage at this location is:
//
// coverage =
//    // reads supporting an observed alternate allele.
//    sum(read_allele_i.count)      // [also equal to read_alleles_size()]
//    // All of the reads confidently asserting reference.
//    + ref_supporting_read_count
//    // All of the reads supporting ref without a confident alignment.
//    + ref_nonconfident_read_count
//
message AlleleCount {
  // The position on the genome of this AlleleCount.
  nucleus.genomics.v1.Position position = 1;

  // The reference bases of this AlleleCount. Since AlleleCount currently
  // only represents a single location, this field should always be a
  // single base.
  string ref_base = 2;

  // The number of reads that confidently carry the reference allele at this
  // position.
  int32 ref_supporting_read_count = 3;

  // A map from a read's key to an Allele message containing information about
  // the allele supported by that read at this position. There will be one
  // binding for each usable read spanning this position that supports a
  // non-reference allele. The read's key is a unique string that identifies
  // the read, currently "fragment_name/read_number".
  map<string, Allele> read_alleles = 4;

  // A count of the number of reads that supported the reference allele but
  // whose alignment to the reference genome isn't 100% certain.
  int32 ref_nonconfident_read_count = 5;

  // A map where key is a sample name and value is a list of alt alleles that
  // are supported by reads from this sample.
  message Alleles {
    repeated Allele alleles = 1;
  }
  map<string, Alleles> sample_alleles = 6;

  // If true reads supporting ref are tracked (read ids are saved).
  bool track_ref_reads = 7;
}

// A lighter-weight version of AlleleCount.
//
// The only material difference with this proto is that we don't store the map
// from read names to Alleles, but instead have the total number of reads we've
// seen at this position.
message AlleleCountSummary {
  // The Position field of AlleleCount with values inlined here.
  string reference_name = 1;
  int64 position = 2;
  // Same as in AlleleCount.
  string ref_base = 3;
  // Same as in AlleleCount.
  int32 ref_supporting_read_count = 4;
  // This is the total number of reads observed at position.
  int32 total_read_count = 5;
  // Same as in AlleleCount.
  int32 ref_nonconfident_read_count = 6;
}

// A message encapsulating all of the information about a Variant call site for
// consumption by further stages of the DeepVariant data processing workflow.
message DeepVariantCall {
  // A Variant call based on the information in allele_count. Will always be a
  // non-reference variant call (no gVCF or reference records).
  nucleus.genomics.v1.Variant variant = 1;

  // A map from an alt allele in Variant to Read key that support that allele.
  // Every alt allele in the variant will have an entry. Reference supporting
  // reads aren't listed. There may be a special key "UNCALLED_ALLELE" for reads
  // that don't support either the reference allele or any alt allele in the
  // variant. This can happen when the read supports an allele that didn't pass
  // our calling thresholds. The read's key is a unique string that identifies
  // the read constructed as "fragment_read/read_number".
  message SupportingReads {
    repeated string read_names = 1;
  }
  map<string, SupportingReads> allele_support = 2;
  map<string, float> allele_frequency = 3;

  // List of Read keys supporting red allele.
  repeated string ref_support = 4;

  message ReadSupport {
    string read_name = 1;
    bool is_low_quality = 2;
  }
  // A map from alt allele in Variant to ReadSupport structure. This structrue
  // is to replace SupportingReads but for back a backward compatibility old
  // one is kept.
  message SupportingReadsExt {
    repeated ReadSupport read_infos = 1;
  }

  // This is to replace allele_support.
  map<string, SupportingReadsExt> allele_support_ext = 5;

  SupportingReadsExt ref_support_ext = 6;
}

// Options to control how our AlleleCounter code works.
message AlleleCounterOptions {
  // The number of basepairs to include in each partition of the reference
  // genome. This determines how many map/reduce jobs are used to compute the
  // AlleleCounts. Using a too small value (below 10000 for example) results
  // in having many many intervals to process which may be a performance problem
  // for the tool. Using too large of a value will result in difficulty
  // parallelizing the computation as there will be too few work units to
  // parallelize and each unit will use a lot of memory.
  int32 partition_size = 1;

  // The requirements for reads to be used when counting alleles.
  nucleus.genomics.v1.ReadRequirements read_requirements = 2;

  // Determains how allele counter keeps track of ref reads. If True then
  // allele_counter stores reads IDs of ref reads, otherwise just a counter is
  // used for ref reads. Default value is False.
  bool track_ref_reads = 3;

  // Option to left align INDELs for each read.
  bool normalize_reads = 4;

  // If True, the behavior in this commit is reverted:
  // https://github.com/google/deepvariant/commit/fbde0674639a28cb9e8004c7a01bbe25240c7d46
  bool keep_legacy_behavior = 5;
}

// Variant call for a single site, in a pseudo-biallelic manner. This is an
// intermediate format for call_variants.py that needs to be merged if there
// are multiallelics.
// The `variant` here likely doesn't have fully filled information for output to
// a VCF file yet.
message CallVariantsOutput {
  nucleus.genomics.v1.Variant variant = 1;

  // The alt allele indices is represented as a sub-message so that it's easier
  // to re-use as a standalone proto for encoding+decoding.
  message AltAlleleIndices {
    repeated int32 indices = 1;
  }
  AltAlleleIndices alt_allele_indices = 2;

  repeated double genotype_probabilities = 3;

  // Next ID: 11
  message DebugInfo {
    int32 predicted_label = 1;
    bool has_insertion = 2;
    bool has_deletion = 3;
    bool is_snp = 4;
    int32 true_label = 5;
    repeated double logits = 6;
    repeated double prelogits = 7;
    // The encoded image used for inference.
    bytes image_encoded = 8;
    // Key-value pairs of layer names of call variant models and the encoded
    // layers' outputs.
    map<string, bytes> layer_output_encoded = 9;

    message PileupCuration {
      // The following enums are defined in nucleus/util/vis.py.
      int32 diff_category = 1;
      int32 base_quality = 2;
      int32 mapping_quality = 3;
      int32 strand_bias = 4;
      int32 read_support = 5;
    }
    PileupCuration pileup_curation = 10;
  }
  DebugInfo debug_info = 4;
}

// Options to control how our candidate VariantCaller works.
// Next ID: 18
message VariantCallerOptions {
  // Alleles occurring at least this many times in our AlleleCount are
  // considered candidate variants.
  int32 min_count_snps = 1;
  int32 min_count_indels = 2;

  // Alleles that have counts at least this fraction of the all counts in an
  // AlleleCount are considered candidate variants.
  float min_fraction_snps = 3;
  float min_fraction_indels = 4;
  // In candidate generation, this multiplier is applied to the minimum allele
  // fraction thresholds (vsc_min_fraction_snps and vsc_min_fraction_indels)
  // to adapt thresholds for multi-sample calling.
  float min_fraction_multiplier = 12;
  // In candidate generation, this threshold is used to exclude a variant when
  // the allele frequency is above this threshold from a non-target sample.
  // This is designed for the somatic case - where we want to avoid generating
  // a candidate if the AF is high in any of the non-target samples.
  float max_fraction_snps_for_non_target_sample = 16;
  float max_fraction_indels_for_non_target_sample = 17;

  // If provided, we will emit "candidate" variant records at a random fraction
  // of otherwise non-candidate sites. Useful for training.
  float fraction_reference_sites_to_emit = 5;

  // The random seed to use in our variant caller. If not provided, a truly
  // random seed will be used.
  uint32 random_seed = 6;

  // The name of the sample we will put in our VariantCall field of constructed
  // variants.
  string sample_name = 7;

  // The probability that a non-reference allele is actually an error.
  float p_error = 8;

  // The maximum genotype quality we'll emit for a reference site.
  int32 max_gq = 9;

  // The width of a GQ bin used to quantize the raw double GQ
  // values into coarser-grained bins than just 1 integer unit. See QuantizeGQ
  // for more information about the quantization process.
  int32 gq_resolution = 10;

  // The ploidy of this sample. For humans, this is 2 (diploid). Currently the
  // code makes implicit assumptions that the ploidy is 2, but this value is
  // used in calculations directly involving ploidy so when we generalize the
  // caller to handle other ploidy values we don't have to update all of those
  // constants.
  int32 ploidy = 11;

  // Skip uncalled genotypes. This is used during training so that
  // uncalled ./. genotypes are not used to generate and label examples.
  bool skip_uncalled_genotypes = 13;

  bool track_ref_reads = 14;

  int32 phase_reads_region_padding_pct = 15;
}

// Options to control how we label variant calls.
message VariantLabelerOptions {
  // Currently there are no options for VariantLabeler.
}

// Options to control how we construct pileup images.
// Next ID: 41.
message PileupImageOptions {
  // The height, in pixels, of the pileup image we'll construct.
  int32 height = 1;

  // The width, in pixels, of the pileup image we'll construct.
  int32 width = 2;

  // We include at the top of the each image a band of reference pixels with
  // this specified height.
  int32 reference_band_height = 3;

  // Controls how bases are encoded as red pixel values.
  //
  // A is base_color_offset_a_and_g + base_color_stride * 3
  // G is base_color_offset_a_and_g + base_color_stride * 2
  // T is base_color_offset_t_and_c + base_color_stride * 1
  // C is base_color_offset_t_and_c + base_color_stride * 0
  //
  // The offset in red color space for A and G bases.
  int32 base_color_offset_a_and_g = 4;
  // The offset in red color space for T and C bases.
  int32 base_color_offset_t_and_c = 5;
  // Each base color is offset from each other by this stride.
  int32 base_color_stride = 6;

  // The alpha value applied to pixels in the reference genome band.
  float reference_alpha = 7;
  // The base quality we assume for the reference genome bases.
  int32 reference_base_quality = 8;

  // The alpha to apply to reads that support our alt alleles.
  float allele_supporting_read_alpha = 9;
  // The alpha to apply to reads that support the other alt allele.
  float other_allele_supporting_read_alpha = 32;
  // The alpha to apply to reads that do not support our alt alleles.
  float allele_unsupporting_read_alpha = 10;
  // The alpha to apply to a base that matches the reference sequence.
  float reference_matching_read_alpha = 11;
  // The alpha to apply to a base that doesn't matches the reference sequence.
  float reference_mismatching_read_alpha = 12;

  // The character we'll use when encoding insertion/deletion anchor bases.
  string indel_anchoring_base_char = 13;

  // The color value to use for reads on the positive strand.
  int32 positive_strand_color = 14;
  // The color value to use for reads on the negative strand.
  int32 negative_strand_color = 15;

  // The maximum base quality we'll allow in PIC. Base qualities above this
  // value are treated as being base_quality_cap.
  int32 base_quality_cap = 16;

  // Extend read windows by a small amount when calculating overlap with calls.
  // This is important to include all the reads involved in deletions in a
  // pileup image.
  int32 read_overlap_buffer_bp = 17;

  // The requirements for reads to be used when creating pileup images.
  nucleus.genomics.v1.ReadRequirements read_requirements = 18;

  enum MultiAllelicMode {
    UNSPECIFIED = 0;
    ADD_HET_ALT_IMAGES = 1;
    NO_HET_ALT_IMAGES = 2;
  }
  MultiAllelicMode multi_allelic_mode = 19;

  // The maximum mapping quality we'll allow in PIC. Mapping qualities above
  // this value are treated as being mapping_quality_cap.
  int32 mapping_quality_cap = 20;

  // The random seed to use in our Pileup Image Creation.
  uint32 random_seed = 21;

  // The number of data channels in our pileup images.
  int32 num_channels = 22;

  //  (Experimental feature that was removed.)
  // The character we'll use when encoding insertion anchor bases.
  string unused_insert_base_char = 23;

  //  (Experimental feature that was removed.)
  // The character we'll use when encoding deletion anchor bases.
  string unused_delete_base_char = 24;

  //  (Experimental feature that was removed.)
  // Include custom pileup image feature.
  bool unused_custom_pileup_image = 25;

  //  (Experimental feature that was removed.)
  // Include sequencing type image feature.
  bool unused_sequencing_type_image = 26;

  // Sequencing type of input bam file.
  enum SequencingType {
    UNSPECIFIED_SEQ_TYPE = 0;
    WGS = 1;
    WES = 2;
    TRIO = 3;
  }
  SequencingType sequencing_type = 27;

  // Whether and how to include alt-aligned pileup images (experimental).
  string alt_aligned_pileup = 30;

  // If set reads are sorted by haplotype tag (HP tag) and then by alignment
  // position.
  bool sort_by_haplotypes = 31;

  bool reverse_haplotypes = 40;

  // Minimal non-zero allele frequency. This is used when normalizing color
  // intensities for the allele frequency channel.
  float min_non_zero_allele_frequency = 33;

  // Whether to consider allele frequencies.
  bool use_allele_frequency = 34;

  // Which variant types to use alt-align on.
  string types_to_alt_align = 36;

  // If true, add an additional channel where the color information per-read
  // indicates the HP value.
  bool add_hp_channel = 37;

  // For assembly polishing, specifies the HP tag we're calling for.
  int32 hp_tag_for_assembly_polishing = 38;

  // The set of channels to collect
  repeated string channels = 39;

  // Deprecated fields.
  int32 sort_by_haplotypes_sample_hp_tag = 35 [deprecated = true];
}

// Options that may differ by sample.
// Next ID: 11.
message SampleOptions {
  // A string to identify the role of this sample in the analysis, e.g. in
  // trios 'child', 'parent1', or 'parent2'. Importantly, `role` strings should
  // not be checked inside make_examples_core.py. For example, instead of
  // checking whether sample.role == "child" to set pileup height, instead add
  // the pileup height to the sample, adding new properties to this proto if
  // needed. This keeps make_examples_core.py functioning for multiple samples
  // without it having to reason about sample roles that belong to each
  // application. This role is used to keep track of sample identities
  // throughout the analysis, and for debugging.
  string role = 6;

  // Sample name, e.g. HG002. Often given by a --sample_name flag or inferred
  // from input files.
  string name = 7;

  // Paths to files with read alignments, e.g. BAM or CRAM files.
  repeated string reads_filenames = 1;

  // Should we downsample our reads and if so, by how much? If == 0.0 (default),
  // no downsampling occurs. But if set, must be between 0.0 and 1.0 and
  // indicates the probability that a read will be kept (randomly) when read
  // from the input. This option makes it easy to simulate lower coverage data.
  float downsample_fraction = 2;

  // Options for finding candidate variants.
  VariantCallerOptions variant_caller_options = 3;

  // Height of the pileup image for this sample.
  int32 pileup_height = 4;

  // A list of integers indicating the order in which samples should be shown
  // in the pileup image when calling on this sample. The indices refer to the
  // list of samples in the regionprocessor.
  repeated int32 order = 5;

  // Path to the variants for vcf_candidate_importer.
  string proposed_variants_filename = 8;

  // Path to binary file containing candidate positions.
  string candidate_positions = 9;

  // If true, skip any output generation for this sample.
  bool skip_output_generation = 10;
}

// High-level options that encapsulates all of the parameters needed to run
// DeepVariant end-to-end.
// Next ID: 60.
message MakeExamplesOptions {
  // A list of contig names we never want to call variants on. For example,
  // chrM in humans is the mitocondrial genome and the caller isn't trained to
  // call variants on that genome.
  repeated string exclude_contigs = 1;

  // List of regions where we want to call variants. If missing, we will call
  // variants throughout the entire genome.
  repeated string calling_regions = 2;

  // Fixed random seed to use for DeepVariant itself.
  uint32 random_seed = 3;

  // The number of cores to use when running DeepVariant. Must be >= 1.
  int32 n_cores = 4;

  // Options to control how we run the AlleleCounter.
  AlleleCounterOptions allele_counter_options = 5;

  // Deprecated. Use sample_options instead.
  VariantCallerOptions deprecated_variant_caller_options = 6;

  // Options to control how we generate pileup images.
  PileupImageOptions pic_options = 7;

  // Options to control how we label our examples.
  VariantLabelerOptions labeler_options = 8;

  // Only reads satisfying these requirements will be used in DeepVariant.
  // This parameters are propagated as appropriate to read_requirement fields
  // in our tool-specific options.
  nucleus.genomics.v1.ReadRequirements read_requirements = 9;

  // Options to control out input data sources and output data sinks.
  // Path to our genome reference.
  string reference_filename = 10;

  // Deprecated. Use sample_options instead.
  repeated string deprecated_reads_filenames = 32;
  // Deprecated.
  string deprecated_reads_filename = 11;

  // Path where we'll write out our candidate variants.
  string candidates_filename = 12;
  // Path to examples.
  string examples_filename = 13;
  // Path to a list of regions we are confident in, for determining which
  // candidate variants get labels.
  string confident_regions_filename = 14;
  // Path to the truth variants, for use in labeling our examples.
  string truth_variants_filename = 15;
  // Path to the variants for vcf_candidate_importer.
  string deprecated_proposed_variants_filename = 33 [deprecated = true];
  // Path where we should put our gVCF records.
  string gvcf_filename = 16;
  // Whether to generate MED_DP in gVCF records or not.
  bool include_med_dp = 43;

  // The name of the deep learning model to use with DeepVariant.
  string model_name = 17;

  enum Mode {
    UNSPECIFIED = 0;
    CALLING = 1;
    TRAINING = 2;
    CANDIDATE_SWEEP = 3;
  }
  Mode mode = 18;

  // The minimum fraction of basepairs that must be shared by all contigs common
  // to DeepVariant inputs and the reference contigs alone. If the common
  // contigs cover less than min_shared_contig_basepairs of the reference genome
  // contigs DeepVariant will signal an error that the input datasets aren't
  // from compatible genomes.
  float min_shared_contigs_basepairs = 19;

  // The task identifier, as an integer, of this task. If we are running with
  // multiple tasks processing the same inputs into sharded outputs, this id
  // should be set to a number from 0 (master) to N - 1 to indicate which of the
  // tasks we are currently processing.
  int32 task_id = 20;

  // When running in sharded output mode (i.e., writing outputs to foo@N), this
  // field captures the number of sharded outputs (i.e., N). When not running in
  // sharded output mode, this field should be 0.
  int32 num_shards = 21;

  // Options to control realigner module.

  // Whether the realigner should be enabled.
  bool realigner_enabled = 22;
  // If True, realign reads from all samples together. If False, realign
  // per sample.
  bool joint_realignment = 54;
  // Settings for the realigner module.
  learning.genomics.deepvariant.RealignerOptions realigner_options = 23;

  // The maximum number of reads per partition that we consider before following
  // processing such as sampling and realigner.
  int32 max_reads_per_partition = 24;
  // Similar to `max_reads_per_partition`, we want to we add another field
  // to constrain the number of reads to downsample. Even with
  // `max_reads_per_partition`, the memory usage can sometimes still be too
  // large, especially when the reads are very long.
  // When this field is set, we'll multiple it by the region we're in. And then
  // we will only sample the reads up to a point where the number of bases in
  // the region are larger than
  // (max_reads_for_dynamic_bases_per_region * region length).
  int32 max_reads_for_dynamic_bases_per_region = 57;

  // Deprecated. Use sample_options instead.
  float deprecated_downsample_fraction = 25;

  // List of regions where we DON'T want to call variants. If missing, no
  // regions will be excluded from calling.
  repeated string exclude_calling_regions = 26;

  // An enumeration of all of the labeler algorithms we support in DeepVariant.
  enum LabelerAlgorithm {
    UNSPECIFIED_LABELER_ALGORITHM = 0;
    // The labeling algorithm used with DeepVariant 0.4-0.5, which does position
    // matching to find truth variant to label our candidates.
    POSITIONAL_LABELER = 1;
    // A haplotype-aware labeling algorithm, similar to hap.py xcmp, that looks
    // for genotypes for candidate variants that produce haplotypes that match
    // those implied by the genotypes of our truth variants. Produces more
    // accurate labels than the POSITIONAL_LABELER labeling algorithm.
    HAPLOTYPE_LABELER = 2;
    // The labeling algorithm which labels the variants into customized classes
    // specified in the specified INFO field in the VCF file.
    CUSTOMIZED_CLASSES_LABELER = 3;
  }
  // The labeling algorithm we are using in this DeepVariant run. Only needed
  // when in CALLING mode.
  LabelerAlgorithm labeler_algorithm = 27;
  string run_info_filename = 28;

  // By default aligned_quality field is read from QUAL in SAM. If flag is set,
  // aligned_quality field is read from OQ tag in SAM.
  bool use_original_quality_scores = 29;

  // A list of variant types that we want to restrict our examples to. E.g.,
  // select_variant_types = ['snps'] would indicate that we only want to
  // generate SNP candidate variants.
  repeated string select_variant_types = 30;

  enum VariantCaller {
    UNSPECIFIED_CALLER = 0;
    // The default very sensitive caller.
    VERY_SENSITIVE_CALLER = 1;
    // An advanced caller that uses an input VCF to call variants.
    VCF_CANDIDATE_IMPORTER = 2;
  }
  VariantCaller variant_caller = 31;

  // If flag is set, consider allele frequency.
  bool use_allele_frequency = 34;

  // A list of VCF or VCF.gz files that specify allele frequency information.
  repeated string population_vcf_filenames = 35;

  // Path to output optional runtime profiling by region.
  string runtime_by_region = 36;

  // Use --ref argument as the reference file for the CRAM.
  bool use_ref_for_cram = 37;

  // Parse aux fields from BAM -- needed for some features like using HP tags.
  bool parse_sam_aux_fields = 38;

  // This field is used to pass into SamReaderOptions.
  // By default, this field is empty. If empty, we keep all aux fields if they
  // are parsed. If set, we only keep the aux fields with the names in this
  // list.
  repeated string aux_fields_to_keep = 50;

  // Size of blocks to read from BAM.
  int32 hts_block_size = 39;

  // How often to show log messages.
  int32 logging_every_n_candidates = 40;

  string customized_classes_labeler_classes_list = 41;

  string customized_classes_labeler_info_field_name = 42;

  // The index of the sample to focus on within the list of samples.
  int32 main_sample_index = 44;

  // per-sample statistics for training examples.
  string bam_fname = 45;

  // Samples, e.g. a list of 3 for DeepTrio, list of 1 for DeepVariant.
  repeated SampleOptions sample_options = 46;

  // Sample role to focus on for training. This can be different from the sample
  // indicated by the main_sample_index.
  string sample_role_to_train = 47;

  // DirectPhasing related options.
  bool phase_reads = 51;
  int32 phase_reads_region_padding_pct = 52;

  int32 phase_max_candidates = 53;

  string read_phases_output = 55;

  bool discard_non_dna_regions = 56;

  bool output_sitelist = 58;

  // Related to de novo variants labeling
  string denovo_regions_filename = 59;
}

// Config describe information needed for a dataset that can be used for
// training, validation, or testing.
message DeepVariantDatasetConfig {
  // A human-readable name of the dataset.
  string name = 1;

  // Full path of the tensorflow.Example TFRecord file.
  string tfrecord_path = 2;

  // Number of examples for this dataset. Right now this needs to be manually
  // filled in order to compute how the learning rate decays, and also used
  // in make_training_batches.
  uint32 num_examples = 3;
}


// Metrics on the labeling of candidate / truth variants when running
// DeepVariant's make_examples in training mode.
// Next ID: 17.
message LabelingMetrics {
  // Notes on counting by site or by allele:
  //
  // Throughout this proto we often measure the same quantity (e.g., false
  // positives) by site and by alleles. This reflects two different ways of
  // counting errors in genomes where the number of chromosomes > 1. We can give
  // a concrete example:
  //
  //  Candidate: chr20:10 with A/C
  //  Truth: chr20:10 with A/C alleles and with genotype (0, 1)
  //
  // Since we have the same variant with the same alleles at the same position
  // in both the candidates and the truth, the matching is trivial. For this
  // variant, we'd update our counts as follows:
  //
  //   # We have only a single site, so we +1 to each truth and candidates.
  //   n_truth_variant_sites += 1
  //   n_candidate_variant_sites += 1
  //
  //   # Both the candidate and truth variants only have 1 alternative allele,
  //   # so we increment the allele counts by 1 as well.
  //   n_truth_variant_alleles += 1
  //   n_candidate_variant_alleles += 1
  //
  // A similar logic would apply to counting true positives (+1), false
  // negatives (+0), and false positives (+0) for both sites and alleles.
  //
  // Now let's take a more complex example where candidates and truth differ
  // in their alleles:
  //
  //  Candidate: chr20:20 with A/C/T
  //  Truth: chr20:20 with A/C/G alleles and with genotype (1, 2)
  //
  // Here we have a candidate and truth at the same position but they can only
  // be partially matched, since the truth variant includes a G allele (e.g.,
  // genotype == 2) that isn't even present in the candidate. And the candidate
  // has an extra allele T that isn't real. Matching these variants produces a
  // genotype of (0, 1) for the candidate, since we have one copy of the C
  // allele (e.g., genotype == 1) but we cannot match the true G allele, so we
  // are forced to say one allele is reference (e.g., genotype == 0).
  //
  // Now let's update our counts:
  //
  //   # We have only a single site, so we +1 to each truth and candidates, even
  //   # though they both have multiple alt allele, as this is the sites-level
  //   # metric.
  //   n_truth_variant_sites += 1
  //   n_candidate_variant_sites += 1
  //
  //   # Both variants have 2 alternative alleles, so we +2 each of the alleles
  //   # counts.
  //   n_truth_variant_alleles += 2
  //   n_candidate_variant_alleles += 2
  //
  //   # Now for the complex TP/FN/FP counts.
  //   # We found a candidate for this true site, even though we didn't get all
  //   # of the alleles, so we increment the n_true_positive_sites by 1 and get
  //   # +0 for each of the FN and FP sites metrics:
  //   n_true_positive_sites  += 1
  //   n_false_negative_sites += 0
  //   n_false_positive_sites += 0
  //
  //   # However, we only got one of the two true positive alleles, so we get
  //   # one TP allele (the C), one FN allele (the missed G), and one FP allele
  //   # (the bad T allele in the candidate).
  //   n_true_positive_alleles  += 1
  //   n_false_negative_alleles += 1
  //   n_false_positive_alleles += 1

  // Notes on deriving higher-level metrics from the counts here:
  //
  // This proto only contains counts, not ratios like PPV, since we want to make
  // it easy to sum up the metrics in this protos across shared make_example
  // runs. That said, it's valuable to note here how to compute common
  // ratiometric statistics from the counts in this proto since there are many
  // fields to choose from. I give the sites-level metrics here, but the
  // allele-level values can be computed similarly.
  //
  // See https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values
  // for a general description of these metrics.
  //
  // PPV is TP / (TP + FP):
  // n_true_positive_sites / (n_true_positive_sites + n_false_positive_sites)
  //
  // Sensitivity: TP / (TP + FN)
  // n_true_positive_sites / (n_true_positive_sites + n_false_negative_sites)

  // Counts of candidate and truth variants by site and by allele.
  //
  // The sites metrics are essentially the number of records seen of each type,
  // after removing non-PASS truth variants, regardless of the number of alleles
  // in the records. The allele metrics are like the sites metrics, but instead
  // of getting a +1 for each record we get +N for each site where N is the
  // number of alternative alleles.
  //
  // See above for more information on the difference between sites and alleles
  // counting.
  int32 n_truth_variant_sites = 1;
  int32 n_truth_variant_alleles = 2;
  int32 n_candidate_variant_sites = 3;
  int32 n_candidate_variant_alleles = 4;
  int32 n_non_confident_candidate_variant_sites = 5;

  // Counts of TP, FN, and FP implied by the labeling of candidates against
  // truth.
  //
  // See above for more information on the difference between sites and alleles
  // counting.
  //
  // Note that we don't track true negatives since true negatives are poorly
  // defined in the variant calling problem.

  // TPs:
  //
  // The number of true variants assigned a non-ref genotype in the match for
  // the sites calculation. For the allele calculation, the number of true
  // variant alleles with a gt > 0 in the assignment.
  int32 n_true_positive_sites = 6;
  int32 n_true_positive_alleles = 7;

  // FNs:
  //
  // The number of true variants assigned 0/0 genotypes for the sites
  // calculation. For the allele calculation, the number of true variant alleles
  // without a gt > 0 in the assignment.
  int32 n_false_negative_sites = 8;
  int32 n_false_negative_alleles = 9;

  // FPs:
  //
  // The number of candidate variants assigned a 0/0 genotype for the sites
  // calculation. For the allele calculation, the number of candidate variant
  // alternative alleles without a genotype > 0 in the assignment.
  int32 n_false_positive_sites = 10;
  int32 n_false_positive_alleles = 11;

  // A few more complex metrics that aren't derivable from the above metrics as
  // their calculation requires access to the underlying matched variants
  // themselves.

  // The number of candidate variants (counted by site) that are assigned a
  // non-reference genotype (e.g., != 0/0) but that don't have an exact
  // position match in truth.
  int32 n_inexact_position_matches = 12;

  // A count of the number of sites where the candidate and truth variants occur
  // at the same position, with increasingly strict additional matching
  // criteria. These metrics are all computed over sites, not alleles.
  //
  // The number of sites where candidate and truth have the same start position.
  int32 n_exact_position_matches = 13;
  // Same criteria as above but with the additional requirement that all of the
  // alleles be exactly the same between candidate and truth.
  int32 n_exact_position_and_allele_matches = 14;
  // Same criteria as above but with the additional requirement that the matched
  // genotypes to be identical as well.
  int32 n_exact_position_and_allele_and_genotype_matches = 15;

  // Number of truth variants (counted by site) with more than one alternative
  // allele where at least one alternative allele was missed (i.e., was assigned
  // a genotype of 0 in the match).
  int32 n_truth_multiallelics_sites_with_missed_alleles = 16;
}

// Statistics about MakeExamples.
// Next ID: 9.
message MakeExamplesStats {
  int32 num_examples = 1;

  int32 num_indels = 2;
  int32 num_snps = 3;

  int32 num_class_0 = 4;
  int32 num_class_1 = 5;
  int32 num_class_2 = 6;
  int32 num_denovo = 7;
  int32 num_nondenovo = 8;
}

// Configuration and runtime information about a MakeExamples run in
// DeepVariant.
// Next ID: 5.
message MakeExamplesRunInfo {
  MakeExamplesOptions options = 1;
  LabelingMetrics labeling_metrics = 2;
  ResourceMetrics resource_metrics = 3;
  MakeExamplesStats stats = 4;
}

// Next ID: 22.
enum DeepVariantChannelEnum {
  // Default should be unspecified.
  CH_UNSPECIFIED = 0;

  // 6 channels that exist in all DeepVariant production models.
  CH_READ_BASE = 1;
  CH_BASE_QUALITY = 2;
  CH_MAPPING_QUALITY = 3;
  CH_STRAND = 4;
  CH_READ_SUPPORTS_VARIANT = 5;
  CH_BASE_DIFFERS_FROM_REF = 6;

  // "Improving Variant Calling using Haplotype Information"
  // https://google.github.io/deepvariant/posts/2021-02-08-the-haplotype-channel/
  CH_HAPLOTYPE_TAG = 7;

  // "Improving variant calling using population data and deep learning"
  // https://doi.org/10.1101/2021.01.06.425550
  CH_ALLELE_FREQUENCY = 8;

  // Two extra channels for diff_channels:
  CH_DIFF_CHANNELS_ALTERNATE_ALLELE_1 = 9;
  CH_DIFF_CHANNELS_ALTERNATE_ALLELE_2 = 10;

  // Two extra channels for base_channels:
  CH_BASE_CHANNELS_ALTERNATE_ALLELE_1 = 20;
  CH_BASE_CHANNELS_ALTERNATE_ALLELE_2 = 21;

  // The following channels correspond to the "Opt Channels" defined in
  // deepvariant/pileup_channel_lib.h:
  CH_READ_MAPPING_PERCENT = 11;
  CH_AVG_BASE_QUALITY = 12;
  CH_IDENTITY = 13;
  CH_GAP_COMPRESSED_IDENTITY = 14;
  CH_GC_CONTENT = 15;
  CH_IS_HOMOPOLYMER = 16;
  CH_HOMOPOLYMER_WEIGHTED = 17;
  CH_BLANK = 18;
  CH_INSERT_SIZE = 19;
}