Diff of /deeptrio/testdata.py [000000] .. [5a4941]

Switch to side-by-side view

--- a
+++ b/deeptrio/testdata.py
@@ -0,0 +1,234 @@
+# Copyright 2017 Google LLC.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from this
+#    software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+"""Utilities to help with testing DeepVariant code."""
+
+import os
+
+from third_party.nucleus.testing import test_utils as nucleus_test_utils
+
+GENOMICS_DIR = 'learning/genomics'
+
+
+def deeptrio_testdata(filename):
+  """Gets the path to filename in genomics/deepvariant/testdata.
+
+  These paths are only known at runtime, after flag parsing
+  has occurred.
+
+  Args:
+    filename: The name of a testdata file in the core genomics testdata
+      directory. For example, if you have a test file in
+      "learning/genomics/deepvariant/testdata/foo.txt", filename should be
+      "foo.txt" to get a path to it.
+
+  Returns:
+    The absolute path to a testdata file.
+  """
+  return nucleus_test_utils.genomics_testdata(
+      os.path.join('deeptrio/testdata', filename), GENOMICS_DIR
+  )
+
+
+CHR20_FASTA = None
+HG001_CHR20_BAM = None
+NA12891_CHR20_BAM = None
+NA12892_CHR20_BAM = None
+GOLDEN_TRAINING_EXAMPLES = None
+GOLDEN_CALLING_CANDIDATES = None
+GOLDEN_CANDIDATE_POSITIONS = None
+GOLDEN_CALLING_EXAMPLES = None
+CONFIDENT_REGIONS_BED = None
+TRUTH_VARIANTS_VCF = None
+TRUTH_VARIANTS_VCF_WITH_TYPES = None
+GOLDEN_POSTPROCESS_INPUT = None
+GOLDEN_POSTPROCESS_OUTPUT = None
+GOLDEN_POSTPROCESS_OUTPUT_COMPRESSED = None
+GOLDEN_POSTPROCESS_GVCF_INPUT = None
+GOLDEN_POSTPROCESS_GVCF_OUTPUT = None
+GOLDEN_POSTPROCESS_GVCF_OUTPUT_COMPRESSED = None
+GOLDEN_MAKE_EXAMPLES_RUN_INFO = None
+WS_ALLELE_COUNT_LINEAR_MODEL = None
+WS_ALLELE_COUNT_LINEAR_MODEL_PCKL = None
+WS_VARIANT_READS_THRESHOLD_MODEL = None
+# Test data for ONT
+GRCH38_CHR0_FASTA = None
+ONT_HG002_BAM = None
+ONT_HG003_BAM = None
+ONT_HG004_BAM = None
+HG002_HIGH_CONFIDENCE_VCF = None
+HG002_HIGH_CONFIDENCE_BED = None
+HG002_DENOVO_BED = None
+GOLDEN_ONT_MAKE_EXAMPLES_OUTPUT = None
+GOLDEN_ONT_DENOVO_MAKE_EXAMPLES_OUTPUT = None
+
+ONT_N_GOLDEN_TRAINING_EXAMPLES = 167
+N_GOLDEN_TRAINING_EXAMPLES = 50
+N_GOLDEN_CALLING_EXAMPLES = 103
+
+CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES = None
+ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES = None
+GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES = None
+GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD = None
+
+
+def init():
+  """Initialize global variables from flag values."""
+  global CHR20_FASTA
+  global HG001_CHR20_BAM
+  global NA12891_CHR20_BAM
+  global NA12892_CHR20_BAM
+  global GOLDEN_TRAINING_EXAMPLES
+  global GOLDEN_CANDIDATE_POSITIONS
+  global GOLDEN_CALLING_CANDIDATES
+  global GOLDEN_CALLING_EXAMPLES
+  global CONFIDENT_REGIONS_BED
+  global TRUTH_VARIANTS_VCF
+  global TRUTH_VARIANTS_VCF_WITH_TYPES
+  global GOLDEN_POSTPROCESS_INPUT
+  global GOLDEN_POSTPROCESS_OUTPUT
+  global GOLDEN_POSTPROCESS_OUTPUT_COMPRESSED
+  global GOLDEN_POSTPROCESS_GVCF_INPUT
+  global GOLDEN_POSTPROCESS_GVCF_OUTPUT
+  global GOLDEN_POSTPROCESS_GVCF_OUTPUT_COMPRESSED
+  global GOLDEN_MAKE_EXAMPLES_RUN_INFO
+  global WS_ALLELE_COUNT_LINEAR_MODEL
+  global WS_ALLELE_COUNT_LINEAR_MODEL_PCKL
+  global WS_VARIANT_READS_THRESHOLD_MODEL
+  global GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES
+  global GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD
+
+  global GRCH38_CHR0_FASTA
+  global ONT_HG002_BAM
+  global ONT_HG003_BAM
+  global ONT_HG004_BAM
+  global HG002_HIGH_CONFIDENCE_VCF
+  global HG002_HIGH_CONFIDENCE_BED
+  global HG002_DENOVO_BED
+  global GOLDEN_ONT_MAKE_EXAMPLES_OUTPUT
+  global GOLDEN_ONT_DENOVO_MAKE_EXAMPLES_OUTPUT
+
+  CHR20_FASTA = deeptrio_testdata('input/hs37d5.chr20.fa.gz')
+  HG001_CHR20_BAM = deeptrio_testdata('input/HG001.chr20.10_10p1mb_sorted.bam')
+  NA12891_CHR20_BAM = deeptrio_testdata(
+      'input/NA12891.chr20.10_10p1mb_sorted.bam'
+  )
+  NA12892_CHR20_BAM = deeptrio_testdata(
+      'input/NA12892.chr20.10_10p1mb_sorted.bam'
+  )
+
+  GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata(
+      'golden.training_examples.tfrecord.gz'
+  )
+  GOLDEN_CANDIDATE_POSITIONS = deeptrio_testdata(
+      'golden_child.candidate_positions'
+  )
+  GOLDEN_CALLING_CANDIDATES = deeptrio_testdata(
+      'golden_child.calling_examples.tfrecord.gz'
+  )
+  GOLDEN_CALLING_EXAMPLES = deeptrio_testdata(
+      'golden_child.calling_examples.tfrecord.gz'
+  )
+  CONFIDENT_REGIONS_BED = deeptrio_testdata(
+      'input/test_giab.b37_chr20_100kbp_at_10mb.bed'
+  )
+  TRUTH_VARIANTS_VCF = deeptrio_testdata(
+      'input/HG001_chr20_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz'
+  )
+  TRUTH_VARIANTS_VCF_WITH_TYPES = deeptrio_testdata(
+      'input/with_types.test_nist.b37_chr20_4kbp_at_10mb.vcf.gz'
+  )
+  GOLDEN_POSTPROCESS_INPUT = deeptrio_testdata(
+      'golden.postprocess_single_site_input.tfrecord.gz'
+  )
+  GOLDEN_POSTPROCESS_OUTPUT = deeptrio_testdata(
+      'golden.postprocess_single_site_output.vcf'
+  )
+  GOLDEN_POSTPROCESS_OUTPUT_COMPRESSED = deeptrio_testdata(
+      'golden.postprocess_single_site_output.vcf.gz'
+  )
+  GOLDEN_POSTPROCESS_GVCF_INPUT = deeptrio_testdata(
+      'golden_child.postprocess_gvcf_input.tfrecord.gz'
+  )
+  GOLDEN_POSTPROCESS_GVCF_OUTPUT = deeptrio_testdata(
+      'golden.postprocess_gvcf_output.g.vcf'
+  )
+  GOLDEN_MAKE_EXAMPLES_RUN_INFO = deeptrio_testdata(
+      'golden.training_examples.tfrecord.gz.run_info.pbtxt'
+  )
+  WS_ALLELE_COUNT_LINEAR_MODEL = deeptrio_testdata(
+      'window_selector_allele_count_linear.pbtxt'
+  )
+  WS_ALLELE_COUNT_LINEAR_MODEL_PCKL = deeptrio_testdata(
+      'window_selector_allele_count_linear.pckl'
+  )
+  WS_VARIANT_READS_THRESHOLD_MODEL = deeptrio_testdata(
+      'window_selector_variant_read_threshold.pbtxt'
+  )
+
+  # For oxford nanopore
+  GRCH38_CHR0_FASTA = deeptrio_testdata(
+      'input/grch38.chr20_5050000_5075000.masked.fa.gz'
+  )
+  ONT_HG002_BAM = deeptrio_testdata('input/HG002_R10_chr20_5050000_5075000.bam')
+  ONT_HG003_BAM = deeptrio_testdata('input/HG003_R10_chr20_5050000_5075000.bam')
+  ONT_HG004_BAM = deeptrio_testdata('input/HG004_R10_chr20_5050000_5075000.bam')
+  HG002_HIGH_CONFIDENCE_VCF = deeptrio_testdata(
+      'input/HG002_GRCh38_1_22_v4.2.1_benchmark.chr20.vcf.gz'
+  )
+  HG002_HIGH_CONFIDENCE_BED = deeptrio_testdata(
+      'input/HG002_GRCh38_1_22_v4.2.1_benchmark.chr20.bed'
+  )
+  HG002_DENOVO_BED = deeptrio_testdata(
+      'input/HG002_GRCh38_1_22_v4.2.1_benchmark.chr20.denovo_regions.bed'
+  )
+  GOLDEN_ONT_MAKE_EXAMPLES_OUTPUT = deeptrio_testdata(
+      'HG002_ONT_deeptrio.examples.tfrecord.gz'
+  )
+  GOLDEN_ONT_DENOVO_MAKE_EXAMPLES_OUTPUT = deeptrio_testdata(
+      'HG002_ONT_deeptrio.denovo.examples.tfrecord.gz'
+  )
+
+  # For CustomizedClassesVariantLabeler.
+  global CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES
+  CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata(
+      'customized_classes.golden.training_examples.tfrecord.gz'
+  )
+
+  # For alt-aligned pileups
+  global ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES
+  ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata(
+      'alt_aligned_pileup.golden.training_examples.tfrecord.gz'
+  )
+
+  GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES = deeptrio_testdata(
+      'golden.vcf_candidate_importer.training_examples.tfrecord.gz'
+  )
+  GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD = deeptrio_testdata(
+      'golden_child.vcf_candidate_importer.calling_examples.tfrecord.gz'
+  )