Diff of /deeptrio/testdata.py [000000] .. [5a4941]

Switch to unified view

a b/deeptrio/testdata.py
1
# Copyright 2017 Google LLC.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions
5
# are met:
6
#
7
# 1. Redistributions of source code must retain the above copyright notice,
8
#    this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
#    notice, this list of conditions and the following disclaimer in the
12
#    documentation and/or other materials provided with the distribution.
13
#
14
# 3. Neither the name of the copyright holder nor the names of its
15
#    contributors may be used to endorse or promote products derived from this
16
#    software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
# POSSIBILITY OF SUCH DAMAGE.
29
"""Utilities to help with testing DeepVariant code."""
30
31
import os
32
33
from third_party.nucleus.testing import test_utils as nucleus_test_utils
34
35
GENOMICS_DIR = 'learning/genomics'
36
37
38
def deeptrio_testdata(filename):
39
  """Gets the path to filename in genomics/deepvariant/testdata.
40
41
  These paths are only known at runtime, after flag parsing
42
  has occurred.
43
44
  Args:
45
    filename: The name of a testdata file in the core genomics testdata
46
      directory. For example, if you have a test file in
47
      "learning/genomics/deepvariant/testdata/foo.txt", filename should be
48
      "foo.txt" to get a path to it.
49
50
  Returns:
51
    The absolute path to a testdata file.
52
  """
53
  return nucleus_test_utils.genomics_testdata(
54
      os.path.join('deeptrio/testdata', filename), GENOMICS_DIR
55
  )
56
57
58
CHR20_FASTA = None
59
HG001_CHR20_BAM = None
60
NA12891_CHR20_BAM = None
61
NA12892_CHR20_BAM = None
62
GOLDEN_TRAINING_EXAMPLES = None
63
GOLDEN_CALLING_CANDIDATES = None
64
GOLDEN_CANDIDATE_POSITIONS = None
65
GOLDEN_CALLING_EXAMPLES = None
66
CONFIDENT_REGIONS_BED = None
67
TRUTH_VARIANTS_VCF = None
68
TRUTH_VARIANTS_VCF_WITH_TYPES = None
69
GOLDEN_POSTPROCESS_INPUT = None
70
GOLDEN_POSTPROCESS_OUTPUT = None
71
GOLDEN_POSTPROCESS_OUTPUT_COMPRESSED = None
72
GOLDEN_POSTPROCESS_GVCF_INPUT = None
73
GOLDEN_POSTPROCESS_GVCF_OUTPUT = None
74
GOLDEN_POSTPROCESS_GVCF_OUTPUT_COMPRESSED = None
75
GOLDEN_MAKE_EXAMPLES_RUN_INFO = None
76
WS_ALLELE_COUNT_LINEAR_MODEL = None
77
WS_ALLELE_COUNT_LINEAR_MODEL_PCKL = None
78
WS_VARIANT_READS_THRESHOLD_MODEL = None
79
# Test data for ONT
80
GRCH38_CHR0_FASTA = None
81
ONT_HG002_BAM = None
82
ONT_HG003_BAM = None
83
ONT_HG004_BAM = None
84
HG002_HIGH_CONFIDENCE_VCF = None
85
HG002_HIGH_CONFIDENCE_BED = None
86
HG002_DENOVO_BED = None
87
GOLDEN_ONT_MAKE_EXAMPLES_OUTPUT = None
88
GOLDEN_ONT_DENOVO_MAKE_EXAMPLES_OUTPUT = None
89
90
ONT_N_GOLDEN_TRAINING_EXAMPLES = 167
91
N_GOLDEN_TRAINING_EXAMPLES = 50
92
N_GOLDEN_CALLING_EXAMPLES = 103
93
94
CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES = None
95
ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES = None
96
GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES = None
97
GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD = None
98
99
100
def init():
101
  """Initialize global variables from flag values."""
102
  global CHR20_FASTA
103
  global HG001_CHR20_BAM
104
  global NA12891_CHR20_BAM
105
  global NA12892_CHR20_BAM
106
  global GOLDEN_TRAINING_EXAMPLES
107
  global GOLDEN_CANDIDATE_POSITIONS
108
  global GOLDEN_CALLING_CANDIDATES
109
  global GOLDEN_CALLING_EXAMPLES
110
  global CONFIDENT_REGIONS_BED
111
  global TRUTH_VARIANTS_VCF
112
  global TRUTH_VARIANTS_VCF_WITH_TYPES
113
  global GOLDEN_POSTPROCESS_INPUT
114
  global GOLDEN_POSTPROCESS_OUTPUT
115
  global GOLDEN_POSTPROCESS_OUTPUT_COMPRESSED
116
  global GOLDEN_POSTPROCESS_GVCF_INPUT
117
  global GOLDEN_POSTPROCESS_GVCF_OUTPUT
118
  global GOLDEN_POSTPROCESS_GVCF_OUTPUT_COMPRESSED
119
  global GOLDEN_MAKE_EXAMPLES_RUN_INFO
120
  global WS_ALLELE_COUNT_LINEAR_MODEL
121
  global WS_ALLELE_COUNT_LINEAR_MODEL_PCKL
122
  global WS_VARIANT_READS_THRESHOLD_MODEL
123
  global GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES
124
  global GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD
125
126
  global GRCH38_CHR0_FASTA
127
  global ONT_HG002_BAM
128
  global ONT_HG003_BAM
129
  global ONT_HG004_BAM
130
  global HG002_HIGH_CONFIDENCE_VCF
131
  global HG002_HIGH_CONFIDENCE_BED
132
  global HG002_DENOVO_BED
133
  global GOLDEN_ONT_MAKE_EXAMPLES_OUTPUT
134
  global GOLDEN_ONT_DENOVO_MAKE_EXAMPLES_OUTPUT
135
136
  CHR20_FASTA = deeptrio_testdata('input/hs37d5.chr20.fa.gz')
137
  HG001_CHR20_BAM = deeptrio_testdata('input/HG001.chr20.10_10p1mb_sorted.bam')
138
  NA12891_CHR20_BAM = deeptrio_testdata(
139
      'input/NA12891.chr20.10_10p1mb_sorted.bam'
140
  )
141
  NA12892_CHR20_BAM = deeptrio_testdata(
142
      'input/NA12892.chr20.10_10p1mb_sorted.bam'
143
  )
144
145
  GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata(
146
      'golden.training_examples.tfrecord.gz'
147
  )
148
  GOLDEN_CANDIDATE_POSITIONS = deeptrio_testdata(
149
      'golden_child.candidate_positions'
150
  )
151
  GOLDEN_CALLING_CANDIDATES = deeptrio_testdata(
152
      'golden_child.calling_examples.tfrecord.gz'
153
  )
154
  GOLDEN_CALLING_EXAMPLES = deeptrio_testdata(
155
      'golden_child.calling_examples.tfrecord.gz'
156
  )
157
  CONFIDENT_REGIONS_BED = deeptrio_testdata(
158
      'input/test_giab.b37_chr20_100kbp_at_10mb.bed'
159
  )
160
  TRUTH_VARIANTS_VCF = deeptrio_testdata(
161
      'input/HG001_chr20_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz'
162
  )
163
  TRUTH_VARIANTS_VCF_WITH_TYPES = deeptrio_testdata(
164
      'input/with_types.test_nist.b37_chr20_4kbp_at_10mb.vcf.gz'
165
  )
166
  GOLDEN_POSTPROCESS_INPUT = deeptrio_testdata(
167
      'golden.postprocess_single_site_input.tfrecord.gz'
168
  )
169
  GOLDEN_POSTPROCESS_OUTPUT = deeptrio_testdata(
170
      'golden.postprocess_single_site_output.vcf'
171
  )
172
  GOLDEN_POSTPROCESS_OUTPUT_COMPRESSED = deeptrio_testdata(
173
      'golden.postprocess_single_site_output.vcf.gz'
174
  )
175
  GOLDEN_POSTPROCESS_GVCF_INPUT = deeptrio_testdata(
176
      'golden_child.postprocess_gvcf_input.tfrecord.gz'
177
  )
178
  GOLDEN_POSTPROCESS_GVCF_OUTPUT = deeptrio_testdata(
179
      'golden.postprocess_gvcf_output.g.vcf'
180
  )
181
  GOLDEN_MAKE_EXAMPLES_RUN_INFO = deeptrio_testdata(
182
      'golden.training_examples.tfrecord.gz.run_info.pbtxt'
183
  )
184
  WS_ALLELE_COUNT_LINEAR_MODEL = deeptrio_testdata(
185
      'window_selector_allele_count_linear.pbtxt'
186
  )
187
  WS_ALLELE_COUNT_LINEAR_MODEL_PCKL = deeptrio_testdata(
188
      'window_selector_allele_count_linear.pckl'
189
  )
190
  WS_VARIANT_READS_THRESHOLD_MODEL = deeptrio_testdata(
191
      'window_selector_variant_read_threshold.pbtxt'
192
  )
193
194
  # For oxford nanopore
195
  GRCH38_CHR0_FASTA = deeptrio_testdata(
196
      'input/grch38.chr20_5050000_5075000.masked.fa.gz'
197
  )
198
  ONT_HG002_BAM = deeptrio_testdata('input/HG002_R10_chr20_5050000_5075000.bam')
199
  ONT_HG003_BAM = deeptrio_testdata('input/HG003_R10_chr20_5050000_5075000.bam')
200
  ONT_HG004_BAM = deeptrio_testdata('input/HG004_R10_chr20_5050000_5075000.bam')
201
  HG002_HIGH_CONFIDENCE_VCF = deeptrio_testdata(
202
      'input/HG002_GRCh38_1_22_v4.2.1_benchmark.chr20.vcf.gz'
203
  )
204
  HG002_HIGH_CONFIDENCE_BED = deeptrio_testdata(
205
      'input/HG002_GRCh38_1_22_v4.2.1_benchmark.chr20.bed'
206
  )
207
  HG002_DENOVO_BED = deeptrio_testdata(
208
      'input/HG002_GRCh38_1_22_v4.2.1_benchmark.chr20.denovo_regions.bed'
209
  )
210
  GOLDEN_ONT_MAKE_EXAMPLES_OUTPUT = deeptrio_testdata(
211
      'HG002_ONT_deeptrio.examples.tfrecord.gz'
212
  )
213
  GOLDEN_ONT_DENOVO_MAKE_EXAMPLES_OUTPUT = deeptrio_testdata(
214
      'HG002_ONT_deeptrio.denovo.examples.tfrecord.gz'
215
  )
216
217
  # For CustomizedClassesVariantLabeler.
218
  global CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES
219
  CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata(
220
      'customized_classes.golden.training_examples.tfrecord.gz'
221
  )
222
223
  # For alt-aligned pileups
224
  global ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES
225
  ALT_ALIGNED_PILEUP_GOLDEN_TRAINING_EXAMPLES = deeptrio_testdata(
226
      'alt_aligned_pileup.golden.training_examples.tfrecord.gz'
227
  )
228
229
  GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES = deeptrio_testdata(
230
      'golden.vcf_candidate_importer.training_examples.tfrecord.gz'
231
  )
232
  GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES_CHILD = deeptrio_testdata(
233
      'golden_child.vcf_candidate_importer.calling_examples.tfrecord.gz'
234
  )