a b/deeptrio/make_examples.py
1
# Copyright 2020 Google LLC.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions
5
# are met:
6
#
7
# 1. Redistributions of source code must retain the above copyright notice,
8
#    this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
#    notice, this list of conditions and the following disclaimer in the
12
#    documentation and/or other materials provided with the distribution.
13
#
14
# 3. Neither the name of the copyright holder nor the names of its
15
#    contributors may be used to endorse or promote products derived from this
16
#    software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
# POSSIBILITY OF SUCH DAMAGE.
29
"""Step one of DeepTrio: creates tf.Example protos for training/calling."""
30
31
import os
32
33
from absl import app
34
from absl import flags
35
36
from deeptrio import dt_constants
37
from deepvariant import logging_level
38
from deepvariant import make_examples_core
39
from deepvariant import make_examples_options
40
from deepvariant.protos import deepvariant_pb2
41
from third_party.nucleus.io.python import hts_verbose
42
from third_party.nucleus.util import errors
43
from third_party.nucleus.util import proto_utils
44
45
# Sentinel command line flag value indicating no downsampling should occur.
46
NO_DOWNSAMPLING = 0.0
47
48
MAIN_SAMPLE_INDEX = 1  # 1 is the child of the trio.
49
50
FLAGS = flags.FLAGS
51
52
# Adopt more general flags from make_examples_options.
53
flags.adopt_module_key_flags(make_examples_options)
54
55
# Flags related to samples in DeepTrio:
56
SAMPLE_NAME_TO_TRAIN_ = flags.DEFINE_string(
57
    'sample_name_to_train',
58
    None,
59
    (
60
        'Optional - if not set, default to the value in --sample_name, i.e. the'
61
        ' child. The default is set to be backward compatible. If set, it has'
62
        ' to match one of --sample_name, --sample_name_parent1, or'
63
        ' --sample_name_parent2. Only used for training. When run in calling'
64
        ' mode, this is unused because examples are generated for all 3 samples'
65
        ' together.'
66
    ),
67
)
68
READS_ = flags.DEFINE_string(
69
    'reads',
70
    None,
71
    (
72
        'Required. Aligned, sorted, indexed BAM file containing reads from the '
73
        'child of the trio. '
74
        'Should be aligned to a reference genome compatible with --ref. '
75
        'Can provide multiple BAMs (comma-separated).'
76
    ),
77
)
78
READS_PARENT1_ = flags.DEFINE_string(
79
    'reads_parent1',
80
    None,
81
    (
82
        'Required. Aligned, sorted, indexed BAM file containing reads from'
83
        ' parent 1 of the trio. Should be aligned to a reference genome'
84
        ' compatible with --ref. Can provide multiple BAMs (comma-separated).'
85
    ),
86
)
87
READS_PARENT2_ = flags.DEFINE_string(
88
    'reads_parent2',
89
    None,
90
    (
91
        'Aligned, sorted, indexed BAM file containing reads from parent 2 of'
92
        ' the trio. Should be aligned to a reference genome compatible with'
93
        ' --ref. Can provide multiple BAMs (comma-separated).'
94
    ),
95
)
96
DOWNSAMPLE_FRACTION_CHILD_ = flags.DEFINE_float(
97
    'downsample_fraction_child',
98
    NO_DOWNSAMPLING,
99
    'If not '
100
    + str(NO_DOWNSAMPLING)
101
    + ' must be a value between 0.0 and 1.0. '
102
    'Reads will be kept (randomly) with a probability of downsample_fraction '
103
    'from the input child BAM. This argument makes it easy to create examples '
104
    'as though the input BAM had less coverage.',
105
)
106
DOWNSAMPLE_FRACTION_PARENTS_ = flags.DEFINE_float(
107
    'downsample_fraction_parents',
108
    NO_DOWNSAMPLING,
109
    'If not '
110
    + str(NO_DOWNSAMPLING)
111
    + ' must be a value between 0.0 and 1.0. '
112
    'Reads will be kept (randomly) with a probability of downsample_fraction '
113
    'from the input parent BAMs. This argument makes it easy to create examples'
114
    ' as though the input BAMs had less coverage.',
115
)
116
SAMPLE_NAME_ = flags.DEFINE_string(
117
    'sample_name',
118
    '',
119
    (
120
        'Child sample name to use for our sample_name in the output'
121
        ' Variant/DeepVariantCall protos. If not specified, will be inferred'
122
        ' from the header information from --reads.'
123
    ),
124
)
125
SAMPLE_NAME_PARENT1_ = flags.DEFINE_string(
126
    'sample_name_parent1',
127
    '',
128
    (
129
        'Parent1 Sample name to use for our sample_name in the output'
130
        ' Variant/DeepVariantCall protos. If not specified, will be inferred'
131
        ' from the header information from --reads_parent1.'
132
    ),
133
)
134
SAMPLE_NAME_PARENT2_ = flags.DEFINE_string(
135
    'sample_name_parent2',
136
    '',
137
    (
138
        'Parent2 Sample name to use for our sample_name in the output'
139
        ' Variant/DeepVariantCall protos. If not specified, will be inferred'
140
        ' from the header information from --reads_parent2.'
141
    ),
142
)
143
PILEUP_IMAGE_HEIGHT_PARENT_ = flags.DEFINE_integer(
144
    'pileup_image_height_parent',
145
    0,
146
    'Height for the parent pileup image. If 0, uses the default height',
147
)
148
PILEUP_IMAGE_HEIGHT_CHILD_ = flags.DEFINE_integer(
149
    'pileup_image_height_child',
150
    0,
151
    'Height for the child pileup image. If 0, uses the default height',
152
)
153
PROPOSED_VARIANTS_CHILD_ = flags.DEFINE_string(
154
    'proposed_variants_child',
155
    None,
156
    (
157
        '(Only used when --variant_caller=vcf_candidate_importer.) '
158
        'Tabix-indexed VCF file containing the proposed positions and alts for '
159
        '`vcf_candidate_importer` for the child. The GTs will be ignored.'
160
    ),
161
)
162
PROPOSED_VARIANTS_PARENT1_ = flags.DEFINE_string(
163
    'proposed_variants_parent1',
164
    None,
165
    (
166
        '(Only used when --variant_caller=vcf_candidate_importer.) '
167
        'Tabix-indexed VCF file containing the proposed positions and alts for '
168
        '`vcf_candidate_importer` for the parent 1. The GTs will be ignored.'
169
    ),
170
)
171
PROPOSED_VARIANTS_PARENT2_ = flags.DEFINE_string(
172
    'proposed_variants_parent2',
173
    None,
174
    (
175
        '(Only used when --variant_caller=vcf_candidate_importer.) '
176
        'Tabix-indexed VCF file containing the proposed positions and alts for '
177
        '`vcf_candidate_importer` for the parent 2. The GTs will be ignored.'
178
    ),
179
)
180
# We are using this flag for determining intervals for both child and parent
181
# models. In the future, we can consider extending into 3 samples.
182
CANDIDATE_POSITIONS_ = flags.DEFINE_string(
183
    'candidate_positions',
184
    None,
185
    (
186
        'Path to the binary file containing candidate positions used for '
187
        'make_examples partitioning by candidates. Currently this '
188
        'is only the child positions.'
189
    ),
190
)
191
_SKIP_PARENT_CALLING = flags.DEFINE_bool(
192
    'skip_parent_calling',
193
    False,
194
    'If True, parents will not be called. Default is False.',
195
)
196
197
# Change any flag defaults that differ for DeepTrio.
198
FLAGS.set_default('vsc_min_fraction_multiplier', 0.67)
199
200
201
def trio_samples_from_flags(add_flags=True, flags_obj=None):
202
  """Collects sample-related options into a list of samples."""
203
  # Sample-specific options.
204
  child_sample_name = make_examples_core.assign_sample_name(
205
      sample_name_flag=SAMPLE_NAME_.value, reads_filenames=READS_.value
206
  )
207
208
  parent1_sample_name = make_examples_core.assign_sample_name(
209
      sample_name_flag=SAMPLE_NAME_PARENT1_.value,
210
      reads_filenames=READS_PARENT1_.value,
211
  )
212
213
  parent2_sample_name = make_examples_core.assign_sample_name(
214
      sample_name_flag=SAMPLE_NAME_PARENT2_.value,
215
      reads_filenames=READS_PARENT2_.value,
216
  )
217
218
  parent1_options = deepvariant_pb2.SampleOptions(
219
      role='parent1',
220
      name=parent1_sample_name,
221
      variant_caller_options=make_examples_core.make_vc_options(
222
          sample_name=parent1_sample_name, flags_obj=flags_obj
223
      ),
224
      order=[0, 1, 2],
225
      pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_PARENT,
226
      skip_output_generation=_SKIP_PARENT_CALLING.value,
227
  )
228
  child_options = deepvariant_pb2.SampleOptions(
229
      role='child',
230
      name=child_sample_name,
231
      variant_caller_options=make_examples_core.make_vc_options(
232
          sample_name=child_sample_name, flags_obj=flags_obj
233
      ),
234
      order=[0, 1, 2],
235
      pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_CHILD,
236
  )
237
  parent2_options = deepvariant_pb2.SampleOptions(
238
      role='parent2',
239
      name=parent2_sample_name,
240
      variant_caller_options=make_examples_core.make_vc_options(
241
          sample_name=parent2_sample_name, flags_obj=flags_obj
242
      ),
243
      # Swap the two parents when calling on parent2.
244
      order=[2, 1, 0],
245
      pileup_height=dt_constants.PILEUP_DEFAULT_HEIGHT_PARENT,
246
      skip_output_generation=_SKIP_PARENT_CALLING.value,
247
  )
248
249
  # If --sample_name_to_train is not set, train on the child.
250
  # This is for backward compatibility.
251
  sample_role_to_train = 'child'
252
253
  if add_flags:
254
    if READS_.value:
255
      child_options.reads_filenames.extend(READS_.value.split(','))
256
    if READS_PARENT1_.value:
257
      parent1_options.reads_filenames.extend(READS_PARENT1_.value.split(','))
258
    if READS_PARENT2_.value:
259
      parent2_options.reads_filenames.extend(READS_PARENT2_.value.split(','))
260
261
    if CANDIDATE_POSITIONS_.value:
262
      child_options.candidate_positions = CANDIDATE_POSITIONS_.value
263
264
    if PROPOSED_VARIANTS_CHILD_.value:
265
      child_options.proposed_variants_filename = PROPOSED_VARIANTS_CHILD_.value
266
    if PROPOSED_VARIANTS_PARENT1_.value:
267
      parent1_options.proposed_variants_filename = (
268
          PROPOSED_VARIANTS_PARENT1_.value
269
      )
270
    if PROPOSED_VARIANTS_PARENT2_.value:
271
      parent2_options.proposed_variants_filename = (
272
          PROPOSED_VARIANTS_PARENT2_.value
273
      )
274
275
    if DOWNSAMPLE_FRACTION_CHILD_.value != NO_DOWNSAMPLING:
276
      child_options.downsample_fraction = DOWNSAMPLE_FRACTION_CHILD_.value
277
    if DOWNSAMPLE_FRACTION_PARENTS_.value != NO_DOWNSAMPLING:
278
      parent1_options.downsample_fraction = DOWNSAMPLE_FRACTION_PARENTS_.value
279
      parent2_options.downsample_fraction = DOWNSAMPLE_FRACTION_PARENTS_.value
280
281
    if PILEUP_IMAGE_HEIGHT_CHILD_.value:
282
      child_options.pileup_height = PILEUP_IMAGE_HEIGHT_CHILD_.value
283
    if PILEUP_IMAGE_HEIGHT_PARENT_.value:
284
      parent1_options.pileup_height = (
285
          parent2_options.pileup_height
286
      ) = PILEUP_IMAGE_HEIGHT_PARENT_.value
287
288
    if SAMPLE_NAME_TO_TRAIN_.value:
289
      if SAMPLE_NAME_TO_TRAIN_.value == SAMPLE_NAME_.value:
290
        sample_role_to_train = child_options.role
291
      elif SAMPLE_NAME_TO_TRAIN_.value == SAMPLE_NAME_PARENT1_.value:
292
        sample_role_to_train = parent1_options.role
293
      else:
294
        errors.log_and_raise(
295
            (
296
                '--sample_name_to_train must match either --sample_name or '
297
                '--sample_name_parent1, or it can be unset to default to '
298
                '--sample_name.'
299
            ),
300
            errors.CommandLineError,
301
        )
302
303
  # Ordering here determines the default order of samples, and when a sample
304
  # above has a custom .order, then this is the list those indices refer to.
305
  samples_in_order = [parent1_options, child_options, parent2_options]
306
  return samples_in_order, sample_role_to_train
307
308
309
def default_options(add_flags=True, flags_obj=None):
310
  """Creates a MakeExamplesOptions proto populated with reasonable defaults.
311
312
  Args:
313
    add_flags: bool. defaults to True. If True, we will push the value of
314
      certain FLAGS into our options. If False, those option fields are left
315
      uninitialized.
316
    flags_obj: object.  If not None, use as the source of flags, else use global
317
      FLAGS.
318
319
  Returns:
320
    deepvariant_pb2.MakeExamplesOptions protobuf.
321
322
  Raises:
323
    ValueError: If we observe invalid flag values.
324
  """
325
  if not flags_obj:
326
    flags_obj = FLAGS
327
328
  samples_in_order, sample_role_to_train = trio_samples_from_flags(
329
      add_flags=add_flags, flags_obj=flags_obj
330
  )
331
332
  options = make_examples_options.shared_flags_to_options(
333
      add_flags=add_flags,
334
      flags_obj=flags_obj,
335
      samples_in_order=samples_in_order,
336
      sample_role_to_train=sample_role_to_train,
337
      main_sample_index=MAIN_SAMPLE_INDEX,
338
  )
339
340
  if add_flags:
341
    options.bam_fname = (
342
        os.path.basename(READS_.value)
343
        + '|'
344
        + (
345
            os.path.basename(READS_PARENT1_.value)
346
            if READS_PARENT1_.value
347
            else 'None'
348
        )
349
        + '|'
350
        + (
351
            os.path.basename(READS_PARENT2_.value)
352
            if READS_PARENT2_.value
353
            else 'None'
354
        )
355
    )
356
    options.pic_options.sequencing_type = (
357
        deepvariant_pb2.PileupImageOptions.TRIO
358
    )
359
    if not options.pic_options.height:
360
      options.pic_options.height = dt_constants.PILEUP_DEFAULT_HEIGHT
361
    if not options.pic_options.width:
362
      options.pic_options.width = dt_constants.PILEUP_DEFAULT_WIDTH
363
364
  return options
365
366
367
def check_options_are_valid(options):
368
  """Checks that all the options chosen make sense together."""
369
370
  # Check for general flags (shared for DeepVariant and DeepTrio).
371
  make_examples_options.check_options_are_valid(
372
      options, main_sample_index=MAIN_SAMPLE_INDEX
373
  )
374
375
  child = options.sample_options[MAIN_SAMPLE_INDEX]
376
377
  # Sanity check the sample_names (specific to trio).
378
  if (
379
      child.variant_caller_options.sample_name == FLAGS.sample_name_parent1
380
      or child.variant_caller_options.sample_name == FLAGS.sample_name_parent2
381
  ):
382
    errors.log_and_raise(
383
        'The sample_name of the child is the same as one of the parents.',
384
        errors.CommandLineError,
385
    )
386
387
  if options.pic_options.alt_aligned_pileup == 'rows':
388
    errors.log_and_raise(
389
        '--alt_aligned_pileup="rows" cannot be used with '
390
        'DeepTrio because the pileup images would become '
391
        'too tall for InceptionV3.'
392
    )
393
394
  if (
395
      options.mode == deepvariant_pb2.MakeExamplesOptions.CANDIDATE_SWEEP
396
      and child.candidate_positions is None
397
  ):
398
    errors.log_and_raise(
399
        '--candidate_positions is required when --positions_sweep_mode is set.'
400
    )
401
402
403
def main(argv=()):
404
  with errors.clean_commandline_error_exit():
405
    if len(argv) > 1:
406
      errors.log_and_raise(
407
          'Command line parsing failure: make_examples does not accept '
408
          'positional arguments but some are present on the command line: '
409
          '"{}".'.format(str(argv)),
410
          errors.CommandLineError,
411
      )
412
    del argv  # Unused.
413
414
    proto_utils.uses_fast_cpp_protos_or_die()
415
416
    logging_level.set_from_flag()
417
    hts_verbose.set(hts_verbose.htsLogLevel.HTS_LOG_WARNING)
418
419
    # Set up options; may do I/O.
420
    options = default_options(add_flags=True, flags_obj=FLAGS)
421
    check_options_are_valid(options)
422
423
    # Run!
424
    make_examples_core.make_examples_runner(options)
425
426
427
if __name__ == '__main__':
428
  flags.mark_flags_as_required([
429
      'examples',
430
      'mode',
431
      'reads',
432
      'ref',
433
  ])
434
  app.run(main)