Switch to unified view

a b/deepvariant/make_examples_somatic.py
1
# Copyright 2021 Google LLC.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions
5
# are met:
6
#
7
# 1. Redistributions of source code must retain the above copyright notice,
8
#    this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
#    notice, this list of conditions and the following disclaimer in the
12
#    documentation and/or other materials provided with the distribution.
13
#
14
# 3. Neither the name of the copyright holder nor the names of its
15
#    contributors may be used to endorse or promote products derived from this
16
#    software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
# POSSIBILITY OF SUCH DAMAGE.
29
"""A prototype to create tumor-normal images (tf.Example protos)."""
30
31
import logging
32
import os
33
34
from absl import app
35
from absl import flags
36
37
from deepvariant import dv_constants
38
from deepvariant import logging_level
39
from deepvariant import make_examples_core
40
from deepvariant import make_examples_options
41
from deepvariant.protos import deepvariant_pb2
42
from third_party.nucleus.io.python import hts_verbose
43
from third_party.nucleus.util import errors
44
from third_party.nucleus.util import proto_utils
45
46
# Sentinel command line flag value indicating no downsampling should occur.
47
NO_DOWNSAMPLING = 0.0
48
49
# 1 is the tumor, 0 is the normal match.
50
# Tumor sample is the "main" sample because the goal here is somatic calling.
51
NORMAL_SAMPLE_INDEX = (
52
    0  # If the normal sample is present, it will appear first.
53
)
54
55
FLAGS = flags.FLAGS
56
57
# Adopt more general flags from make_examples_options.
58
flags.adopt_module_key_flags(make_examples_options)
59
60
# Flags related to samples in DeepSomatic:
61
_READS_TUMOR = flags.DEFINE_string(
62
    'reads_tumor',
63
    None,
64
    (
65
        'Required. Reads from the tumor sample. '
66
        'Aligned, sorted, indexed BAM file. '
67
        'Should be aligned to a reference genome compatible with --ref. '
68
        'Can provide multiple BAMs (comma-separated).'
69
    ),
70
)
71
_READS_NORMAL = flags.DEFINE_string(
72
    'reads_normal',
73
    None,
74
    (
75
        'Required. Reads from the normal matched sample. '
76
        'Aligned, sorted, indexed BAM file. '
77
        'Should be aligned to a reference genome compatible with --ref. '
78
        'Can provide multiple BAMs (comma-separated).'
79
    ),
80
)
81
_SAMPLE_NAME_TUMOR = flags.DEFINE_string(
82
    'sample_name_tumor',
83
    '',
84
    (
85
        'Sample name for tumor to use for our sample_name in the output'
86
        ' Variant/DeepVariantCall protos. If not specified, will be inferred'
87
        ' from the header information from --reads_tumor.'
88
    ),
89
)
90
_SAMPLE_NAME_NORMAL = flags.DEFINE_string(
91
    'sample_name_normal',
92
    '',
93
    (
94
        'Sample name for normal match to use for our sample_name in the output'
95
        ' Variant/DeepVariantCall protos. If not specified, will be inferred'
96
        ' from the header information from --reads_normal.'
97
    ),
98
)
99
_DOWNSAMPLE_FRACTION_TUMOR = flags.DEFINE_float(
100
    'downsample_fraction_tumor',
101
    NO_DOWNSAMPLING,
102
    'If not '
103
    + str(NO_DOWNSAMPLING)
104
    + ' must be a value between 0.0 and 1.0. '
105
    'Reads will be kept (randomly) with a probability of downsample_fraction '
106
    'from the input tumor sample BAM. This argument makes it easy to create '
107
    'examples as though the input BAM had less coverage.',
108
)
109
_DOWNSAMPLE_FRACTION_NORMAL = flags.DEFINE_float(
110
    'downsample_fraction_normal',
111
    NO_DOWNSAMPLING,
112
    'If not '
113
    + str(NO_DOWNSAMPLING)
114
    + ' must be a value between 0.0 and 1.0. '
115
    'Reads will be kept (randomly) with a probability of downsample_fraction '
116
    'from the input normal matched BAMs. This argument makes it easy to create '
117
    'examples as though the input BAMs had less coverage.',
118
)
119
_PILEUP_IMAGE_HEIGHT_TUMOR = flags.DEFINE_integer(
120
    'pileup_image_height_tumor',
121
    dv_constants.PILEUP_DEFAULT_HEIGHT,
122
    (
123
        'Height for the part of the pileup image showing reads from the tumor. '
124
        f'Uses {dv_constants.PILEUP_DEFAULT_HEIGHT} by default.'
125
    ),
126
)
127
_PILEUP_IMAGE_HEIGHT_NORMAL = flags.DEFINE_integer(
128
    'pileup_image_height_normal',
129
    dv_constants.PILEUP_DEFAULT_HEIGHT,
130
    (
131
        'Height for the part of the pileup image showing reads from the matched'
132
        f' normal. Uses {dv_constants.PILEUP_DEFAULT_HEIGHT} by default.'
133
    ),
134
)
135
_CANDIDATE_POSITIONS = flags.DEFINE_string(
136
    'candidate_positions',
137
    '',
138
    'Path to the binary file containing candidate positions.',
139
)
140
141
# Change any flag defaults that differ for DeepSomatic.
142
# I'm setting this to float('inf') because we don't want to include any
143
# candidates from the non-target (i.e., normal) sample.
144
FLAGS.set_default('vsc_min_fraction_multiplier', float('inf'))
145
146
147
def tumor_normal_samples_from_flags(flags_obj):
148
  """Collects sample-related options into a list of samples."""
149
  samples_in_order = []
150
151
  def setup_sample(role):
152
    pileup_height = (
153
        flags_obj.pileup_image_height_tumor
154
        if role == 'tumor'
155
        else flags_obj.pileup_image_height_normal
156
    )
157
    read_filenames = (
158
        flags_obj.reads_tumor if role == 'tumor' else flags_obj.reads_normal
159
    )
160
    sample_name_flag = (
161
        flags_obj.sample_name_tumor
162
        if role == 'tumor'
163
        else flags_obj.sample_name_normal
164
    )
165
    reads_filenames_split = read_filenames.split(',')
166
    sample_name = make_examples_core.assign_sample_name(
167
        sample_name_flag=sample_name_flag,
168
        reads_filenames=read_filenames,
169
    )
170
    skip_output_generation = True if role == 'normal' else False
171
172
    sample_options = deepvariant_pb2.SampleOptions(
173
        role=role,
174
        name=sample_name,
175
        variant_caller_options=make_examples_core.make_vc_options(
176
            sample_name=sample_name, flags_obj=flags_obj
177
        ),
178
        skip_output_generation=skip_output_generation,
179
        pileup_height=pileup_height,
180
        reads_filenames=reads_filenames_split,
181
    )
182
    if role == 'tumor':
183
      if flags_obj.reads_normal and flags_obj.reads_tumor:
184
        sample_options.order.extend([0, 1])
185
      else:
186
        sample_options.order.extend([0])
187
188
    downsample_fraction = (
189
        flags_obj.downsample_fraction_tumor
190
        if role == 'tumor'
191
        else flags_obj.downsample_fraction_normal
192
    )
193
    if downsample_fraction != NO_DOWNSAMPLING:
194
      sample_options.downsample_fraction = downsample_fraction
195
    samples_in_order.append(sample_options)
196
197
  if flags_obj.reads_normal:
198
    setup_sample('normal')
199
200
  setup_sample('tumor')
201
202
  return samples_in_order, 'tumor'
203
204
205
def default_options(main_sample_index, add_flags=True, flags_obj=None):
206
  """Creates a MakeExamplesOptions proto populated with reasonable defaults.
207
208
  Args:
209
    main_sample_index: int. Indicates the position of the tumor sample.
210
    add_flags: bool. defaults to True. If True, we will push the value of
211
      certain FLAGS into our options. If False, those option fields are left
212
      uninitialized.
213
    flags_obj: object.  If not None, use as the source of flags, else use global
214
      FLAGS.
215
216
  Returns:
217
    deepvariant_pb2.MakeExamplesOptions protobuf.
218
219
  Raises:
220
    ValueError: If we observe invalid flag values.
221
  """
222
  if not flags_obj:
223
    flags_obj = FLAGS
224
225
  samples_in_order, sample_role_to_train = tumor_normal_samples_from_flags(
226
      flags_obj=flags_obj
227
  )
228
  samples_in_order[main_sample_index].candidate_positions = (
229
      flags_obj.candidate_positions
230
  )
231
232
  options = make_examples_options.shared_flags_to_options(
233
      add_flags=add_flags,
234
      flags_obj=flags_obj,
235
      samples_in_order=samples_in_order,
236
      sample_role_to_train=sample_role_to_train,
237
      main_sample_index=main_sample_index,
238
  )
239
240
  if _READS_NORMAL.value:
241
    options.bam_fname = f'{os.path.basename(flags_obj.reads_tumor)}|{os.path.basename(flags_obj.reads_normal)}'
242
  else:
243
    options.bam_fname = os.path.basename(flags_obj.reads_tumor)
244
245
  return options
246
247
248
def check_options_are_valid(options, main_sample_index):
249
  """Checks that all the options chosen make sense together."""
250
251
  # Check for general flags (shared for DeepVariant and DeepTrio).
252
  make_examples_options.check_options_are_valid(
253
      options, main_sample_index=main_sample_index
254
  )
255
256
  tumor = options.sample_options[main_sample_index]
257
  if _READS_NORMAL.value:
258
    normal = options.sample_options[NORMAL_SAMPLE_INDEX]
259
260
    if (
261
        tumor.variant_caller_options.sample_name
262
        == normal.variant_caller_options.sample_name
263
    ):
264
      errors.log_and_raise(
265
          (
266
              'Sample names of tumor and normal samples cannot be the same. Use'
267
              ' --sample_name_tumor and --sample_name_normal with different'
268
              ' names '
269
          ),
270
          errors.CommandLineError,
271
      )
272
273
  if options.sample_options[main_sample_index].candidate_positions:
274
    if options.max_reads_per_partition:
275
      logging.warning(
276
          'Since candidate_positions is set, we use '
277
          'max_reads_for_dynamic_bases_per_region instead of '
278
          'max_reads_per_partition. This is due to the dynamic nature of the '
279
          'partition size when candidate_positions is enabled, making a fixed '
280
          'max_reads_per_partition unsuitable.'
281
      )
282
      options.max_reads_for_dynamic_bases_per_region = (
283
          options.max_reads_per_partition
284
      )
285
      options.max_reads_per_partition = 0
286
287
288
def main(argv=()):
289
  with errors.clean_commandline_error_exit():
290
    if len(argv) > 1:
291
      errors.log_and_raise(
292
          'Command line parsing failure: make_examples does not accept '
293
          'positional arguments but some are present on the command line: '
294
          '"{}".'.format(str(argv)),
295
          errors.CommandLineError,
296
      )
297
    del argv  # Unused.
298
299
    proto_utils.uses_fast_cpp_protos_or_die()
300
301
    logging_level.set_from_flag()
302
    hts_verbose.set(hts_verbose.htsLogLevel.HTS_LOG_WARNING)
303
304
    # Set up options; may do I/O.
305
    is_tumor_only = _READS_TUMOR.value and not _READS_NORMAL.value
306
    main_sample_index = 0 if is_tumor_only else 1
307
    options = default_options(main_sample_index, flags_obj=FLAGS)
308
    check_options_are_valid(options, main_sample_index)
309
310
    # Run!
311
    make_examples_core.make_examples_runner(options)
312
313
314
if __name__ == '__main__':
315
  flags.mark_flags_as_required([
316
      'examples',
317
      'mode',
318
      'reads_tumor',
319
      'ref',
320
  ])
321
  app.run(main)