|
a |
|
b/deepvariant/make_examples_somatic.py |
|
|
1 |
# Copyright 2021 Google LLC. |
|
|
2 |
# |
|
|
3 |
# Redistribution and use in source and binary forms, with or without |
|
|
4 |
# modification, are permitted provided that the following conditions |
|
|
5 |
# are met: |
|
|
6 |
# |
|
|
7 |
# 1. Redistributions of source code must retain the above copyright notice, |
|
|
8 |
# this list of conditions and the following disclaimer. |
|
|
9 |
# |
|
|
10 |
# 2. Redistributions in binary form must reproduce the above copyright |
|
|
11 |
# notice, this list of conditions and the following disclaimer in the |
|
|
12 |
# documentation and/or other materials provided with the distribution. |
|
|
13 |
# |
|
|
14 |
# 3. Neither the name of the copyright holder nor the names of its |
|
|
15 |
# contributors may be used to endorse or promote products derived from this |
|
|
16 |
# software without specific prior written permission. |
|
|
17 |
# |
|
|
18 |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
|
19 |
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
|
20 |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
|
21 |
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
|
|
22 |
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
|
23 |
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
|
24 |
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
|
25 |
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
|
26 |
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
|
27 |
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
|
28 |
# POSSIBILITY OF SUCH DAMAGE. |
|
|
29 |
"""A prototype to create tumor-normal images (tf.Example protos).""" |
|
|
30 |
|
|
|
31 |
import logging |
|
|
32 |
import os |
|
|
33 |
|
|
|
34 |
from absl import app |
|
|
35 |
from absl import flags |
|
|
36 |
|
|
|
37 |
from deepvariant import dv_constants |
|
|
38 |
from deepvariant import logging_level |
|
|
39 |
from deepvariant import make_examples_core |
|
|
40 |
from deepvariant import make_examples_options |
|
|
41 |
from deepvariant.protos import deepvariant_pb2 |
|
|
42 |
from third_party.nucleus.io.python import hts_verbose |
|
|
43 |
from third_party.nucleus.util import errors |
|
|
44 |
from third_party.nucleus.util import proto_utils |
|
|
45 |
|
|
|
46 |
# Sentinel command line flag value indicating no downsampling should occur. |
|
|
47 |
NO_DOWNSAMPLING = 0.0 |
|
|
48 |
|
|
|
49 |
# 1 is the tumor, 0 is the normal match. |
|
|
50 |
# Tumor sample is the "main" sample because the goal here is somatic calling. |
|
|
51 |
NORMAL_SAMPLE_INDEX = ( |
|
|
52 |
0 # If the normal sample is present, it will appear first. |
|
|
53 |
) |
|
|
54 |
|
|
|
55 |
FLAGS = flags.FLAGS |
|
|
56 |
|
|
|
57 |
# Adopt more general flags from make_examples_options. |
|
|
58 |
flags.adopt_module_key_flags(make_examples_options) |
|
|
59 |
|
|
|
60 |
# Flags related to samples in DeepSomatic: |
|
|
61 |
_READS_TUMOR = flags.DEFINE_string( |
|
|
62 |
'reads_tumor', |
|
|
63 |
None, |
|
|
64 |
( |
|
|
65 |
'Required. Reads from the tumor sample. ' |
|
|
66 |
'Aligned, sorted, indexed BAM file. ' |
|
|
67 |
'Should be aligned to a reference genome compatible with --ref. ' |
|
|
68 |
'Can provide multiple BAMs (comma-separated).' |
|
|
69 |
), |
|
|
70 |
) |
|
|
71 |
_READS_NORMAL = flags.DEFINE_string( |
|
|
72 |
'reads_normal', |
|
|
73 |
None, |
|
|
74 |
( |
|
|
75 |
'Required. Reads from the normal matched sample. ' |
|
|
76 |
'Aligned, sorted, indexed BAM file. ' |
|
|
77 |
'Should be aligned to a reference genome compatible with --ref. ' |
|
|
78 |
'Can provide multiple BAMs (comma-separated).' |
|
|
79 |
), |
|
|
80 |
) |
|
|
81 |
_SAMPLE_NAME_TUMOR = flags.DEFINE_string( |
|
|
82 |
'sample_name_tumor', |
|
|
83 |
'', |
|
|
84 |
( |
|
|
85 |
'Sample name for tumor to use for our sample_name in the output' |
|
|
86 |
' Variant/DeepVariantCall protos. If not specified, will be inferred' |
|
|
87 |
' from the header information from --reads_tumor.' |
|
|
88 |
), |
|
|
89 |
) |
|
|
90 |
_SAMPLE_NAME_NORMAL = flags.DEFINE_string( |
|
|
91 |
'sample_name_normal', |
|
|
92 |
'', |
|
|
93 |
( |
|
|
94 |
'Sample name for normal match to use for our sample_name in the output' |
|
|
95 |
' Variant/DeepVariantCall protos. If not specified, will be inferred' |
|
|
96 |
' from the header information from --reads_normal.' |
|
|
97 |
), |
|
|
98 |
) |
|
|
99 |
_DOWNSAMPLE_FRACTION_TUMOR = flags.DEFINE_float( |
|
|
100 |
'downsample_fraction_tumor', |
|
|
101 |
NO_DOWNSAMPLING, |
|
|
102 |
'If not ' |
|
|
103 |
+ str(NO_DOWNSAMPLING) |
|
|
104 |
+ ' must be a value between 0.0 and 1.0. ' |
|
|
105 |
'Reads will be kept (randomly) with a probability of downsample_fraction ' |
|
|
106 |
'from the input tumor sample BAM. This argument makes it easy to create ' |
|
|
107 |
'examples as though the input BAM had less coverage.', |
|
|
108 |
) |
|
|
109 |
_DOWNSAMPLE_FRACTION_NORMAL = flags.DEFINE_float( |
|
|
110 |
'downsample_fraction_normal', |
|
|
111 |
NO_DOWNSAMPLING, |
|
|
112 |
'If not ' |
|
|
113 |
+ str(NO_DOWNSAMPLING) |
|
|
114 |
+ ' must be a value between 0.0 and 1.0. ' |
|
|
115 |
'Reads will be kept (randomly) with a probability of downsample_fraction ' |
|
|
116 |
'from the input normal matched BAMs. This argument makes it easy to create ' |
|
|
117 |
'examples as though the input BAMs had less coverage.', |
|
|
118 |
) |
|
|
119 |
_PILEUP_IMAGE_HEIGHT_TUMOR = flags.DEFINE_integer( |
|
|
120 |
'pileup_image_height_tumor', |
|
|
121 |
dv_constants.PILEUP_DEFAULT_HEIGHT, |
|
|
122 |
( |
|
|
123 |
'Height for the part of the pileup image showing reads from the tumor. ' |
|
|
124 |
f'Uses {dv_constants.PILEUP_DEFAULT_HEIGHT} by default.' |
|
|
125 |
), |
|
|
126 |
) |
|
|
127 |
_PILEUP_IMAGE_HEIGHT_NORMAL = flags.DEFINE_integer( |
|
|
128 |
'pileup_image_height_normal', |
|
|
129 |
dv_constants.PILEUP_DEFAULT_HEIGHT, |
|
|
130 |
( |
|
|
131 |
'Height for the part of the pileup image showing reads from the matched' |
|
|
132 |
f' normal. Uses {dv_constants.PILEUP_DEFAULT_HEIGHT} by default.' |
|
|
133 |
), |
|
|
134 |
) |
|
|
135 |
_CANDIDATE_POSITIONS = flags.DEFINE_string( |
|
|
136 |
'candidate_positions', |
|
|
137 |
'', |
|
|
138 |
'Path to the binary file containing candidate positions.', |
|
|
139 |
) |
|
|
140 |
|
|
|
141 |
# Change any flag defaults that differ for DeepSomatic. |
|
|
142 |
# I'm setting this to float('inf') because we don't want to include any |
|
|
143 |
# candidates from the non-target (i.e., normal) sample. |
|
|
144 |
FLAGS.set_default('vsc_min_fraction_multiplier', float('inf')) |
|
|
145 |
|
|
|
146 |
|
|
|
147 |
def tumor_normal_samples_from_flags(flags_obj): |
|
|
148 |
"""Collects sample-related options into a list of samples.""" |
|
|
149 |
samples_in_order = [] |
|
|
150 |
|
|
|
151 |
def setup_sample(role): |
|
|
152 |
pileup_height = ( |
|
|
153 |
flags_obj.pileup_image_height_tumor |
|
|
154 |
if role == 'tumor' |
|
|
155 |
else flags_obj.pileup_image_height_normal |
|
|
156 |
) |
|
|
157 |
read_filenames = ( |
|
|
158 |
flags_obj.reads_tumor if role == 'tumor' else flags_obj.reads_normal |
|
|
159 |
) |
|
|
160 |
sample_name_flag = ( |
|
|
161 |
flags_obj.sample_name_tumor |
|
|
162 |
if role == 'tumor' |
|
|
163 |
else flags_obj.sample_name_normal |
|
|
164 |
) |
|
|
165 |
reads_filenames_split = read_filenames.split(',') |
|
|
166 |
sample_name = make_examples_core.assign_sample_name( |
|
|
167 |
sample_name_flag=sample_name_flag, |
|
|
168 |
reads_filenames=read_filenames, |
|
|
169 |
) |
|
|
170 |
skip_output_generation = True if role == 'normal' else False |
|
|
171 |
|
|
|
172 |
sample_options = deepvariant_pb2.SampleOptions( |
|
|
173 |
role=role, |
|
|
174 |
name=sample_name, |
|
|
175 |
variant_caller_options=make_examples_core.make_vc_options( |
|
|
176 |
sample_name=sample_name, flags_obj=flags_obj |
|
|
177 |
), |
|
|
178 |
skip_output_generation=skip_output_generation, |
|
|
179 |
pileup_height=pileup_height, |
|
|
180 |
reads_filenames=reads_filenames_split, |
|
|
181 |
) |
|
|
182 |
if role == 'tumor': |
|
|
183 |
if flags_obj.reads_normal and flags_obj.reads_tumor: |
|
|
184 |
sample_options.order.extend([0, 1]) |
|
|
185 |
else: |
|
|
186 |
sample_options.order.extend([0]) |
|
|
187 |
|
|
|
188 |
downsample_fraction = ( |
|
|
189 |
flags_obj.downsample_fraction_tumor |
|
|
190 |
if role == 'tumor' |
|
|
191 |
else flags_obj.downsample_fraction_normal |
|
|
192 |
) |
|
|
193 |
if downsample_fraction != NO_DOWNSAMPLING: |
|
|
194 |
sample_options.downsample_fraction = downsample_fraction |
|
|
195 |
samples_in_order.append(sample_options) |
|
|
196 |
|
|
|
197 |
if flags_obj.reads_normal: |
|
|
198 |
setup_sample('normal') |
|
|
199 |
|
|
|
200 |
setup_sample('tumor') |
|
|
201 |
|
|
|
202 |
return samples_in_order, 'tumor' |
|
|
203 |
|
|
|
204 |
|
|
|
205 |
def default_options(main_sample_index, add_flags=True, flags_obj=None): |
|
|
206 |
"""Creates a MakeExamplesOptions proto populated with reasonable defaults. |
|
|
207 |
|
|
|
208 |
Args: |
|
|
209 |
main_sample_index: int. Indicates the position of the tumor sample. |
|
|
210 |
add_flags: bool. defaults to True. If True, we will push the value of |
|
|
211 |
certain FLAGS into our options. If False, those option fields are left |
|
|
212 |
uninitialized. |
|
|
213 |
flags_obj: object. If not None, use as the source of flags, else use global |
|
|
214 |
FLAGS. |
|
|
215 |
|
|
|
216 |
Returns: |
|
|
217 |
deepvariant_pb2.MakeExamplesOptions protobuf. |
|
|
218 |
|
|
|
219 |
Raises: |
|
|
220 |
ValueError: If we observe invalid flag values. |
|
|
221 |
""" |
|
|
222 |
if not flags_obj: |
|
|
223 |
flags_obj = FLAGS |
|
|
224 |
|
|
|
225 |
samples_in_order, sample_role_to_train = tumor_normal_samples_from_flags( |
|
|
226 |
flags_obj=flags_obj |
|
|
227 |
) |
|
|
228 |
samples_in_order[main_sample_index].candidate_positions = ( |
|
|
229 |
flags_obj.candidate_positions |
|
|
230 |
) |
|
|
231 |
|
|
|
232 |
options = make_examples_options.shared_flags_to_options( |
|
|
233 |
add_flags=add_flags, |
|
|
234 |
flags_obj=flags_obj, |
|
|
235 |
samples_in_order=samples_in_order, |
|
|
236 |
sample_role_to_train=sample_role_to_train, |
|
|
237 |
main_sample_index=main_sample_index, |
|
|
238 |
) |
|
|
239 |
|
|
|
240 |
if _READS_NORMAL.value: |
|
|
241 |
options.bam_fname = f'{os.path.basename(flags_obj.reads_tumor)}|{os.path.basename(flags_obj.reads_normal)}' |
|
|
242 |
else: |
|
|
243 |
options.bam_fname = os.path.basename(flags_obj.reads_tumor) |
|
|
244 |
|
|
|
245 |
return options |
|
|
246 |
|
|
|
247 |
|
|
|
248 |
def check_options_are_valid(options, main_sample_index): |
|
|
249 |
"""Checks that all the options chosen make sense together.""" |
|
|
250 |
|
|
|
251 |
# Check for general flags (shared for DeepVariant and DeepTrio). |
|
|
252 |
make_examples_options.check_options_are_valid( |
|
|
253 |
options, main_sample_index=main_sample_index |
|
|
254 |
) |
|
|
255 |
|
|
|
256 |
tumor = options.sample_options[main_sample_index] |
|
|
257 |
if _READS_NORMAL.value: |
|
|
258 |
normal = options.sample_options[NORMAL_SAMPLE_INDEX] |
|
|
259 |
|
|
|
260 |
if ( |
|
|
261 |
tumor.variant_caller_options.sample_name |
|
|
262 |
== normal.variant_caller_options.sample_name |
|
|
263 |
): |
|
|
264 |
errors.log_and_raise( |
|
|
265 |
( |
|
|
266 |
'Sample names of tumor and normal samples cannot be the same. Use' |
|
|
267 |
' --sample_name_tumor and --sample_name_normal with different' |
|
|
268 |
' names ' |
|
|
269 |
), |
|
|
270 |
errors.CommandLineError, |
|
|
271 |
) |
|
|
272 |
|
|
|
273 |
if options.sample_options[main_sample_index].candidate_positions: |
|
|
274 |
if options.max_reads_per_partition: |
|
|
275 |
logging.warning( |
|
|
276 |
'Since candidate_positions is set, we use ' |
|
|
277 |
'max_reads_for_dynamic_bases_per_region instead of ' |
|
|
278 |
'max_reads_per_partition. This is due to the dynamic nature of the ' |
|
|
279 |
'partition size when candidate_positions is enabled, making a fixed ' |
|
|
280 |
'max_reads_per_partition unsuitable.' |
|
|
281 |
) |
|
|
282 |
options.max_reads_for_dynamic_bases_per_region = ( |
|
|
283 |
options.max_reads_per_partition |
|
|
284 |
) |
|
|
285 |
options.max_reads_per_partition = 0 |
|
|
286 |
|
|
|
287 |
|
|
|
288 |
def main(argv=()): |
|
|
289 |
with errors.clean_commandline_error_exit(): |
|
|
290 |
if len(argv) > 1: |
|
|
291 |
errors.log_and_raise( |
|
|
292 |
'Command line parsing failure: make_examples does not accept ' |
|
|
293 |
'positional arguments but some are present on the command line: ' |
|
|
294 |
'"{}".'.format(str(argv)), |
|
|
295 |
errors.CommandLineError, |
|
|
296 |
) |
|
|
297 |
del argv # Unused. |
|
|
298 |
|
|
|
299 |
proto_utils.uses_fast_cpp_protos_or_die() |
|
|
300 |
|
|
|
301 |
logging_level.set_from_flag() |
|
|
302 |
hts_verbose.set(hts_verbose.htsLogLevel.HTS_LOG_WARNING) |
|
|
303 |
|
|
|
304 |
# Set up options; may do I/O. |
|
|
305 |
is_tumor_only = _READS_TUMOR.value and not _READS_NORMAL.value |
|
|
306 |
main_sample_index = 0 if is_tumor_only else 1 |
|
|
307 |
options = default_options(main_sample_index, flags_obj=FLAGS) |
|
|
308 |
check_options_are_valid(options, main_sample_index) |
|
|
309 |
|
|
|
310 |
# Run! |
|
|
311 |
make_examples_core.make_examples_runner(options) |
|
|
312 |
|
|
|
313 |
|
|
|
314 |
if __name__ == '__main__': |
|
|
315 |
flags.mark_flags_as_required([ |
|
|
316 |
'examples', |
|
|
317 |
'mode', |
|
|
318 |
'reads_tumor', |
|
|
319 |
'ref', |
|
|
320 |
]) |
|
|
321 |
app.run(main) |