Switch to unified view

a b/deepvariant/realigner/window_selector.py
1
# Copyright 2017 Google LLC.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions
5
# are met:
6
#
7
# 1. Redistributions of source code must retain the above copyright notice,
8
#    this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
#    notice, this list of conditions and the following disclaimer in the
12
#    documentation and/or other materials provided with the distribution.
13
#
14
# 3. Neither the name of the copyright holder nor the names of its
15
#    contributors may be used to endorse or promote products derived from this
16
#    software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
# POSSIBILITY OF SUCH DAMAGE.
29
"""Determine genomic ranges to perform local assembly."""
30
31
from third_party.nucleus.protos import reads_pb2
32
from third_party.nucleus.util import ranges
33
from deepvariant.protos import deepvariant_pb2
34
from deepvariant.protos import realigner_pb2
35
from deepvariant.python import allelecounter
36
from deepvariant.realigner.python import window_selector as cpp_window_selector
37
38
39
def _candidates_from_reads(config, ref_reader, reads, region):
40
  """Returns a list of candidate positions.
41
42
  Args:
43
    config: learning.genomics.deepvariant.realigner.WindowSelectorOptions
44
      options determining the behavior of this window selector.
45
    ref_reader: GenomeReference. Indexed reference genome to query bases.
46
    reads: list[nucleus.protos.Read]. The reads we are processing into candidate
47
      positions.
48
    region: nucleus.protos.Range. The region we are processing.
49
50
  Returns:
51
    A list. The elements are reference positions within region.
52
53
  Raises:
54
    ValueError: if config.window_selector_model.model_type isn't a valid enum
55
    name in realigner_pb2.WindowSelectorModel.ModelType.
56
  """
57
  allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
58
      read_requirements=reads_pb2.ReadRequirements(
59
          min_mapping_quality=config.min_mapq,
60
          min_base_quality=config.min_base_quality),
61
      keep_legacy_behavior=config.keep_legacy_behavior)
62
  expanded_region = ranges.expand(
63
      region,
64
      config.region_expansion_in_bp,
65
      contig_map=ranges.contigs_dict(ref_reader.header.contigs))
66
67
  allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader,
68
                                               expanded_region, [],
69
                                               allele_counter_options)
70
71
  for read in reads:
72
    allele_counter.add(read, 'placeholder_sample_id')
73
74
  model_type = config.window_selector_model.model_type
75
  if model_type == realigner_pb2.WindowSelectorModel.VARIANT_READS:
76
    return _variant_reads_threshold_selector(
77
        allele_counter, config.window_selector_model.variant_reads_model,
78
        expanded_region)
79
  elif model_type == realigner_pb2.WindowSelectorModel.ALLELE_COUNT_LINEAR:
80
    return _allele_count_linear_selector(
81
        allele_counter, config.window_selector_model.allele_count_linear_model,
82
        expanded_region)
83
  else:
84
    raise ValueError('Unknown enum option "{}" for '
85
                     'WindowSelectorModel.model_type'.format(
86
                         config.window_selector_model.model_type))
87
88
89
def _variant_reads_threshold_selector(allele_counter, model_conf,
90
                                      expanded_region):
91
  """Returns a list of candidate positions.
92
93
  Following cigar operations generate candidate position:
94
    - ALIGNMENT_MATCH, SEQUENCE_MISMATCH, SEQUENCE_MATCH: at mismatch positions
95
      in the read when compared to the reference sequence.
96
    - DELETE: at positions within [cigar_start, cigar_start + cigar_len)
97
    - INSERT, CLIP_SOFT: at positions within
98
        [cigar_start - cigar_len, cigar_start + cigar_len)
99
100
   Note. Function implementation has changed to return positions beyond input
101
   region in case we have variants there. See the change at internal and
102
   internal.
103
104
  Args:
105
    allele_counter: learning.genomics.deepvariant.realigner.AlleleCounter in the
106
      considered region.
107
    model_conf: learning.genomics.deepvariant.realigner
108
      .WindowSelectorOptions.VariantReadsThresholdModel options determining the
109
      behavior of this window selector.
110
    expanded_region: nucleus.protos.Range. The region we are processing.
111
112
  Returns:
113
    A list. The elements are reference positions within region.
114
  """
115
116
  counts_vec = cpp_window_selector.variant_reads_candidates_from_allele_counter(
117
      allele_counter)
118
119
  return [
120
      expanded_region.start + i
121
      for i, count in enumerate(counts_vec)
122
      if (count >= model_conf.min_num_supporting_reads and
123
          count <= model_conf.max_num_supporting_reads)
124
  ]
125
126
127
def _allele_count_linear_selector(allele_counter, model_conf, expanded_region):
128
  """Returns a list of candidate positions.
129
130
  Candidate positions for realignment are generated by scoring each location.
131
  The score at a location is a weighted sum of the number of reads with each
132
  CIGAR operation at the location, where the weights are determined by the model
133
  coefficients. Locations whose score exceed the model decision boundary value
134
  are used to create realignment windows.
135
136
  Note. Function implementation has changed to return positions beyond input
137
  region in case we have variants there. See the change at internal and
138
  internal.
139
140
  Args:
141
    allele_counter: learning.genomics.deepvariant.realigner.AlleleCounter in the
142
      considered region.
143
    model_conf: learning.genomics.deepvariant.realigner
144
      .WindowSelectorOptions.AlleleCountLinearModel options determining the
145
      behavior of this window selector.
146
    expanded_region: nucleus.protos.Range. The region we are processing.
147
148
  Returns:
149
    A list. The elements are reference positions within region.
150
  """
151
152
  scores_vec = (
153
      cpp_window_selector.allele_count_linear_candidates_from_allele_counter(
154
          allele_counter, model_conf))
155
156
  return [
157
      expanded_region.start + i
158
      for i, score in enumerate(scores_vec)
159
      if score > model_conf.decision_boundary
160
  ]
161
162
163
def _candidates_to_windows(config, candidate_pos, ref_name):
164
  """"Process candidate positions to determine windows for local assembly.
165
166
  Windows are within range of
167
    [min(pos) - config.min_windows_distance,
168
     max(pos) + config.min_windows_distance)
169
170
  Args:
171
    config: learning.genomics.deepvariant.realigner.WindowSelectorOptions
172
      options determining the behavior of this window selector.
173
    candidate_pos: A list of ref_pos.
174
    ref_name: Reference name, used in setting the output
175
      genomics.range.reference_name value.
176
177
  Returns:
178
    A sorted list of nucleus.protos.Range protos for all windows in this region.
179
  """
180
  windows = []
181
182
  def _add_window(start_pos, end_pos):
183
    windows.append(
184
        ranges.make_range(ref_name, start_pos - config.min_windows_distance,
185
                          end_pos + config.min_windows_distance))
186
187
  start_pos, end_pos = None, None
188
  for pos in sorted(candidate_pos):
189
    if start_pos is None:
190
      start_pos = pos
191
      end_pos = pos
192
    # We need to check if the previous end_pos is within 2*window_distance as we
193
    # generate a window of radius window_distance around each position.
194
    #
195
    #   <-------end_pos------->
196
    #                          <-------pos------->
197
    # where window_distance = ------->
198
    #
199
    # If this is the case, we need to merge the two windows.
200
    elif pos > end_pos + 2 * config.min_windows_distance:
201
      _add_window(start_pos, end_pos)
202
      start_pos = pos
203
      end_pos = pos
204
    else:
205
      end_pos = pos
206
  if start_pos is not None:
207
    _add_window(start_pos, end_pos)
208
209
  return sorted(windows, key=ranges.as_tuple)
210
211
212
def select_windows(config, ref_reader, reads, region):
213
  """"Process reads to determine candidate windows for local assembly.
214
215
  Windows are within range of
216
    [0 - config.min_windows_distance, ref_len + config.min_windows_distance)
217
218
  Args:
219
    config: learning.genomics.deepvariant.realigner.WindowSelectorOptions
220
      options determining the behavior of this window selector.
221
    ref_reader: GenomeReference. Indexed reference genome to query bases.
222
    reads: A list of genomics.Read records.
223
    region: nucleus.protos.Range. The region we are processing.
224
225
  Returns:
226
    A list of nucleus.protos.Range protos sorted by their genomic position.
227
  """
228
  # This is a fast path for the case where we have no reads, so we have no
229
  # windows to assemble.
230
  if not reads:
231
    return []
232
233
  candidates = _candidates_from_reads(config, ref_reader, reads, region)
234
  return _candidates_to_windows(config, candidates, region.reference_name)