|
a |
|
b/deepvariant/realigner/window_selector.py |
|
|
1 |
# Copyright 2017 Google LLC. |
|
|
2 |
# |
|
|
3 |
# Redistribution and use in source and binary forms, with or without |
|
|
4 |
# modification, are permitted provided that the following conditions |
|
|
5 |
# are met: |
|
|
6 |
# |
|
|
7 |
# 1. Redistributions of source code must retain the above copyright notice, |
|
|
8 |
# this list of conditions and the following disclaimer. |
|
|
9 |
# |
|
|
10 |
# 2. Redistributions in binary form must reproduce the above copyright |
|
|
11 |
# notice, this list of conditions and the following disclaimer in the |
|
|
12 |
# documentation and/or other materials provided with the distribution. |
|
|
13 |
# |
|
|
14 |
# 3. Neither the name of the copyright holder nor the names of its |
|
|
15 |
# contributors may be used to endorse or promote products derived from this |
|
|
16 |
# software without specific prior written permission. |
|
|
17 |
# |
|
|
18 |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
|
19 |
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
|
20 |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
|
21 |
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
|
|
22 |
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
|
23 |
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
|
24 |
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
|
25 |
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
|
26 |
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
|
27 |
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
|
28 |
# POSSIBILITY OF SUCH DAMAGE. |
|
|
29 |
"""Determine genomic ranges to perform local assembly.""" |
|
|
30 |
|
|
|
31 |
from third_party.nucleus.protos import reads_pb2 |
|
|
32 |
from third_party.nucleus.util import ranges |
|
|
33 |
from deepvariant.protos import deepvariant_pb2 |
|
|
34 |
from deepvariant.protos import realigner_pb2 |
|
|
35 |
from deepvariant.python import allelecounter |
|
|
36 |
from deepvariant.realigner.python import window_selector as cpp_window_selector |
|
|
37 |
|
|
|
38 |
|
|
|
39 |
def _candidates_from_reads(config, ref_reader, reads, region): |
|
|
40 |
"""Returns a list of candidate positions. |
|
|
41 |
|
|
|
42 |
Args: |
|
|
43 |
config: learning.genomics.deepvariant.realigner.WindowSelectorOptions |
|
|
44 |
options determining the behavior of this window selector. |
|
|
45 |
ref_reader: GenomeReference. Indexed reference genome to query bases. |
|
|
46 |
reads: list[nucleus.protos.Read]. The reads we are processing into candidate |
|
|
47 |
positions. |
|
|
48 |
region: nucleus.protos.Range. The region we are processing. |
|
|
49 |
|
|
|
50 |
Returns: |
|
|
51 |
A list. The elements are reference positions within region. |
|
|
52 |
|
|
|
53 |
Raises: |
|
|
54 |
ValueError: if config.window_selector_model.model_type isn't a valid enum |
|
|
55 |
name in realigner_pb2.WindowSelectorModel.ModelType. |
|
|
56 |
""" |
|
|
57 |
allele_counter_options = deepvariant_pb2.AlleleCounterOptions( |
|
|
58 |
read_requirements=reads_pb2.ReadRequirements( |
|
|
59 |
min_mapping_quality=config.min_mapq, |
|
|
60 |
min_base_quality=config.min_base_quality), |
|
|
61 |
keep_legacy_behavior=config.keep_legacy_behavior) |
|
|
62 |
expanded_region = ranges.expand( |
|
|
63 |
region, |
|
|
64 |
config.region_expansion_in_bp, |
|
|
65 |
contig_map=ranges.contigs_dict(ref_reader.header.contigs)) |
|
|
66 |
|
|
|
67 |
allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, |
|
|
68 |
expanded_region, [], |
|
|
69 |
allele_counter_options) |
|
|
70 |
|
|
|
71 |
for read in reads: |
|
|
72 |
allele_counter.add(read, 'placeholder_sample_id') |
|
|
73 |
|
|
|
74 |
model_type = config.window_selector_model.model_type |
|
|
75 |
if model_type == realigner_pb2.WindowSelectorModel.VARIANT_READS: |
|
|
76 |
return _variant_reads_threshold_selector( |
|
|
77 |
allele_counter, config.window_selector_model.variant_reads_model, |
|
|
78 |
expanded_region) |
|
|
79 |
elif model_type == realigner_pb2.WindowSelectorModel.ALLELE_COUNT_LINEAR: |
|
|
80 |
return _allele_count_linear_selector( |
|
|
81 |
allele_counter, config.window_selector_model.allele_count_linear_model, |
|
|
82 |
expanded_region) |
|
|
83 |
else: |
|
|
84 |
raise ValueError('Unknown enum option "{}" for ' |
|
|
85 |
'WindowSelectorModel.model_type'.format( |
|
|
86 |
config.window_selector_model.model_type)) |
|
|
87 |
|
|
|
88 |
|
|
|
89 |
def _variant_reads_threshold_selector(allele_counter, model_conf, |
|
|
90 |
expanded_region): |
|
|
91 |
"""Returns a list of candidate positions. |
|
|
92 |
|
|
|
93 |
Following cigar operations generate candidate position: |
|
|
94 |
- ALIGNMENT_MATCH, SEQUENCE_MISMATCH, SEQUENCE_MATCH: at mismatch positions |
|
|
95 |
in the read when compared to the reference sequence. |
|
|
96 |
- DELETE: at positions within [cigar_start, cigar_start + cigar_len) |
|
|
97 |
- INSERT, CLIP_SOFT: at positions within |
|
|
98 |
[cigar_start - cigar_len, cigar_start + cigar_len) |
|
|
99 |
|
|
|
100 |
Note. Function implementation has changed to return positions beyond input |
|
|
101 |
region in case we have variants there. See the change at internal and |
|
|
102 |
internal. |
|
|
103 |
|
|
|
104 |
Args: |
|
|
105 |
allele_counter: learning.genomics.deepvariant.realigner.AlleleCounter in the |
|
|
106 |
considered region. |
|
|
107 |
model_conf: learning.genomics.deepvariant.realigner |
|
|
108 |
.WindowSelectorOptions.VariantReadsThresholdModel options determining the |
|
|
109 |
behavior of this window selector. |
|
|
110 |
expanded_region: nucleus.protos.Range. The region we are processing. |
|
|
111 |
|
|
|
112 |
Returns: |
|
|
113 |
A list. The elements are reference positions within region. |
|
|
114 |
""" |
|
|
115 |
|
|
|
116 |
counts_vec = cpp_window_selector.variant_reads_candidates_from_allele_counter( |
|
|
117 |
allele_counter) |
|
|
118 |
|
|
|
119 |
return [ |
|
|
120 |
expanded_region.start + i |
|
|
121 |
for i, count in enumerate(counts_vec) |
|
|
122 |
if (count >= model_conf.min_num_supporting_reads and |
|
|
123 |
count <= model_conf.max_num_supporting_reads) |
|
|
124 |
] |
|
|
125 |
|
|
|
126 |
|
|
|
127 |
def _allele_count_linear_selector(allele_counter, model_conf, expanded_region): |
|
|
128 |
"""Returns a list of candidate positions. |
|
|
129 |
|
|
|
130 |
Candidate positions for realignment are generated by scoring each location. |
|
|
131 |
The score at a location is a weighted sum of the number of reads with each |
|
|
132 |
CIGAR operation at the location, where the weights are determined by the model |
|
|
133 |
coefficients. Locations whose score exceed the model decision boundary value |
|
|
134 |
are used to create realignment windows. |
|
|
135 |
|
|
|
136 |
Note. Function implementation has changed to return positions beyond input |
|
|
137 |
region in case we have variants there. See the change at internal and |
|
|
138 |
internal. |
|
|
139 |
|
|
|
140 |
Args: |
|
|
141 |
allele_counter: learning.genomics.deepvariant.realigner.AlleleCounter in the |
|
|
142 |
considered region. |
|
|
143 |
model_conf: learning.genomics.deepvariant.realigner |
|
|
144 |
.WindowSelectorOptions.AlleleCountLinearModel options determining the |
|
|
145 |
behavior of this window selector. |
|
|
146 |
expanded_region: nucleus.protos.Range. The region we are processing. |
|
|
147 |
|
|
|
148 |
Returns: |
|
|
149 |
A list. The elements are reference positions within region. |
|
|
150 |
""" |
|
|
151 |
|
|
|
152 |
scores_vec = ( |
|
|
153 |
cpp_window_selector.allele_count_linear_candidates_from_allele_counter( |
|
|
154 |
allele_counter, model_conf)) |
|
|
155 |
|
|
|
156 |
return [ |
|
|
157 |
expanded_region.start + i |
|
|
158 |
for i, score in enumerate(scores_vec) |
|
|
159 |
if score > model_conf.decision_boundary |
|
|
160 |
] |
|
|
161 |
|
|
|
162 |
|
|
|
163 |
def _candidates_to_windows(config, candidate_pos, ref_name): |
|
|
164 |
""""Process candidate positions to determine windows for local assembly. |
|
|
165 |
|
|
|
166 |
Windows are within range of |
|
|
167 |
[min(pos) - config.min_windows_distance, |
|
|
168 |
max(pos) + config.min_windows_distance) |
|
|
169 |
|
|
|
170 |
Args: |
|
|
171 |
config: learning.genomics.deepvariant.realigner.WindowSelectorOptions |
|
|
172 |
options determining the behavior of this window selector. |
|
|
173 |
candidate_pos: A list of ref_pos. |
|
|
174 |
ref_name: Reference name, used in setting the output |
|
|
175 |
genomics.range.reference_name value. |
|
|
176 |
|
|
|
177 |
Returns: |
|
|
178 |
A sorted list of nucleus.protos.Range protos for all windows in this region. |
|
|
179 |
""" |
|
|
180 |
windows = [] |
|
|
181 |
|
|
|
182 |
def _add_window(start_pos, end_pos): |
|
|
183 |
windows.append( |
|
|
184 |
ranges.make_range(ref_name, start_pos - config.min_windows_distance, |
|
|
185 |
end_pos + config.min_windows_distance)) |
|
|
186 |
|
|
|
187 |
start_pos, end_pos = None, None |
|
|
188 |
for pos in sorted(candidate_pos): |
|
|
189 |
if start_pos is None: |
|
|
190 |
start_pos = pos |
|
|
191 |
end_pos = pos |
|
|
192 |
# We need to check if the previous end_pos is within 2*window_distance as we |
|
|
193 |
# generate a window of radius window_distance around each position. |
|
|
194 |
# |
|
|
195 |
# <-------end_pos-------> |
|
|
196 |
# <-------pos-------> |
|
|
197 |
# where window_distance = -------> |
|
|
198 |
# |
|
|
199 |
# If this is the case, we need to merge the two windows. |
|
|
200 |
elif pos > end_pos + 2 * config.min_windows_distance: |
|
|
201 |
_add_window(start_pos, end_pos) |
|
|
202 |
start_pos = pos |
|
|
203 |
end_pos = pos |
|
|
204 |
else: |
|
|
205 |
end_pos = pos |
|
|
206 |
if start_pos is not None: |
|
|
207 |
_add_window(start_pos, end_pos) |
|
|
208 |
|
|
|
209 |
return sorted(windows, key=ranges.as_tuple) |
|
|
210 |
|
|
|
211 |
|
|
|
212 |
def select_windows(config, ref_reader, reads, region): |
|
|
213 |
""""Process reads to determine candidate windows for local assembly. |
|
|
214 |
|
|
|
215 |
Windows are within range of |
|
|
216 |
[0 - config.min_windows_distance, ref_len + config.min_windows_distance) |
|
|
217 |
|
|
|
218 |
Args: |
|
|
219 |
config: learning.genomics.deepvariant.realigner.WindowSelectorOptions |
|
|
220 |
options determining the behavior of this window selector. |
|
|
221 |
ref_reader: GenomeReference. Indexed reference genome to query bases. |
|
|
222 |
reads: A list of genomics.Read records. |
|
|
223 |
region: nucleus.protos.Range. The region we are processing. |
|
|
224 |
|
|
|
225 |
Returns: |
|
|
226 |
A list of nucleus.protos.Range protos sorted by their genomic position. |
|
|
227 |
""" |
|
|
228 |
# This is a fast path for the case where we have no reads, so we have no |
|
|
229 |
# windows to assemble. |
|
|
230 |
if not reads: |
|
|
231 |
return [] |
|
|
232 |
|
|
|
233 |
candidates = _candidates_from_reads(config, ref_reader, reads, region) |
|
|
234 |
return _candidates_to_windows(config, candidates, region.reference_name) |