|
a |
|
b/third_party/nucleus/util/ranges.py |
|
|
1 |
# Copyright 2018 Google LLC. |
|
|
2 |
# |
|
|
3 |
# Redistribution and use in source and binary forms, with or without |
|
|
4 |
# modification, are permitted provided that the following conditions |
|
|
5 |
# are met: |
|
|
6 |
# |
|
|
7 |
# 1. Redistributions of source code must retain the above copyright notice, |
|
|
8 |
# this list of conditions and the following disclaimer. |
|
|
9 |
# |
|
|
10 |
# 2. Redistributions in binary form must reproduce the above copyright |
|
|
11 |
# notice, this list of conditions and the following disclaimer in the |
|
|
12 |
# documentation and/or other materials provided with the distribution. |
|
|
13 |
# |
|
|
14 |
# 3. Neither the name of the copyright holder nor the names of its |
|
|
15 |
# contributors may be used to endorse or promote products derived from this |
|
|
16 |
# software without specific prior written permission. |
|
|
17 |
# |
|
|
18 |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
|
19 |
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
|
20 |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
|
21 |
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
|
|
22 |
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
|
23 |
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
|
24 |
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
|
25 |
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
|
26 |
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
|
27 |
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
|
28 |
# POSSIBILITY OF SUCH DAMAGE. |
|
|
29 |
"""Utilities for Range overlap detection.""" |
|
|
30 |
|
|
|
31 |
from __future__ import absolute_import |
|
|
32 |
from __future__ import division |
|
|
33 |
from __future__ import print_function |
|
|
34 |
|
|
|
35 |
import collections |
|
|
36 |
import re |
|
|
37 |
from typing import Dict, Iterable, Optional, Sequence |
|
|
38 |
|
|
|
39 |
from absl import logging |
|
|
40 |
from etils import epath |
|
|
41 |
import intervaltree |
|
|
42 |
import six |
|
|
43 |
|
|
|
44 |
from third_party.nucleus.io import bed |
|
|
45 |
from third_party.nucleus.protos import position_pb2 |
|
|
46 |
from third_party.nucleus.protos import range_pb2 |
|
|
47 |
from third_party.nucleus.protos import reference_pb2 |
|
|
48 |
from third_party.nucleus.protos import variants_pb2 |
|
|
49 |
|
|
|
50 |
|
|
|
51 |
# Regular expressions for matching literal chr:start-stop strings. |
|
|
52 |
_REGION_LITERAL_REGEXP = re.compile(r'^(\S+):([0-9,]+)-([0-9,]+)$') |
|
|
53 |
|
|
|
54 |
# Regular expressions for matching literal chr:start strings. |
|
|
55 |
_POSITION_LITERAL_REGEXP = re.compile(r'^(\S+):([0-9,]+)$') |
|
|
56 |
|
|
|
57 |
# Logging frequency when building our rangeset objects, which can take some time |
|
|
58 |
# to complete. Rather than just pausing for a few minutes, we provide an update |
|
|
59 |
# logging message every _LOG_EVERY_N_RANGES_IN_RANGESET_INIT records added. See |
|
|
60 |
# internal for more information. |
|
|
61 |
_LOG_EVERY_N_RANGES_IN_RANGESET_INIT = 250000 |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
class RangeSet(object): |
|
|
65 |
"""Fast overlap detection of a genomic position against a database of Ranges. |
|
|
66 |
|
|
|
67 |
Enables O(log n) computation of whether a point chr:pos falls within one of a |
|
|
68 |
large number of genomic ranges. |
|
|
69 |
|
|
|
70 |
This class does not supports overlapping or adjacent intervals. Any such |
|
|
71 |
intervals will be automatically merged together in the constructor. |
|
|
72 |
|
|
|
73 |
This class is immutable. No methods should be added that directly modify the |
|
|
74 |
ranges held by the class. |
|
|
75 |
""" |
|
|
76 |
|
|
|
77 |
def __init__(self, ranges=None, contigs=None, quiet=False): |
|
|
78 |
"""Creates a RangeSet backed by ranges. |
|
|
79 |
|
|
|
80 |
Note that the Range objects in ranges are *not* stored directly here, so |
|
|
81 |
they can safely be modified after they are passed to this RangeSet. |
|
|
82 |
|
|
|
83 |
Args: |
|
|
84 |
ranges: list(nucleus.genomics.v1.Range) protos (or anything with |
|
|
85 |
reference_name, start, and end properties following the Range |
|
|
86 |
convention). If None, no ranges will be used, and overlaps() will always |
|
|
87 |
return False. |
|
|
88 |
contigs: list(nucleus.genomics.v1.ContigInfo) protos. Used to define the |
|
|
89 |
iteration order over contigs (i.e., by contig.pos_in_fasta). If this |
|
|
90 |
list is not provided, the iteration order will be determined by the |
|
|
91 |
alphabetical order of the contig names. |
|
|
92 |
quiet: bool; defaults to False: If False, we will emit a logging message |
|
|
93 |
every _LOG_EVERY_N_RANGES_IN_RANGESET_INIT records processed while |
|
|
94 |
building this intervaltree. Set to True to stop all of the logging. |
|
|
95 |
|
|
|
96 |
Raises: |
|
|
97 |
ValueError: if any range's reference_name does not correspond to any |
|
|
98 |
contig in `contigs`. |
|
|
99 |
""" |
|
|
100 |
if contigs is not None: |
|
|
101 |
self._contigs = contigs |
|
|
102 |
self._contig_map = contigs_dict(contigs) |
|
|
103 |
self._contig_sort_key_fn = lambda name: self._contig_map[ |
|
|
104 |
name |
|
|
105 |
].pos_in_fasta |
|
|
106 |
self._is_valid_contig = lambda name: name in self._contig_map |
|
|
107 |
else: |
|
|
108 |
self._contigs = None |
|
|
109 |
self._contig_map = None |
|
|
110 |
self._contig_sort_key_fn = lambda name: name |
|
|
111 |
self._is_valid_contig = lambda name: True |
|
|
112 |
|
|
|
113 |
if ranges is None: |
|
|
114 |
ranges = [] |
|
|
115 |
|
|
|
116 |
# Add each range to our contig-specific intervaltrees. |
|
|
117 |
self._by_chr = collections.defaultdict(intervaltree.IntervalTree) |
|
|
118 |
for i, range_ in enumerate(ranges): |
|
|
119 |
if not self._is_valid_contig(range_.reference_name): |
|
|
120 |
raise ValueError( |
|
|
121 |
'Range {} is on an unrecognized contig.'.format(range_) |
|
|
122 |
) |
|
|
123 |
self._by_chr[range_.reference_name].addi(range_.start, range_.end, None) |
|
|
124 |
if not quiet and i > 0 and i % _LOG_EVERY_N_RANGES_IN_RANGESET_INIT == 0: |
|
|
125 |
# We do our test directly here on i > 0 so we only see the log messages |
|
|
126 |
# if we add at least _LOG_EVERY_N_RANGES_IN_RANGESET_INIT records. |
|
|
127 |
logging.info('Adding interval %s to intervaltree', to_literal(range_)) |
|
|
128 |
|
|
|
129 |
# Merge overlapping / adjacent intervals in each tree. |
|
|
130 |
for tree in six.itervalues(self._by_chr): |
|
|
131 |
tree.merge_overlaps(strict=False) |
|
|
132 |
|
|
|
133 |
def __iter__(self): |
|
|
134 |
"""Iterate over the ranges in this RangeSet. |
|
|
135 |
|
|
|
136 |
Yields: |
|
|
137 |
Each range of this RangeSet, in sorted order (by chromosome, then start |
|
|
138 |
end positions). Relative ordering of chromosomes is defined by the |
|
|
139 |
contig.pos_in_fasta integer key for the associated contig. These objects |
|
|
140 |
are new range protos so can be freely modified. |
|
|
141 |
""" |
|
|
142 |
for refname in sorted( |
|
|
143 |
six.iterkeys(self._by_chr), key=self._contig_sort_key_fn |
|
|
144 |
): |
|
|
145 |
for start, end, _ in sorted(self._by_chr[refname]): |
|
|
146 |
yield make_range(refname, start, end) |
|
|
147 |
|
|
|
148 |
@classmethod |
|
|
149 |
def from_regions( |
|
|
150 |
cls, |
|
|
151 |
regions: Sequence[str], |
|
|
152 |
# TODO: Use X | None instead. |
|
|
153 |
contig_map: Optional[Dict[str, reference_pb2.ContigInfo]] = None, |
|
|
154 |
) -> 'RangeSet': |
|
|
155 |
"""Parses a command-line style literal regions flag into a RangeSet. |
|
|
156 |
|
|
|
157 |
Args: |
|
|
158 |
regions: An iterable or None. If not None, regions will be parsed with |
|
|
159 |
ranges.from_regions. |
|
|
160 |
contig_map: An optional dictionary mapping from contig names to ContigInfo |
|
|
161 |
protobufs. If provided, allows literals of the format "contig_name", |
|
|
162 |
which will be parsed into a Range with reference_name=contig_name, |
|
|
163 |
start=0, end=n_bases where n_bases comes from the ContigInfo; |
|
|
164 |
additionally the sort order of the RangeSet will be determined by |
|
|
165 |
contig.pos_in_fasta. |
|
|
166 |
|
|
|
167 |
Returns: |
|
|
168 |
A RangeSet object. |
|
|
169 |
""" |
|
|
170 |
if regions is None: |
|
|
171 |
return cls(ranges=[]) |
|
|
172 |
else: |
|
|
173 |
return cls(ranges=from_regions(regions, contig_map=contig_map)) |
|
|
174 |
|
|
|
175 |
@classmethod |
|
|
176 |
def from_contigs( |
|
|
177 |
cls, contigs: Sequence[reference_pb2.ContigInfo] |
|
|
178 |
) -> 'RangeSet': |
|
|
179 |
"""Creates a RangeSet with an interval covering each base of each contig.""" |
|
|
180 |
return cls( |
|
|
181 |
(make_range(contig.name, 0, contig.n_bases) for contig in contigs), |
|
|
182 |
contigs, |
|
|
183 |
) |
|
|
184 |
|
|
|
185 |
@classmethod |
|
|
186 |
def from_bed( |
|
|
187 |
cls, source, contigs=None, intersect_ranges=None, enable_logging=True |
|
|
188 |
): |
|
|
189 |
"""Creates a RangeSet containing the intervals from source. |
|
|
190 |
|
|
|
191 |
Args: |
|
|
192 |
source: A path to a BED (or equivalent) file of intervals. |
|
|
193 |
contigs: An optional list of ContigInfo proto, used by RangeSet |
|
|
194 |
constructor. |
|
|
195 |
intersect_ranges: An optional list of RangeSet objects to intersect with |
|
|
196 |
the intervals in the BED file before creating the RangeSet. |
|
|
197 |
enable_logging: Enables logging line while reading the file. |
|
|
198 |
|
|
|
199 |
Returns: |
|
|
200 |
A RangeSet. |
|
|
201 |
""" |
|
|
202 |
return cls(bed_parser(source, intersect_ranges, enable_logging), contigs) |
|
|
203 |
|
|
|
204 |
def intersection(self, *others: 'RangeSet') -> 'RangeSet': |
|
|
205 |
"""Computes the intersection among this RangeSet and *others RangeSets. |
|
|
206 |
|
|
|
207 |
This function computes the intersection of all of the intervals in self and |
|
|
208 |
*others, returning a RangeSet containing only intervals common to all. The |
|
|
209 |
intersection here is an ranged intersection, not an identity intersection, |
|
|
210 |
so the resulting set of intervals may not contain any of the original |
|
|
211 |
intervals in any of the sets. |
|
|
212 |
|
|
|
213 |
To be concrete, suppose we have three sets to intersect, each having two |
|
|
214 |
intervals: |
|
|
215 |
|
|
|
216 |
self : chr1:1-10, chr2:20-30 |
|
|
217 |
other1 : chr1:5-8, chr3:10-40 |
|
|
218 |
other2 : chr1:3-7, chr3:10-30 |
|
|
219 |
|
|
|
220 |
self.intersection(other1, other2) produces a RangeSet with one interval |
|
|
221 |
chr1:5-7, the common bases on chr1 in self, other1, and other2. No intervals |
|
|
222 |
on chr2 or chr3 are included since the chr2 only occurs in self and the two |
|
|
223 |
intervals on chr3, despite having some shared bases, don't have an |
|
|
224 |
overlapping interval in self. |
|
|
225 |
|
|
|
226 |
Args: |
|
|
227 |
*others: A list of RangeSet objects to intersect with the intervals in |
|
|
228 |
this RangeSet. |
|
|
229 |
|
|
|
230 |
Returns: |
|
|
231 |
A RangeSet. If *others is empty, this function returns self rather than |
|
|
232 |
making an unnecessary copy. In all other cases, the returned value will be |
|
|
233 |
a freshly allocated RangeSet. |
|
|
234 |
""" |
|
|
235 |
|
|
|
236 |
def _intersect2(refname, tree1, tree2): |
|
|
237 |
"""Intersects the intervals of two IntervalTrees.""" |
|
|
238 |
# Yields all of the overlapping intervals from each interval of tree1 |
|
|
239 |
# found in tree2. Since each tree has only non-adjacent, non-overlapping, |
|
|
240 |
# intervals this calculation is straightforward and safe and produces only |
|
|
241 |
# non-adjacent, non-overlapping intervals. |
|
|
242 |
if len(tree1) > len(tree2): |
|
|
243 |
(bigtree, smalltree) = (tree1, tree2) |
|
|
244 |
else: |
|
|
245 |
(bigtree, smalltree) = (tree2, tree1) |
|
|
246 |
return ( |
|
|
247 |
make_range( |
|
|
248 |
refname, |
|
|
249 |
max(interval1.begin, overlapping.begin), |
|
|
250 |
min(interval1.end, overlapping.end), |
|
|
251 |
) |
|
|
252 |
for interval1 in bigtree |
|
|
253 |
for overlapping in smalltree.overlap(interval1) |
|
|
254 |
) |
|
|
255 |
|
|
|
256 |
# Iteratively intersect each of our *other RangeSets with this RangeSet. |
|
|
257 |
# Sort by size so we do the smallest number of element merge first. |
|
|
258 |
# TODO: Note we could optimize this code by computing the set of |
|
|
259 |
# common contigs upfront across all others and only looping over those. |
|
|
260 |
intersected = self |
|
|
261 |
for other in sorted(others, key=len): |
|
|
262 |
intersected_intervals = [] |
|
|
263 |
# pylint: disable=protected-access |
|
|
264 |
# So we can intersect intervals within each contig separately. |
|
|
265 |
for refname, intervals in six.iteritems(intersected._by_chr): |
|
|
266 |
# If refname is present in other, intersect those two IntervalTrees |
|
|
267 |
# directly and add those contigs to our growing list of intersected |
|
|
268 |
# intervals. If refname isn't present, all of the intervals on refname |
|
|
269 |
# should be dropped as there are no intervals to overlap. |
|
|
270 |
other_chr = other._by_chr.get(refname, None) |
|
|
271 |
if other_chr: |
|
|
272 |
intersected_intervals.extend( |
|
|
273 |
_intersect2(refname, intervals, other_chr) |
|
|
274 |
) |
|
|
275 |
|
|
|
276 |
# Update our intersected RangeSet with the new intervals. |
|
|
277 |
intersected = RangeSet(intersected_intervals, self._contigs) |
|
|
278 |
|
|
|
279 |
return intersected |
|
|
280 |
|
|
|
281 |
def exclude_regions(self, other: 'RangeSet'): |
|
|
282 |
"""Chops out all of the intervals in other from this this RangeSet. |
|
|
283 |
|
|
|
284 |
NOTE: This is a *MUTATING* operation for performance reasons. Make a copy |
|
|
285 |
of self if you want to avoid modifying the RangeSet. |
|
|
286 |
|
|
|
287 |
Args: |
|
|
288 |
other: A RangeSet object whose intervals will be removed from this |
|
|
289 |
RangeSet. |
|
|
290 |
""" |
|
|
291 |
# pylint: disable=protected-access |
|
|
292 |
for chrname, chr_intervals in six.iteritems(other._by_chr): |
|
|
293 |
# If refname is present in self, difference those two IntervalTrees. |
|
|
294 |
self_intervals = self._by_chr.get(chrname, None) |
|
|
295 |
if self_intervals: |
|
|
296 |
for begin, end, _ in chr_intervals: |
|
|
297 |
self_intervals.chop(begin, end) |
|
|
298 |
if self_intervals.is_empty(): |
|
|
299 |
# Cleanup after ourselves by removing empty trees from our map. |
|
|
300 |
del self._by_chr[chrname] |
|
|
301 |
|
|
|
302 |
def __len__(self): |
|
|
303 |
"""Gets the number of ranges used by this RangeSet.""" |
|
|
304 |
return sum(len(for_chr) for for_chr in six.itervalues(self._by_chr)) |
|
|
305 |
|
|
|
306 |
def __nonzero__(self): |
|
|
307 |
"""Returns True if this RangeSet is not empty.""" |
|
|
308 |
return bool(self._by_chr) |
|
|
309 |
|
|
|
310 |
__bool__ = __nonzero__ # Python 3 compatibility. |
|
|
311 |
|
|
|
312 |
def variant_overlaps( |
|
|
313 |
self, |
|
|
314 |
variant: variants_pb2.Variant, |
|
|
315 |
empty_set_return_value: bool = True, |
|
|
316 |
): |
|
|
317 |
"""Returns True if the variant's range overlaps with any in this set.""" |
|
|
318 |
if not self: |
|
|
319 |
return empty_set_return_value |
|
|
320 |
else: |
|
|
321 |
return self.overlaps(variant.reference_name, variant.start) |
|
|
322 |
|
|
|
323 |
def overlaps(self, chrom: str, pos: int): |
|
|
324 |
"""Returns True if chr:pos overlaps with any range in this RangeSet. |
|
|
325 |
|
|
|
326 |
Uses a fast bisection algorithm to determine the overlap in O(log n) time. |
|
|
327 |
|
|
|
328 |
Args: |
|
|
329 |
chrom: str. The chromosome name. |
|
|
330 |
pos: int. The position (0-based). |
|
|
331 |
|
|
|
332 |
Returns: |
|
|
333 |
True if chr:pos overlaps with a range. |
|
|
334 |
""" |
|
|
335 |
chr_ranges = self._by_chr.get(chrom, None) |
|
|
336 |
if chr_ranges is None: |
|
|
337 |
return False |
|
|
338 |
return chr_ranges.overlaps(pos) |
|
|
339 |
|
|
|
340 |
def partition(self, max_size): |
|
|
341 |
"""Splits our intervals so that none are larger than max_size. |
|
|
342 |
|
|
|
343 |
Slices up the intervals in this RangeSet into a equivalent set of intervals |
|
|
344 |
(i.e., spanning the same set of bases), each of which is at most max_size in |
|
|
345 |
length. |
|
|
346 |
|
|
|
347 |
This function does not modify this RangeSet. |
|
|
348 |
|
|
|
349 |
Because RangeSet merges adjacent intervals, this function cannot use a |
|
|
350 |
RangeSet to represent the partitioned intervals and so instead generates |
|
|
351 |
these intervals via a yield statement. |
|
|
352 |
|
|
|
353 |
Args: |
|
|
354 |
max_size: int > 0. The maximum size of any interval. |
|
|
355 |
|
|
|
356 |
Yields: |
|
|
357 |
nucleus.genomics.v1.Range protos, in sorted order (see comment about order |
|
|
358 |
in __iter__). |
|
|
359 |
|
|
|
360 |
Raises: |
|
|
361 |
ValueError: if max_size <= 0. |
|
|
362 |
""" |
|
|
363 |
if max_size <= 0: |
|
|
364 |
raise ValueError('max_size must be > 0: {}'.format(max_size)) |
|
|
365 |
|
|
|
366 |
for interval in self: |
|
|
367 |
refname = interval.reference_name |
|
|
368 |
for pos in range(interval.start, interval.end, max_size): |
|
|
369 |
yield make_range(refname, pos, min(interval.end, pos + max_size)) |
|
|
370 |
|
|
|
371 |
def envelops(self, chrom, start, end): |
|
|
372 |
"""Returns True iff some range in this RangeSet envelops the range. |
|
|
373 |
|
|
|
374 |
Args: |
|
|
375 |
chrom: str. The chromosome of interest. |
|
|
376 |
start: int. Zero-based inclusive index of the query range. |
|
|
377 |
end: int: Zero-based exclusive index of the query range. |
|
|
378 |
|
|
|
379 |
Returns: |
|
|
380 |
True if and only if some range in `self` completely spans the query |
|
|
381 |
range. |
|
|
382 |
""" |
|
|
383 |
chr_ranges = self._by_chr.get(chrom, None) |
|
|
384 |
if chr_ranges is None: |
|
|
385 |
return False |
|
|
386 |
# The intervaltree package does the inverse check, i.e. whether ranges |
|
|
387 |
# contained in it overlap with the query region. So it returns nothing |
|
|
388 |
# when start == end. We by convention want anything overlapping the start |
|
|
389 |
# position to still indicate enveloping in this case. |
|
|
390 |
if start == end: |
|
|
391 |
return chr_ranges.overlaps(start) |
|
|
392 |
else: |
|
|
393 |
overlap_set = chr_ranges.overlap(begin=start, end=end) |
|
|
394 |
return any(ov.begin <= start and ov.end >= end for ov in overlap_set) |
|
|
395 |
|
|
|
396 |
|
|
|
397 |
def make_position(chrom, position, reverse_strand=False): |
|
|
398 |
"""Returns a nucleus.genomics.v1.Position. |
|
|
399 |
|
|
|
400 |
Args: |
|
|
401 |
chrom: str. The chromosome name. |
|
|
402 |
position: int. The start position (0-based, inclusive). |
|
|
403 |
reverse_strand: bool. If True, indicates the position is on the negative |
|
|
404 |
strand. |
|
|
405 |
""" |
|
|
406 |
return position_pb2.Position( |
|
|
407 |
reference_name=chrom, position=position, reverse_strand=reverse_strand |
|
|
408 |
) |
|
|
409 |
|
|
|
410 |
|
|
|
411 |
def make_range(chrom, start, end): |
|
|
412 |
"""Returns a nucleus.genomics.v1.Range. |
|
|
413 |
|
|
|
414 |
Args: |
|
|
415 |
chrom: str. The chromosome name. |
|
|
416 |
start: int. The start position (0-based, inclusive) of this range. |
|
|
417 |
end: int. The end position (0-based, exclusive) of this range. |
|
|
418 |
|
|
|
419 |
Returns: |
|
|
420 |
A nucleus.genomics.v1.Range. |
|
|
421 |
""" |
|
|
422 |
return range_pb2.Range(reference_name=chrom, start=start, end=end) |
|
|
423 |
|
|
|
424 |
|
|
|
425 |
def position_overlaps(chrom, pos, interval): |
|
|
426 |
"""Returns True iff the position chr:pos overlaps the interval. |
|
|
427 |
|
|
|
428 |
Args: |
|
|
429 |
chrom: str. The chromosome name. |
|
|
430 |
pos: int. The position (0-based, inclusive). |
|
|
431 |
interval: nucleus.genomics.v1.Range object. |
|
|
432 |
|
|
|
433 |
Returns: |
|
|
434 |
True if interval overlaps chr:pos. |
|
|
435 |
""" |
|
|
436 |
return ( |
|
|
437 |
chrom == interval.reference_name and interval.start <= pos < interval.end |
|
|
438 |
) |
|
|
439 |
|
|
|
440 |
|
|
|
441 |
def ranges_overlap(i1, i2): |
|
|
442 |
"""Returns True iff ranges i1 and i2 overlap. |
|
|
443 |
|
|
|
444 |
Args: |
|
|
445 |
i1: nucleus.genomics.v1.Range object. |
|
|
446 |
i2: nucleus.genomics.v1.Range object. |
|
|
447 |
|
|
|
448 |
Returns: |
|
|
449 |
True if and only if i1 and i2 overlap. |
|
|
450 |
""" |
|
|
451 |
return ( |
|
|
452 |
i1.reference_name == i2.reference_name |
|
|
453 |
and i1.end > i2.start |
|
|
454 |
and i1.start < i2.end |
|
|
455 |
) |
|
|
456 |
|
|
|
457 |
|
|
|
458 |
def bedpe_parser(filename: str) -> Iterable[range_pb2.Range]: |
|
|
459 |
"""Parses Range objects from a BEDPE-formatted file object. |
|
|
460 |
|
|
|
461 |
See http://bedtools.readthedocs.org/en/latest/content/general-usage.html |
|
|
462 |
for more information on the BEDPE format. |
|
|
463 |
|
|
|
464 |
Skips events that span across chromosomes. For example, if the starting |
|
|
465 |
location is on chr1 and the ending location is on chr2, that record will |
|
|
466 |
not appear in the output. |
|
|
467 |
|
|
|
468 |
Args: |
|
|
469 |
filename: file name of a BEDPE-formatted file. |
|
|
470 |
|
|
|
471 |
Yields: |
|
|
472 |
nucleus.genomics.v1.Range protobuf objects. |
|
|
473 |
""" |
|
|
474 |
with epath.Path(filename).open('r') as fp: |
|
|
475 |
for line in fp: |
|
|
476 |
parts = line.split('\t') |
|
|
477 |
if parts[0] == parts[3]: |
|
|
478 |
# only keep events on the same chromosome |
|
|
479 |
yield make_range(parts[0], int(parts[1]), int(parts[5])) |
|
|
480 |
|
|
|
481 |
|
|
|
482 |
def bed_parser(filename, intersect_ranges=None, enable_logging=True): |
|
|
483 |
"""Parses Range objects from a BED-formatted file object. |
|
|
484 |
|
|
|
485 |
See http://bedtools.readthedocs.org/en/latest/content/general-usage.html |
|
|
486 |
for more information on the BED format. |
|
|
487 |
|
|
|
488 |
Args: |
|
|
489 |
filename: File name of a BED-formatted file. |
|
|
490 |
intersect_ranges: An optional list of RangeSet objects to intersect with the |
|
|
491 |
intervals in the BED file before creating the RangeSet. Requires a tabix |
|
|
492 |
index. |
|
|
493 |
enable_logging: Enables logging line while reading the file. |
|
|
494 |
|
|
|
495 |
Yields: |
|
|
496 |
nucleus.genomics.v1.Range protobuf objects. |
|
|
497 |
""" |
|
|
498 |
with bed.BedReader(filename, enable_logging) as fin: |
|
|
499 |
if not fin.has_index(): |
|
|
500 |
logging.warning( |
|
|
501 |
'BED file does not have a tabix index. Reading full bed file.' |
|
|
502 |
) |
|
|
503 |
if intersect_ranges and fin.has_index(): |
|
|
504 |
for region in intersect_ranges: |
|
|
505 |
for r in fin.query(region): |
|
|
506 |
yield make_range(r.reference_name, r.start, r.end) |
|
|
507 |
else: |
|
|
508 |
for r in fin.iterate(): |
|
|
509 |
yield make_range(r.reference_name, r.start, r.end) |
|
|
510 |
|
|
|
511 |
|
|
|
512 |
def from_regions(regions, contig_map=None): |
|
|
513 |
"""Parses each region of `regions` into a Range proto. |
|
|
514 |
|
|
|
515 |
This function provides a super high-level interface for |
|
|
516 |
reading/parsing/converting objects into Range protos. Each `region` of |
|
|
517 |
`regions` is processed in turn, yielding one or more Range protos. This |
|
|
518 |
function inspects the contents of `region` to determine how to convert it to |
|
|
519 |
Range(s) protos. The following types of `region` strings are supported: |
|
|
520 |
|
|
|
521 |
* If region ends with an extension known in _get_parser_for_file, we treat |
|
|
522 |
region as a file and read the Range protos from it with the corresponding |
|
|
523 |
reader from _get_parser_for_file, yielding each Range from the file in |
|
|
524 |
order. |
|
|
525 |
* Otherwise we parse region as a region literal (`chr20:1-10`) and return |
|
|
526 |
the Range proto. |
|
|
527 |
|
|
|
528 |
Args: |
|
|
529 |
regions: iterable[str]. Converts each element of this iterable into |
|
|
530 |
region(s). |
|
|
531 |
contig_map: An optional dictionary mapping from contig names to ContigInfo |
|
|
532 |
protobufs. If provided, allows literals of the format "contig_name", which |
|
|
533 |
will be parsed into a Range with reference_name=contig_name, start=0, |
|
|
534 |
end=n_bases where n_bases comes from the ContigInfo. |
|
|
535 |
|
|
|
536 |
Yields: |
|
|
537 |
A Range proto. |
|
|
538 |
""" |
|
|
539 |
for region in regions: |
|
|
540 |
reader = _get_parser_for_file(region) |
|
|
541 |
if reader: |
|
|
542 |
for elt in reader(region): |
|
|
543 |
yield elt |
|
|
544 |
else: |
|
|
545 |
yield parse_literal(region, contig_map) |
|
|
546 |
|
|
|
547 |
|
|
|
548 |
# Cannot be at the top of the file because these parser functions need to be |
|
|
549 |
# defined before adding them to the dictionary. |
|
|
550 |
_REGION_FILE_READERS = { |
|
|
551 |
bed_parser: frozenset(['.bed']), |
|
|
552 |
bedpe_parser: frozenset(['.bedpe']), |
|
|
553 |
} |
|
|
554 |
|
|
|
555 |
|
|
|
556 |
def _get_parser_for_file(filename): |
|
|
557 |
for reader, exts in six.iteritems(_REGION_FILE_READERS): |
|
|
558 |
if any(filename.lower().endswith(ext) for ext in exts): |
|
|
559 |
return reader |
|
|
560 |
return None |
|
|
561 |
|
|
|
562 |
|
|
|
563 |
def to_literal(range_pb): |
|
|
564 |
"""Converts Range protobuf into string literal form. |
|
|
565 |
|
|
|
566 |
The string literal form looks like: |
|
|
567 |
|
|
|
568 |
reference_name:start+1-end |
|
|
569 |
|
|
|
570 |
since start and end are zero-based inclusive (start) and exclusive (end), |
|
|
571 |
while the literal form is one-based inclusive on both ends. |
|
|
572 |
|
|
|
573 |
Args: |
|
|
574 |
range_pb: A nucleus.genomics.v1.Range object. |
|
|
575 |
|
|
|
576 |
Returns: |
|
|
577 |
A string representation of the Range. |
|
|
578 |
""" |
|
|
579 |
return '{}:{}-{}'.format( |
|
|
580 |
range_pb.reference_name, range_pb.start + 1, range_pb.end |
|
|
581 |
) |
|
|
582 |
|
|
|
583 |
|
|
|
584 |
def parse_literal(region_literal, contig_map=None): |
|
|
585 |
"""Parses a Range from a string representation like chr:start-end. |
|
|
586 |
|
|
|
587 |
The region literal must conform to the following pattern: |
|
|
588 |
|
|
|
589 |
chromosome:start-end |
|
|
590 |
chromosome:position |
|
|
591 |
chromosome [if contig_map is provided] |
|
|
592 |
|
|
|
593 |
chromosome can be any non-empty string without whitespace. start and end must |
|
|
594 |
both be positive integers. They can contain commas for readability. start and |
|
|
595 |
end are positions not offsets, so start == 1 means an offset of zero. If only |
|
|
596 |
a single position is provided, this creates a 1 bp interval starting at |
|
|
597 |
position - 1 and ending at position. |
|
|
598 |
|
|
|
599 |
Inspired by the samtools region specification: |
|
|
600 |
http://www.htslib.org/doc/samtools.html |
|
|
601 |
|
|
|
602 |
Args: |
|
|
603 |
region_literal: str. The literal to parse. |
|
|
604 |
contig_map: An optional dictionary mapping from contig names to ContigInfo |
|
|
605 |
protobufs. If provided, allows literals of the format "contig_name", which |
|
|
606 |
will be parsed into a Range with reference_name=contig_name, start=0, |
|
|
607 |
end=n_bases where n_bases comes from the ContigInfo. |
|
|
608 |
|
|
|
609 |
Returns: |
|
|
610 |
nucleus.genomics.v1.Range. |
|
|
611 |
|
|
|
612 |
Raises: |
|
|
613 |
ValueError: if region_literal cannot be parsed. |
|
|
614 |
""" |
|
|
615 |
|
|
|
616 |
def parse_position(pos_str): |
|
|
617 |
return int(pos_str.replace(',', '')) |
|
|
618 |
|
|
|
619 |
matched = _REGION_LITERAL_REGEXP.match(region_literal) |
|
|
620 |
if matched: |
|
|
621 |
chrom, start, end = matched.groups() |
|
|
622 |
return make_range(chrom, parse_position(start) - 1, parse_position(end)) |
|
|
623 |
|
|
|
624 |
matched = _POSITION_LITERAL_REGEXP.match(region_literal) |
|
|
625 |
if matched: |
|
|
626 |
chrom, pos = matched.groups() |
|
|
627 |
pos = parse_position(pos) |
|
|
628 |
return make_range(chrom, pos - 1, pos) |
|
|
629 |
|
|
|
630 |
if contig_map and region_literal in contig_map: |
|
|
631 |
# If the region_literals is an exact contig name like chr1 or MT return a |
|
|
632 |
# range over the entire contig. |
|
|
633 |
return make_range(region_literal, 0, contig_map[region_literal].n_bases) |
|
|
634 |
raise ValueError( |
|
|
635 |
'Could not parse "{}" as a region literal. Region literals ' |
|
|
636 |
'should have the form "chr:start-stop" or "chr:start" or ' |
|
|
637 |
'just "chr". A common error is to use the "chr" prefix on ' |
|
|
638 |
"inputs that don't have it, or vice-versa.".format(region_literal) |
|
|
639 |
) |
|
|
640 |
|
|
|
641 |
|
|
|
642 |
def parse_literals(region_literals, contig_map=None): |
|
|
643 |
"""Parses each literal of region_literals in order.""" |
|
|
644 |
return [parse_literal(literal, contig_map) for literal in region_literals] |
|
|
645 |
|
|
|
646 |
|
|
|
647 |
def contigs_n_bases(contigs): |
|
|
648 |
"""Returns the sum of all n_bases of contigs.""" |
|
|
649 |
return sum(c.n_bases for c in contigs) |
|
|
650 |
|
|
|
651 |
|
|
|
652 |
def contigs_dict( |
|
|
653 |
contigs: Iterable[reference_pb2.ContigInfo], |
|
|
654 |
) -> Dict[str, reference_pb2.ContigInfo]: |
|
|
655 |
"""Creates a dictionary for contigs. |
|
|
656 |
|
|
|
657 |
Args: |
|
|
658 |
contigs: Iterable of ContigInfo protos. |
|
|
659 |
|
|
|
660 |
Returns: |
|
|
661 |
A dictionary mapping contig.name: contig for each contig in contigs. |
|
|
662 |
""" |
|
|
663 |
return {contig.name: contig for contig in contigs} |
|
|
664 |
|
|
|
665 |
|
|
|
666 |
def sorted_ranges(ranges, contigs=None): |
|
|
667 |
"""Sorts ranges by reference_name, start, and end. |
|
|
668 |
|
|
|
669 |
Args: |
|
|
670 |
ranges: Iterable of nucleus.genomics.v1.Range protos that we want to sort. |
|
|
671 |
contigs: None or an iterable of ContigInfo protos. If not None, we will use |
|
|
672 |
the order of the contigs (as defined by their pos_in_fasta field values) |
|
|
673 |
to sort the Ranges on different contigs with respect to each other. |
|
|
674 |
|
|
|
675 |
Returns: |
|
|
676 |
A newly allocated list of nucleus.genomics.v1.Range protos. |
|
|
677 |
""" |
|
|
678 |
if contigs: |
|
|
679 |
contig_map = contigs_dict(contigs) |
|
|
680 |
|
|
|
681 |
def to_key(range_): |
|
|
682 |
pos = contig_map[range_.reference_name].pos_in_fasta |
|
|
683 |
return pos, range_.start, range_.end |
|
|
684 |
|
|
|
685 |
else: |
|
|
686 |
to_key = as_tuple |
|
|
687 |
|
|
|
688 |
return sorted(ranges, key=to_key) |
|
|
689 |
|
|
|
690 |
|
|
|
691 |
def as_tuple(range_): |
|
|
692 |
"""Returns a Python tuple (reference_name, start, end).""" |
|
|
693 |
return range_.reference_name, range_.start, range_.end |
|
|
694 |
|
|
|
695 |
|
|
|
696 |
def overlap_len(range1, range2): |
|
|
697 |
"""Computes the number of overlapping bases of range1 and range2. |
|
|
698 |
|
|
|
699 |
Args: |
|
|
700 |
range1: nucleus.genomics.v1.Range. |
|
|
701 |
range2: nucleus.genomics.v1.Range. |
|
|
702 |
|
|
|
703 |
Returns: |
|
|
704 |
int. The number of basepairs in common. 0 if the ranges are not on the same |
|
|
705 |
contig. |
|
|
706 |
""" |
|
|
707 |
if range1.reference_name != range2.reference_name: |
|
|
708 |
return 0 |
|
|
709 |
return max(0, (min(range1.end, range2.end) - max(range1.start, range2.start))) |
|
|
710 |
|
|
|
711 |
|
|
|
712 |
def find_max_overlapping(query_range, search_ranges): |
|
|
713 |
"""Gets the index of the element in search_ranges with max overlap with query. |
|
|
714 |
|
|
|
715 |
In case of ties, selects the lowest index range in search_ranges. |
|
|
716 |
|
|
|
717 |
Args: |
|
|
718 |
query_range: nucleus.genomics.v1.Range, read genomic range. |
|
|
719 |
search_ranges: list[nucleus.genomics.v1.Read]. The list of regions we want |
|
|
720 |
to search for the maximal overlap with query_range. NOTE: this must be a |
|
|
721 |
list (not a generator) as we loop over the search_ranges multiple times. |
|
|
722 |
|
|
|
723 |
Returns: |
|
|
724 |
int, the search_ranges index with the maximum read overlap. Returns None |
|
|
725 |
when read has no overlap with any of the search_ranges or search_ranges is |
|
|
726 |
empty. |
|
|
727 |
""" |
|
|
728 |
if not search_ranges: |
|
|
729 |
return None |
|
|
730 |
overlaps = [overlap_len(query_range, srange) for srange in search_ranges] |
|
|
731 |
argmax = max(range(len(search_ranges)), key=lambda i: overlaps[i]) |
|
|
732 |
# We return None if the read doesn't overlap at all. |
|
|
733 |
return None if overlaps[argmax] == 0 else argmax |
|
|
734 |
|
|
|
735 |
|
|
|
736 |
def expand(region, n_bp, contig_map=None): |
|
|
737 |
"""Expands region by n_bp in both directions. |
|
|
738 |
|
|
|
739 |
Takes a Range(chrom, start, stop) and returns a new |
|
|
740 |
Range(chrom, new_start, new_stop), where: |
|
|
741 |
|
|
|
742 |
-- new_start is max(start - n_bp, 0) |
|
|
743 |
-- new_stop is stop + n_bp if contig_map is None, or min(stop + n_bp, max_bp) |
|
|
744 |
where max_bp is contig_map[chrom].n_bp. |
|
|
745 |
|
|
|
746 |
Args: |
|
|
747 |
region: A nucleus.genomics.v1.Range proto. |
|
|
748 |
n_bp: int >= 0. how many basepairs to increase region by. |
|
|
749 |
contig_map: dict[string, ContigInfo] or None. If not None, used to get the |
|
|
750 |
maximum extent to increase stop by. Must have region.reference_name as a |
|
|
751 |
key. |
|
|
752 |
|
|
|
753 |
Returns: |
|
|
754 |
nucleus.genomics.v1.Range proto. |
|
|
755 |
|
|
|
756 |
Raises: |
|
|
757 |
ValueError: if n_bp is invalid. |
|
|
758 |
KeyError: contig_map is not None and region.reference_name isn't a key. |
|
|
759 |
""" |
|
|
760 |
if n_bp < 0: |
|
|
761 |
raise ValueError('n_bp must be >= 0 but got {}'.format(n_bp)) |
|
|
762 |
|
|
|
763 |
new_start = max(region.start - n_bp, 0) |
|
|
764 |
new_end = region.end + n_bp |
|
|
765 |
if contig_map is not None: |
|
|
766 |
new_end = min(new_end, contig_map[region.reference_name].n_bases) |
|
|
767 |
return make_range(region.reference_name, new_start, new_end) |
|
|
768 |
|
|
|
769 |
|
|
|
770 |
def span(regions): |
|
|
771 |
"""Returns a region that spans all of the bases in regions. |
|
|
772 |
|
|
|
773 |
This function returns a Range(chrom, start, stop), where start is the min |
|
|
774 |
of the starts in regions, and stop is the max end in regions. It may not be |
|
|
775 |
freshly allocated. |
|
|
776 |
|
|
|
777 |
Args: |
|
|
778 |
regions: list[Range]: a list of Range protos. |
|
|
779 |
|
|
|
780 |
Returns: |
|
|
781 |
A single Range proto. |
|
|
782 |
|
|
|
783 |
Raises: |
|
|
784 |
ValueError: if not all regions have the same reference_name. |
|
|
785 |
ValueError: if regions is empty. |
|
|
786 |
""" |
|
|
787 |
if not regions: |
|
|
788 |
raise ValueError('regions is empty but must have at least one region') |
|
|
789 |
elif len(regions) == 1: |
|
|
790 |
return regions[0] |
|
|
791 |
elif any(r.reference_name != regions[0].reference_name for r in regions): |
|
|
792 |
raise ValueError('regions must be all on the same contig') |
|
|
793 |
else: |
|
|
794 |
start = min(r.start for r in regions) |
|
|
795 |
end = max(r.end for r in regions) |
|
|
796 |
return make_range(regions[0].reference_name, start, end) |
|
|
797 |
|
|
|
798 |
|
|
|
799 |
def length(region): |
|
|
800 |
"""Returns the length in basepairs of region.""" |
|
|
801 |
return region.end - region.start |