Switch to unified view

a b/third_party/nucleus/util/ranges.py
1
# Copyright 2018 Google LLC.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions
5
# are met:
6
#
7
# 1. Redistributions of source code must retain the above copyright notice,
8
#    this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
#    notice, this list of conditions and the following disclaimer in the
12
#    documentation and/or other materials provided with the distribution.
13
#
14
# 3. Neither the name of the copyright holder nor the names of its
15
#    contributors may be used to endorse or promote products derived from this
16
#    software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
# POSSIBILITY OF SUCH DAMAGE.
29
"""Utilities for Range overlap detection."""
30
31
from __future__ import absolute_import
32
from __future__ import division
33
from __future__ import print_function
34
35
import collections
36
import re
37
from typing import Dict, Iterable, Optional, Sequence
38
39
from absl import logging
40
from etils import epath
41
import intervaltree
42
import six
43
44
from third_party.nucleus.io import bed
45
from third_party.nucleus.protos import position_pb2
46
from third_party.nucleus.protos import range_pb2
47
from third_party.nucleus.protos import reference_pb2
48
from third_party.nucleus.protos import variants_pb2
49
50
51
# Regular expressions for matching literal chr:start-stop strings.
52
_REGION_LITERAL_REGEXP = re.compile(r'^(\S+):([0-9,]+)-([0-9,]+)$')
53
54
# Regular expressions for matching literal chr:start strings.
55
_POSITION_LITERAL_REGEXP = re.compile(r'^(\S+):([0-9,]+)$')
56
57
# Logging frequency when building our rangeset objects, which can take some time
58
# to complete. Rather than just pausing for a few minutes, we provide an update
59
# logging message every _LOG_EVERY_N_RANGES_IN_RANGESET_INIT records added. See
60
# internal for more information.
61
_LOG_EVERY_N_RANGES_IN_RANGESET_INIT = 250000
62
63
64
class RangeSet(object):
65
  """Fast overlap detection of a genomic position against a database of Ranges.
66
67
  Enables O(log n) computation of whether a point chr:pos falls within one of a
68
  large number of genomic ranges.
69
70
  This class does not supports overlapping or adjacent intervals. Any such
71
  intervals will be automatically merged together in the constructor.
72
73
  This class is immutable. No methods should be added that directly modify the
74
  ranges held by the class.
75
  """
76
77
  def __init__(self, ranges=None, contigs=None, quiet=False):
78
    """Creates a RangeSet backed by ranges.
79
80
    Note that the Range objects in ranges are *not* stored directly here, so
81
    they can safely be modified after they are passed to this RangeSet.
82
83
    Args:
84
      ranges: list(nucleus.genomics.v1.Range) protos (or anything with
85
        reference_name, start, and end properties following the Range
86
        convention). If None, no ranges will be used, and overlaps() will always
87
        return False.
88
      contigs: list(nucleus.genomics.v1.ContigInfo) protos. Used to define the
89
        iteration order over contigs (i.e., by contig.pos_in_fasta).  If this
90
        list is not provided, the iteration order will be determined by the
91
        alphabetical order of the contig names.
92
      quiet: bool; defaults to False: If False, we will emit a logging message
93
        every _LOG_EVERY_N_RANGES_IN_RANGESET_INIT records processed while
94
        building this intervaltree. Set to True to stop all of the logging.
95
96
    Raises:
97
      ValueError: if any range's reference_name does not correspond to any
98
        contig in `contigs`.
99
    """
100
    if contigs is not None:
101
      self._contigs = contigs
102
      self._contig_map = contigs_dict(contigs)
103
      self._contig_sort_key_fn = lambda name: self._contig_map[
104
          name
105
      ].pos_in_fasta
106
      self._is_valid_contig = lambda name: name in self._contig_map
107
    else:
108
      self._contigs = None
109
      self._contig_map = None
110
      self._contig_sort_key_fn = lambda name: name
111
      self._is_valid_contig = lambda name: True
112
113
    if ranges is None:
114
      ranges = []
115
116
    # Add each range to our contig-specific intervaltrees.
117
    self._by_chr = collections.defaultdict(intervaltree.IntervalTree)
118
    for i, range_ in enumerate(ranges):
119
      if not self._is_valid_contig(range_.reference_name):
120
        raise ValueError(
121
            'Range {} is on an unrecognized contig.'.format(range_)
122
        )
123
      self._by_chr[range_.reference_name].addi(range_.start, range_.end, None)
124
      if not quiet and i > 0 and i % _LOG_EVERY_N_RANGES_IN_RANGESET_INIT == 0:
125
        # We do our test directly here on i > 0 so we only see the log messages
126
        # if we add at least _LOG_EVERY_N_RANGES_IN_RANGESET_INIT records.
127
        logging.info('Adding interval %s to intervaltree', to_literal(range_))
128
129
    # Merge overlapping / adjacent intervals in each tree.
130
    for tree in six.itervalues(self._by_chr):
131
      tree.merge_overlaps(strict=False)
132
133
  def __iter__(self):
134
    """Iterate over the ranges in this RangeSet.
135
136
    Yields:
137
      Each range of this RangeSet, in sorted order (by chromosome, then start
138
      end positions). Relative ordering of chromosomes is defined by the
139
      contig.pos_in_fasta integer key for the associated contig. These objects
140
      are new range protos so can be freely modified.
141
    """
142
    for refname in sorted(
143
        six.iterkeys(self._by_chr), key=self._contig_sort_key_fn
144
    ):
145
      for start, end, _ in sorted(self._by_chr[refname]):
146
        yield make_range(refname, start, end)
147
148
  @classmethod
149
  def from_regions(
150
      cls,
151
      regions: Sequence[str],
152
      # TODO: Use X | None instead.
153
      contig_map: Optional[Dict[str, reference_pb2.ContigInfo]] = None,
154
  ) -> 'RangeSet':
155
    """Parses a command-line style literal regions flag into a RangeSet.
156
157
    Args:
158
      regions: An iterable or None. If not None, regions will be parsed with
159
        ranges.from_regions.
160
      contig_map: An optional dictionary mapping from contig names to ContigInfo
161
        protobufs. If provided, allows literals of the format "contig_name",
162
        which will be parsed into a Range with reference_name=contig_name,
163
        start=0, end=n_bases where n_bases comes from the ContigInfo;
164
        additionally the sort order of the RangeSet will be determined by
165
        contig.pos_in_fasta.
166
167
    Returns:
168
      A RangeSet object.
169
    """
170
    if regions is None:
171
      return cls(ranges=[])
172
    else:
173
      return cls(ranges=from_regions(regions, contig_map=contig_map))
174
175
  @classmethod
176
  def from_contigs(
177
      cls, contigs: Sequence[reference_pb2.ContigInfo]
178
  ) -> 'RangeSet':
179
    """Creates a RangeSet with an interval covering each base of each contig."""
180
    return cls(
181
        (make_range(contig.name, 0, contig.n_bases) for contig in contigs),
182
        contigs,
183
    )
184
185
  @classmethod
186
  def from_bed(
187
      cls, source, contigs=None, intersect_ranges=None, enable_logging=True
188
  ):
189
    """Creates a RangeSet containing the intervals from source.
190
191
    Args:
192
      source: A path to a BED (or equivalent) file of intervals.
193
      contigs: An optional list of ContigInfo proto, used by RangeSet
194
        constructor.
195
      intersect_ranges: An optional list of RangeSet objects to intersect with
196
        the intervals in the BED file before creating the RangeSet.
197
      enable_logging: Enables logging line while reading the file.
198
199
    Returns:
200
      A RangeSet.
201
    """
202
    return cls(bed_parser(source, intersect_ranges, enable_logging), contigs)
203
204
  def intersection(self, *others: 'RangeSet') -> 'RangeSet':
205
    """Computes the intersection among this RangeSet and *others RangeSets.
206
207
    This function computes the intersection of all of the intervals in self and
208
    *others, returning a RangeSet containing only intervals common to all. The
209
    intersection here is an ranged intersection, not an identity intersection,
210
    so the resulting set of intervals may not contain any of the original
211
    intervals in any of the sets.
212
213
    To be concrete, suppose we have three sets to intersect, each having two
214
    intervals:
215
216
      self   : chr1:1-10, chr2:20-30
217
      other1 : chr1:5-8, chr3:10-40
218
      other2 : chr1:3-7, chr3:10-30
219
220
    self.intersection(other1, other2) produces a RangeSet with one interval
221
    chr1:5-7, the common bases on chr1 in self, other1, and other2. No intervals
222
    on chr2 or chr3 are included since the chr2 only occurs in self and the two
223
    intervals on chr3, despite having some shared bases, don't have an
224
    overlapping interval in self.
225
226
    Args:
227
      *others: A list of RangeSet objects to intersect with the intervals in
228
        this RangeSet.
229
230
    Returns:
231
      A RangeSet. If *others is empty, this function returns self rather than
232
      making an unnecessary copy. In all other cases, the returned value will be
233
      a freshly allocated RangeSet.
234
    """
235
236
    def _intersect2(refname, tree1, tree2):
237
      """Intersects the intervals of two IntervalTrees."""
238
      # Yields all of the overlapping intervals from each interval of tree1
239
      # found in tree2. Since each tree has only non-adjacent, non-overlapping,
240
      # intervals this calculation is straightforward and safe and produces only
241
      # non-adjacent, non-overlapping intervals.
242
      if len(tree1) > len(tree2):
243
        (bigtree, smalltree) = (tree1, tree2)
244
      else:
245
        (bigtree, smalltree) = (tree2, tree1)
246
      return (
247
          make_range(
248
              refname,
249
              max(interval1.begin, overlapping.begin),
250
              min(interval1.end, overlapping.end),
251
          )
252
          for interval1 in bigtree
253
          for overlapping in smalltree.overlap(interval1)
254
      )
255
256
    # Iteratively intersect each of our *other RangeSets with this RangeSet.
257
    # Sort by size so we do the smallest number of element merge first.
258
    # TODO: Note we could optimize this code by computing the set of
259
    # common contigs upfront across all others and only looping over those.
260
    intersected = self
261
    for other in sorted(others, key=len):
262
      intersected_intervals = []
263
      # pylint: disable=protected-access
264
      # So we can intersect intervals within each contig separately.
265
      for refname, intervals in six.iteritems(intersected._by_chr):
266
        # If refname is present in other, intersect those two IntervalTrees
267
        # directly and add those contigs to our growing list of intersected
268
        # intervals. If refname isn't present, all of the intervals on refname
269
        # should be dropped as there are no intervals to overlap.
270
        other_chr = other._by_chr.get(refname, None)
271
        if other_chr:
272
          intersected_intervals.extend(
273
              _intersect2(refname, intervals, other_chr)
274
          )
275
276
      # Update our intersected RangeSet with the new intervals.
277
      intersected = RangeSet(intersected_intervals, self._contigs)
278
279
    return intersected
280
281
  def exclude_regions(self, other: 'RangeSet'):
282
    """Chops out all of the intervals in other from this this RangeSet.
283
284
    NOTE: This is a *MUTATING* operation for performance reasons. Make a copy
285
    of self if you want to avoid modifying the RangeSet.
286
287
    Args:
288
      other: A RangeSet object whose intervals will be removed from this
289
        RangeSet.
290
    """
291
    # pylint: disable=protected-access
292
    for chrname, chr_intervals in six.iteritems(other._by_chr):
293
      # If refname is present in self, difference those two IntervalTrees.
294
      self_intervals = self._by_chr.get(chrname, None)
295
      if self_intervals:
296
        for begin, end, _ in chr_intervals:
297
          self_intervals.chop(begin, end)
298
        if self_intervals.is_empty():
299
          # Cleanup after ourselves by removing empty trees from our map.
300
          del self._by_chr[chrname]
301
302
  def __len__(self):
303
    """Gets the number of ranges used by this RangeSet."""
304
    return sum(len(for_chr) for for_chr in six.itervalues(self._by_chr))
305
306
  def __nonzero__(self):
307
    """Returns True if this RangeSet is not empty."""
308
    return bool(self._by_chr)
309
310
  __bool__ = __nonzero__  # Python 3 compatibility.
311
312
  def variant_overlaps(
313
      self,
314
      variant: variants_pb2.Variant,
315
      empty_set_return_value: bool = True,
316
  ):
317
    """Returns True if the variant's range overlaps with any in this set."""
318
    if not self:
319
      return empty_set_return_value
320
    else:
321
      return self.overlaps(variant.reference_name, variant.start)
322
323
  def overlaps(self, chrom: str, pos: int):
324
    """Returns True if chr:pos overlaps with any range in this RangeSet.
325
326
    Uses a fast bisection algorithm to determine the overlap in O(log n) time.
327
328
    Args:
329
      chrom: str. The chromosome name.
330
      pos: int. The position (0-based).
331
332
    Returns:
333
      True if chr:pos overlaps with a range.
334
    """
335
    chr_ranges = self._by_chr.get(chrom, None)
336
    if chr_ranges is None:
337
      return False
338
    return chr_ranges.overlaps(pos)
339
340
  def partition(self, max_size):
341
    """Splits our intervals so that none are larger than max_size.
342
343
    Slices up the intervals in this RangeSet into a equivalent set of intervals
344
    (i.e., spanning the same set of bases), each of which is at most max_size in
345
    length.
346
347
    This function does not modify this RangeSet.
348
349
    Because RangeSet merges adjacent intervals, this function cannot use a
350
    RangeSet to represent the partitioned intervals and so instead generates
351
    these intervals via a yield statement.
352
353
    Args:
354
      max_size: int > 0. The maximum size of any interval.
355
356
    Yields:
357
      nucleus.genomics.v1.Range protos, in sorted order (see comment about order
358
      in __iter__).
359
360
    Raises:
361
      ValueError: if max_size <= 0.
362
    """
363
    if max_size <= 0:
364
      raise ValueError('max_size must be > 0: {}'.format(max_size))
365
366
    for interval in self:
367
      refname = interval.reference_name
368
      for pos in range(interval.start, interval.end, max_size):
369
        yield make_range(refname, pos, min(interval.end, pos + max_size))
370
371
  def envelops(self, chrom, start, end):
372
    """Returns True iff some range in this RangeSet envelops the range.
373
374
    Args:
375
      chrom: str. The chromosome of interest.
376
      start: int. Zero-based inclusive index of the query range.
377
      end: int: Zero-based exclusive index of the query range.
378
379
    Returns:
380
      True if and only if some range in `self` completely spans the query
381
      range.
382
    """
383
    chr_ranges = self._by_chr.get(chrom, None)
384
    if chr_ranges is None:
385
      return False
386
    # The intervaltree package does the inverse check, i.e. whether ranges
387
    # contained in it overlap with the query region. So it returns nothing
388
    # when start == end. We by convention want anything overlapping the start
389
    # position to still indicate enveloping in this case.
390
    if start == end:
391
      return chr_ranges.overlaps(start)
392
    else:
393
      overlap_set = chr_ranges.overlap(begin=start, end=end)
394
      return any(ov.begin <= start and ov.end >= end for ov in overlap_set)
395
396
397
def make_position(chrom, position, reverse_strand=False):
398
  """Returns a nucleus.genomics.v1.Position.
399
400
  Args:
401
    chrom: str. The chromosome name.
402
    position: int. The start position (0-based, inclusive).
403
    reverse_strand: bool. If True, indicates the position is on the negative
404
      strand.
405
  """
406
  return position_pb2.Position(
407
      reference_name=chrom, position=position, reverse_strand=reverse_strand
408
  )
409
410
411
def make_range(chrom, start, end):
412
  """Returns a nucleus.genomics.v1.Range.
413
414
  Args:
415
    chrom: str. The chromosome name.
416
    start: int. The start position (0-based, inclusive) of this range.
417
    end: int. The end position (0-based, exclusive) of this range.
418
419
  Returns:
420
    A nucleus.genomics.v1.Range.
421
  """
422
  return range_pb2.Range(reference_name=chrom, start=start, end=end)
423
424
425
def position_overlaps(chrom, pos, interval):
426
  """Returns True iff the position chr:pos overlaps the interval.
427
428
  Args:
429
    chrom: str. The chromosome name.
430
    pos: int. The position (0-based, inclusive).
431
    interval: nucleus.genomics.v1.Range object.
432
433
  Returns:
434
    True if interval overlaps chr:pos.
435
  """
436
  return (
437
      chrom == interval.reference_name and interval.start <= pos < interval.end
438
  )
439
440
441
def ranges_overlap(i1, i2):
442
  """Returns True iff ranges i1 and i2 overlap.
443
444
  Args:
445
    i1: nucleus.genomics.v1.Range object.
446
    i2: nucleus.genomics.v1.Range object.
447
448
  Returns:
449
    True if and only if i1 and i2 overlap.
450
  """
451
  return (
452
      i1.reference_name == i2.reference_name
453
      and i1.end > i2.start
454
      and i1.start < i2.end
455
  )
456
457
458
def bedpe_parser(filename: str) -> Iterable[range_pb2.Range]:
459
  """Parses Range objects from a BEDPE-formatted file object.
460
461
  See http://bedtools.readthedocs.org/en/latest/content/general-usage.html
462
  for more information on the BEDPE format.
463
464
  Skips events that span across chromosomes. For example, if the starting
465
  location is on chr1 and the ending location is on chr2, that record will
466
  not appear in the output.
467
468
  Args:
469
    filename: file name of a BEDPE-formatted file.
470
471
  Yields:
472
    nucleus.genomics.v1.Range protobuf objects.
473
  """
474
  with epath.Path(filename).open('r') as fp:
475
    for line in fp:
476
      parts = line.split('\t')
477
      if parts[0] == parts[3]:
478
        # only keep events on the same chromosome
479
        yield make_range(parts[0], int(parts[1]), int(parts[5]))
480
481
482
def bed_parser(filename, intersect_ranges=None, enable_logging=True):
483
  """Parses Range objects from a BED-formatted file object.
484
485
  See http://bedtools.readthedocs.org/en/latest/content/general-usage.html
486
  for more information on the BED format.
487
488
  Args:
489
    filename: File name of a BED-formatted file.
490
    intersect_ranges: An optional list of RangeSet objects to intersect with the
491
      intervals in the BED file before creating the RangeSet. Requires a tabix
492
      index.
493
    enable_logging: Enables logging line while reading the file.
494
495
  Yields:
496
    nucleus.genomics.v1.Range protobuf objects.
497
  """
498
  with bed.BedReader(filename, enable_logging) as fin:
499
    if not fin.has_index():
500
      logging.warning(
501
          'BED file does not have a tabix index. Reading full bed file.'
502
      )
503
    if intersect_ranges and fin.has_index():
504
      for region in intersect_ranges:
505
        for r in fin.query(region):
506
          yield make_range(r.reference_name, r.start, r.end)
507
    else:
508
      for r in fin.iterate():
509
        yield make_range(r.reference_name, r.start, r.end)
510
511
512
def from_regions(regions, contig_map=None):
513
  """Parses each region of `regions` into a Range proto.
514
515
  This function provides a super high-level interface for
516
  reading/parsing/converting objects into Range protos. Each `region` of
517
  `regions` is processed in turn, yielding one or more Range protos. This
518
  function inspects the contents of `region` to determine how to convert it to
519
  Range(s) protos. The following types of `region` strings are supported:
520
521
    * If region ends with an extension known in _get_parser_for_file, we treat
522
      region as a file and read the Range protos from it with the corresponding
523
      reader from _get_parser_for_file, yielding each Range from the file in
524
      order.
525
    * Otherwise we parse region as a region literal (`chr20:1-10`) and return
526
      the Range proto.
527
528
  Args:
529
    regions: iterable[str]. Converts each element of this iterable into
530
      region(s).
531
    contig_map: An optional dictionary mapping from contig names to ContigInfo
532
      protobufs. If provided, allows literals of the format "contig_name", which
533
      will be parsed into a Range with reference_name=contig_name, start=0,
534
      end=n_bases where n_bases comes from the ContigInfo.
535
536
  Yields:
537
    A Range proto.
538
  """
539
  for region in regions:
540
    reader = _get_parser_for_file(region)
541
    if reader:
542
      for elt in reader(region):
543
        yield elt
544
    else:
545
      yield parse_literal(region, contig_map)
546
547
548
# Cannot be at the top of the file because these parser functions need to be
549
# defined before adding them to the dictionary.
550
_REGION_FILE_READERS = {
551
    bed_parser: frozenset(['.bed']),
552
    bedpe_parser: frozenset(['.bedpe']),
553
}
554
555
556
def _get_parser_for_file(filename):
557
  for reader, exts in six.iteritems(_REGION_FILE_READERS):
558
    if any(filename.lower().endswith(ext) for ext in exts):
559
      return reader
560
  return None
561
562
563
def to_literal(range_pb):
564
  """Converts Range protobuf into string literal form.
565
566
  The string literal form looks like:
567
568
    reference_name:start+1-end
569
570
  since start and end are zero-based inclusive (start) and exclusive (end),
571
  while the literal form is one-based inclusive on both ends.
572
573
  Args:
574
    range_pb: A nucleus.genomics.v1.Range object.
575
576
  Returns:
577
    A string representation of the Range.
578
  """
579
  return '{}:{}-{}'.format(
580
      range_pb.reference_name, range_pb.start + 1, range_pb.end
581
  )
582
583
584
def parse_literal(region_literal, contig_map=None):
585
  """Parses a Range from a string representation like chr:start-end.
586
587
  The region literal must conform to the following pattern:
588
589
    chromosome:start-end
590
    chromosome:position
591
    chromosome  [if contig_map is provided]
592
593
  chromosome can be any non-empty string without whitespace. start and end must
594
  both be positive integers. They can contain commas for readability. start and
595
  end are positions not offsets, so start == 1 means an offset of zero. If only
596
  a single position is provided, this creates a 1 bp interval starting at
597
  position - 1 and ending at position.
598
599
  Inspired by the samtools region specification:
600
  http://www.htslib.org/doc/samtools.html
601
602
  Args:
603
    region_literal: str. The literal to parse.
604
    contig_map: An optional dictionary mapping from contig names to ContigInfo
605
      protobufs. If provided, allows literals of the format "contig_name", which
606
      will be parsed into a Range with reference_name=contig_name, start=0,
607
      end=n_bases where n_bases comes from the ContigInfo.
608
609
  Returns:
610
    nucleus.genomics.v1.Range.
611
612
  Raises:
613
    ValueError: if region_literal cannot be parsed.
614
  """
615
616
  def parse_position(pos_str):
617
    return int(pos_str.replace(',', ''))
618
619
  matched = _REGION_LITERAL_REGEXP.match(region_literal)
620
  if matched:
621
    chrom, start, end = matched.groups()
622
    return make_range(chrom, parse_position(start) - 1, parse_position(end))
623
624
  matched = _POSITION_LITERAL_REGEXP.match(region_literal)
625
  if matched:
626
    chrom, pos = matched.groups()
627
    pos = parse_position(pos)
628
    return make_range(chrom, pos - 1, pos)
629
630
  if contig_map and region_literal in contig_map:
631
    # If the region_literals is an exact contig name like chr1 or MT return a
632
    # range over the entire contig.
633
    return make_range(region_literal, 0, contig_map[region_literal].n_bases)
634
  raise ValueError(
635
      'Could not parse "{}" as a region literal.  Region literals '
636
      'should have the form "chr:start-stop" or "chr:start" or '
637
      'just "chr".  A common error is to use the "chr" prefix on '
638
      "inputs that don't have it, or vice-versa.".format(region_literal)
639
  )
640
641
642
def parse_literals(region_literals, contig_map=None):
643
  """Parses each literal of region_literals in order."""
644
  return [parse_literal(literal, contig_map) for literal in region_literals]
645
646
647
def contigs_n_bases(contigs):
648
  """Returns the sum of all n_bases of contigs."""
649
  return sum(c.n_bases for c in contigs)
650
651
652
def contigs_dict(
653
    contigs: Iterable[reference_pb2.ContigInfo],
654
) -> Dict[str, reference_pb2.ContigInfo]:
655
  """Creates a dictionary for contigs.
656
657
  Args:
658
    contigs: Iterable of ContigInfo protos.
659
660
  Returns:
661
    A dictionary mapping contig.name: contig for each contig in contigs.
662
  """
663
  return {contig.name: contig for contig in contigs}
664
665
666
def sorted_ranges(ranges, contigs=None):
667
  """Sorts ranges by reference_name, start, and end.
668
669
  Args:
670
    ranges: Iterable of nucleus.genomics.v1.Range protos that we want to sort.
671
    contigs: None or an iterable of ContigInfo protos. If not None, we will use
672
      the order of the contigs (as defined by their pos_in_fasta field values)
673
      to sort the Ranges on different contigs with respect to each other.
674
675
  Returns:
676
    A newly allocated list of nucleus.genomics.v1.Range protos.
677
  """
678
  if contigs:
679
    contig_map = contigs_dict(contigs)
680
681
    def to_key(range_):
682
      pos = contig_map[range_.reference_name].pos_in_fasta
683
      return pos, range_.start, range_.end
684
685
  else:
686
    to_key = as_tuple
687
688
  return sorted(ranges, key=to_key)
689
690
691
def as_tuple(range_):
692
  """Returns a Python tuple (reference_name, start, end)."""
693
  return range_.reference_name, range_.start, range_.end
694
695
696
def overlap_len(range1, range2):
697
  """Computes the number of overlapping bases of range1 and range2.
698
699
  Args:
700
    range1: nucleus.genomics.v1.Range.
701
    range2: nucleus.genomics.v1.Range.
702
703
  Returns:
704
    int. The number of basepairs in common. 0 if the ranges are not on the same
705
    contig.
706
  """
707
  if range1.reference_name != range2.reference_name:
708
    return 0
709
  return max(0, (min(range1.end, range2.end) - max(range1.start, range2.start)))
710
711
712
def find_max_overlapping(query_range, search_ranges):
713
  """Gets the index of the element in search_ranges with max overlap with query.
714
715
  In case of ties, selects the lowest index range in search_ranges.
716
717
  Args:
718
    query_range: nucleus.genomics.v1.Range, read genomic range.
719
    search_ranges: list[nucleus.genomics.v1.Read]. The list of regions we want
720
      to search for the maximal overlap with query_range. NOTE: this must be a
721
      list (not a generator) as we loop over the search_ranges multiple times.
722
723
  Returns:
724
    int, the search_ranges index with the maximum read overlap. Returns None
725
    when read has no overlap with any of the search_ranges or search_ranges is
726
    empty.
727
  """
728
  if not search_ranges:
729
    return None
730
  overlaps = [overlap_len(query_range, srange) for srange in search_ranges]
731
  argmax = max(range(len(search_ranges)), key=lambda i: overlaps[i])
732
  # We return None if the read doesn't overlap at all.
733
  return None if overlaps[argmax] == 0 else argmax
734
735
736
def expand(region, n_bp, contig_map=None):
737
  """Expands region by n_bp in both directions.
738
739
  Takes a Range(chrom, start, stop) and returns a new
740
  Range(chrom, new_start, new_stop), where:
741
742
  -- new_start is max(start - n_bp, 0)
743
  -- new_stop is stop + n_bp if contig_map is None, or min(stop + n_bp, max_bp)
744
     where max_bp is contig_map[chrom].n_bp.
745
746
  Args:
747
    region: A nucleus.genomics.v1.Range proto.
748
    n_bp: int >= 0. how many basepairs to increase region by.
749
    contig_map: dict[string, ContigInfo] or None. If not None, used to get the
750
      maximum extent to increase stop by. Must have region.reference_name as a
751
      key.
752
753
  Returns:
754
    nucleus.genomics.v1.Range proto.
755
756
  Raises:
757
    ValueError: if n_bp is invalid.
758
    KeyError: contig_map is not None and region.reference_name isn't a key.
759
  """
760
  if n_bp < 0:
761
    raise ValueError('n_bp must be >= 0 but got {}'.format(n_bp))
762
763
  new_start = max(region.start - n_bp, 0)
764
  new_end = region.end + n_bp
765
  if contig_map is not None:
766
    new_end = min(new_end, contig_map[region.reference_name].n_bases)
767
  return make_range(region.reference_name, new_start, new_end)
768
769
770
def span(regions):
771
  """Returns a region that spans all of the bases in regions.
772
773
  This function returns a Range(chrom, start, stop), where start is the min
774
  of the starts in regions, and stop is the max end in regions. It may not be
775
  freshly allocated.
776
777
  Args:
778
    regions: list[Range]: a list of Range protos.
779
780
  Returns:
781
    A single Range proto.
782
783
  Raises:
784
    ValueError: if not all regions have the same reference_name.
785
    ValueError: if regions is empty.
786
  """
787
  if not regions:
788
    raise ValueError('regions is empty but must have at least one region')
789
  elif len(regions) == 1:
790
    return regions[0]
791
  elif any(r.reference_name != regions[0].reference_name for r in regions):
792
    raise ValueError('regions must be all on the same contig')
793
  else:
794
    start = min(r.start for r in regions)
795
    end = max(r.end for r in regions)
796
    return make_range(regions[0].reference_name, start, end)
797
798
799
def length(region):
800
  """Returns the length in basepairs of region."""
801
  return region.end - region.start