a b/third_party/nucleus/util/utils.py
1
# Copyright 2018 Google LLC.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions
5
# are met:
6
#
7
# 1. Redistributions of source code must retain the above copyright notice,
8
#    this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
#    notice, this list of conditions and the following disclaimer in the
12
#    documentation and/or other materials provided with the distribution.
13
#
14
# 3. Neither the name of the copyright holder nor the names of its
15
#    contributors may be used to endorse or promote products derived from this
16
#    software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
# POSSIBILITY OF SUCH DAMAGE.
29
"""Utility functions for working with reads."""
30
31
from __future__ import absolute_import
32
from __future__ import division
33
from __future__ import print_function
34
35
import numpy as np
36
37
from third_party.nucleus.protos import range_pb2
38
from third_party.nucleus.util.python import utils as utils_cpp
39
40
41
def read_overlaps_region(read, region):
42
  """Returns True if read overlaps read.
43
44
  This function is equivalent to calling:
45
46
    `ranges.ranges_overlap(region, read_range(read))`
47
48
  But is optimized for speed and memory performance in C++.
49
50
  Args:
51
    read: nucleus.genomics.v1.Read.
52
    region: nucleus.genomics.v1.Range.
53
54
  Returns:
55
    True if read and region overlap (i.e, have the same reference_name and their
56
    start/ends overlap at least one basepair).
57
  """
58
  return utils_cpp.read_overlaps_region(read, region)
59
60
61
def read_range(read):
62
  """Creates a Range proto from the alignment of Read.
63
64
  Args:
65
    read: nucleus.genomics.v1.Read. The read to calculate the range for.
66
67
  Returns:
68
    A nucleus.genomics.v1.Range for read.
69
  """
70
  range_pb = range_pb2.Range()
71
  utils_cpp.read_range(read, range_pb)
72
  return range_pb
73
74
75
def read_end(read):
76
  """Returns the read start + alignment length for Read read."""
77
  return read_range(read).end
78
79
80
def reservoir_sample(iterable, k, random=None):
81
  """Samples k elements with uniform probability from an iterable.
82
83
  Selects a subset of k elements from n input elements with uniform probability
84
  without needing to hold all n elements in memory at the same time. This
85
  implementation has max space complexity O(min(k, n)), i.e., we allocate up to
86
  min(k, n) elements to store the samples. This means that we only use ~n
87
  elements when n is smaller than k, which can be important when k is large. If
88
  n elements are added to this sampler, and n <= k, all n elements will be
89
  retained. If n > k, each added element will be retained with a uniform
90
  probability of k / n.
91
92
  The order of the k retained samples from our n elements is undefined. In
93
  particular that means that the elements in the returned list can occur in a
94
  different order than they appeared in the iterable.
95
96
  More details about reservoir sampling (and the specific algorithm used here
97
  called Algorithm R) can be found on wikipedia:
98
99
  https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R
100
101
  Args:
102
    iterable: Python iterable. The iterable to sample from.
103
    k: int. The number of elements to sample.
104
    random: A random number generator or None.
105
106
  Returns:
107
    A list containing the k sampled elements.
108
109
  Raises:
110
    ValueError: If k is negative.
111
  """
112
  if k < 0:
113
    raise ValueError('k must be nonnegative, but got {}'.format(k))
114
  if random is None:
115
    random = np.random
116
  sample = []
117
  for i, item in enumerate(iterable):
118
    if len(sample) < k:
119
      sample.append(item)
120
    else:
121
      j = random.randint(0, i + 1)
122
      if j < k:
123
        sample[j] = item
124
  return sample