Switch to unified view

a b/third_party/nucleus/util/utils.cc
1
/*
2
 * Copyright 2018 Google LLC.
3
 *
4
 * Redistribution and use in source and binary forms, with or without
5
 * modification, are permitted provided that the following conditions
6
 * are met:
7
 *
8
 * 1. Redistributions of source code must retain the above copyright notice,
9
 *    this list of conditions and the following disclaimer.
10
 *
11
 * 2. Redistributions in binary form must reproduce the above copyright
12
 *    notice, this list of conditions and the following disclaimer in the
13
 *    documentation and/or other materials provided with the distribution.
14
 *
15
 * 3. Neither the name of the copyright holder nor the names of its
16
 *    contributors may be used to endorse or promote products derived from this
17
 *    software without specific prior written permission.
18
 *
19
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
 * POSSIBILITY OF SUCH DAMAGE.
30
 *
31
 */
32
33
#include <algorithm>
34
#include <map>
35
#include <string>
36
#include <vector>
37
38
#include "third_party/nucleus/util/utils.h"
39
40
#include "absl/log/check.h"
41
#include "absl/log/log.h"
42
#include "absl/strings/substitute.h"
43
#include "third_party/nucleus/protos/cigar.pb.h"
44
#include "third_party/nucleus/protos/reads.pb.h"
45
46
namespace nucleus {
47
48
using nucleus::genomics::v1::CigarUnit;
49
using nucleus::genomics::v1::Position;
50
using nucleus::genomics::v1::Range;
51
using nucleus::genomics::v1::Read;
52
using nucleus::genomics::v1::Variant;
53
54
using absl::string_view;
55
using absl::Substitute;
56
57
namespace {
58
59
// Returns a StringPiece containing the canonical base characters corresponding
60
// to the requested CanonicalBases in canon.
61
string_view GetCanonicalBases(const CanonicalBases canon) {
62
  switch (canon) {
63
    case CanonicalBases::ACGT:
64
      return "ACGT";
65
    case CanonicalBases::ACGTN:
66
      return "ACGTN";
67
      // We are not adding a default clause here, to explicitly make clang
68
      // detect the missing codes. This conversion method must stay in sync with
69
      // CanonicalBases enum values.
70
  }
71
72
  LOG(FATAL) << "Invalid CanonicalBases value" << static_cast<int>(canon);
73
  return "";
74
}
75
76
// Looks up the `variant` name in the mapping `contig_name_to_pos_in_fasta` and
77
// returns the corresponding pos_in_fasta. This functions assumes the variant
78
// name exists in the mapping.
79
int PosInFasta(const std::map<string, int>& contig_name_to_pos_in_fasta,
80
               const Variant& variant) {
81
  std::map<string, int>::const_iterator pos_in_fasta =
82
      contig_name_to_pos_in_fasta.find(variant.reference_name());
83
  QCHECK(pos_in_fasta != contig_name_to_pos_in_fasta.end())
84
      << "Reference name " << variant.reference_name()
85
      << " not in contig info.";
86
  return pos_in_fasta->second;
87
}
88
89
}  // namespace
90
91
bool IsCanonicalBase(const char base, const CanonicalBases canon) {
92
  return GetCanonicalBases(canon).find(base) != string_view::npos;
93
}
94
95
size_t FindNonCanonicalBase(string_view bases, const CanonicalBases canon) {
96
  for (size_t i = 0; i < bases.size(); i++) {
97
    // Meh.  This should be a static lookup table.
98
    if (!IsCanonicalBase(bases[i], canon)) {
99
      return i;
100
    }
101
  }
102
  return string::npos;
103
}
104
105
bool AreCanonicalBases(string_view bases, const CanonicalBases canon,
106
                       size_t* bad_position) {
107
  CHECK(!bases.empty()) << "bases cannot be empty";
108
  const size_t bad_pos = FindNonCanonicalBase(bases, canon);
109
  if (bad_pos == string::npos) return true;
110
  if (bad_position != nullptr) *bad_position = bad_pos;
111
  return false;
112
}
113
114
Position MakePosition(string_view chr, const int64 pos,
115
                      const bool reverse_strand) {
116
  Position position;
117
  position.set_reference_name(string(chr));  // TODO
118
  position.set_position(pos);
119
  position.set_reverse_strand(reverse_strand);
120
  return position;
121
}
122
123
Position MakePosition(const Variant& variant) {
124
  return MakePosition(variant.reference_name(), variant.start());
125
}
126
127
Range MakeRange(string_view chr, const int64 start, const int64 end) {
128
  Range range;
129
  range.set_reference_name(string(chr));  // TODO
130
  range.set_start(start);
131
  range.set_end(end);
132
  return range;
133
}
134
135
Range MakeRange(const Variant& variant) {
136
  return MakeRange(variant.reference_name(), variant.start(), variant.end());
137
}
138
139
Range MakeRange(const Read& read) {
140
  return MakeRange(AlignedContig(read), ReadStart(read), ReadEnd(read));
141
}
142
143
void ReadRangePython(
144
    const nucleus::ConstProtoPtr<const ::nucleus::genomics::v1::Read>&
145
        read_wrapped,
146
    nucleus::EmptyProtoPtr<::nucleus::genomics::v1::Range> range_wrapped) {
147
  const Read& read = *read_wrapped.p_;
148
  Range* range = range_wrapped.p_;
149
  range->set_reference_name(read.alignment().position().reference_name());
150
  range->set_start(ReadStart(read));
151
  range->set_end(ReadEnd(read));
152
}
153
154
bool RangeContains(const Range& haystack, const Range& needle) {
155
  return (needle.reference_name() == haystack.reference_name() &&
156
          needle.start() >= haystack.start() &&
157
          needle.end() <= haystack.end());
158
}
159
160
bool RangesContainVariant(
161
    const std::vector<nucleus::genomics::v1::Range>& ranges,
162
    const Variant& variant) {
163
  for (const auto& range : ranges) {
164
    if (range.reference_name() == variant.reference_name() &&
165
        range.start() <= variant.start() && range.end() > variant.start()) {
166
      return true;
167
    }
168
  }
169
  return false;
170
}
171
172
bool ReadOverlapsRegion(const ::nucleus::genomics::v1::Read& read,
173
                        const ::nucleus::genomics::v1::Range& range) {
174
  // Equivalent code in python from ranges.py:
175
  //
176
  // return (i1.reference_name == i2.reference_name and i1.end > i2.start and
177
  //         i1.start < i2.end)
178
  //
179
  // Here i1 is range and i2 is the range implied from the read.
180
  return
181
      // This is the cheapest calculation as read start is cheap to determine.
182
      range.end() > ReadStart(read) &&
183
      // Next we check read end, which is slightly more expensive as we need to
184
      // compute the end from the cigar.
185
      range.start() < ReadEnd(read) &&
186
      // Finally we compute if the reference_names are the same.
187
      range.reference_name() == AlignedContig(read);
188
}
189
190
// Creates an interval string from its arguments, like chr:start-end
191
string MakeIntervalStr(string_view chr, const int64 start, const int64 end,
192
                       bool base_zero) {
193
  int offset = base_zero ? 1 : 0;
194
  if (start == end) {
195
    // TODO: remove string conversion
196
    return Substitute("$0:$1", string(chr), start + offset);
197
  } else {
198
    // TODO: remove string conversion
199
    return Substitute("$0:$1-$2", string(chr), start + offset, end + offset);
200
  }
201
}
202
203
string MakeIntervalStr(const Position& position) {
204
  return MakeIntervalStr(position.reference_name(), position.position(),
205
                         position.position(), true);
206
}
207
208
string MakeIntervalStr(const Range& interval) {
209
  return MakeIntervalStr(interval.reference_name(), interval.start(),
210
                         interval.end(), true);
211
}
212
213
string AlignedContig(const Read& read) {
214
  return read.has_alignment() ? read.alignment().position().reference_name()
215
                              : "";
216
}
217
218
int64 ReadStart(const Read& read) {
219
  return read.alignment().position().position();
220
}
221
222
int64 ReadEnd(const Read& read) {
223
  int64 position = ReadStart(read);
224
  for (const auto& cigar : read.alignment().cigar()) {
225
    switch (cigar.operation()) {
226
      case CigarUnit::ALIGNMENT_MATCH:
227
      case CigarUnit::SEQUENCE_MATCH:
228
      case CigarUnit::DELETE:
229
      case CigarUnit::SKIP:
230
      case CigarUnit::SEQUENCE_MISMATCH:
231
        position += cigar.operation_length();
232
        break;
233
      default:
234
        // None of the other operations change the alignment offset
235
        break;
236
    }
237
  }
238
239
  return position;
240
}
241
242
int ComparePositions(const Position& pos1, const Position& pos2) {
243
  int result = pos1.reference_name().compare(pos2.reference_name());
244
  if (result == 0) {
245
    result = static_cast<int>(pos1.position() - pos2.position());
246
  }
247
  return result;
248
}
249
250
// TODO: should compare ranges, implement compare range
251
int ComparePositions(const Variant& variant1, const Variant& variant2) {
252
  return ComparePositions(MakePosition(variant1), MakePosition(variant2));
253
}
254
255
// True if:
256
// -- read is not part of a pair
257
// -- read is explicitly marked as properly placed by the aligner
258
// -- read has an unmapped mate (we only can see the next mate)
259
// -- read is unmapped itself
260
// -- read and mate are mapped to the same contig
261
bool IsReadProperlyPlaced(const Read& read) {
262
  return (read.number_reads() < 2 || read.proper_placement() ||
263
          read.next_mate_position().reference_name().empty() ||
264
          !read.has_alignment() ||
265
          AlignedContig(read) == read.next_mate_position().reference_name());
266
}
267
268
inline string_view ClippedSubstr(string_view s, size_t pos, size_t n) {
269
  pos = std::min(pos, static_cast<size_t>(s.size()));
270
  return s.substr(pos, n);
271
}
272
273
string_view Unquote(string_view input) {
274
  if (input.size() < 2) return input;
275
  char firstChar = input[0];
276
  char lastChar = input[input.size() - 1];
277
  if ((firstChar == '"' || firstChar == '\'') && (firstChar == lastChar)) {
278
    return nucleus::ClippedSubstr(input, 1, input.size() - 2);
279
  } else {
280
    return input;
281
  }
282
}
283
284
std::map<string, int> MapContigNameToPosInFasta(
285
    const std::vector<nucleus::genomics::v1::ContigInfo>& contigs) {
286
  // Create the mapping from from contig to pos_in_fasta.
287
  std::map<string, int> contig_name_to_pos_in_fasta;
288
  for (const nucleus::genomics::v1::ContigInfo& contig : contigs) {
289
    contig_name_to_pos_in_fasta[contig.name()] = contig.pos_in_fasta();
290
  }
291
  return contig_name_to_pos_in_fasta;
292
}
293
294
bool CompareVariants(const Variant& a, const Variant& b,
295
                     const std::map<string, int>& contig_name_to_pos_in_fasta) {
296
  const int pos_in_fasta_a = PosInFasta(contig_name_to_pos_in_fasta, a);
297
  const int pos_in_fasta_b = PosInFasta(contig_name_to_pos_in_fasta, b);
298
  if (pos_in_fasta_a != pos_in_fasta_b) {
299
    return pos_in_fasta_a < pos_in_fasta_b;
300
  }
301
  const int64 start_a = a.start();
302
  const int64 start_b = b.start();
303
  if (start_a != start_b) {
304
    return start_a < start_b;
305
  }
306
  return a.end() < b.end();
307
}
308
309
bool EndsWith(const string& s, const string& t) {
310
  if (t.size() > s.size()) return false;
311
  return std::equal(t.rbegin(), t.rend(), s.rbegin());
312
}
313
314
}  // namespace nucleus