[9b26b7]: / deepvariant / allelecounter.h

Download this file

473 lines (414 with data), 21.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/*
* Copyright 2017 Google LLC.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
// Compute AlleleCounts over an interval on the genome given the reads that
// align somewhere within that interval.
//
#ifndef LEARNING_GENOMICS_DEEPVARIANT_ALLELECOUNTER_H_
#define LEARNING_GENOMICS_DEEPVARIANT_ALLELECOUNTER_H_
#ifndef FRIEND_TEST
#define FRIEND_TEST(test_case_name, test_name)\
friend class test_case_name##_##test_name##_Test
#endif
#include <memory>
#include <string>
#include <vector>
#include "deepvariant/protos/deepvariant.pb.h"
#include "absl/strings/string_view.h"
#include "third_party/nucleus/io/reference.h"
#include "third_party/nucleus/protos/cigar.pb.h"
#include "third_party/nucleus/protos/position.pb.h"
#include "third_party/nucleus/protos/range.pb.h"
#include "third_party/nucleus/protos/reads.pb.h"
#include "third_party/nucleus/util/proto_ptr.h"
namespace learning {
namespace genomics {
namespace deepvariant {
using std::string;
// Summarizes the counts of all of the distinct alleles present in allele_count.
//
// Computes and returns a vector of Allele objects, one for each distinct Allele
// across all reads in allele_count. Each allele in the vector has its count
// field set to the number of reads that carried that allele.
std::vector<Allele> SumAlleleCounts(const AlleleCount& allele_count,
bool include_low_quality = false);
// Summarizes the counts of all of the distinct alleles present in allele_count
// for one position combined for all DeepTrio samples. Effectively this function
// merges allele_count from all DeepTrio samples.
// This function is similar to SumAlleleCounts(const AlleleCount& allele_count)
std::vector<Allele> SumAlleleCounts(
const std::vector<AlleleCount>& allele_counts,
bool include_low_quality = false);
// Gets the total count of observed alleles in this allele_count, which is the
// sum of the observed non-reference alleles in read_alleles + the total number
// of reference supporting reads.
int TotalAlleleCounts(const AlleleCount& allele_count,
bool include_low_quality = false);
// Gets the total count of observed alleles in allele_count from all DeepTrio
// samples, which is the sum of the observed non-reference alleles in
// read_alleles + the total number of reference supporting reads.
int TotalAlleleCounts(const std::vector<AlleleCount>& allele_counts,
bool include_low_quality = false);
// Binary search for allele index by position.
int AlleleIndex(const std::vector<AlleleCount>& allele_counts, int64_t pos);
// Represents an Allele observed in a read at a specific position in our
// interval. Supports the concept that the site should be skipped but still
// needs to be represented in a data processing chain. ReadAlleles marked as
// skip() won't be added to the AlleleCount at position but since they are
// explicitly represented can be reasoned about relative to other ReadAlleles.
// The pos() of a ReadAllele is an offset in our interval where this Allele
// should be added. For example, if pos() == 1, the Allele implied by this
// ReadAllele would be added to the second AlleleCount of this AlleleCounter.
// The position of the ReadAllele can be < 0 and beyond the length of the
// interval, indicating that some read that overlaps our interval carries an
// Allele but it shouldn't be added to our AlleleCounts.
class ReadAllele {
public:
ReadAllele() = default;
// Creates a ReadAllele with position, bases, and type.
ReadAllele(int position, absl::string_view bases, const AlleleType& type,
bool is_low_quality = false)
: position_(position),
bases_(bases),
type_(type),
low_quality_allele_(is_low_quality) {}
// Gets the position of this ReadAllele. Can be < 0 or >= IntervalLength(),
// indicating that the ReadAllele refers to a position outside of the
// interval.
int position() const { return position_; }
// Returns true if this ReadAllele should be skipped.
bool skip() const { return position_ == kInvalidPosition; }
// Gets the bases of the allele observed at this position.
const string& bases() const { return bases_; }
// Gets the type of the allele observed at this position.
const AlleleType& type() const { return type_; }
bool is_low_quality() const { return low_quality_allele_; }
private:
static constexpr int kInvalidPosition = -1;
const int position_ = kInvalidPosition;
const string bases_ = "";
const AlleleType type_ = AlleleType::UNSPECIFIED;
bool low_quality_allele_ = false;
};
// Workhorse class to compute AlleleCounts over an interval on the genome.
//
// AlleleCounter works roughly as follows:
//
// AlleleCounter counter = AlleleCounter(ref, interval)
//
// // Add reads that overlap our interval
// for ( read : reads_that_overlap_interval ) {
// counter.Add(read)
//
// // Tell the counter that we are done adding reads
// counter.Finalize()
// vector<AlleleCount> counter.Counts()
//
// The returned vector of AlleleCounts has an AlleleCount for every position in
// the interval. Each AlleleCount contains a position, the reference base at
// that position, and a repeated field of Allele protos which contain the bases,
// type, and counts of the alleles. Each observed allele comes from the
// alignment of a read at that position in the genome. For example, if we have:
//
// pos: 0123
// ref: ACGT
// read: CG
//
// We could produce AlleleCounts with 0 observed alleles at position 0, one C
// at position 1, one G at position 2, and no alleles at position 3.
//
// This becomes more complex when insertion and deletion alleles occur, so that
// you have:
//
// pos: 0123
// ref: ACGT
// read: C-T
//
// Would actually produce an allele CG deletion at position 1, indicating that
// the reference base was C and that a G base was deleted. No allele count would
// occur at G (after all it was never observed), and there would be 1 T at
// position 3. The AlleleCount proto and associated other protos provide
// additional details.
//
// The AlleleCount's generated by adding each read simply sum up independently
// with multiple reads, which is a very attractive property of the AlleleCount
// representation.
//
// Note that this code can diverge from the left-alignment requirement of VCF /
// variant protos when the input read cigars are themselves not left aligned.
// For example, suppose we have:
//
// ref: TAAAC
// sample: TAAC
//
// Ideally an aligner would place the DELETION cigar elements for any read
// covering this site to the left-most position:
//
// ref: TAAAC
// read1: T-AAC [cigar = 1M1D3M, ideal]
// read2: TA-AC [cigar = 2M1D2M, pretty far from ideal, but equivalent]
//
// This code doesn't try to do anything clever by left-aligning CIGAR elements
// in order to fix this problem. This is largely ok because (1) the standard
// aligner (BWA) does in fact do consistent left alignment and (2) we anticipate
// feeding this AlleleCounter reads that have been assembled into a consistent
// alignment across all of the reads in a way that is impossible for a read-by-
// read aligner to do. So beware with the aligner you use, unless you've cleaned
// up the reads in some way (left aligning the cigars or just doing assembly).
//
// It is ok to send reads that only partially overlap the interval; the
// AlleleCounter will only add counts from the part that overlap the interval.
//
// This code assumes that the reference genome and the reads have only upper
// case bases. By construction our GenomeReference will not have lower case
// bases. Lower case bases are allowed in read sequences (see
// https://samtools.github.io/hts-specs/SAMv1.pdf),
// so it may be worth upper casing the read sequence at some point to make this
// code more robust.
//
// AlleleCounter performs some specialized logic to deal with non-canonical
// bases that might occur in the reference and/or the reads. By canonical we
// mean bases that are one of {A,C,G,T}. The basic constraint is that we will
// not add Allele's that contain non-canonical bases. So if we have a read that
// has (e.g.) an N, we will not produce Allele objects that contain the N. So
// if it doesn't match the reference at a site, it will be skipped as a
// SUBSTITUTION allele, and if it were part of an insertion, that entire
// insertion will be skipped. We do support AlleleCount objects that have a N
// base as its reference base, and those bases get alleles just as a normal base
// would. Downstream processing should look at the reference base of the
// AlleleCounts produced by this code for non-canonical reference bases and
// handle if special handling is needed. Finally, if a read's deletion CIGAR
// spans across part of the reference genome that has an N, the corresponding
// DELETION allele will be dropped.
class AlleleCounter {
public:
// Creates an AlleleCounter.
//
// The counter will track AlleleCounts over the interval range, using ref to
// get reference data in that interval, using the options to determine which
// reads and bases are counted.
// Candidate_positions parameter is optional. When it is present allele
// counter saves ref alleles for positions found in this vector.
//
// The GenomeReference must be available throughout the lifetime of this
// AlleleCounter object.
AlleleCounter(const nucleus::GenomeReference* ref,
const nucleus::genomics::v1::Range& range,
const std::vector<int>& candidate_positions,
const AlleleCounterOptions& options);
// An alternative constructor that allows to use a wider reference region for
// allele counter. This is needed for read normalization for those reads that
// only partially overlap allele counter region.
AlleleCounter(const nucleus::GenomeReference* ref,
const nucleus::genomics::v1::Range& range,
const nucleus::genomics::v1::Range& full_range,
const std::vector<int>& candidate_positions,
const AlleleCounterOptions& options);
// This Init is used by unit tests only.
static AlleleCounter* InitFromAlleleCounts(
const std::vector<AlleleCount>& allele_counts);
// Adds the alleles from read to our AlleleCounts. This method is also called
// by NormalizeAndAdd. In that case allele counts are created using a
// normalized cigar and update read alignment position passed as an optional
// parameter.
void Add(const nucleus::genomics::v1::Read& read, absl::string_view sample,
const std::vector<nucleus::genomics::v1::CigarUnit>* cigar_to_use =
nullptr,
int read_shift = 0);
// Wrapper around Add() that normalize the input read first and then calls
// Add().
void NormalizeAndAdd(
const nucleus::genomics::v1::Read& read, absl::string_view sample,
std::unique_ptr<std::vector<nucleus::genomics::v1::CigarUnit>>&
norm_cigar,
int& read_shift);
// Python wrapper around NormalizeAndAdd. It allows to avoid serialization of
// protos when calling from Python.
std::unique_ptr<std::vector<nucleus::genomics::v1::CigarUnit>>
NormalizeAndAddPython(const nucleus::ConstProtoPtr<
const nucleus::genomics::v1::Read>& wrapped,
const string& sample, int* read_shift) {
auto norm_cigar =
std::make_unique<std::vector<nucleus::genomics::v1::CigarUnit>>(
std::vector<nucleus::genomics::v1::CigarUnit>());
NormalizeAndAdd(*(wrapped.p_), sample, norm_cigar, *read_shift);
return norm_cigar;
}
// Simple wrapper around Add() that allows us to efficiently pass large
// protobufs in from Python. Simply unwraps the ConstProtoPtr objects and
// calls Add(read).
void AddPython(const nucleus::ConstProtoPtr<
const nucleus::genomics::v1::Read>& wrapped,
const string& sample) {
Add(*(wrapped.p_), sample, nullptr);
}
// Gets the options in use by this AlleleCounter
const AlleleCounterOptions& Options() const { return options_; }
// Gets the interval we are counting alleles over.
const nucleus::genomics::v1::Range& Interval() const { return interval_; }
// Gets the interval overlapping all the reads.
const nucleus::genomics::v1::Range& ReadsInterval() const {
return reads_interval_;
}
// Returns the number of basepairs in our interval.
int64_t IntervalLength() const { return interval_.end() - interval_.start(); }
// Return the number of basepairs of the interval overlapping all the reads.
int64_t ReadsIntervalLength() const {
return reads_interval_.end() - reads_interval_.start();
}
// Gets the completed AlleleCounts over this counter's interval.
//
// This function returns a vector of AlleleCount objects, one for each
// basepair in our interval, filled in according to the reads that have been
// added via calls to Add*() routines.
//
// Calling this routine finalizes the AlleleCounts(), so that subsequent calls
// to Add*() will fail.
const std::vector<AlleleCount>& Counts() const { return counts_; }
// Similar to Counts() function but returns a lighter-weight summary proto.
//
// This function has all of the behavior of calling Counts() but instead of
// returning the heavy-weight AlleleCount proto this returns a simpler proto.
// See the proto description for more information about the proto fields.
std::vector<AlleleCountSummary> SummaryCounts(int left_padding = 0,
int right_padding = 0) const;
// How many reads have been added to this counter?
int NCountedReads() const { return n_reads_counted_; }
// Constructs a unique string key for this read. The key is the concatenation
// of fragment_name, "/", and read_number.
string ReadKey(const nucleus::genomics::v1::Read& read);
private:
// This constructor is used for unit testing only.
AlleleCounter();
// Initialize allele counter.
void Init();
// Helper function to get the reference bases between offsets rel_start
// (inclusive) and rel_end (exclusive). The offsets are both relative to our
// interval, so rel_start = 0 means the first base in our interval. Because
// of their relative nature, it's meaningful to ask for offsets that are
// negative (gets bases before the start of the interval) or offsets that are
// longer than the interval. Will return the empty string if the actual
// genomic coordinates implied by the offsets aren't all on the chromosome.
string RefBases(int64_t rel_start, int64_t len);
// Returns True if ref_offset (where 0 indicates the first position in the
// interval, which could be base 1234 in genomic coordinates, for example), is
// within our interval. This means that ref_offset >= 0 and ref_offset <
// IntervalLength().
bool IsValidRefOffset(int ref_offset) {
return ref_offset >= 0 && ref_offset < ReadsIntervalLength();
}
// Returns True if interval_offset is within our allele counter interval.
bool IsValidIntervalOffset(int interval_offset) {
return interval_offset >= 0 && interval_offset < IntervalLength();
}
// Gets the base before read_offset in read, or if that would be before the
// start of the read (i.e., read_offset == 0) then return the previous base on
// the reference genome (at interval_offset - 1).
string GetPrevBase(const nucleus::genomics::v1::Read& read, int read_offset,
int interval_offset);
// Creates a ReadAllele for an indel (type based on cigar) from read starting
// at read_offset position in the read to the AlleleCount at interval_offset.
// Does all of the necessary quality checks to ensure we only add good bases
// to the our AlleleCounts, as well as manages the complexity of determining
// the correct allele to add. May return a ReadAllele marked as skip() if the
// implied allele isn't valid for some reason (e.g., bases are too low
// quality).
ReadAllele MakeIndelReadAllele(
const nucleus::genomics::v1::Read& read, int interval_offset,
int ref_offset, int read_offset,
const nucleus::genomics::v1::CigarUnit& cigar);
// Adds the ReadAlleles in to_add to our AlleleCounts.
void AddReadAlleles(const nucleus::genomics::v1::Read& read,
absl::string_view sample,
const std::vector<ReadAllele>& to_add);
// Normalize cigar by shifting INDELs in the middle of a repeat all the way
// to the left. As a result of shifting two INDELs may become merged. Merged
// INDEL may become non-normalized so the process is repeated up to 10 times.
// If INDEL is shifted all the way to the beginning of the read then this
// INDEL is removed and read alignment position has to be shifted.
bool NormalizeCigar(const absl::string_view read_seq, int interval_offset,
std::vector<nucleus::genomics::v1::CigarUnit>& cigar,
int& read_shift) const;
// Helper function used in NormalizeCigar function. Returns true if deletion
// operation can be shifted left preserving the alignment.
bool CanDelBeShifted(
const absl::string_view read_seq,
std::vector<nucleus::genomics::v1::CigarUnit>::const_iterator cigar_elt,
int read_offset,
int interval_offset,
int op_len) const;
// Helper function used in NormalizeCigar function. Returns true if insertion
// operation can be shifted left preserving the alignment.
bool CanInsBeShifted(
const absl::string_view read_seq,
std::vector<nucleus::genomics::v1::CigarUnit>::const_iterator cigar_elt,
int read_offset,
int interval_offset,
int op_len) const;
// Our GenomeReference, which we use to get information about the reference
// bases in our interval.
const nucleus::GenomeReference* const ref_;
// The interval chr from start (0-based, inclusive) to end (0-based,
// exclusive) describing where we are counting on the genome. We will produce
// one AlleleCount for each base in interval, from start to end (exclusive).
const nucleus::genomics::v1::Range interval_;
// The interval chr from start (0-based, inclusive) to end (0-based,
// exclusive) of the available ref bases. By default this interval is equal to
// to the interval_. If read normalization is needed then reads_interval_ may
// be an etentsion of interval_. reads_interval_ spans from the first position
// of the very first read to the last position of the last read.
const nucleus::genomics::v1::Range reads_interval_;
// Vector of potential candidate positions. Ref alleles are stored only for
// positions found in this vector. This functionality is optional and
// governed by track_ref_reads flag.
std::vector<int> candidate_positions_;
// The options that are controlling how we count reads.
const AlleleCounterOptions options_;
// The number of reads we've added to this interval.
int n_reads_counted_ = 0;
// Our AlleleCount objects, one for each base in our interval, in order.
std::vector<AlleleCount> counts_;
// The reference bases covering our interval;
const string ref_bases_;
// Following tests call protected method NormalizeCigar.
FRIEND_TEST(AlleleCounterTest, NormalizeCigarDel);
FRIEND_TEST(AlleleCounterTest, NormalizeCigarIns);
FRIEND_TEST(AlleleCounterTest, NormalizeCigarInsDel);
FRIEND_TEST(AlleleCounterTest, NormalizeCigarInsertAtTheEnd);
FRIEND_TEST(AlleleCounterTest, NormalizeCigarTwoDelsMerged);
FRIEND_TEST(AlleleCounterTest, NormalizeCigarDelInsMerged);
FRIEND_TEST(AlleleCounterTest, NormalizeCigarInsShiftedToEdge);
FRIEND_TEST(AlleleCounterTest, NormalizeCigarInsShiftedAllTheWayToSoftClip);
FRIEND_TEST(AlleleCounterTest, NormalizeCigarDelInsMergedNoShift);
};
} // namespace deepvariant
} // namespace genomics
} // namespace learning
#endif // LEARNING_GENOMICS_DEEPVARIANT_ALLELECOUNTER_H_