[9b26b7]: / deepvariant / realigner / fast_pass_aligner.h

Download this file

437 lines (356 with data), 16.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
/*
* Copyright 2018 Google LLC.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef LEARNING_GENOMICS_DEEPVARIANT_REALIGNER_FAST_PASS_ALIGNER_H_
#define LEARNING_GENOMICS_DEEPVARIANT_REALIGNER_FAST_PASS_ALIGNER_H_
#include <limits>
#include <list>
#include <map>
#include <memory>
#include <set>
#include <utility>
#include <vector>
#include "deepvariant/protos/realigner.pb.h"
#include "deepvariant/realigner/ssw.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/node_hash_map.h"
#include "absl/memory/memory.h"
#include "absl/strings/string_view.h"
#include "third_party/nucleus/protos/cigar.pb.h"
#include "third_party/nucleus/protos/reads.pb.h"
#include "re2/re2.h"
namespace learning {
namespace genomics {
namespace deepvariant {
using nucleus::genomics::v1::CigarUnit;
using nucleus::genomics::v1::LinearAlignment;
using nucleus::genomics::v1::Position;
// TODO Add ChromosomePosition type instead of uint64_t
struct Kmer {
string sequence;
};
struct ReadId {
ReadId() : is_set(false), id(0) {}
explicit ReadId(size_t id) : is_set(true), id(id) {}
explicit operator int64_t() const { return id; }
explicit operator uint64_t() const { return id; }
bool operator<(const ReadId& that) const { return id < that.id; }
bool operator==(const ReadId& that) const { return id == that.id; }
bool is_set;
size_t id;
};
struct KmerOffset {
KmerOffset() : is_set(false), pos(0) {}
explicit KmerOffset(size_t pos) : is_set(true), pos(pos) {}
bool operator==(const KmerOffset& that) const {
return pos == that.pos && is_set == that.is_set;
}
bool is_set;
size_t pos;
};
// Used in Reads kmer index
struct KmerOccurrence {
KmerOccurrence() {}
KmerOccurrence(ReadId read_id, KmerOffset pos)
: read_id(read_id), read_pos(pos) {}
bool operator==(const KmerOccurrence& that) const {
return read_id == that.read_id && read_pos == that.read_pos;
}
ReadId read_id; // ID if the read this Kmer appears in.
KmerOffset read_pos; // Position in the read.
};
// Store read alignment to haplotype
// TODO position could be of type KmerOffset, but then it needs to be
// renamed. That would be better because default position = 0 is a valid
// position.
struct ReadAlignment {
static constexpr uint16_t kNotAligned = std::numeric_limits<uint16_t>::max();
ReadAlignment() : position(kNotAligned), cigar(""), score(0) {}
ReadAlignment(uint16_t position_param, absl::string_view cigar_param,
int score_param)
: position(position_param), cigar(cigar_param), score(score_param) {}
bool operator==(const ReadAlignment& that) const {
return score == that.score && position == that.position &&
cigar == that.cigar;
}
void reset() {
score = 0;
position = kNotAligned;
cigar = "";
}
uint16_t position;
string cigar;
int score;
};
// Cigar operation is defined by operation type and its length.
struct CigarOp {
CigarOp()
: operation(nucleus::genomics::v1::CigarUnit::OPERATION_UNSPECIFIED),
length(0) {}
CigarOp(CigarUnit::Operation op, int len) : operation(op), length(len) {}
bool operator==(const CigarOp& that) const {
return operation == that.operation && length == that.length;
}
CigarUnit::Operation operation;
int length;
};
// TODO HaplotypeReadsAlignment is not a good name for this structure.
// It contains not only read alignments to haplotype but also haplotype to
// reference alignment.
// Stores alignment scores for all reads to one haplotype.
// haplotype_score = sum of all aligned read scores, where read score
// is a number of matched bases. This way we can order haplotypes by number of
// supporting reads.
struct HaplotypeReadsAlignment {
HaplotypeReadsAlignment() : haplotype_index(0), haplotype_score(0) {}
HaplotypeReadsAlignment(
size_t haplotype_index,
int score,
const std::vector<ReadAlignment>& read_alignment_scores)
: haplotype_index(haplotype_index), haplotype_score(score) {
this->read_alignment_scores.assign(read_alignment_scores.begin(),
read_alignment_scores.end());
}
bool operator==(const HaplotypeReadsAlignment& that) const {
return haplotype_index == that.haplotype_index &&
haplotype_score == that.haplotype_score &&
read_alignment_scores == that.read_alignment_scores &&
cigar == that.cigar && cigar_ops == that.cigar_ops &&
is_reference == that.is_reference &&
hap_to_ref_positions_map == that.hap_to_ref_positions_map;
}
bool operator<(const HaplotypeReadsAlignment& that) const {
return haplotype_score < that.haplotype_score;
}
// halpotype index in haplotypes member of FastPassAligner class
size_t haplotype_index;
// sum of all aligned read scores. Each read's
// haplotype_score = number of matched bases.
int haplotype_score;
// Simple alignment haplotype_score for each read.
std::vector<ReadAlignment> read_alignment_scores;
// Cigar string for haplotype against reference alignment.
string cigar;
// Cigar as a list of CigarOp
std::list<CigarOp> cigar_ops;
// Hypolotype to reference position.
uint64_t ref_pos;
// Map of shifts that allow to easily calculate read to reference position
// from read to haplotype position. read pos = read_to_haplotype_pos
// + haplotype_to_ref_position
// + hap_to_ref_positions_map[read_to_haplotype_pos]
std::vector<int> hap_to_ref_positions_map;
// If true the haplotype is a reference.
bool is_reference;
};
// Calculate a shift for each position of a haplotype from a haplotype to
// reference alignment.
void SetPositionsMap(size_t haplotype_size,
HaplotypeReadsAlignment* hyplotype_alignment);
void MergeCigarOp(const CigarOp& op, int read_len, std::list<CigarOp>* cigar);
using KmerIndexType =
absl::flat_hash_map<absl::string_view, std::vector<KmerOccurrence>>;
// Align a set of reads to a target sequence.
// This class is intended for realigning reads to haplotypes (graph paths)
// generated by DeBrujn graph. Since graph's paths are constructed from the same
// reads all "correct" paths should have almost perfect alignment with exception
// of reads with sequencing errors. Alignment is done by using reads kmer index.
// Using the index we can quickly (in O(n)) find all reads overlapping all kmers
// in a given haplotype. Haplotype is quickly discarded If we cannot get a 100%
// coverage using read kmer index. Details of 1st step: For each kmer in
// haplotype:
// 1. Find all reads overlapping given kmer
// 2. Compare each read with target sequence. Keep the read if we can have (x)M
// alignment, allowing up to 3 mismatches, where
// x is the length of the read. This operation can be very fast if we
// encode each base with 2 bits. We may use special instructions that count
// set bits in one cycle.
// 3. Keep only reads that have perfect alignment.
// 4. Due to sequencing errors some reads may be left unaligned. For those read
// we can use a SW
// alignment. Number of such reads should be very low (less than error rate
// because only structural errors may contribute)
// Next step we need to align haplotype to reference.
// Next step using haplotype to reference alignment we can correct all reads
// alignment for the final result.
class FastPassAligner {
public:
void set_reference(const string& reference);
void set_reads(const std::vector<string>& reads);
std::vector<string> get_reads() const { return reads_; }
void set_ref_start(const string& chromosome, uint64_t position);
void set_haplotypes(const std::vector<string>& haplotypes);
void set_normalize_reads(bool normalize_reads) {
normalize_reads_ = normalize_reads;
}
uint8_t get_match_score() const { return match_score_; }
uint8_t get_mismatch_penalty() const { return mismatch_penalty_; }
void set_options(const AlignerOptions& options);
void set_is_debug(bool is_debug) { debug_out_ = is_debug; }
void set_debug_read_id(int read_id) { debug_read_id_ = read_id; }
void set_ref_prefix_len(int ref_prefix_len) {
ref_prefix_len_ = ref_prefix_len;
}
void set_ref_suffix_len(int ref_suffix_len) {
ref_suffix_len_ = ref_suffix_len;
}
int16_t get_ssw_alignment_score_threshold() const {
return ssw_alignment_score_threshold_;
}
// Align reads to the reference by first aligning reads to haplotypes and
// then by merging haplotype to reference cigars and reads to haplotype
// cigars.
// This function is an entry point for FastPassAligner.
std::unique_ptr<std::vector<nucleus::genomics::v1::Read>> AlignReads(
const std::vector<nucleus::genomics::v1::Read>& reads_param);
// Build K-mer index for all reads.
void BuildIndex();
KmerIndexType GetKmerIndex() const { return kmer_index_; }
// Align all reads to a haplotype using fast pass alignment.
void FastAlignReadsToHaplotype(
absl::string_view haplotype, int* haplotype_score,
std::vector<ReadAlignment>* haplotype_read_alignment_scores);
// Align reads to haplotypes using SSW library. Only reads that could not
// be aligned with FastAlignReadsToHaplotype are aligned here.
// Only alignment with better than score_threshold score are kept.
void SswAlignReadsToHaplotypes(uint16_t score_threshold);
// Initialize SSW library.
void InitSswLib();
void SswSetReference(const string& reference);
// Align target sequence to the reference using SSW library. InitSswLib must
// be called prior to calling SswAlign.
Alignment SswAlign(const string& target) const;
// Align all haplotypes to the reference using SSW library.
void AlignHaplotypesToReference();
const std::vector<HaplotypeReadsAlignment>& GetReadToHaplotypeAlignments()
const {
return read_to_haplotype_alignments_;
}
// Finds the best alignment by iterating all haplotype alignments.
// TODO Can be optimized if we store the best alignment with the read.
bool GetBestReadAlignment(size_t readId, int* best_hap_index) const;
// Calculate a read alignment by merging read to haplotype and haplotype to
// reference alignments.
// 1. Extracts a portion of haplotype to reference cigar that overlaps
// positions of read to haplotype alignment.
// 2. Iterates through each cigar operation for both alignments, merging 2
// operations of length 1 at a time.
// 3. Different logic is implemented for each type of merges: =:=, DEL:=,
// =:DEL, INS:=, =:INS, DEL:DEL, INS:INS, DEL:INS
void CalculateReadToRefAlignment(
size_t read_index,
const ReadAlignment& read_to_haplotype_alignment,
const std::list<CigarOp>& haplotype_to_ref_cigar_ops_input,
std::list<CigarOp>* read_to_ref_cigar_ops) const;
bool IsAlignmentNormalized(
const std::list<CigarOp>& cigar,
int ref_offset,
absl::string_view read_sequence) const;
// Replace each read alignment with the better one (if available), otherwise
// an original alignment is preserved.
// realigned_reads param is eventually passed to Python wrapped in unique_ptr
// as per clif requirements.
void RealignReadsToReference(
const std::vector<nucleus::genomics::v1::Read>& reads,
std::unique_ptr<std::vector<nucleus::genomics::v1::Read>>*
realigned_reads);
void CalculateSswAlignmentScoreThreshold();
private:
// Reference sequence for the window
string reference_;
// Chromosome name for the window start position
string region_chromosome_;
// Position in chromosome for the window start position
uint64_t region_position_in_chr_;
// Vector of haploptypes sequences
std::vector<string> haplotypes_;
std::vector<HaplotypeReadsAlignment> read_to_haplotype_alignments_;
// index of reads. Allows to find all reads that contain a given k-mer
// and their align position.
KmerIndexType kmer_index_;
// Vector of reads that need to be realigned
std::vector<string> reads_;
// K-mer size that is used for indexing input reads
int kmer_size_ = 32;
// Expected read length. It is needed for sanity checking. Actual reads may
// be different sizes. Although, actual read sizes should be close to
// read_size.
int read_size_;
int max_num_of_mismatches_ = 2;
// Only read alignments with the score greater than
// ssw_alignment_score_threshold_ are kept. ssw_alignment_score_threshold_
// is calculated from similarity_threshold_ and read_size.
// Most of the reads should almost perfectly align to haplotypes. Read may
// not align to haplotype perfectly if:
// - there is a sequencing error;
// - read does not really come from this region;
// - haplotype does not represent a real target genome sequence.
int16_t ssw_alignment_score_threshold_;
// Four parameters below specify alignment scoring schema.
uint8_t match_score_ = 4;
uint8_t mismatch_penalty_ = 6;
uint8_t gap_opening_penalty_ = 8;
uint8_t gap_extending_penalty_ = 1;
bool force_alignment_ = false;
// Threshold is calculated from this flag using the following formula.
// score_threshold = match_score_ * read_size_ * <similarity_threshold_>
// - mismatch_penalty_ * read_size_ * (1 - <similarity_threshold_ >);
// This threshold is used for alignments performed with SSW.
double similarity_threshold_ = 0.85;
// SSW aligner instance
std::unique_ptr<Aligner> ssw_aligner_;
// These attributes allow debug output.
bool debug_out_ = false;
int debug_read_id_ = 0;
int ref_prefix_len_;
int ref_suffix_len_;
// Set to the same value of --normalize_reads flag from make_examples.
bool normalize_reads_ = false;
// Alingn reads to haplotypes by simply comparing strings. This way we will
// be able align all the reads that are aligned to haplotypes w/o indels.
void FastAlignReadsToHaplotypes();
void AddReadToIndex(absl::string_view read, ReadId read_id);
void AddKmerToIndex(absl::string_view kmer, ReadId read_id,
KmerOffset pos);
int FastAlignStrings(absl::string_view s1, absl::string_view s2,
int max_mismatches, int* num_of_mismatches) const;
void UpdateBestHaplotypes(
size_t haplotype_index, int haplotype_score,
const std::vector<ReadAlignment>& current_read_scores);
// Update position map for each haplotype. Position map stores shifts for
// each position of a haplotype in respect to haplotype to reference
// alignment. This map helps to quickly calculate reference position for
// a read from a read to haplotype alignment.
void CalculatePositionMaps();
};
} // namespace deepvariant
} // namespace genomics
} // namespace learning
#endif // LEARNING_GENOMICS_DEEPVARIANT_REALIGNER_FAST_PASS_ALIGNER_H_