deepvariant-r1.6.1 / Git / [9b26b7] /deepvariant/realigner/fast_pass

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / deepvariant / realigner / fast_pass_aligner.h
History
Download this file
437 lines (356 with data), 16.3 kB

/*
 * Copyright 2018 Google LLC.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef LEARNING_GENOMICS_DEEPVARIANT_REALIGNER_FAST_PASS_ALIGNER_H_
#define LEARNING_GENOMICS_DEEPVARIANT_REALIGNER_FAST_PASS_ALIGNER_H_

#include <limits>
#include <list>
#include <map>
#include <memory>
#include <set>
#include <utility>
#include <vector>

#include "deepvariant/protos/realigner.pb.h"
#include "deepvariant/realigner/ssw.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/node_hash_map.h"
#include "absl/memory/memory.h"
#include "absl/strings/string_view.h"
#include "third_party/nucleus/protos/cigar.pb.h"
#include "third_party/nucleus/protos/reads.pb.h"
#include "re2/re2.h"

namespace learning {
namespace genomics {
namespace deepvariant {

using nucleus::genomics::v1::CigarUnit;
using nucleus::genomics::v1::LinearAlignment;
using nucleus::genomics::v1::Position;

// TODO Add ChromosomePosition type instead of uint64_t

struct Kmer {
  string sequence;
};

struct ReadId {
  ReadId() : is_set(false), id(0) {}
  explicit ReadId(size_t id) : is_set(true), id(id) {}

  explicit operator int64_t() const { return id; }
  explicit operator uint64_t() const { return id; }

  bool operator<(const ReadId& that) const { return id < that.id; }
  bool operator==(const ReadId& that) const { return id == that.id; }

  bool is_set;
  size_t id;
};

struct KmerOffset {
  KmerOffset() : is_set(false), pos(0) {}
  explicit KmerOffset(size_t pos) : is_set(true), pos(pos) {}

  bool operator==(const KmerOffset& that) const {
    return pos == that.pos && is_set == that.is_set;
  }

  bool is_set;
  size_t pos;
};

// Used in Reads kmer index
struct KmerOccurrence {
  KmerOccurrence() {}
  KmerOccurrence(ReadId read_id, KmerOffset pos)
      : read_id(read_id), read_pos(pos) {}

  bool operator==(const KmerOccurrence& that) const {
    return read_id == that.read_id && read_pos == that.read_pos;
  }

  ReadId read_id;       // ID if the read this Kmer appears in.
  KmerOffset read_pos;  // Position in the read.
};

// Store read alignment to haplotype
// TODO position could be of type KmerOffset, but then it needs to be
// renamed. That would be better because default position = 0 is a valid
// position.
struct ReadAlignment {
  static constexpr uint16_t kNotAligned = std::numeric_limits<uint16_t>::max();
  ReadAlignment() : position(kNotAligned), cigar(""), score(0) {}

  ReadAlignment(uint16_t position_param, absl::string_view cigar_param,
                int score_param)
      : position(position_param), cigar(cigar_param), score(score_param) {}

  bool operator==(const ReadAlignment& that) const {
    return score == that.score && position == that.position &&
           cigar == that.cigar;
  }

  void reset() {
    score = 0;
    position = kNotAligned;
    cigar = "";
  }

  uint16_t position;
  string cigar;
  int score;
};

// Cigar operation is defined by operation type and its length.
struct CigarOp {
  CigarOp()
      : operation(nucleus::genomics::v1::CigarUnit::OPERATION_UNSPECIFIED),
        length(0) {}
  CigarOp(CigarUnit::Operation op, int len) : operation(op), length(len) {}

  bool operator==(const CigarOp& that) const {
    return operation == that.operation && length == that.length;
  }

  CigarUnit::Operation operation;
  int length;
};

// TODO HaplotypeReadsAlignment is not a good name for this structure.
// It contains not only read alignments to haplotype but also haplotype to
// reference alignment.
// Stores alignment scores for all reads to one haplotype.
// haplotype_score = sum of all aligned read scores, where read score
// is a number of matched bases. This way we can order haplotypes by number of
// supporting reads.
struct HaplotypeReadsAlignment {
  HaplotypeReadsAlignment() : haplotype_index(0), haplotype_score(0) {}
  HaplotypeReadsAlignment(
      size_t haplotype_index,
      int score,
      const std::vector<ReadAlignment>& read_alignment_scores)
        : haplotype_index(haplotype_index), haplotype_score(score) {
    this->read_alignment_scores.assign(read_alignment_scores.begin(),
                                       read_alignment_scores.end());
  }

  bool operator==(const HaplotypeReadsAlignment& that) const {
    return haplotype_index == that.haplotype_index &&
           haplotype_score == that.haplotype_score &&
           read_alignment_scores == that.read_alignment_scores &&
           cigar == that.cigar && cigar_ops == that.cigar_ops &&
           is_reference == that.is_reference &&
           hap_to_ref_positions_map == that.hap_to_ref_positions_map;
  }

  bool operator<(const HaplotypeReadsAlignment& that) const {
    return haplotype_score < that.haplotype_score;
  }

  // halpotype index in haplotypes member of FastPassAligner class
  size_t haplotype_index;

  // sum of all aligned read scores. Each read's
  // haplotype_score = number of matched bases.
  int haplotype_score;

  // Simple alignment haplotype_score for each read.
  std::vector<ReadAlignment> read_alignment_scores;

  // Cigar string for haplotype against reference alignment.
  string cigar;

  // Cigar as a list of CigarOp
  std::list<CigarOp> cigar_ops;

  // Hypolotype to reference position.
  uint64_t ref_pos;

  // Map of shifts that allow to easily calculate read to reference position
  // from read to haplotype position. read pos = read_to_haplotype_pos
  //    + haplotype_to_ref_position
  //    + hap_to_ref_positions_map[read_to_haplotype_pos]
  std::vector<int> hap_to_ref_positions_map;

  // If true the haplotype is a reference.
  bool is_reference;
};

// Calculate a shift for each position of a haplotype from a haplotype to
// reference alignment.
void SetPositionsMap(size_t haplotype_size,
                     HaplotypeReadsAlignment* hyplotype_alignment);

void MergeCigarOp(const CigarOp& op, int read_len, std::list<CigarOp>* cigar);

using KmerIndexType =
    absl::flat_hash_map<absl::string_view, std::vector<KmerOccurrence>>;

// Align a set of reads to a target sequence.
// This class is intended for realigning reads to haplotypes (graph paths)
// generated by DeBrujn graph. Since graph's paths are constructed from the same
// reads all "correct" paths should have almost perfect alignment with exception
// of reads with sequencing errors. Alignment is done by using reads kmer index.
// Using the index we can quickly (in O(n)) find all reads overlapping all kmers
// in a given haplotype. Haplotype is quickly discarded If we cannot get a 100%
// coverage using read kmer index. Details of 1st step: For each kmer in
// haplotype:
//  1. Find all reads overlapping given kmer
//  2. Compare each read with target sequence. Keep the read if we can have (x)M
//  alignment, allowing up to 3 mismatches, where
//      x is the length of the read. This operation can be very fast if we
//      encode each base with 2 bits. We may use special instructions that count
//      set bits in one cycle.
//  3. Keep only reads that have perfect alignment.
//  4. Due to sequencing errors some reads may be left unaligned. For those read
//  we can use a SW
//      alignment. Number of such reads should be very low (less than error rate
//      because only structural errors may contribute)
// Next step we need to align haplotype to reference.
// Next step using haplotype to reference alignment we can correct all reads
// alignment for the final result.
class FastPassAligner {
 public:
  void set_reference(const string& reference);
  void set_reads(const std::vector<string>& reads);
  std::vector<string> get_reads() const { return reads_; }
  void set_ref_start(const string& chromosome, uint64_t position);
  void set_haplotypes(const std::vector<string>& haplotypes);
  void set_normalize_reads(bool normalize_reads) {
    normalize_reads_ = normalize_reads;
  }
  uint8_t get_match_score() const { return match_score_; }
  uint8_t get_mismatch_penalty() const { return mismatch_penalty_; }
  void set_options(const AlignerOptions& options);
  void set_is_debug(bool is_debug) { debug_out_ = is_debug; }
  void set_debug_read_id(int read_id) { debug_read_id_ = read_id; }
  void set_ref_prefix_len(int ref_prefix_len) {
    ref_prefix_len_ = ref_prefix_len;
  }
  void set_ref_suffix_len(int ref_suffix_len) {
    ref_suffix_len_ = ref_suffix_len;
  }
  int16_t get_ssw_alignment_score_threshold() const {
    return ssw_alignment_score_threshold_;
  }

  // Align reads to the reference by first aligning reads to haplotypes and
  // then by merging haplotype to reference cigars and reads to haplotype
  // cigars.
  // This function is an entry point for FastPassAligner.
  std::unique_ptr<std::vector<nucleus::genomics::v1::Read>> AlignReads(
      const std::vector<nucleus::genomics::v1::Read>& reads_param);

  // Build K-mer index for all reads.
  void BuildIndex();

  KmerIndexType GetKmerIndex() const { return kmer_index_; }

  // Align all reads to a haplotype using fast pass alignment.
  void FastAlignReadsToHaplotype(
      absl::string_view haplotype, int* haplotype_score,
      std::vector<ReadAlignment>* haplotype_read_alignment_scores);

  // Align reads to haplotypes using SSW library. Only reads that could not
  // be aligned with FastAlignReadsToHaplotype are aligned here.
  // Only alignment with better than score_threshold score are kept.
  void SswAlignReadsToHaplotypes(uint16_t score_threshold);

  // Initialize SSW library.
  void InitSswLib();

  void SswSetReference(const string& reference);

  // Align target sequence to the reference using SSW library. InitSswLib must
  // be called prior to calling SswAlign.
  Alignment SswAlign(const string& target) const;

  // Align all haplotypes to the reference using SSW library.
  void AlignHaplotypesToReference();

  const std::vector<HaplotypeReadsAlignment>& GetReadToHaplotypeAlignments()
      const {
    return read_to_haplotype_alignments_;
  }

  // Finds the best alignment by iterating all haplotype alignments.
  // TODO Can be optimized if we store the best alignment with the read.
  bool GetBestReadAlignment(size_t readId, int* best_hap_index) const;

  // Calculate a read alignment by merging read to haplotype and haplotype to
  // reference alignments.
  // 1. Extracts a portion of haplotype to reference cigar that overlaps
  //    positions of read to haplotype alignment.
  // 2. Iterates through each cigar operation for both alignments, merging 2
  //    operations of length 1 at a time.
  // 3. Different logic is implemented for each type of merges: =:=, DEL:=,
  //    =:DEL, INS:=, =:INS, DEL:DEL, INS:INS, DEL:INS
  void CalculateReadToRefAlignment(
      size_t read_index,
      const ReadAlignment& read_to_haplotype_alignment,
      const std::list<CigarOp>& haplotype_to_ref_cigar_ops_input,
      std::list<CigarOp>* read_to_ref_cigar_ops) const;

  bool IsAlignmentNormalized(
    const std::list<CigarOp>& cigar,
    int ref_offset,
    absl::string_view read_sequence) const;

  // Replace each read alignment with the better one (if available), otherwise
  // an original alignment is preserved.
  // realigned_reads param is eventually passed to Python wrapped in unique_ptr
  // as per clif requirements.
  void RealignReadsToReference(
      const std::vector<nucleus::genomics::v1::Read>& reads,
      std::unique_ptr<std::vector<nucleus::genomics::v1::Read>>*
          realigned_reads);

  void CalculateSswAlignmentScoreThreshold();

 private:
  // Reference sequence for the window
  string reference_;

  // Chromosome name for the window start position
  string region_chromosome_;

  // Position in chromosome for the window start position
  uint64_t region_position_in_chr_;

  // Vector of haploptypes sequences
  std::vector<string> haplotypes_;

  std::vector<HaplotypeReadsAlignment> read_to_haplotype_alignments_;

  // index of reads. Allows to find all reads that contain a given k-mer
  // and their align position.
  KmerIndexType kmer_index_;

  // Vector of reads that need to be realigned
  std::vector<string> reads_;

  // K-mer size that is used for indexing input reads
  int kmer_size_ = 32;

  // Expected read length. It is needed for sanity checking. Actual reads may
  // be different sizes. Although, actual read sizes should be close to
  // read_size.
  int read_size_;

  int max_num_of_mismatches_ = 2;

  // Only read alignments with the score greater than
  // ssw_alignment_score_threshold_ are kept. ssw_alignment_score_threshold_
  // is calculated from similarity_threshold_ and read_size.
  // Most of the reads should almost perfectly align to haplotypes. Read may
  // not align to haplotype perfectly if:
  //  - there is a sequencing error;
  //  - read does not really come from this region;
  //  - haplotype does not represent a real target genome sequence.
  int16_t ssw_alignment_score_threshold_;

  // Four parameters below specify alignment scoring schema.
  uint8_t match_score_ = 4;
  uint8_t mismatch_penalty_ = 6;
  uint8_t gap_opening_penalty_ = 8;
  uint8_t gap_extending_penalty_ = 1;
  bool force_alignment_ = false;

  // Threshold is calculated from this flag using the following formula.
  // score_threshold = match_score_ * read_size_ * <similarity_threshold_>
  //    - mismatch_penalty_ * read_size_ * (1 - <similarity_threshold_ >);
  // This threshold is used for alignments performed with SSW.
  double similarity_threshold_ = 0.85;

  // SSW aligner instance
  std::unique_ptr<Aligner> ssw_aligner_;

  // These attributes allow debug output.
  bool debug_out_ = false;
  int debug_read_id_ = 0;

  int ref_prefix_len_;
  int ref_suffix_len_;

  // Set to the same value of --normalize_reads flag from make_examples.
  bool normalize_reads_ = false;

  // Alingn reads to haplotypes by simply comparing strings. This way we will
  // be able align all the reads that are aligned to haplotypes w/o indels.
  void FastAlignReadsToHaplotypes();

  void AddReadToIndex(absl::string_view read, ReadId read_id);

  void AddKmerToIndex(absl::string_view kmer, ReadId read_id,
                      KmerOffset pos);

  int FastAlignStrings(absl::string_view s1, absl::string_view s2,
                       int max_mismatches, int* num_of_mismatches) const;

  void UpdateBestHaplotypes(
      size_t haplotype_index, int haplotype_score,
      const std::vector<ReadAlignment>& current_read_scores);

  // Update position map for each haplotype. Position map stores shifts for
  // each position of a haplotype in respect to haplotype to reference
  // alignment. This map helps to quickly calculate reference position for
  // a read from a read to haplotype alignment.
  void CalculatePositionMaps();
};

}  // namespace deepvariant
}  // namespace genomics
}  // namespace learning

#endif  // LEARNING_GENOMICS_DEEPVARIANT_REALIGNER_FAST_PASS_ALIGNER_H_