deepvariant-r1.6.1 / Git / [9b26b7] /deepvariant/realigner/fast_pass

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / deepvariant / realigner / fast_pass_aligner.cc
History
Download this file
971 lines (888 with data), 36.8 kB

/*
 * Copyright 2018 Google LLC.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "deepvariant/realigner/fast_pass_aligner.h"

#include <algorithm>
#include <fstream>
#include <iostream>
#include <iterator>
#include <list>
#include <memory>
#include <set>
#include <sstream>
#include <string>
#include <vector>

#include "absl/log/check.h"
#include "absl/log/log.h"
#include "absl/memory/memory.h"
#include "absl/strings/ascii.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "third_party/nucleus/protos/cigar.pb.h"
#include "third_party/nucleus/protos/position.pb.h"
#include "re2/re2.h"

namespace learning {
namespace genomics {
namespace deepvariant {

void FastPassAligner::set_reference(const string& reference) {
  this->reference_ = reference;
}

void FastPassAligner::set_reads(const std::vector<string>& reads) {
  this->reads_ = reads;
}

void FastPassAligner::set_ref_start(const string& chromosome,
                                    uint64_t position) {
  this->region_chromosome_ = chromosome;
  this->region_position_in_chr_ = position;
}

void FastPassAligner::set_haplotypes(const std::vector<string>& haplotypes) {
  this->haplotypes_ = haplotypes;
}

void FastPassAligner::set_options(const AlignerOptions& options) {
  // There is no is_set method in proto so we assume that value is set if it is
  // not zero.
  if (options.kmer_size() > 0) {
    this->kmer_size_ = options.kmer_size();
  }
  if (options.read_size() > 0) {
    this->read_size_ = options.read_size();
  }
  if (options.max_num_of_mismatches() > 0) {
    this->max_num_of_mismatches_ = options.max_num_of_mismatches();
  }
  if (options.realignment_similarity_threshold() > 0.0) {
    this->similarity_threshold_ = options.realignment_similarity_threshold();
  }
  if (options.match() > 0) {
    this->match_score_ = options.match();
  }
  if (options.mismatch() > 0) {
    this->mismatch_penalty_ = options.mismatch();
  }
  if (options.gap_open() > 0) {
    this->gap_opening_penalty_ = options.gap_open();
  }
  if (options.gap_extend() > 0) {
    this->gap_extending_penalty_ = options.gap_extend();
  }
  this->force_alignment_ = options.force_alignment();

  CHECK(kmer_size_ >= 3 && kmer_size_ <= 32);
  CHECK_GE(similarity_threshold_, 0.0);
  CHECK_LE(similarity_threshold_, 1.0);
  CHECK_GE(max_num_of_mismatches_, 0);
  CHECK(kmer_size_ >= 3 && kmer_size_ <= 32);
}

void FastPassAligner::CalculateSswAlignmentScoreThreshold() {
  ssw_alignment_score_threshold_ = match_score_
      * read_size_
      * similarity_threshold_
          - mismatch_penalty_
      * read_size_
      * (1 - similarity_threshold_);
  if (ssw_alignment_score_threshold_ < 0) {
    ssw_alignment_score_threshold_ = 1;
  }
}

// Fast align reads to haplotypes using reads index.
// Align reads that could not be aligned in the first step using ssw aligner.
// Keep the best alignment for each read, or preserve an original one if read
// could not be realigned with a high enough score.
std::unique_ptr<std::vector<nucleus::genomics::v1::Read>>
FastPassAligner::AlignReads(
    const std::vector<nucleus::genomics::v1::Read>& reads_param) {

  // Copy reads
  for (const auto& read : reads_param) {
    reads_.push_back(absl::AsciiStrToUpper(read.aligned_sequence()));
  }

  CalculateSswAlignmentScoreThreshold();

  // Build index
  BuildIndex();

  // Align reads to haplotypes using reads index. This is O(n) operation per
  // read, where n = read size.
  FastAlignReadsToHaplotypes();

  // Initialize ssw library. Set reference.
  InitSswLib();

  // Align haplotypes to the reference.
  AlignHaplotypesToReference();

  // calculate position shifts.
  CalculatePositionMaps();

  // Align reads that couldn't be aligned in FastAlignReadsToHaplotypes using
  // ssw library.
  SswAlignReadsToHaplotypes(ssw_alignment_score_threshold_);

  // Sort haplotypes by number of supporting reads. First haplotype is the one
  // that has fewer supporting reads.
  std::sort(read_to_haplotype_alignments_.begin(),
            read_to_haplotype_alignments_.end());

  // Realign reads that we could successfully realign in previous steps back to
  // reference. From all read to haplotype alignments the best one is picked.
  // In the case where read alignments are equally good to ref haplotype and
  // non-ref haplotype, a non-ref haplotype is preferred.
  std::unique_ptr<std::vector<nucleus::genomics::v1::Read>> realigned_reads(
      new std::vector<nucleus::genomics::v1::Read>());
  RealignReadsToReference(reads_param, &realigned_reads);

  return realigned_reads;
}

void FastPassAligner::InitSswLib() {
  // Initialize ssw library. Set reference.
  Filter filter;
  ssw_aligner_ =
      std::make_unique<Aligner>(match_score_, mismatch_penalty_,
                                gap_opening_penalty_, gap_extending_penalty_);
}

void FastPassAligner::SswSetReference(const string& reference) {
  CHECK(ssw_aligner_);
  ssw_aligner_->SetReferenceSequence(reference);
}

Alignment FastPassAligner::SswAlign(const string& target) const {
  CHECK(ssw_aligner_);
  Filter filter;
  Alignment alignment;
  if (ssw_aligner_->Align(target, filter, &alignment) == 0) {
    return alignment;
  } else {
    LOG(WARNING) << "SSW alignment failed for query: '" << target << "'";
    return Alignment();
  }
}

// For each haplotype try to find all reads that can be aligned using index.
void FastPassAligner::FastAlignReadsToHaplotypes() {
  std::vector<ReadAlignment> read_alignment_scores(reads_.size());
  for (int i = 0; i < haplotypes_.size(); i++) {
    const auto& haplotype = haplotypes_[i];
    int haplotype_score = 0;
    for (auto& readAlignment : read_alignment_scores) {
      readAlignment.reset();
    }
    FastAlignReadsToHaplotype(haplotype,
                              &haplotype_score,
                              &read_alignment_scores);

    // haplotype_score == 0 means we found a problem with this haplotype. In
    // this case we need to discard of this haplotype.
    if (haplotype_score == 0) {
      for (auto& readAlignment : read_alignment_scores) {
        readAlignment.reset();
      }
    }

    read_to_haplotype_alignments_.push_back(
        HaplotypeReadsAlignment(i, haplotype_score, read_alignment_scores));
  }
}

void FastPassAligner::FastAlignReadsToHaplotype(
    absl::string_view haplotype, int* haplotype_score,
    std::vector<ReadAlignment>* haplotype_read_alignment_scores) {
  CHECK(haplotype_score != nullptr);
  CHECK(haplotype_read_alignment_scores != nullptr);

  bool is_ref = (haplotype == reference_);
  std::vector<int> coverage(haplotype.size(), 0);
  // In the loop we try to align reads for each position in haplotype up to
  // lastPos.
  const auto& lastPos = haplotype.length() - kmer_size_;
  for (int i = 0; i <= lastPos; i++) {
    // get all reads that are aligned against i-th position
    auto index_it = kmer_index_.find(haplotype.substr(i, kmer_size_));
    if (index_it == kmer_index_.end()) {
      continue;
    }
    // Iterate through all the reads that are found in the index for the current
    // kmer.
    for (const auto& it : index_it->second) {
      uint64_t read_id_index = static_cast<uint64_t>(it.read_id);
      CHECK(read_id_index < reads_.size() && it.read_id.is_set);
      size_t target_start_pos = std::max(
          static_cast<int64_t>(0),
          static_cast<int64_t>(i) - static_cast<int64_t>(it.read_pos.pos));
      size_t cur_read_size = reads_[read_id_index].size();
      size_t span = cur_read_size;
      if (target_start_pos + cur_read_size > haplotype.length()) {
        continue;
      }
      auto& read_alignment =
          (*haplotype_read_alignment_scores)[read_id_index];

      // This read is already aligned, skip it.
      if (read_alignment.position != ReadAlignment::kNotAligned &&
          read_alignment.position == target_start_pos) {
        continue;
      }
      CHECK(target_start_pos + span <= haplotype.size());
      int num_of_mismatches = 0;
      int new_read_alignment_score = FastAlignStrings(
          haplotype.substr(target_start_pos, span),
          reads_[read_id_index],
          max_num_of_mismatches_ + 1, &num_of_mismatches);

      if (num_of_mismatches <= max_num_of_mismatches_) {
        CHECK(it.read_id.is_set &&
            read_id_index < haplotype_read_alignment_scores->size());
        int oldScore = read_alignment.score;

        for (auto pos = target_start_pos; pos < target_start_pos + span;
             pos++) {
          coverage[pos]++;
        }

        if (oldScore < new_read_alignment_score) {
          read_alignment.score = new_read_alignment_score;
          *haplotype_score -= oldScore;
          *haplotype_score += read_alignment.score;
          read_alignment.position = target_start_pos;
          read_alignment.cigar = std::to_string(cur_read_size) + "=";
        }
      }
    }  // for (matching reads)
    // We want to discard haplotypes that don't have good read support.
    // At the same time we don't want to discard reference haplotype because,
    // there might be cases were not all haplotypes were generated and we can
    // get away with that by aligning reads to reference haplotype.
    if (coverage[i] == 0 && i >= ref_prefix_len_ &&
        i < haplotype.size() - ref_suffix_len_ && !is_ref) {
      *haplotype_score = 0;
      return;
    }
  }    // for (all k-mer positions)
}

// Align 2 same length strings by comparing each character.
int FastPassAligner::FastAlignStrings(absl::string_view s1,
                                      absl::string_view s2,
                                      int max_mismatches,
                                      int* num_of_mismatches) const {
  int num_of_matches = 0;
  *num_of_mismatches = 0;
  CHECK(s1.size() == s2.size());
  for (int i = 0; i < s1.size(); i++) {
    const auto& c1 = s1[i];
    const auto& c2 = s2[i];
    if (c1 != c2 && (c1 != 'N' && c2 != 'N')) {
      if (c1 != c2) {
        (*num_of_mismatches)++;
      }
      if (*num_of_mismatches == max_mismatches) {
        return 0;
      }
    } else {
      num_of_matches++;
    }
  }
  return num_of_matches * match_score_ - *num_of_mismatches * mismatch_penalty_;
}

CigarUnit::Operation CigarOperationFromChar(char op) {
  switch (op) {
    case '=':
    case 'X':
      return nucleus::genomics::v1::CigarUnit::ALIGNMENT_MATCH;
    case 'S':
      return nucleus::genomics::v1::CigarUnit::CLIP_SOFT;
    case 'D':
      return nucleus::genomics::v1::CigarUnit::DELETE;
    case 'I':
      return nucleus::genomics::v1::CigarUnit::INSERT;
    default:
      return nucleus::genomics::v1::CigarUnit::OPERATION_UNSPECIFIED;
  }
}

std::list<CigarOp> CigarStringToVector(absl::string_view cigar) {
  std::list<CigarOp> cigarOps;
  absl::string_view input(cigar);
  RE2 pattern("(\\d+)([XIDS=])");
  int opLen;
  string opType;
  while (RE2::Consume(&input, pattern, &opLen, &opType)) {
    CHECK_EQ(opType.length(), 1);
    CigarUnit::Operation op = CigarOperationFromChar(opType[0]);
    cigarOps.push_back(CigarOp(op, opLen));
  }
  return cigarOps;
}

inline bool AlignmentIsRef(absl::string_view cigar, size_t target_len) {
  return cigar == absl::StrCat(target_len, "=");
}

// Align haplotypes to reference using ssw library.
void FastPassAligner::AlignHaplotypesToReference() {
  SswSetReference(reference_);

  // Initialize read_to_haplotype_alignments_ if it is not initialized yet.
  if (read_to_haplotype_alignments_.empty()) {
    for (int i = 0; i < haplotypes_.size(); i++) {
      read_to_haplotype_alignments_.push_back(HaplotypeReadsAlignment(
          i, -1, std::vector<ReadAlignment>(reads_.size())));
    }
  }

  for (auto& haplotype_alignment : read_to_haplotype_alignments_) {
    Filter filter;
    CHECK(haplotype_alignment.haplotype_index < haplotypes_.size());
    auto hap_len = haplotypes_[haplotype_alignment.haplotype_index].size();
    // Most of the time, one haplotype will perfectly match the reference.
    if (haplotypes_[haplotype_alignment.haplotype_index] == reference_) {
      haplotype_alignment.is_reference = true;
      haplotype_alignment.cigar = absl::StrCat(hap_len, "=");
      haplotype_alignment.cigar_ops =
          CigarStringToVector(haplotype_alignment.cigar);
      haplotype_alignment.ref_pos = 0;
    } else {
      Alignment alignment =
          SswAlign(haplotypes_[haplotype_alignment.haplotype_index]);
      if (alignment.sw_score > 0) {
        // In rare cases, the ref haplotype will be a substring of the ref, and
        // therefore not caught by the string equality check above.
        haplotype_alignment.is_reference =
            AlignmentIsRef(alignment.cigar_string, hap_len);

        haplotype_alignment.cigar = alignment.cigar_string;
        haplotype_alignment.cigar_ops =
            CigarStringToVector(haplotype_alignment.cigar);
        haplotype_alignment.ref_pos = alignment.ref_begin;
      }
    }
  }
}

void FastPassAligner::SswAlignReadsToHaplotypes(uint16_t score_threshold) {
  // For each read
  for (int i = 0; i < reads_.size(); i++) {
    bool has_at_least_one_alignment = false;
    // Check if this read is aligned to at least one haplotype
    for (const auto& hap_alignment : read_to_haplotype_alignments_) {
      if (hap_alignment.read_alignment_scores[i].score > 0) {
        has_at_least_one_alignment = true;
        break;
      }
    }
    // If this read is not aligned to any of the haplotypes we try SSW.
    if (!has_at_least_one_alignment) {
      for (auto& hap_alignment : read_to_haplotype_alignments_) {
        // Skip haplotypes with no read support (score=0), except if
        // force_alignment, then compute an alignment against the reference no
        // matter what.
        if (hap_alignment.haplotype_score == 0 &&
            !(force_alignment_ && hap_alignment.is_reference)) {
          continue;
        }
        CHECK(hap_alignment.haplotype_index < haplotypes_.size());
        SswSetReference(haplotypes_[hap_alignment.haplotype_index]);
        Alignment alignment = SswAlign(reads_[i]);
        if (alignment.sw_score > 0) {
          // TODO Remove score_threshold condition. It is effectively
          // not used.
          if (alignment.sw_score >= score_threshold ||
              (force_alignment_ && hap_alignment.is_reference)) {
            hap_alignment.read_alignment_scores[i].score = alignment.sw_score;
            hap_alignment.read_alignment_scores[i].cigar =
                alignment.cigar_string;
            hap_alignment.read_alignment_scores[i].position =
                alignment.ref_begin;
          }
        } else if (force_alignment_ && hap_alignment.is_reference) {
        }
      }
    }
  }  // for all reads
}

// Each operation except MATCH is checked in the input cigar.
// If reference base preceding operation is the same as the last base of the
// operation alt bases then alignment is not normalized.
// Example:
// Reference: GATCGAGAGAGAGATC
// Read.    : GATCGA--GAGAGATC
// This alignment is not normalized because last base in deletion 'GA' is the
// same as the base preceding the deletion in the read.
// The reason we sometimes see these non-normalized alignments is because reads
// are aligned to the haplotype first. Continuing the example above, the
// haplotype could be the following:
// Reference: GATCGAGAGAGAGATC
// Haplotype: GATCGT--GAGAGATC
// Read.    : GATCGA--GAGAGATC
// In this case haplotype to the reference alignment is normalized (cannot be
// shifted left), read to haplotype alignment is also normalized, but read to
// reference alignment is not normalized.
// Normalizing the alignment could be done but it is not easy. If we shift
// cigar operation to the left we may need to deal with merging it with other
// operations that may exist.
bool FastPassAligner::IsAlignmentNormalized(
    const std::list<CigarOp>& cigar,
    int ref_offset,
    const absl::string_view read_sequence) const {
  if (ref_offset < 0) {
    return true;
  }
  int cur_ref_offset = ref_offset;
  int cur_read_offset = 0;
  for (const auto& op : cigar) {
    if (op.operation == nucleus::genomics::v1::CigarUnit::CLIP_SOFT) {
      cur_read_offset += op.length;
      continue;
    }
    if (op.operation != nucleus::genomics::v1::CigarUnit::ALIGNMENT_MATCH) {
      std::string op_sequence;
      if  (op.operation == nucleus::genomics::v1::CigarUnit::DELETE) {
        if (cur_ref_offset + op.length > reference_.size()) {
          return false;
        }
        CHECK(cur_ref_offset + op.length <= reference_.size());
        op_sequence = reference_.substr(cur_ref_offset, op.length);
      } else {
        CHECK(cur_read_offset + op.length <= read_sequence.size());
        op_sequence =  read_sequence.substr(cur_read_offset, op.length);
      }
      if ((op.operation == nucleus::genomics::v1::CigarUnit::INSERT
          && op_sequence.back() == reference_[cur_ref_offset - 1]) ||
         (cur_read_offset > 0 &&
          op.operation == nucleus::genomics::v1::CigarUnit::DELETE
          && op_sequence.back() == read_sequence[cur_read_offset - 1])) {
        return false;
      }
    }
    if (op.operation != nucleus::genomics::v1::CigarUnit::INSERT) {
      cur_ref_offset += op.length;
    }
    if (op.operation != nucleus::genomics::v1::CigarUnit::DELETE) {
      cur_read_offset += op.length;
    }
  }
  return true;
}

void FastPassAligner::RealignReadsToReference(
    const std::vector<nucleus::genomics::v1::Read>& reads,
    std::unique_ptr<std::vector<nucleus::genomics::v1::Read>>*
        realigned_reads) {
  // Loop through all reads
  for (size_t read_index = 0; read_index < reads.size(); read_index++) {
    const nucleus::genomics::v1::Read& read = reads[read_index];
    nucleus::genomics::v1::Read realigned_read;
    realigned_read.MergeFrom(read);
    int best_hap_index = -1;
    // See if we have a better alignment
    if (GetBestReadAlignment(read_index, &best_hap_index)) {
      const HaplotypeReadsAlignment& bestHaplotypeAlignments =
          read_to_haplotype_alignments_[best_hap_index];
      std::unique_ptr<LinearAlignment> new_alignment(new LinearAlignment());
      new_alignment->MergeFrom(read.alignment());
      new_alignment->clear_cigar();
      // Calculate new alignment position.
      std::unique_ptr<nucleus::genomics::v1::Position> new_position(
          new nucleus::genomics::v1::Position());
      new_position->MergeFrom(read.alignment().position());
      auto read_to_hap_pos = bestHaplotypeAlignments
          .read_alignment_scores[read_index]
          .position;
      CHECK(read_to_hap_pos < bestHaplotypeAlignments
          .hap_to_ref_positions_map.size());
      int hap_to_ref_position = bestHaplotypeAlignments
          .hap_to_ref_positions_map[read_to_hap_pos];
      // We only change position of original read alignment and don't change
      // chromosome, it shouldn't change anyway!
      new_position->set_position(
          region_position_in_chr_
              + bestHaplotypeAlignments.ref_pos
              + read_to_hap_pos
              + hap_to_ref_position);
      new_alignment->set_allocated_position(new_position.release());
      std::list<CigarOp> readToRefCigarOps;
      // Calculate new cigar by merging read to haplotype and haplotype to ref
      // alignments.
      CalculateReadToRefAlignment(
          read_index, bestHaplotypeAlignments.read_alignment_scores[read_index],
          bestHaplotypeAlignments.cigar_ops, &readToRefCigarOps);

      // The following block is only executed if normalize_reads flag is not
      // set. This is because if --normalize_reads is true, they will be
      // normalize later on.
      if (!normalize_reads_) {
        // If read is not normalized (any cigar operation can be shifter left)
        // we discard the alignment.
        if (!IsAlignmentNormalized(
            readToRefCigarOps,
            bestHaplotypeAlignments.ref_pos
                + read_to_hap_pos
                + hap_to_ref_position,
            reads_[read_index])) {
            readToRefCigarOps.clear();
        }
      }

      for (auto& op : readToRefCigarOps) {
        CigarUnit* cu = new_alignment->add_cigar();
        cu->set_operation(op.operation);
        cu->set_operation_length(op.length);
      }

      if (!readToRefCigarOps.empty()) {
        realigned_read.set_allocated_alignment(new_alignment.release());
      } else if (force_alignment_) {
      }
      (*realigned_reads)->push_back(realigned_read);
    } else {  // Could not find a new alignment.
      if (force_alignment_) {
      } else {
        // Keeping original alignment (force_alignment is off).
        (*realigned_reads)->push_back(realigned_read);
      }
    }
  }  // for
}

void FastPassAligner::AddKmerToIndex(absl::string_view kmer,
                                     ReadId read_id, KmerOffset pos) {
  kmer_index_[kmer].push_back(KmerOccurrence(read_id, pos));
}

void FastPassAligner::AddReadToIndex(absl::string_view read, ReadId read_id) {
  // Ignoring reads that are too short for a kmer size. Those reads will still
  // be realigned with SSW.
  if (read.length() <= kmer_size_) {
    return;
  }
  auto last_pos = read.length() - kmer_size_;
  for (int i = 0; i <= last_pos; i++) {
    AddKmerToIndex(read.substr(i, kmer_size_), read_id, KmerOffset(i));
  }
}

void FastPassAligner::BuildIndex() {
  size_t read_id = 0;
  for (const auto& read : reads_) {
    AddReadToIndex(read, ReadId(read_id++));
  }
}

void SetPositionsMap(size_t haplotype_size,
                     HaplotypeReadsAlignment* hyplotype_alignment) {
  std::vector<int>& positions_map =
      hyplotype_alignment->hap_to_ref_positions_map;
  positions_map.resize(haplotype_size);
  RE2 pattern("(\\d+)([XIDS=])");  // matches cigar operation
  absl::string_view input(hyplotype_alignment->cigar);
  int cur_shift = 0;
  int haplotype_pos = 0;
  int last_pos = 0;
  int operation_len;
  string operation_type;
  while (RE2::Consume(&input, pattern, &operation_len, &operation_type)) {
    CHECK_EQ(operation_type.length(), 1);

    char op = operation_type[0];
    switch (op) {
      case '=':
      case 'X':
        last_pos = haplotype_pos + operation_len;
        while (haplotype_pos != last_pos) {
          positions_map[haplotype_pos] = cur_shift;
          haplotype_pos++;
        }
        break;
      case 'S':
        last_pos = haplotype_pos + operation_len;
        cur_shift -= operation_len;
        while (haplotype_pos != last_pos) {
          positions_map[haplotype_pos] = cur_shift;
          haplotype_pos++;
        }
        break;
      case 'D':
        cur_shift += operation_len;
        break;
      case 'I':
        last_pos = haplotype_pos + operation_len;
        while (haplotype_pos != last_pos) {
          positions_map[haplotype_pos] = cur_shift;
          cur_shift--;
          haplotype_pos++;
        }
        break;
    }
  }
}

void FastPassAligner::CalculatePositionMaps() {
  for (auto& hyplotype_alignment : read_to_haplotype_alignments_) {
    SetPositionsMap(haplotypes_[hyplotype_alignment.haplotype_index].size(),
                    &hyplotype_alignment);
  }
}

bool FastPassAligner::GetBestReadAlignment(
    size_t readId,
    int* best_hap_index) const {
  int best_score = 0;
  bool best_haplotype_found = false;
  for (int hap_index = 0; hap_index < haplotypes_.size(); hap_index++) {
    int hap_score = read_to_haplotype_alignments_[hap_index]
                        .read_alignment_scores[readId]
                        .score;
    // If compared scores are equal, preference is given to a read alignment
    // against a non-reference haplotype.
    if (hap_score > best_score ||
        (best_score > 0 && hap_score == best_score &&
         !read_to_haplotype_alignments_[hap_index].is_reference)) {
      best_score = read_to_haplotype_alignments_[hap_index]
                       .read_alignment_scores[readId]
                       .score;
      *best_hap_index = hap_index;
      best_haplotype_found = true;
    }
  }
  return best_haplotype_found;
}

// Calculate aligned length from cigar.
int AlignedLength(const std::list<CigarOp>& cigar) {
  int len = 0;
  for (auto& op : cigar) {
    if (op.operation != nucleus::genomics::v1::CigarUnit::DELETE) {
      len += op.length;
    }
  }
  return len;
}

// Merge cigar op to the end of the output cigar.
// <op> input parameter is always a one base operation to be merged.
// <cigar> input/output parameter is our merged output cigar.
// - If op.operation is the same as the last one in the cigar then the length
//   of the last operation is increased by op.length.
// - If op.operation is not the same then new operation is added to the cigar.
// For all operations except DELETE we need to make sure that aligned length
// does not go over the read length. This can happen for example when we merge
// a large INS (larger than a read itself)
void MergeCigarOp(const CigarOp& op, int read_len, std::list<CigarOp>* cigar) {
  const auto& last_cigar_op =
      cigar->empty() ? nucleus::genomics::v1::CigarUnit::OPERATION_UNSPECIFIED
                     : cigar->back().operation;
  int aligned_length_before_merge = AlignedLength(*cigar);
  int new_op_length = 0;
  if (op.operation != nucleus::genomics::v1::CigarUnit::DELETE) {
    new_op_length = std::min(op.length, read_len - aligned_length_before_merge);
  } else {
    new_op_length = op.length;
  }

  // Nothing is merged if we already aligned all positions of the read.
  if (new_op_length <= 0 || aligned_length_before_merge == read_len) {
    return;
  }

  // Special processing is used when merging INS and DEL. Going one base at a
  // time INS and DEL cancel each other. In addition we need to add new
  // match operations in place of removed INS and DEL. New Match operation is
  // added right before the trailing INDEL operation.
  if ((op.operation == nucleus::genomics::v1::CigarUnit::INSERT
      && last_cigar_op == nucleus::genomics::v1::CigarUnit::DELETE)
      ||
     (op.operation == nucleus::genomics::v1::CigarUnit::DELETE
     && last_cigar_op == nucleus::genomics::v1::CigarUnit::INSERT)) {
    // Determine the last element in the current cigar list.
    std::list<CigarOp>::iterator last_element = cigar->end();
    if (!cigar->empty()) {
      last_element--;
    }

    // Determine one before last element in cigar list.
    std::list<CigarOp>::iterator one_before_last = cigar->end();
    if (cigar->size() > 1) {
      one_before_last = std::prev(one_before_last, 2);
    } else {
      one_before_last = std::prev(one_before_last, 1);
    }
    if (one_before_last->operation !=
        nucleus::genomics::v1::CigarUnit::ALIGNMENT_MATCH) {
      cigar->insert(last_element,
          CigarOp(nucleus::genomics::v1::CigarUnit::ALIGNMENT_MATCH, 1));
    } else {
      one_before_last->length += 1;
    }

    // Reduce the size of the last operation. If last operation's length was 1
    // then it is removed.
    if (cigar->back().length == 1) {
      cigar->pop_back();
    } else {
      cigar->back().length -= 1;
    }
  } else if (op.operation == last_cigar_op) {
    cigar->back().length += new_op_length;

    // If the op we are adding is not the same as the last, set the proper
    // length and add it to the list.
  }  else {
    cigar->push_back(CigarOp(op.operation, new_op_length));
  }
}

// Following functions are for internal use only.
namespace {

std::list<CigarOp> LeftTrimHaplotypeToRefAlignment(
    const std::list<CigarOp>& haplotype_to_ref_cigar_ops_input,
    int read_to_haplotype_pos) {
  int cur_pos = 0;
  std::list<CigarOp> haplotype_to_ref_cigar_ops(
      haplotype_to_ref_cigar_ops_input);
  while (cur_pos != read_to_haplotype_pos) {
    CHECK(!haplotype_to_ref_cigar_ops.empty());
    CigarOp cur_hap_op = haplotype_to_ref_cigar_ops.front();
    haplotype_to_ref_cigar_ops.pop_front();
    if (cur_hap_op.operation ==
            nucleus::genomics::v1::CigarUnit::ALIGNMENT_MATCH ||
        cur_hap_op.operation == nucleus::genomics::v1::CigarUnit::CLIP_HARD ||
        cur_hap_op.operation == nucleus::genomics::v1::CigarUnit::CLIP_SOFT ||
        cur_hap_op.operation == nucleus::genomics::v1::CigarUnit::INSERT) {
      if (cur_hap_op.length + cur_pos > read_to_haplotype_pos) {
        haplotype_to_ref_cigar_ops.push_front(
            CigarOp(cur_hap_op.operation,
                    cur_hap_op.length - (read_to_haplotype_pos - cur_pos)));
      }
      cur_pos = std::min(cur_hap_op.length + cur_pos, read_to_haplotype_pos);
    }
  }

  // If after trimming the first operation is DEL we need to remove it,
  // because read alignment cannot start with DEL.
  if (haplotype_to_ref_cigar_ops.front().operation ==
      nucleus::genomics::v1::CigarUnit::DELETE) {
    haplotype_to_ref_cigar_ops.pop_front();
  }

  return haplotype_to_ref_cigar_ops;
}

// Merging one base operations.
// This function handles all possible combinations of one base merges except
// INS+DEL and DEL+INS. Below is the list of all possible combinations of
// operations and how they are resolved. Note, that all mergings below are
// symmetrical (DEL + MATCH is the same as MATCH + DEL).
// DEL + MATCH = DEL
// INS + MATCH = INS
// DEL + DEL = DEL
// INS + INS = INS
// CLIP_SOFT + MATCH = CLIP_SOFT
// CLIP_SOFT + CLIP_SOFT = CLIP_SOFT
// MATCH + MATCH = MATCH
// INS + DEL = Exception!
// DEL + INS = Exception!
inline void MergeOneBaseOperations(
    const CigarOp& cur_read_to_hap_op,
    const CigarOp& cur_hap_to_ref_op,
    int read_len,
    std::list<CigarOp>* read_to_ref_cigar_ops) {
    // Assert that operations are not INS and DEL
    std::vector<CigarUnit::Operation> input_operations{
      cur_read_to_hap_op.operation, cur_hap_to_ref_op.operation};
    CHECK((input_operations != std::vector<CigarUnit::Operation>{
      nucleus::genomics::v1::CigarUnit::DELETE,
      nucleus::genomics::v1::CigarUnit::INSERT}) &&
         (input_operations != std::vector<CigarUnit::Operation>{
      nucleus::genomics::v1::CigarUnit::INSERT,
      nucleus::genomics::v1::CigarUnit::DELETE}));

    std::vector<CigarUnit::Operation> operations{
        nucleus::genomics::v1::CigarUnit::CLIP_SOFT,
        nucleus::genomics::v1::CigarUnit::DELETE,
        nucleus::genomics::v1::CigarUnit::INSERT,
        nucleus::genomics::v1::CigarUnit::ALIGNMENT_MATCH
        };
    for (auto op : operations) {
      if (cur_read_to_hap_op.operation == op
          || cur_hap_to_ref_op.operation == op) {
        MergeCigarOp(CigarOp(op, 1), read_len, read_to_ref_cigar_ops);
        break;
      }
    }
}

}  // namespace

void FastPassAligner::CalculateReadToRefAlignment(
    size_t read_index,
    const ReadAlignment& read_to_haplotype_alignment,
    const std::list<CigarOp>& haplotype_to_ref_cigar_ops_input,
    std::list<CigarOp>* read_to_ref_cigar_ops) const {
  CHECK(read_index < reads_.size());
  int read_len = reads_[read_index].length();
  int read_to_haplotype_pos = read_to_haplotype_alignment.position;
  std::list<CigarOp> read_to_haplotype_cigar_ops =
      CigarStringToVector(read_to_haplotype_alignment.cigar);

  // Left trim haplotype to reference cigar to match read to haplotype
  // alignment position.
  std::list<CigarOp> haplotype_to_ref_cigar_ops =
      LeftTrimHaplotypeToRefAlignment(haplotype_to_ref_cigar_ops_input,
                                      read_to_haplotype_pos);

  // Sanity check. By design haplotype is built from reads. Therefore it should
  // be impossible that read does not overlap with haplotype.
  CHECK(!haplotype_to_ref_cigar_ops.empty());

  // Skip heading soft clips.
  if (!read_to_haplotype_cigar_ops.empty() &&
      read_to_haplotype_cigar_ops.front().operation ==
          nucleus::genomics::v1::CigarUnit::CLIP_SOFT) {
    MergeCigarOp(CigarOp(nucleus::genomics::v1::CigarUnit::CLIP_SOFT,
                         read_to_haplotype_cigar_ops.front().length),
                 read_len, read_to_ref_cigar_ops);
    read_to_haplotype_cigar_ops.pop_front();
  }

  CigarOp cur_read_to_hap_op(
      nucleus::genomics::v1::CigarUnit::OPERATION_UNSPECIFIED, 0);
  CigarOp cur_hap_to_ref_op(
      nucleus::genomics::v1::CigarUnit::OPERATION_UNSPECIFIED, 0);

  // Build read to reference cigar by iterating CigarOp overlaps.
  while ((!read_to_haplotype_cigar_ops.empty() ||
          !haplotype_to_ref_cigar_ops.empty()) &&
         AlignedLength(*read_to_ref_cigar_ops) < read_len) {
    // TODO Need to verify this logic.
    // This can happen if read was aligned to hyplotype partially. In this case
    // The tail (or head) of read to haplotype alignment would be soft-clipped.
    if (!read_to_haplotype_cigar_ops.empty() &&
        haplotype_to_ref_cigar_ops.empty() && cur_hap_to_ref_op.length == 0) {
      MergeCigarOp(read_to_haplotype_cigar_ops.front(), read_len,
                   read_to_ref_cigar_ops);
      read_to_haplotype_cigar_ops.pop_front();
      continue;
    }

    // Read is aligned completely, we are done.
    if (read_to_haplotype_cigar_ops.empty() && cur_read_to_hap_op.length == 0 &&
        !haplotype_to_ref_cigar_ops.empty()) {
      break;
    }

    // Assign current Cigar Ops for each alignment.
    if (cur_read_to_hap_op.length == 0) {
      cur_read_to_hap_op = read_to_haplotype_cigar_ops.front();
      read_to_haplotype_cigar_ops.pop_front();
    }
    if (cur_hap_to_ref_op.length == 0) {
      cur_hap_to_ref_op = haplotype_to_ref_cigar_ops.front();
      haplotype_to_ref_cigar_ops.pop_front();
    }

    while (cur_read_to_hap_op.length > 0 && cur_hap_to_ref_op.length > 0) {
      if ((cur_read_to_hap_op.operation
              == nucleus::genomics::v1::CigarUnit::DELETE &&
          cur_hap_to_ref_op.operation
              == nucleus::genomics::v1::CigarUnit::INSERT)
           ||
          (cur_read_to_hap_op.operation
              == nucleus::genomics::v1::CigarUnit::INSERT &&
          cur_hap_to_ref_op.operation
              == nucleus::genomics::v1::CigarUnit::DELETE)) {
        cur_hap_to_ref_op.length--;
        cur_read_to_hap_op.length--;
        // When hap_to_ref deletion is consumed by read_to_hap insertion we
        // need to convert the deletion to match.
        if (cur_hap_to_ref_op.operation ==
            nucleus::genomics::v1::CigarUnit::DELETE) {
          haplotype_to_ref_cigar_ops.push_front(
              CigarOp(nucleus::genomics::v1::CigarUnit::ALIGNMENT_MATCH, 1));
          read_to_haplotype_cigar_ops.push_front(
              CigarOp(nucleus::genomics::v1::CigarUnit::ALIGNMENT_MATCH, 1));
        }
        continue;
      }

      MergeOneBaseOperations(cur_read_to_hap_op, cur_hap_to_ref_op,
                             read_len, read_to_ref_cigar_ops);

      // If read_to_hap is INS or DEL then we consume these operations first.
      if (cur_read_to_hap_op.operation ==
              nucleus::genomics::v1::CigarUnit::INSERT) {
        cur_read_to_hap_op.length--;
      } else if (cur_hap_to_ref_op.operation ==
              nucleus::genomics::v1::CigarUnit::DELETE) {
        cur_hap_to_ref_op.length--;
      } else {
        cur_hap_to_ref_op.length--;
        cur_read_to_hap_op.length--;
      }
    }  // while
  }  // while
}

}  // namespace deepvariant
}  // namespace genomics
}  // namespace learning