deepvariant-r1.6.1 / Git / [9b26b7] /deepvariant/allelecounter.cc

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / deepvariant / allelecounter.cc
History
Download this file
912 lines (846 with data), 37.6 kB

/*
 * Copyright 2017 Google LLC.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

// Implementation of allelecounter.h.
#include "deepvariant/allelecounter.h"

#include <algorithm>
#include <cstddef>
#include <iomanip>
#include <iterator>
#include <map>
#include <memory>
#include <optional>
#include <ostream>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "deepvariant/protos/deepvariant.pb.h"
#include "deepvariant/utils.h"
#include "absl/memory/memory.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "third_party/nucleus/protos/cigar.pb.h"
#include "third_party/nucleus/protos/position.pb.h"
#include "third_party/nucleus/util/utils.h"
#include "absl/log/log.h"

namespace learning {
namespace genomics {
namespace deepvariant {

// Separator string that will appear between the fragment name and read number
// the string key constructed from a Read with ReadKey().
static constexpr char kFragmentNameReadNumberSeparator[] = "/";

using absl::string_view;

using absl::StrCat;
using nucleus::GenomeReference;
using nucleus::genomics::v1::CigarUnit;
using nucleus::genomics::v1::LinearAlignment;
using nucleus::genomics::v1::Range;
using nucleus::genomics::v1::Read;

// TODO Consolidate SumAlleleCounts functions into one since the
// functionality is identical.
std::vector<Allele> SumAlleleCounts(const AlleleCount& allele_count,
                                    bool include_low_quality) {
  std::map<std::pair<string_view, AlleleType>, int> allele_sums;
  for (const auto& entry : allele_count.read_alleles()) {
    if (include_low_quality || !entry.second.is_low_quality()) {
      ++allele_sums[{entry.second.bases(), entry.second.type()}];
    }
  }

  std::vector<Allele> to_return;
  to_return.reserve(allele_sums.size());
  for (const auto& entry : allele_sums) {
    to_return.push_back(
        MakeAllele(entry.first.first, entry.first.second, entry.second));
  }

  // TODO SumAlleleCounts is only used in one place in variant_calling.cc
  // where ref alleles are filtered out. The code below is redundant.
  // Verify that there are no other usages of ref alleles and remove this code.
  //
  // Creates a synthetic reference Allele if we saw any reference containing
  // alleles, whose count is tracked (for performance reasons) as an integer
  // in the AlleleCount.ref_supporting_read_count field of the proto. This
  // synthetic allele allows us to provide the same API from this function: a
  // vector of the Alleles observed in allele_count without having to track the
  // read names for reference containing reads, which is very memory-intensive.
  if (allele_count.ref_supporting_read_count() > 0 &&
      !allele_count.track_ref_reads()) {
    to_return.push_back(MakeAllele(allele_count.ref_base(),
                                   AlleleType::REFERENCE,
                                   allele_count.ref_supporting_read_count()));
  }

  return to_return;
}

std::vector<Allele> SumAlleleCounts(
    const std::vector<AlleleCount>& allele_counts, bool include_low_quality) {
  std::map<std::pair<string_view, AlleleType>, int> allele_sums;
  for (const AlleleCount& allele_count : allele_counts) {
    for (const auto& entry : allele_count.read_alleles()) {
      if (include_low_quality || !entry.second.is_low_quality()) {
        ++allele_sums[{entry.second.bases(), entry.second.type()}];
      }
    }
  }

  std::vector<Allele> to_return;
  to_return.reserve(allele_sums.size());
  for (const auto& entry : allele_sums) {
    to_return.push_back(
        MakeAllele(entry.first.first, entry.first.second, entry.second));
  }

  // TODO SumAlleleCounts is only used in one place in variant_calling.cc
  // where ref alleles are filtered out. The code below is redundant.
  // Verify that there are no other usages of ref alleles and remove this code.
  //
  // Creates a synthetic reference Allele if we saw any reference containing
  // alleles, whose count is tracked (for performance reasons) as an integer
  // in the AlleleCount.ref_supporting_read_count field of the proto. This
  // synthetic allele allows us to provide the same API from this function: a
  // vector of the Alleles observed in allele_count without having to track the
  // read names for reference containing reads, which is very memory-intensive.
  int ref_support_for_all_samples = 0;
  for (const AlleleCount& allele_count : allele_counts) {
    ref_support_for_all_samples += allele_count.ref_supporting_read_count();
  }
  if (ref_support_for_all_samples > 0 && !allele_counts.empty() &&
      !allele_counts[0].track_ref_reads()) {
    to_return.push_back(MakeAllele(allele_counts[0].ref_base(),
                                   AlleleType::REFERENCE,
                                   ref_support_for_all_samples));
  }

  return to_return;
}

// TODO Consolidate TotalAlleleCounts functions into one since the
// functionality is identical.
// Allele counter tracks reads supporting alt alleles. Simple counter is used
// for ref supporting reads. If track_ref_reads flag is set then ref supporting
// reads are tracked as well but only for positions marked as potential
// candidates.
int TotalAlleleCounts(const AlleleCount& allele_count,
                      bool include_low_quality) {
  int total_allele_counts = std::count_if(
      allele_count.read_alleles().begin(), allele_count.read_alleles().end(),
      [include_low_quality](google::protobuf::Map<string, Allele>::value_type e) {
        return (!e.second.is_low_quality() || include_low_quality) &&
               e.second.type() != AlleleType::REFERENCE;
      });
  total_allele_counts += allele_count.ref_supporting_read_count();
  return total_allele_counts;
}

// Allele counter tracks reads supporting alt alleles. Simple counter is used
// for ref supporting reads. If track_ref_reads flag is set then ref supporting
// reads are tracked as well but only for positions marked as potential
// candidates.
int TotalAlleleCounts(const std::vector<AlleleCount>& allele_counts,
                      bool include_low_quality) {
  int total_allele_count = 0;
  for (const AlleleCount& allele_count : allele_counts) {
    total_allele_count += std::count_if(
        allele_count.read_alleles().begin(), allele_count.read_alleles().end(),
        [include_low_quality](google::protobuf::Map<string, Allele>::value_type e) {
          return (!e.second.is_low_quality() || include_low_quality) &&
                 e.second.type() != AlleleType::REFERENCE;
        });
    total_allele_count += allele_count.ref_supporting_read_count();
  }
  return total_allele_count;
}

// Returns false if any of the bases from offset to offset+len are canonical
// bases.
// If `keep_legacy_behavior` is set to true, this function will also return
// false when any of the bases in read from offset to offset + len is below
// the quality threshold.
//
// There is a separate bool output `is_low_quality`, which will be set to
// true if all the bases in read from offset to offset+len is lower than
// the quality threshold to be used for generating alleles for our counts.
bool CanBasesBeUsed(const nucleus::genomics::v1::Read& read, int offset,
                    int len, const AlleleCounterOptions& options,
                    bool& is_low_quality) {
  CHECK_LE(offset + len, read.aligned_quality_size());

  const int min_base_quality = options.read_requirements().min_base_quality();
  int indel_base_quality = 0;
  for (int i = 0; i < len; i++) {
    indel_base_quality += read.aligned_quality(offset + i);
    if (read.aligned_quality(offset + i) < min_base_quality &&
        options.keep_legacy_behavior()) {
      return false;
    }
    if (!nucleus::IsCanonicalBase(read.aligned_sequence()[offset + i])) {
      return false;
    }
  }
  is_low_quality = false;
  if (!options.keep_legacy_behavior()) {
    if (indel_base_quality < min_base_quality * len) {
      is_low_quality = true;
    }
  }
  return true;
}

bool allele_pos_cmp(const AlleleCount& allele_count, int64_t pos) {
  return allele_count.position().position() < pos;
}

// Return the allele index by base position in allele_counts vector.
int AlleleIndex(const std::vector<AlleleCount>& allele_counts, int64_t pos) {
  auto idx = std::lower_bound(allele_counts.begin(), allele_counts.end(), pos,
                              allele_pos_cmp);
  if (idx == allele_counts.end() || idx->position().position() != pos) {
    return -1;
  }
  return std::distance(allele_counts.begin(), idx);
}

void AlleleCounter::Init() {
  // Initialize our counts vector of AlleleCounts with proper position and
  // reference base information. Initially the alleles repeated field is empty.
  const int64_t len = IntervalLength();
  counts_.reserve(len);
  // Set candidate positions relative to the interval.
  for (auto& candidate_position : candidate_positions_) {
    candidate_position -= interval_.start();
  }
  auto full_interval_offset = interval_.start() - reads_interval_.start();
  for (int i = 0; i < len; ++i) {
    AlleleCount allele_count;
    const int64_t pos = interval_.start() + i;
    *(allele_count.mutable_position()) =
        nucleus::MakePosition(interval_.reference_name(), pos);
    allele_count.set_ref_base(ref_bases_.substr(i + full_interval_offset, 1));
    allele_count.set_track_ref_reads(options_.track_ref_reads());
    counts_.push_back(allele_count);
  }
}

// AlleleCounter objects are passed to Python by pointers. We need to return
// a raw pointer here in order to test a Python specific API.
AlleleCounter* AlleleCounter::InitFromAlleleCounts(
    const std::vector<AlleleCount>& allele_counts) {
  auto allele_counter = new AlleleCounter();
  allele_counter->counts_.assign(allele_counts.begin(), allele_counts.end());
  return allele_counter;
}

// This constructor is only used for unit testing, therefore it is defined as
// private.
AlleleCounter::AlleleCounter() : ref_(nullptr) {}

AlleleCounter::AlleleCounter(const GenomeReference* const ref,
                             const Range& range,
                             const nucleus::genomics::v1::Range& full_range,
                             const std::vector<int>& candidate_positions,
                             const AlleleCounterOptions& options)
    : ref_(ref),
      interval_(range),
      reads_interval_(full_range),
      candidate_positions_(candidate_positions),
      options_(options),
      ref_bases_(ref_->GetBases(full_range).ValueOrDie()) {
  Init();
}

AlleleCounter::AlleleCounter(const GenomeReference* const ref,
                             const Range& range,
                             const std::vector<int>& candidate_positions,
                             const AlleleCounterOptions& options)
    : ref_(ref),
      interval_(range),
      reads_interval_(range),
      candidate_positions_(candidate_positions),
      options_(options),
      ref_bases_(ref_->GetBases(range).ValueOrDie()) {
  Init();
}

string AlleleCounter::RefBases(const int64_t rel_start, const int64_t len) {
  CHECK_GT(len, 0) << "Length must be >= 1";

  // If our region isn't valid (e.g., it is off the end of the chromosome),
  // return an empty string, otherwise get the actual bases from reference.
  const int abs_start = reads_interval_.start() + rel_start;
  const Range region = nucleus::MakeRange(reads_interval_.reference_name(),
                                          abs_start, abs_start + len);
  if (!ref_->IsValidInterval(region)) {
    return "";
  } else {
    return ref_->GetBases(region).ValueOrDie();
  }
}

string AlleleCounter::GetPrevBase(const Read& read, const int read_offset,
                                  const int interval_offset) {
  CHECK_GE(read_offset, 0) << "read_offset should be 0 or greater";
  if (read_offset == 0) {
    // The read_offset case is here to handle the case where the insertion/
    // deletion/soft_clip is the first cigar element of the read, and there's no
    // previous base in the read, and so we take our previous base from the
    // reference genome instead.
    return RefBases(interval_offset - 1, 1);
  } else {
    // In all other cases we actually take our previous base from the read
    // itself.
    return read.aligned_sequence().substr(read_offset - 1, 1);
  }
}

ReadAllele AlleleCounter::MakeIndelReadAllele(const Read& read,
                                              const int interval_offset,
                                              const int ref_offset,
                                              const int read_offset,
                                              const CigarUnit& cigar) {
  const int op_len = cigar.operation_length();
  const string prev_base = GetPrevBase(read, read_offset, ref_offset);
  bool is_low_quality_read_allele = false;

  if (prev_base.empty() || !nucleus::AreCanonicalBases(prev_base) ||
      (cigar.operation() != CigarUnit::DELETE &&
       !CanBasesBeUsed(read, read_offset, op_len, options_,
                       is_low_quality_read_allele))) {
    // There is no prev_base (we are at the start of the contig), or the bases
    // are unusable, so don't actually add the indel allele.
    return ReadAllele();
  }

  AlleleType type;
  string bases;
  switch (cigar.operation()) {
    case CigarUnit::DELETE:
      type = AlleleType::DELETION;
      bases = RefBases(ref_offset, op_len);
      if (bases.empty()) {
        // We couldn't get the ref bases for the deletion (which can happen if
        // the deletion spans off the end of the contig), so abort now without
        // considering this read any longer. It's rare but such things happen in
        // genomes but they do occur in practice, such as when: (1) the reads
        // spans off the chromosome, but because there's more sequence there
        // (the chromosome isn't complete), which means the read can have
        // whatever CIGAR it likes, which may include a deletion; (2) the
        // chromosome is actually circular, and the aligner is clever enough to
        // know that, and the read's cigar reflect true differences of the read
        // to the alignment at the start of the contig.  Nasty, I know.
        VLOG(2) << "Deletion spans off the chromosome for read: "
                << read.ShortDebugString() << " at cigar "
                << cigar.ShortDebugString() << " with interval "
                << Interval().ShortDebugString() << " with interval_offset "
                << interval_offset << " and read_offset " << read_offset;
        return ReadAllele();
      }

      if (!nucleus::AreCanonicalBases(bases)) {
        // The reference genome has non-canonical bases that are being deleted.
        // We don't add deletions with non-canonical bases so we return an empty
        // ReadAllele().
        return ReadAllele();
      }

      break;
    case CigarUnit::INSERT:
      type = AlleleType::INSERTION;
      bases = read.aligned_sequence().substr(read_offset, op_len);
      break;
    case CigarUnit::CLIP_SOFT:
      type = AlleleType::SOFT_CLIP;
      bases = read.aligned_sequence().substr(read_offset, op_len);
      break;
    default:
      LOG(FATAL) << "Unexpected cigar operation: " << cigar.DebugString();
  }

  return ReadAllele(interval_offset - 1, StrCat(prev_base, bases), type,
                    is_low_quality_read_allele);
}

void AlleleCounter::AddReadAlleles(const Read& read, absl::string_view sample,
                                   const std::vector<ReadAllele>& to_add) {
  for (size_t i = 0; i < to_add.size(); ++i) {
    const ReadAllele& to_add_i = to_add[i];

    // The read can span beyond and after the interval, so don't add counts
    // outside our interval boundaries.
    if (to_add_i.skip() || !IsValidIntervalOffset(to_add_i.position())) {
      continue;
    }

    // If sequential alleles have the same position, skip the first one. This
    // occurs, for example, when we observe a base at position p on the genome
    // which is enqueued as the ith element of our to_add vector. But the next
    // allele is an indel allele which, because of VCF convention, occurs at
    // position p, is enqueued at i+1 and supersedes the previous base
    // substitution. Resolving these conflicts here allows us to keep the
    // Read => ReadAllele algorithm logic simple.
    if (i + 1 < to_add.size() &&
        to_add_i.position() == to_add[i + 1].position()) {
      continue;
    }

    AlleleCount& allele_count = counts_[to_add_i.position()];

    if (to_add_i.type() == AlleleType::REFERENCE) {
      if (!to_add_i.is_low_quality()) {
        const int prev_count = allele_count.ref_supporting_read_count();
        allele_count.set_ref_supporting_read_count(prev_count + 1);
      }
    }

    // Always create non reference alleles.
    // Reference alleles are created only when the track_ref_reads flag is set
    // and we know that this position contains a potential candidate.
    if (to_add_i.type() != AlleleType::REFERENCE ||
        (options_.track_ref_reads() &&
         std::binary_search(candidate_positions_.begin(),
                            candidate_positions_.end(), to_add_i.position()))) {
      auto* read_alleles = allele_count.mutable_read_alleles();
      auto* sample_alleles = allele_count.mutable_sample_alleles();
      const string key = ReadKey(read);
      const Allele allele = MakeAllele(to_add_i.bases(), to_add_i.type(), 1,
                                       to_add_i.is_low_quality());

      // Naively, there should never be multiple counts for the same read key.
      // We detect such a situation here but only write out a warning. It would
      // be better to have a stronger response (FATAL), but unfortunately we see
      // data in the wild that we need to process that has duplicates.
      if (read_alleles->count(key)) {
        // Not thread safe.
        static int counter = 0;
        if (counter++ < 1) {
          VLOG(2) << "Found duplicate read: " << key << " at "
                  << allele_count.position().ShortDebugString();
        }
      }

      (*read_alleles)[key] = allele;
      // Update sample to allele map. This may allows us to determine set of
      // samples that support each allele.
      Allele* new_allele = (*sample_alleles)[std::string(sample)].add_alleles();
      *new_allele = allele;
    }
  }
}

// Convenience function to check if operations is match. Note, that we treat
// SEQUENCE_MISMATCH as ALIGNMENT_MATCH.
bool IsOperationMatch(const nucleus::genomics::v1::CigarUnit& op) {
  return (op.operation() == CigarUnit::ALIGNMENT_MATCH ||
          op.operation() == CigarUnit::SEQUENCE_MATCH ||
          op.operation() == CigarUnit::SEQUENCE_MISMATCH);
}

// Merge two operations. If operations are the same type then first operation's
// length is icreased and second operation length's is set to zero. If
// operations are different types then M operation of length MIN(op1, op2) is
// added instead of op1. Op2 is converted to the type of a larger operation
// and length is set to Max(op) - Min(op).
// Function returns true if operations are merged.
bool MergeOperations(nucleus::genomics::v1::CigarUnit& op1,
                     nucleus::genomics::v1::CigarUnit& op2) {
  // Simple merge if operations are of the same type. There are three different
  // types of "match" operation therefore it is not enough to just compare
  // operation types.
  if (op1.operation() == op2.operation() ||
      (IsOperationMatch(op1) && IsOperationMatch(op2))) {
    op1.set_operation_length(op1.operation_length() + op2.operation_length());
    op2.set_operation_length(0);
  } else if ((op1.operation() == CigarUnit::DELETE ||
              op1.operation() == CigarUnit::INSERT) &&
             (op2.operation() == CigarUnit::DELETE ||
              op2.operation() == CigarUnit::INSERT)) {
    auto min_indel_len =
        std::min(op1.operation_length(), op2.operation_length());
    auto new_indel_len =
        std::max(op1.operation_length(), op2.operation_length()) -
        min_indel_len;
    if (op1.operation_length() > op2.operation_length()) {
      op2.set_operation(op1.operation());
    }
    op1.set_operation(CigarUnit::ALIGNMENT_MATCH);
    op1.set_operation_length(min_indel_len);
    op2.set_operation_length(new_indel_len);
  } else {
    return false;
  }
  return true;
}

// Advance reference and read pointers depending on the cigar operation and
// its length.
void AdvanceReadReferencePointers(
    int increment, const nucleus::genomics::v1::CigarUnit_Operation& operation,
    int& read_offset, int& ref_offset) {
  switch (operation) {
    case CigarUnit::ALIGNMENT_MATCH:
    case CigarUnit::SEQUENCE_MATCH:
    case CigarUnit::SEQUENCE_MISMATCH:
      read_offset += increment;
      ref_offset += increment;
      break;
    case CigarUnit::CLIP_SOFT:
    case CigarUnit::INSERT:
      read_offset += increment;
      // No interval offset change, since an insertion doesn't move us on ref.
      break;
    case CigarUnit::DELETE:
    case CigarUnit::PAD:
    case CigarUnit::SKIP:
      // No read offset change, since a del/pad/skip don't consume read bases.
      ref_offset += increment;
      break;
    case CigarUnit::CLIP_HARD:
      break;
    default:
      // Lots of misc. enumerated values from proto that aren't useful such as
      // enumeration values INT_MIN_SENTINEL_DO_NOT_USE_ and
      // OPERATION_UNSPECIFIED.
      break;
  }
}

// Handle the case when INDEL is at the head of a cigar.
// DEL is removed and alignment position is shifted to the right.
// INS is converted into a REF and alignment position is shifted to the left.
int HandleHeadingIndel(
    std::vector<nucleus::genomics::v1::CigarUnit>::iterator it,
    std::vector<nucleus::genomics::v1::CigarUnit>& norm_cigar) {
  int read_alignment_shift = 0;
  // it must be a first operation or the first op following soft clip.
  CHECK(it == norm_cigar.begin() ||
        (!norm_cigar.empty() &&
         norm_cigar.begin()->operation() == CigarUnit::CLIP_SOFT &&
         it == norm_cigar.begin() + 1));
  if (it->operation() == CigarUnit::DELETE) {
    read_alignment_shift = it->operation_length();
    norm_cigar.erase(it);
  } else if (it->operation() == CigarUnit::INSERT) {
    read_alignment_shift = -it->operation_length();
    it->set_operation(CigarUnit::ALIGNMENT_MATCH);
  }
  return read_alignment_shift;
}
// Shift cigar operation according to the shift parameter. Only INDELs are
// shifted and only to the left. It is expected that operation to the left is
// REF or SOFT_CLIP or there is no operation. Operation to the left is decreased
// in length, operation to the right is increased in length. If there is no
// operation to the right  REF is created.
int ShiftOperation(int shift,
                   std::vector<nucleus::genomics::v1::CigarUnit>::iterator it,
                   std::vector<nucleus::genomics::v1::CigarUnit>& norm_cigar) {
  // If previous operation is ref or soft clip then it is reduced in length.
  // If it is the first operation then read alignment is shifted. In this case
  // it is removed if it is del or turned into ref if it is ins.
  int read_alignment_shift = 0;
  // If indel is first operation or second after a soft clip then indel is
  // treated specially.
  if (it == norm_cigar.begin() ||
      (!norm_cigar.empty() && it == norm_cigar.begin() + 1 &&
       norm_cigar.begin()->operation() == CigarUnit::CLIP_SOFT)) {
    return HandleHeadingIndel(it, norm_cigar);
  } else {
    auto prev_op = it - 1;
    // Previous operation should not be a soft clip. Soft clip in the middle of
    // cigar is an error in alignment. It should not happen.
    CHECK(prev_op->operation() != CigarUnit::CLIP_SOFT);
    if (IsOperationMatch(*prev_op)) {
      CHECK(shift <= prev_op->operation_length());
      prev_op->set_operation_length(prev_op->operation_length() - shift);
    } else {
      // Do nothing if prev operation is not REF.
      return read_alignment_shift;
    }
  }

  // Expand existing ref following it or add a new one if it is the last element
  auto post_op = it + 1;
  if (post_op == norm_cigar.end()) {
    nucleus::genomics::v1::CigarUnit post_ref;
    post_ref.set_operation_length(shift);
    post_ref.set_operation(CigarUnit::ALIGNMENT_MATCH);
    norm_cigar.insert(it + 1, post_ref);
  } else {
    post_op->set_operation_length(post_op->operation_length() + shift);
  }
  return read_alignment_shift;
}

// Iterate cigar operations and attempt merging adjacent operations of the same
// type or indels. Returns true of merging was done.
bool FindAndMergeOperations(
    std::vector<nucleus::genomics::v1::CigarUnit>& norm_cigar) {
  for (auto op = norm_cigar.begin(); op != norm_cigar.end(); op++) {
    auto next_op = op + 1;
    if (next_op != norm_cigar.end() && MergeOperations(*op, *next_op)) {
        return true;
    }
  }
  return false;
}

// Remove all zero length operations and merge operations that can be merged.
// Operations of the same type are merged by adding their lengths. If DEL and
// INS has to be merged then their overlapping part is converted to REF and
// non overlapping part is preserved.
// For example. 3D5I (3 del and 5 ins) is merged into 3M2I (3 ref and 2 ins).
// Return true if any change was made to the cigar.
bool SwipeAndMerge(std::vector<nucleus::genomics::v1::CigarUnit>& norm_cigar) {
  // Repeat the loop until nothing is merged.
  bool merged = true;
  bool is_modified = false;
  while (merged) {
    merged = false;
    // First remove all operations of length zero
    auto before_size = norm_cigar.size();
    norm_cigar.erase(
        std::remove_if(norm_cigar.begin(), norm_cigar.end(),
                       [](const nucleus::genomics::v1::CigarUnit& op) {
                         return (op.operation_length() == 0);
                       }),
        norm_cigar.end());
    if (norm_cigar.size() < before_size) {
      is_modified = true;
    }
    // Then merge operations that are mergable.
    merged  = FindAndMergeOperations(norm_cigar);
    if (merged) {
      is_modified = true;
    }
  }  // while(merged)
  return is_modified;
}

bool AlleleCounter::CanDelBeShifted(
    const absl::string_view read_seq,
    std::vector<nucleus::genomics::v1::CigarUnit>::const_iterator cigar_elt,
    int read_offset,
    int interval_offset,
    int op_len) const {
  if (cigar_elt->operation() != CigarUnit::DELETE) {
    return false;
  }
  if (read_offset <= 0) {
    return false;
  }
  if (interval_offset + op_len - 1 >= ref_bases_.size()) {
    return false;
  }

  return  read_seq[read_offset - 1] ==
       ref_bases_[interval_offset + op_len - 1];
}

bool AlleleCounter::CanInsBeShifted(
    const absl::string_view read_seq,
    std::vector<nucleus::genomics::v1::CigarUnit>::const_iterator cigar_elt,
    int read_offset,
    int interval_offset,
    int op_len) const {
  if (cigar_elt->operation() != CigarUnit::INSERT) {
    return false;
  }
  if (interval_offset <= 0) {
    return false;
  }
  if (read_offset + op_len - 1 >= read_seq.size()) {
    return false;
  }

  return read_seq[read_offset + op_len - 1] ==
                     ref_bases_[interval_offset - 1];
}

// Normalize cigar of a given read following
// https://genome.sph.umich.edu/wiki/Variant_Normalization. As a result
// alignment position may need to be adjusted, in this case read_shift parameter
// is set to a non-zero value. As a result of shifting indel operations
// sometimes merging of adjecent indels may be performed.
bool AlleleCounter::NormalizeCigar(
    const absl::string_view read_seq, int interval_offset,
    std::vector<nucleus::genomics::v1::CigarUnit>& norm_cigar,
    int& read_shift) const {
  bool is_modified = false;
  read_shift = 0;
  if (norm_cigar.empty()) {
    return is_modified;
  }
  int iteration = 0;  // while loop will run up to 10 times.
  while (iteration++ < 10) {
    int read_offset = 0;
    int cur_interval_offset = interval_offset + read_shift;
    // Iterate cigar operations and shift indels if possible
    // If shift occurred break the loop and recalculate
    int prev_op_len = norm_cigar.front().operation_length();
    bool is_shifted = false;
    for (auto cigar_elt = norm_cigar.begin(); cigar_elt != norm_cigar.end();
         cigar_elt++) {
      const int op_len = cigar_elt->operation_length();
      int shift = 0;
      if (cigar_elt->operation() == CigarUnit::INSERT ||
          cigar_elt->operation() == CigarUnit::DELETE) {
        // Move INS/DEL to the left until last base of INS/DEL is the same
        // as REF base at the start of the operation. Moving left can only
        // be done if cigar has a REF operation preceding the INS/DEL. By
        // moving INS>DEL to the left we consume the length of a preceding
        // REF operation.
        while (prev_op_len > 0 &&
               ( CanDelBeShifted(read_seq, cigar_elt, read_offset,
                                 cur_interval_offset, op_len) ||
                CanInsBeShifted(read_seq, cigar_elt, read_offset,
                                 cur_interval_offset, op_len))) {
          cur_interval_offset--;
          prev_op_len--;
          read_offset--;
          shift++;
        }
        if (shift > 0) {
          read_shift += ShiftOperation(shift, cigar_elt, norm_cigar);
          is_modified = true;
          is_shifted = true;
          break;
        }
      }
      prev_op_len = cigar_elt->operation_length();
      AdvanceReadReferencePointers(op_len, cigar_elt->operation(), read_offset,
                                   cur_interval_offset);
    }  // for

    // Input BAM may contain non-normalized records, therefore we try to
    // SwipeAndMerge even if there was no shift.
    bool is_merged = SwipeAndMerge(norm_cigar);
    if (is_merged) {
      is_modified = true;
    }
    if (!is_shifted && !is_merged) {
      break;
    }
  }  // while (iteration < 10)
  // Call shift to deal with an indel at the beginning of cigar.
  read_shift += HandleHeadingIndel(norm_cigar.begin(), norm_cigar);
  return is_modified;
}

void AlleleCounter::NormalizeAndAdd(
    const nucleus::genomics::v1::Read& read, absl::string_view sample,
    std::unique_ptr<std::vector<nucleus::genomics::v1::CigarUnit>>& norm_cigar,
    int& read_shift) {
  // Make sure our incoming read has a mapping quality above our min. threshold.
  if (read.alignment().mapping_quality() <
      options_.read_requirements().min_mapping_quality()) {
    return;
  }

  const LinearAlignment& aln = read.alignment();
  std::vector<ReadAllele> to_add;
  to_add.reserve(read.aligned_quality_size());
  int interval_offset = aln.position().position() - ReadsInterval().start();
  const string_view read_seq(read.aligned_sequence());
  // Copy input cigar into the local variable since it can be modified.
  std::vector<CigarUnit> input_output_cigar(aln.cigar().begin(),
                                            aln.cigar().end());
  bool is_modified =
      NormalizeCigar(read_seq, interval_offset, input_output_cigar, read_shift);
  if (is_modified) {
    norm_cigar->assign(input_output_cigar.begin(), input_output_cigar.end());
  }
  Add(read, sample, &input_output_cigar, read_shift);
}

void AlleleCounter::Add(const nucleus::genomics::v1::Read& read,
                        absl::string_view sample,
                        const std::vector<CigarUnit>* cigar_to_use,
                        int read_shift) {
  // Make sure our incoming read has a mapping quality above our min. threshold.
  if (read.alignment().mapping_quality() <
      options_.read_requirements().min_mapping_quality()) {
    return;
  }

  const LinearAlignment& aln = read.alignment();
  std::vector<ReadAllele> to_add;
  to_add.reserve(read.aligned_quality_size());
  int read_offset = 0;
  int ref_interval_offset =
      aln.position().position() + read_shift - ReadsInterval().start();
  int interval_offset =
      aln.position().position() + read_shift - Interval().start();
  const string_view read_seq(read.aligned_sequence());
  std::vector<CigarUnit> cigar;
  if (cigar_to_use != nullptr) {
    cigar.assign(cigar_to_use->begin(), cigar_to_use->end());
  } else {
    cigar.assign(aln.cigar().begin(), aln.cigar().end());
  }

  for (const auto& cigar_elt : cigar) {
    const int op_len = cigar_elt.operation_length();
    switch (cigar_elt.operation()) {
      case CigarUnit::ALIGNMENT_MATCH:
      case CigarUnit::SEQUENCE_MATCH:
      case CigarUnit::SEQUENCE_MISMATCH:
        for (int i = 0; i < op_len; ++i) {
          const int ref_offset = ref_interval_offset + i;
          const int base_offset = read_offset + i;
          bool is_low_quality_read_allele = false;
          if (IsValidRefOffset(ref_offset) &&
              CanBasesBeUsed(read, base_offset, 1, options_,
                             is_low_quality_read_allele)) {
            const AlleleType type =
                ref_bases_[ref_offset] == read_seq[base_offset]
                    ? AlleleType::REFERENCE
                    : AlleleType::SUBSTITUTION;
            to_add.emplace_back(interval_offset + i,
                                string(read_seq.substr(base_offset, 1)), type,
                                is_low_quality_read_allele);
          }
        }
        read_offset += op_len;
        ref_interval_offset += op_len;
        interval_offset += op_len;
        break;
      case CigarUnit::CLIP_SOFT:
      case CigarUnit::INSERT:
        // Note, by convention VCF insertion/deletion are at the preceding base.
        to_add.push_back(MakeIndelReadAllele(read, interval_offset,
                                             ref_interval_offset, read_offset,
                                             cigar_elt));
        read_offset += op_len;
        // No interval offset change, since an insertion doesn't move us on ref.
        break;
      case CigarUnit::DELETE:
        // By convention VCF insertion/deletion are at the preceding base.
        to_add.push_back(MakeIndelReadAllele(read, interval_offset,
                                             ref_interval_offset, read_offset,
                                             cigar_elt));
        // No read offset change, since a deletion doesn't consume read bases.
        ref_interval_offset += op_len;
        interval_offset += op_len;
        break;
      case CigarUnit::PAD:
      case CigarUnit::SKIP:
        // No read offset change, since a pad/skip don't consume read bases.
        ref_interval_offset += op_len;
        interval_offset += op_len;
        break;
      case CigarUnit::CLIP_HARD:
        break;
      default:
        // Lots of misc. enumerated values from proto that aren't useful such as
        // enumeration values INT_MIN_SENTINEL_DO_NOT_USE_ and
        // OPERATION_UNSPECIFIED.
        break;
    }
  }

  AddReadAlleles(read, sample, to_add);
  ++n_reads_counted_;
}

string AlleleCounter::ReadKey(const Read& read) {
  return StrCat(read.fragment_name(), kFragmentNameReadNumberSeparator,
                read.read_number());
}

std::vector<AlleleCountSummary> AlleleCounter::SummaryCounts(
    int left_padding, int right_padding) const {
  std::vector<AlleleCountSummary> summaries;
  CHECK_GE(left_padding, 0);
  CHECK_GE(right_padding, 0);
  CHECK_LT(left_padding + right_padding, counts_.size());
  summaries.reserve(counts_.size() - left_padding - right_padding);
  for (int i = left_padding; i < counts_.size() - right_padding; i++) {
    const AlleleCount& allele_count = counts_[i];
    AlleleCountSummary summary;
    summary.set_reference_name(allele_count.position().reference_name());
    summary.set_position(allele_count.position().position());
    summary.set_ref_base(allele_count.ref_base());
    summary.set_ref_supporting_read_count(
        allele_count.ref_supporting_read_count());
    summary.set_total_read_count(TotalAlleleCounts(allele_count));
    summary.set_ref_nonconfident_read_count(
        allele_count.ref_nonconfident_read_count());
    summaries.push_back(summary);
  }
  return summaries;
}

}  // namespace deepvariant
}  // namespace genomics
}  // namespace learning