deepvariant-r1.6.1 / Git / [9b26b7] /deepvariant/pileup_image

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / deepvariant / pileup_image_native.cc
History
Download this file
525 lines (475 with data), 20.7 kB

/*
 * Copyright 2017 Google LLC.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "deepvariant/pileup_image_native.h"

#include <math.h>

#include <algorithm>
#include <functional>
#include <iterator>
#include <memory>
#include <string>
#include <vector>

#include "deepvariant/pileup_channel_lib.h"
#include "third_party/nucleus/protos/cigar.pb.h"
#include "third_party/nucleus/protos/position.pb.h"
#include "third_party/nucleus/protos/reads.pb.h"
#include "third_party/nucleus/protos/struct.pb.h"
#include "third_party/nucleus/protos/variants.pb.h"
#include "absl/log/check.h"
#include "absl/log/log.h"

using nucleus::genomics::v1::CigarUnit;
using nucleus::genomics::v1::Read;
using std::vector;

using learning::genomics::deepvariant::DeepVariantCall;

namespace learning {
namespace genomics {
namespace deepvariant {

namespace {

// Get the allele frequency of the alt allele that is carried by a read.
inline float ReadAlleleFrequency(const DeepVariantCall& dv_call,
                                 const Read& read,
                                 const std::vector<std::string>& alt_alleles) {
  string key =
      (read.fragment_name() + "/" + std::to_string(read.read_number()));

  // Iterate over all alts, not just alt_alleles.
  for (const string& alt_allele : dv_call.variant().alternate_bases()) {
    const auto& allele_support = dv_call.allele_support();
    auto it_read = allele_support.find(alt_allele);

    if (it_read != allele_support.end()) {
      const auto& supp_read_names = it_read->second.read_names();
      for (const string& read_name : supp_read_names) {
        const bool alt_in_alt_alleles =
            std::find(alt_alleles.begin(), alt_alleles.end(), alt_allele) !=
            alt_alleles.end();
        // If the read supports an alt we are currently considering, return the
        // associated allele frequency.
        if (read_name == key && alt_in_alt_alleles) {
          auto it = dv_call.allele_frequency().find(alt_allele);
          if (it != dv_call.allele_frequency().end())
            return it->second;
          else
            return 0;
        }
      }
    }
  }
  // If cannot find the matching variant, set the frequency to 0.
  return 0;
}

int GetHPValueForHPChannel(const Read& read,
                           int hp_tag_for_assembly_polishing) {
  if (!read.info().contains("HP")) {
    return 0;
  }
  const auto& hp_values = read.info().at("HP").values();
  if (hp_values.empty()) {
    return 0;
  }
  if (hp_values.size() > 1) {
    LOG(WARNING) << "Unexpected: Read contains more than one HP tag. Return 0";
    return 0;
  }
  int hp_value = hp_values[0].int_value();
  // See the description of --add_hp_channel flag in make_examples.py: Currently
  // we only support value of 1, 2, or 0.
  if (hp_value != 0 && hp_value != 1 && hp_value != 2) {
    LOG(FATAL)
        << "This function is currently used when --add_hp_channel is set. "
        << "HP value has to be either 1, 2, or 0. Found a read with HP="
        << hp_value << ", read=" << read.DebugString();
  }
  // If hp_tag_for_assembly_polishing is set to 2, this is a special case
  // assembly polishing:
  // If we're calling HP=2, when displayed with --add_hp_channel, we want to
  // swap the color of reads with HP=2 and HP=1.
  if (hp_tag_for_assembly_polishing == 2) {
    if (hp_value == 1) return 2;
    if (hp_value == 2) return 1;
  }

  // Otherwise, keep the default behavior.
  return hp_value;
}

}  // namespace

ImageRow::ImageRow(int width, int num_channels, bool use_allele_frequency,
                   bool add_hp_channel, const std::vector<string>& channels)
    : base(width, 0),
      base_quality(width, 0),
      mapping_quality(width, 0),
      on_positive_strand(width, 0),
      supports_alt(width, 0),
      matches_ref(width, 0),
      sequencing_type(width, 0),
      allele_frequency(width, 0),
      hp_value(width, 0),
      num_channels(num_channels),
      use_allele_frequency(use_allele_frequency),
      add_hp_channel(add_hp_channel),
      channels(channels) {}

int ImageRow::Width() const {
  CHECK(
      base.size() == base_quality.size() &&
      base.size() == mapping_quality.size() &&
      base.size() == on_positive_strand.size() &&
      base.size() == supports_alt.size() && base.size() == matches_ref.size() &&
      base.size() == sequencing_type.size() &&
      base.size() == allele_frequency.size() && base.size() == hp_value.size());
  return base.size();
}

PileupImageEncoderNative::PileupImageEncoderNative(
    const PileupImageOptions& options)
    : options_(options) {
  CHECK((options_.width() % 2 == 1) && options_.width() >= 3)
      << "Width must be odd; found " << options_.width();
}

// Gets the pixel color (int) for a base.
int PileupImageEncoderNative::BaseColor(char base) const {
  switch (base) {
    case 'A':
      return (options_.base_color_offset_a_and_g() +
              options_.base_color_stride() * 3);
    case 'G':
      return (options_.base_color_offset_a_and_g() +
              options_.base_color_stride() * 2);
    case 'T':
      return (options_.base_color_offset_t_and_c() +
              options_.base_color_stride() * 1);
    case 'C':
      return (options_.base_color_offset_t_and_c() +
              options_.base_color_stride() * 0);
    default:
      return 0;
  }
}

int PileupImageEncoderNative::BaseColor(const string& base) const {
  CHECK_EQ(base.size(), 1) << "'base' string should be a single character";
  return BaseColor(base[0]);
}

int PileupImageEncoderNative::MatchesRefColor(bool base_matches_ref) const {
  float alpha =
      (base_matches_ref ? options_.reference_matching_read_alpha()
                        : options_.reference_mismatching_read_alpha());
  return static_cast<int>(kMaxPixelValueAsFloat * alpha);
}

// Get allele frequency color for a read.
// Convert a frequency value in float to color intensity (int) and normalize.
int PileupImageEncoderNative::AlleleFrequencyColor(
    float allele_frequency) const {
  if (allele_frequency <= options_.min_non_zero_allele_frequency()) {
    return 0;
  } else {
    float log10_af = log10(allele_frequency);
    float log10_min = log10(options_.min_non_zero_allele_frequency());
    return ((log10_min - log10_af) / log10_min) *
           static_cast<int>(kMaxPixelValueAsFloat);
  }
}

int PileupImageEncoderNative::SupportsAltColor(int read_supports_alt) const {
  float alpha;
  if (read_supports_alt == 0) {
    alpha = options_.allele_unsupporting_read_alpha();
  } else if (read_supports_alt == 1) {
    alpha = options_.allele_supporting_read_alpha();
  } else {
    CHECK_EQ(read_supports_alt, 2) << "read_supports_alt can only be 0/1/2.";
    alpha = options_.other_allele_supporting_read_alpha();
  }
  return static_cast<int>(kMaxPixelValueAsFloat * alpha);
}

int PileupImageEncoderNative::BaseQualityColor(int base_qual) const {
  float capped =
      static_cast<float>(std::min(options_.base_quality_cap(), base_qual));
  return static_cast<int>(kMaxPixelValueAsFloat *
                          (capped / options_.base_quality_cap()));
}

int PileupImageEncoderNative::MappingQualityColor(int mapping_qual) const {
  float capped = static_cast<float>(
      std::min(options_.mapping_quality_cap(), mapping_qual));
  return static_cast<int>(kMaxPixelValueAsFloat *
                          (capped / options_.mapping_quality_cap()));
}

int PileupImageEncoderNative::StrandColor(bool on_positive_strand) const {
  return (on_positive_strand ? options_.positive_strand_color()
                             : options_.negative_strand_color());
}

vector<DeepVariantChannelEnum> PileupImageEncoderNative::AllChannelsEnum(
    const std::string& alt_aligned_representation) {
  std::vector<DeepVariantChannelEnum> channels_list;
  // The list here corresponds to the order in EncodeRead.
  channels_list.push_back(DeepVariantChannelEnum::CH_READ_BASE);
  channels_list.push_back(DeepVariantChannelEnum::CH_BASE_QUALITY);
  channels_list.push_back(DeepVariantChannelEnum::CH_MAPPING_QUALITY);
  channels_list.push_back(DeepVariantChannelEnum::CH_STRAND);
  channels_list.push_back(DeepVariantChannelEnum::CH_READ_SUPPORTS_VARIANT);
  channels_list.push_back(DeepVariantChannelEnum::CH_BASE_DIFFERS_FROM_REF);
  if (options_.use_allele_frequency()) {
    channels_list.push_back(DeepVariantChannelEnum::CH_ALLELE_FREQUENCY);
  }
  if (options_.add_hp_channel()) {
    channels_list.push_back(DeepVariantChannelEnum::CH_HAPLOTYPE_TAG);
  }
  // Fill OptChannel set.
  const std::vector<std::string> opt_channels = ToVector(options_.channels());
  for (int j = 0; j < opt_channels.size(); j++) {
    channels_list.push_back(ChannelStrToEnum(opt_channels[j]));
  }
  // Then, in pileup_image.py, _represent_alt_aligned_pileups can potentially
  // add two more channels.
  if (alt_aligned_representation == "diff_channels") {
    channels_list.push_back(
        DeepVariantChannelEnum::CH_DIFF_CHANNELS_ALTERNATE_ALLELE_1);
    channels_list.push_back(
        DeepVariantChannelEnum::CH_DIFF_CHANNELS_ALTERNATE_ALLELE_2);
  } else if (alt_aligned_representation == "base_channels") {
    channels_list.push_back(
        DeepVariantChannelEnum::CH_BASE_CHANNELS_ALTERNATE_ALLELE_1);
    channels_list.push_back(
        DeepVariantChannelEnum::CH_BASE_CHANNELS_ALTERNATE_ALLELE_2);
  }
  return channels_list;
}


std::unique_ptr<ImageRow> PileupImageEncoderNative::EncodeRead(
    const DeepVariantCall& dv_call, const string& ref_bases, const Read& read,
    int image_start_pos, const vector<std::string>& alt_alleles) {
  ImageRow img_row(ref_bases.size(), options_.num_channels(),
                   options_.use_allele_frequency(), options_.add_hp_channel(),
                   ToVector(options_.channels()));

  // Calculate base channels.
  const int supports_alt = ReadSupportsAlt(dv_call, read, alt_alleles);
  const int mapping_quality = read.alignment().mapping_quality();
  const int min_mapping_quality =
      options_.read_requirements().min_mapping_quality();
  // Bail early if this read's mapping quality is too low.
  if (mapping_quality < min_mapping_quality) {
    return nullptr;
  }
  const bool is_forward_strand = !read.alignment().position().reverse_strand();
  const std::uint8_t alt_color = SupportsAltColor(supports_alt);
  const std::uint8_t mapping_color = MappingQualityColor(mapping_quality);
  const std::uint8_t strand_color = StrandColor(is_forward_strand);
  const int min_base_quality = options_.read_requirements().min_base_quality();

  // Calculate AUX channels.
  const float allele_frequency =
      (options_.use_allele_frequency())
          ? ReadAlleleFrequency(dv_call, read, alt_alleles)
          : 0;
  const std::uint8_t allele_frequency_color =
      AlleleFrequencyColor(allele_frequency);
  const int hp_value = (options_.add_hp_channel())
                           ? GetHPValueForHPChannel(
                                 read, options_.hp_tag_for_assembly_polishing())
                           : 0;

  // Calculate OptChannels.
  OptChannels channel_set{options_};
  bool ok = channel_set.CalculateChannels(img_row.channels, read,
                                          ref_bases, dv_call, alt_alleles,
                                          image_start_pos);
  // Bail out if we found an issue while calculating channels
  // (a low-quality base at the call site, mapping quality is too low, etc)
  if (!ok) {
    return nullptr;
  }

  // Fill OptChannel set.
  img_row.channel_data.resize(img_row.channels.size(),
                            std::vector<unsigned char>(ref_bases.size(), 0));
  for (int j = 0; j < img_row.channels.size(); j++) {
    const std::string channel = img_row.channels[j];
    img_row.channel_data[j] = channel_set.data_[channel];
  }

  // Handler for each component of the CIGAR string, as subdivided
  // according the rules below.
  // Side effect: draws in img_row
  // Return value: true on normal exit; false if we determine that we
  // have a low quality base at the call position (in which case we
  // should return null) from EncodeRead.
  std::function<bool(int, int, const CigarUnit::Operation&)>
      action_per_cigar_unit =
          [&](int ref_i, int read_i, const CigarUnit::Operation& cigar_op) {
            char read_base = 0;
            if (cigar_op == CigarUnit::INSERT) {
              // TODO: fix this to be a char in the proto
              read_base = options_.indel_anchoring_base_char()[0];
            } else if (cigar_op == CigarUnit::DELETE) {
              ref_i -= 1;  // Adjust anchor base on reference
              read_base = options_.indel_anchoring_base_char()[0];
            } else if (cigar_op == CigarUnit::ALIGNMENT_MATCH ||
                       cigar_op == CigarUnit::SEQUENCE_MATCH ||
                       cigar_op == CigarUnit::SEQUENCE_MISMATCH) {
              read_base = read.aligned_sequence()[read_i];
            }

            size_t col = ref_i - image_start_pos;
            if (read_base && 0 <= col && col < ref_bases.size()) {
              int base_quality = read.aligned_quality(read_i);
              if (ref_i == dv_call.variant().start() &&
                  base_quality < min_base_quality) {
                return false;
              }
              bool matches_ref = (read_base == ref_bases[col]);

              // Fill Base channel set.
              img_row.base[col] = BaseColor(read_base);
              img_row.base_quality[col] = BaseQualityColor(base_quality);
              img_row.mapping_quality[col] = mapping_color;
              img_row.on_positive_strand[col] = strand_color;
              img_row.supports_alt[col] = alt_color;
              img_row.matches_ref[col] = MatchesRefColor(matches_ref);

              // Fill AUX channel set.
              if (img_row.use_allele_frequency) {
                img_row.allele_frequency[col] = allele_frequency_color;
              }
              if (img_row.add_hp_channel) {
                img_row.hp_value[col] = ScaleColor(hp_value, 2);
              }
            }
            return true;
          };

  // In the following, we iterate over alignment information for each
  // base of read, invoking action_per_cigar_unit for every segment of
  // the alignment.
  //
  // The handling of each cigar element type is given below, assuming
  // it has length n.
  //
  // ALIGNMENT_MATCH, SEQUENCE_MATCH, SEQUENCE_MISMATCH:
  //   Provide a segment ref_i, read_i for each of the n bases in the
  //   operator, where ref_i is the position on the genome where this
  //   base aligns.
  //
  // INSERT, CLIP_SOFT:
  //   Provides a single ref_i, read_i segment regardless of n. ref_i
  //   is set to the preceding base of the insertion; i.e., the anchor
  //   base. Beware that ref_i could be -1 if the insertion is aligned
  //   to the first base of a contig.  read_i points to the first base
  //   of the insertion. So if our cigar is 1M2I1M for a read starting
  //   at S, we'd see first (S, 0, '1M'), followed by one (S, 1,
  //   '2I'), and then (S + 1, 3, '1M').
  //
  // DELETE, SKIP:
  //   Provides a single ref_i, read_i segment regardless of n. ref_i
  //   is set to the first base of the deletion, just like in an
  //   ALIGNMENT_MATCH. read_i points to the previous base in the
  //   read, as there's no actual read sequence associated with a
  //   deletion. Beware that read_i could be -1 if the deletion is the
  //   first cigar of the read. So if our cigar is 1M2D1M for a read
  //   starting at S, we'd see first (S, 0, '1M'), followed by one (S
  //   + 1, 0, '2D'), and then (S + 3, 1, '1M').
  //
  // CLIP_HARD, PAD:
  //   These operators are ignored by as they don't impact the
  //   alignment of the read w.r.t. the reference.
  //
  // Any other CIGAR op:
  //   Fatal error, at present; later we should fail with a status encoding.
  int ref_i = read.alignment().position().position();
  int read_i = 0;
  ok = true;

  for (const auto& cigar_elt : read.alignment().cigar()) {
    const CigarUnit::Operation& op = cigar_elt.operation();
    int op_len = cigar_elt.operation_length();

    switch (op) {
      case CigarUnit::ALIGNMENT_MATCH:
      case CigarUnit::SEQUENCE_MATCH:
      case CigarUnit::SEQUENCE_MISMATCH:
        // Alignment op.
        for (int i = 0; i < op_len; i++) {
          ok = ok && action_per_cigar_unit(ref_i, read_i, op);
          ref_i++;
          read_i++;
        }
        break;
      case CigarUnit::INSERT:
      case CigarUnit::CLIP_SOFT:
        // Insert op.
        ok = action_per_cigar_unit(ref_i - 1, read_i, op);
        read_i += op_len;
        break;
      case CigarUnit::DELETE:
      case CigarUnit::SKIP:
        // Delete op.
        ok = action_per_cigar_unit(ref_i, read_i - 1, op);
        ref_i += op_len;
        break;
      case CigarUnit::CLIP_HARD:
      case CigarUnit::PAD:
        // Ignored ops.  Do nothing.
        break;
      default:
        LOG(FATAL) << "Unrecognized CIGAR op";
    }

    // Bail out if we found this read had a low-quality base at the
    // call site.
    if (!ok) {
      return nullptr;
    }
  }

  return std::make_unique<ImageRow>(img_row);
}

std::unique_ptr<ImageRow> PileupImageEncoderNative::EncodeReference(
    const string& ref_bases) {
  int ref_qual = options_.reference_base_quality();
  std::uint8_t base_quality_color = BaseQualityColor(ref_qual);
  std::uint8_t mapping_quality_color = MappingQualityColor(ref_qual);
  // We use "+" strand color for the reference.
  std::uint8_t strand_color = StrandColor(true);
  std::uint8_t alt_color = SupportsAltColor(0);
  std::uint8_t ref_color = MatchesRefColor(true);
  std::uint8_t allele_frequency_color = AlleleFrequencyColor(0);

  ImageRow img_row(ref_bases.size(), options_.num_channels(),
                   options_.use_allele_frequency(), options_.add_hp_channel(),
                   ToVector(options_.channels()));

  // Initialize dynamic channels
  img_row.channel_data.resize(img_row.channels.size(),
                              std::vector<unsigned char>(ref_bases.size(), 0));

  // Calculate reference rows at the top of each channel image.
  // These are retrieved for each position in the loop below.
  OptChannels channel_set{options_};
  channel_set.CalculateRefRows(img_row.channels, ref_bases);

  for (size_t col = 0; col < ref_bases.size(); ++col) {
    img_row.base[col] = BaseColor(ref_bases[col]);
    img_row.base_quality[col] = base_quality_color;
    img_row.mapping_quality[col] = mapping_quality_color;
    img_row.on_positive_strand[col] = strand_color;
    img_row.supports_alt[col] = alt_color;
    img_row.matches_ref[col] = ref_color;
    if (img_row.use_allele_frequency) {
      img_row.allele_frequency[col] = allele_frequency_color;
    }
    if (img_row.add_hp_channel) {
      img_row.hp_value[col] = ScaleColor(0, 2);
    }

    // Optional channels
    for (int j = 0; j < img_row.channels.size(); j++) {
      img_row.channel_data[j][col] =
          channel_set.GetRefRows(img_row.channels[j], col);
    }
  }

  return std::make_unique<ImageRow>(img_row);
}

}  // namespace deepvariant
}  // namespace genomics
}  // namespace learning