deepvariant-r1.6.1 / Git / [9b26b7] /third_party/nucleus/io/reference.cc

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / third_party / nucleus / io / reference.cc
History
Download this file
545 lines (477 with data), 19.4 kB

/*
 * Copyright 2018 Google LLC.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include "third_party/nucleus/io/reference.h"

#include <stddef.h>
#include <stdlib.h>

#include <algorithm>
#include <utility>

#include "absl/strings/ascii.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "htslib/tbx.h"
#include "third_party/nucleus/io/hts_path.h"
#include "third_party/nucleus/io/reader_base.h"
#include "third_party/nucleus/protos/fasta.pb.h"
#include "third_party/nucleus/protos/range.pb.h"
#include "third_party/nucleus/protos/reference.pb.h"
#include "third_party/nucleus/util/utils.h"
#include "third_party/nucleus/core/status.h"
#include "third_party/nucleus/core/statusor.h"

namespace nucleus {

using nucleus::genomics::v1::Range;
using nucleus::genomics::v1::ReferenceSequence;

// ###########################################################################
//
// GenomeReference code
//
// ###########################################################################

bool GenomeReference::HasContig(const string& contig_name) const {
  const auto& contigs = Contigs();
  return std::any_of(contigs.cbegin(), contigs.cend(),
                     [&](const nucleus::genomics::v1::ContigInfo& contig) {
                       return contig.name() == contig_name;
                     });
}

std::vector<string> GenomeReference::ContigNames() const {
  const auto& contigs = Contigs();
  std::vector<string> keys;
  keys.reserve(contigs.size());
  for (const auto& contig : contigs) {
    keys.push_back(contig.name());
  }
  return keys;
}

StatusOr<const nucleus::genomics::v1::ContigInfo*> GenomeReference::Contig(
    const string& contig_name) const {
  for (const auto& contig : Contigs()) {
    if (contig.name() == contig_name) {
      return &contig;
    }
  }
  return ::nucleus::NotFound(absl::StrCat("Unknown contig ", contig_name));
}

// Note that start and end are 0-based, and end is exclusive. So end
// can go up to the number of bases on contig.
bool GenomeReference::IsValidInterval(const Range& range) const {
  StatusOr<const nucleus::genomics::v1::ContigInfo*> contig_status =
      Contig(range.reference_name());
  if (!contig_status.ok()) return false;
  const int64 n_bases = contig_status.ValueOrDie()->n_bases();
  return range.start() >= 0 && range.start() <= range.end() &&
         range.start() < n_bases && range.end() <= n_bases;
}

// ###########################################################################
//
// IndexedFastaReader code
//
// ###########################################################################

namespace {
// Gets information about the contigs from the fai index faidx.
std::vector<nucleus::genomics::v1::ContigInfo> ExtractContigsFromFai(
    const faidx_t* faidx) {
  int n_contigs = faidx_nseq(faidx);
  std::vector<nucleus::genomics::v1::ContigInfo> contigs(n_contigs);
  for (int i = 0; i < n_contigs; ++i) {
    nucleus::genomics::v1::ContigInfo* contig = &contigs[i];
    const char* name = faidx_iseq(faidx, i);
    CHECK_NE(name, nullptr) << "Name of " << i << " contig in is null";
    contig->set_name(name);
    contig->set_description("");
    contig->set_n_bases(faidx_seq_len(faidx, name));
    CHECK_GE(contig->n_bases(), 0) << "Contig " << name << "Has < 0 bases";
    contig->set_pos_in_fasta(i);
  }
  return contigs;
}
}  // namespace

// Iterable class for traversing all Fasta records in the file.
class IndexedFastaReaderIterable : public GenomeReferenceRecordIterable {
 public:
  // Advance to the next record.
  StatusOr<bool> Next(GenomeReferenceRecord* out) override;

  // Constructor is invoked via IndexedFastaReader::Iterate.
  IndexedFastaReaderIterable(const IndexedFastaReader* reader);
  ~IndexedFastaReaderIterable() override;

 private:
  size_t pos_ = 0;
};

StatusOr<std::unique_ptr<IndexedFastaReader>> IndexedFastaReader::FromFile(
    const string& fasta_path, const string& fai_path, int cache_size_bases) {
  nucleus::genomics::v1::FastaReaderOptions options =
      nucleus::genomics::v1::FastaReaderOptions();
  return FromFile(fasta_path, fai_path, options, cache_size_bases);
}

StatusOr<std::unique_ptr<IndexedFastaReader>> IndexedFastaReader::FromFile(
    const string& fasta_path, const string& fai_path,
    const nucleus::genomics::v1::FastaReaderOptions& options,
    int cache_size_bases) {
  const string gzi = fasta_path + ".gzi";
  faidx_t* faidx = fai_load3_x(fasta_path, fai_path, gzi, 0);
  if (faidx == nullptr) {
    return ::nucleus::NotFound(
        absl::StrCat("could not load fasta and/or fai for fasta ", fasta_path));
  }
  return std::unique_ptr<IndexedFastaReader>(
      new IndexedFastaReader(fasta_path, faidx, options, cache_size_bases));
}

IndexedFastaReader::IndexedFastaReader(
    const string& fasta_path, faidx_t* faidx,
    const nucleus::genomics::v1::FastaReaderOptions& options,
    int cache_size_bases)
    : fasta_path_(fasta_path),
      faidx_(faidx),
      options_(options),
      contigs_(ExtractContigsFromFai(faidx)),
      cache_size_bases_(cache_size_bases),
      small_read_cache_(),
      cached_range_() {}

IndexedFastaReader::~IndexedFastaReader() {
  if (faidx_) {
    NUCLEUS_CHECK_OK(Close());
  }
}

StatusOr<string> IndexedFastaReader::GetBases(const Range& range) const {
  if (faidx_ == nullptr) {
    return ::nucleus::FailedPrecondition(
        "can't read from closed IndexedFastaReader object.");
  }
  if (!IsValidInterval(range))
    return ::nucleus::InvalidArgument(
        absl::StrCat("Invalid interval: ", range.ShortDebugString()));

  if (range.start() == range.end()) {
    // We are requesting an empty string. faidx_fetch_seq does not allow this,
    // so we have to special case it here.
    return string("");
  }

  bool use_cache = (cache_size_bases_ > 0) &&
                   (range.end() - range.start() <= cache_size_bases_);
  Range range_to_fetch;

  if (use_cache) {
    if (cached_range_ && RangeContains(*cached_range_, range)) {
      // Get from cache!
      string result = small_read_cache_.substr(
          range.start() - cached_range_->start(), range.end() - range.start());
      return result;
    } else {
      // Prepare to fetch a sizeable chunk from the FASTA.
      int64 contig_n_bases =
          Contig(range.reference_name()).ValueOrDie()->n_bases();
      range_to_fetch = MakeRange(
          range.reference_name(), range.start(),
          std::min(static_cast<int64>(range.start() + cache_size_bases_),
                   contig_n_bases));
      CHECK(IsValidInterval(range_to_fetch));
    }
  } else {
    range_to_fetch = range;
  }

  // According to htslib docs, faidx_fetch_seq c_name is the contig name,
  // start is the first base (zero-based) to include and end is the last base
  // (zero-based) to include. Len is an output variable returning the length
  // of the fetched region, -2 c_name not present, or -1 for a general error.
  // The returned pointer must be freed. We need to subtract one from our end
  // since end is exclusive in GenomeReference but faidx has an inclusive one.
  int len;
  char* bases =
      faidx_fetch_seq(faidx_, range_to_fetch.reference_name().c_str(),
                      range_to_fetch.start(), range_to_fetch.end() - 1, &len);
  if (len <= 0)
    return ::nucleus::InvalidArgument(
        absl::StrCat("Couldn't fetch bases for ", range.ShortDebugString()));
  string result(bases);
  if (!options_.keep_true_case()) {
    absl::AsciiStrToUpper(&result);
  }
  free(bases);

  if (use_cache) {
    // Update cache.
    small_read_cache_ = result;
    cached_range_ = range_to_fetch;
    // Return the requested substring.
    result = small_read_cache_.substr(0, range.end() - range.start());
  }
  return result;
}

StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>>
IndexedFastaReader::Iterate() const {
  return StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>>(
      MakeIterable<IndexedFastaReaderIterable>(this));
}

::nucleus::Status IndexedFastaReader::Close() {
  if (faidx_ == nullptr) {
    return ::nucleus::FailedPrecondition("IndexedFastaReader already closed");
  } else {
    fai_destroy(faidx_);
    faidx_ = nullptr;
  }
  return ::nucleus::Status();
}

StatusOr<bool> IndexedFastaReaderIterable::Next(GenomeReferenceRecord* out) {
  NUCLEUS_RETURN_IF_ERROR(CheckIsAlive());
  const IndexedFastaReader* fasta_reader =
      static_cast<const IndexedFastaReader*>(reader_);
  if (pos_ >= fasta_reader->contigs_.size()) {
    return false;
  }
  const genomics::v1::ContigInfo& contig = fasta_reader->contigs_.at(pos_);
  const string& reference_name = contig.name();
  out->first = reference_name;
  out->second =
      fasta_reader->GetBases(MakeRange(reference_name, 0, contig.n_bases()))
          .ValueOrDie();
  pos_++;
  return true;
}

IndexedFastaReaderIterable::~IndexedFastaReaderIterable() {}

IndexedFastaReaderIterable::IndexedFastaReaderIterable(
    const IndexedFastaReader* reader)
    : Iterable(reader) {}

// ###########################################################################
//
// UnindexedFastaReader code
//
// ###########################################################################

namespace {

// Helper method to get the name in a header line. This function assumes the
// first character is '>'.
absl::string_view GetNameInHeaderLine(absl::string_view line) {
  DCHECK_LT(1, line.size()) << "name must contain more than >";
  size_t space_idx = line.find(' ');
  if (space_idx == string::npos) {
    // No space is found. The name is the entire string after >.
    space_idx = line.size();
  }
  return line.substr(1, space_idx - 1);
}

}  // namespace

// Iterable class for traversing all Fasta records in the file.
class UnindexedFastaReaderIterable : public GenomeReferenceRecordIterable {
 public:
  // Advance to the next record.
  StatusOr<bool> Next(GenomeReferenceRecord* out) override;

  // Constructor is invoked via UnindexedFastaReader::Iterate.
  UnindexedFastaReaderIterable(const UnindexedFastaReader* reader);
  ~UnindexedFastaReaderIterable() override;

 private:
  // If non-empty, contains the name/id in the header line of the next record.
  std::string next_name_;
};

StatusOr<std::unique_ptr<UnindexedFastaReader>> UnindexedFastaReader::FromFile(
    const string& fasta_path) {
  StatusOr<std::unique_ptr<TextReader>> textreader_or =
      TextReader::FromFile(fasta_path);
  NUCLEUS_RETURN_IF_ERROR(textreader_or.status());
  return std::unique_ptr<UnindexedFastaReader>(
      new UnindexedFastaReader(std::move(textreader_or.ValueOrDie())));
}

UnindexedFastaReader::~UnindexedFastaReader() {}

const std::vector<nucleus::genomics::v1::ContigInfo>&
UnindexedFastaReader::Contigs() const {
  LOG(FATAL) << "Unimplemented function invoked : " << __func__;
  return contigs_;
}

StatusOr<string> UnindexedFastaReader::GetBases(const Range& range) const {
  LOG(FATAL) << "Unimplemented function invoked : " << __func__;
  return ::nucleus::Unimplemented("");
}

StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>>
UnindexedFastaReader::Iterate() const {
  return StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>>(
      MakeIterable<UnindexedFastaReaderIterable>(this));
}

::nucleus::Status UnindexedFastaReader::Close() {
  if (!text_reader_) {
    return ::nucleus::FailedPrecondition("UnindexedFastaReader already closed");
  }
  // Close the file pointer.
  ::nucleus::Status close_status = text_reader_->Close();
  text_reader_ = nullptr;
  return close_status;
}

UnindexedFastaReader::UnindexedFastaReader(
    std::unique_ptr<TextReader> text_reader)
    : text_reader_(std::move(text_reader)) {}

StatusOr<bool> UnindexedFastaReaderIterable::Next(GenomeReferenceRecord* out) {
  NUCLEUS_RETURN_IF_ERROR(CheckIsAlive());
  DCHECK(out && out->first.empty() && out->second.empty())
      << "out must be default initialized";

  const UnindexedFastaReader* fasta_reader =
      static_cast<const UnindexedFastaReader*>(reader_);
  if (!fasta_reader->text_reader_) {
    return ::nucleus::FailedPrecondition(
        "Cannot iterate a closed UnindexedFastaReader.");
  }
  if (!next_name_.empty()) {
    out->first = next_name_;
    next_name_.clear();
  }
  bool eof = false;
  while (true) {
    // Read one line.
    StatusOr<string> line = fasta_reader->text_reader_->ReadLine();
    if (!line.ok()) {
      if (::nucleus::IsOutOfRange(line.status())) {
        eof = true;
        break;
      }
      return ::nucleus::DataLoss("Failed to parse FASTA");
    }
    std::string l = line.ValueOrDie();

    if (l.empty()) continue;
    // Check if the line is a header or a sequence.
    if (l.at(0) == '>') {
      absl::string_view parsed_name = GetNameInHeaderLine(l);
      if (out->first.empty()) {
        out->first = string(parsed_name);
        continue;
      }
      next_name_ = string(parsed_name);
      return true;
    }
    // Processing a sequence line. If name is absent by now, return an error.
    if (out->first.empty()) {
      return ::nucleus::DataLoss("Name not found in FASTA");
    }
    out->second.append(
        absl::AsciiStrToUpper(absl::StripTrailingAsciiWhitespace(l)));
  }
  if (eof && out->first.empty()) {
    // No more records.
    return false;
  }
  return true;
}

UnindexedFastaReaderIterable::~UnindexedFastaReaderIterable() {}

UnindexedFastaReaderIterable::UnindexedFastaReaderIterable(
    const UnindexedFastaReader* reader)
    : Iterable(reader) {}

// ###########################################################################
//
// InMemoryFastaReader code
//
// ###########################################################################

// Iterable class for traversing all Fasta records in the file.
class FastaFullFileIterable : public GenomeReferenceRecordIterable {
 public:
  // Advance to the next record.
  StatusOr<bool> Next(GenomeReferenceRecord* out) override;

  // Constructor is invoked via InMemoryFastaReader::Iterate.
  FastaFullFileIterable(const InMemoryFastaReader* reader);
  ~FastaFullFileIterable() override;

 private:
  size_t pos_ = 0;
};

// Initializes an InMemoryFastaReader from contigs and seqs.
//
// contigs is a vector describing the "contigs" of this GenomeReference. These
// should include only the contigs present in seqs. A ContigInfo object for a
// contig `chrom` should describe the entire chromosome `chrom` even if the
// corresponding ReferenceSequence only contains a subset of the bases.
//
// seqs is a vector where each element describes a region of the genome we are
// caching in memory and will use to provide bases in the query() operation.
//
// Note that only a single ReferenceSequence for each contig is currently
// supported.
//
// There should be exactly one ContigInfo for each reference_name referred to
// across all ReferenceSequences, and no extra ContigInfos.
StatusOr<std::unique_ptr<InMemoryFastaReader>> InMemoryFastaReader::Create(
    const std::vector<nucleus::genomics::v1::ContigInfo>& contigs,
    const std::vector<nucleus::genomics::v1::ReferenceSequence>& seqs) {
  std::unordered_map<string, nucleus::genomics::v1::ReferenceSequence> seqs_map;

  for (const auto& seq : seqs) {
    if (seq.region().reference_name().empty() || seq.region().start() < 0 ||
        seq.region().start() > seq.region().end()) {
      return ::nucleus::InvalidArgument(
          absl::StrCat("Malformed region ", seq.region().ShortDebugString()));
    }

    const size_t region_len = seq.region().end() - seq.region().start();
    if (region_len != seq.bases().length()) {
      return ::nucleus::InvalidArgument(
          absl::StrCat("Region size = ", region_len,
                       " not equal to bases.length() ", seq.bases().length()));
    }

    auto insert_result = seqs_map.emplace(seq.region().reference_name(), seq);
    if (!insert_result.second) {
      return ::nucleus::InvalidArgument(absl::StrCat(
          "Each ReferenceSequence must be on a different chromosome but "
          "multiple ones were found on ",
          seq.region().reference_name()));
    }
  }

  return std::unique_ptr<InMemoryFastaReader>(
      new InMemoryFastaReader(contigs, seqs_map));
}

StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>>
InMemoryFastaReader::Iterate() const {
  return StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>>(
      MakeIterable<FastaFullFileIterable>(this));
}

StatusOr<string> InMemoryFastaReader::GetBases(const Range& range) const {
  if (!IsValidInterval(range))
    return ::nucleus::InvalidArgument(
        absl::StrCat("Invalid interval: ", range.ShortDebugString()));

  const ReferenceSequence& seq = seqs_.at(range.reference_name());

  if (range.start() < seq.region().start() ||
      range.end() > seq.region().end()) {
    return ::nucleus::InvalidArgument(absl::StrCat(
        "Cannot query range=", range.ShortDebugString(),
        " as this InMemoryFastaReader only has bases in the interval=",
        seq.region().ShortDebugString()));
  }
  const int64 pos = range.start() - seq.region().start();
  const int64 len = range.end() - range.start();
  return seq.bases().substr(pos, len);
}

StatusOr<bool> FastaFullFileIterable::Next(GenomeReferenceRecord* out) {
  NUCLEUS_RETURN_IF_ERROR(CheckIsAlive());
  const InMemoryFastaReader* fasta_reader =
      static_cast<const InMemoryFastaReader*>(reader_);
  if (pos_ >= fasta_reader->contigs_.size()) {
    return false;
  }
  const string& reference_name = fasta_reader->contigs_.at(pos_).name();
  auto seq_iter = fasta_reader->seqs_.find(reference_name);
  if (seq_iter == fasta_reader->seqs_.end()) {
    return false;
  }
  DCHECK_NE(nullptr, out) << "FASTA record cannot be null";
  out->first = reference_name;
  out->second = seq_iter->second.bases();
  pos_++;
  return true;
}

FastaFullFileIterable::~FastaFullFileIterable() {}

FastaFullFileIterable::FastaFullFileIterable(const InMemoryFastaReader* reader)
    : Iterable(reader) {}

}  // namespace nucleus