--- a +++ b/third_party/nucleus/util/utils.h @@ -0,0 +1,330 @@ +/* + * Copyright 2018 Google LLC. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +// Core utilities functions used throughout deepvariant. +#ifndef THIRD_PARTY_NUCLEUS_UTIL_UTILS_H_ +#define THIRD_PARTY_NUCLEUS_UTIL_UTILS_H_ + +// TODO: cleanup deps. +#include <map> +#include <type_traits> +#include <vector> + +#include "absl/strings/string_view.h" +#include "third_party/nucleus/protos/position.pb.h" +#include "third_party/nucleus/protos/range.pb.h" +#include "third_party/nucleus/protos/reads.pb.h" +#include "third_party/nucleus/protos/reference.pb.h" +#include "third_party/nucleus/protos/struct.pb.h" +#include "third_party/nucleus/protos/variants.pb.h" +#include "third_party/nucleus/platform/types.h" +#include "third_party/nucleus/util/proto_ptr.h" + +namespace nucleus { + +// Enum specifying a group of canonical DNA sequence bases. +enum class CanonicalBases { + // Only allows ACGT bases. + ACGT, + // Allows ACGT bases as well as the somewhat standard N base. + ACGTN, +}; + +// Returns true if all character in bases are one of canonical bases specified +// in the CanonicalBases canon parameter. If bad_position is not nullptr, we +// will write the position of the first bad base in bases here. If bad_position +// is nullptr nothing will be written to this location. +// +// bases must not be the empty string. +bool AreCanonicalBases(absl::string_view bases, + CanonicalBases canon = CanonicalBases::ACGT, + size_t* bad_position = nullptr); + +// Returns true if base is one of canonical bases specified in the +// CanonicalBases canon parameter. +bool IsCanonicalBase(const char base, + CanonicalBases canon = CanonicalBases::ACGT); + +// Creates a Position proto from chr and pos. +nucleus::genomics::v1::Position MakePosition(absl::string_view chr, int64 pos, + const bool reverse_strand = false); + +// Creates a Position proto from reference_name and start position of Variant. +nucleus::genomics::v1::Position MakePosition( + const nucleus::genomics::v1::Variant& variant); + +// Creates a Range proto from chr, start, and end arguments. +nucleus::genomics::v1::Range MakeRange(absl::string_view chr, int64 start, + int64 end); + +// Creates a Range proto from the reference_name, start, and end of Variant. +nucleus::genomics::v1::Range MakeRange( + const nucleus::genomics::v1::Variant& variant); + +// Creates a Range proto from the alignment of Read. +nucleus::genomics::v1::Range MakeRange( + const nucleus::genomics::v1::Read& read); + +// Returns true iff range `needle` is wholly contained in `haystack`. +bool RangeContains(const nucleus::genomics::v1::Range& haystack, + const nucleus::genomics::v1::Range& needle); + +// Returns true iff any of the given `ranges` contains the `variant` by start +// position only. +bool RangesContainVariant( + const std::vector<nucleus::genomics::v1::Range>& ranges, + const nucleus::genomics::v1::Variant& variant); + +// Creates an interval string from its arguments, like chr:start-end. +string MakeIntervalStr(absl::string_view chr, int64 start, int64 end, + bool base_zero = true); + +// Makes an interval string from a Position proto. +string MakeIntervalStr(const nucleus::genomics::v1::Position& position); + +// Makes an interval string from a Range interval. +string MakeIntervalStr(const nucleus::genomics::v1::Range& interval); + +// Compares pos1 and pos2, lexicographically by reference_name then by position. +int ComparePositions(const nucleus::genomics::v1::Position& pos1, + const nucleus::genomics::v1::Position& pos2); + +// Compares the positions of two Variants via ComparePositions(), so the +// comparison is done on the reference name and start position only. +int ComparePositions(const nucleus::genomics::v1::Variant& variant1, + const nucleus::genomics::v1::Variant& variant2); + +// Returns the contig name to which this read is aligned. Returns empty string +// if the read is unaligned. +string AlignedContig(const nucleus::genomics::v1::Read& read); + +// Get the starting position of read, which is the first base covered +// by cigar operations of read. This is very fast since the start +// is encoded in the read proto. +int64 ReadStart(const nucleus::genomics::v1::Read& read); + +// Gets the end position of the, which is the index of the last base on the +// genome covered by cigar operations in the read. Note this means that the +// end is INCLUSIVE, not exclusive, as many range operations are. Note that +// this operation is substantially more expensive than ReadStart as the +// end must be computed by examining the cigar elements of Read. Implements +// getReferenceLength (excludes padding) as found at: +// http://grepcode.com/file/repo1.maven.org/maven2/org.seqdoop/htsjdk/1.118/htsjdk/samtools/Cigar.java#Cigar.getReferenceLength%28%29 +int64 ReadEnd(const nucleus::genomics::v1::Read& read); + +// Simple wrapper around ReadEnd that allows us to efficiently pass large +// protobufs in from Python. Simply unwraps the ConstProtoPtr read and calls +// ReadEnd(). +inline int64 ReadEndPython( + const nucleus::ConstProtoPtr<const ::nucleus::genomics::v1::Read>& + wrapped) { + return ReadEnd(*(wrapped.p_)); +} + +// Fills into the range information (reference_name, start, end) of +// range_wrapped with the alignment reference_name, start, and ReadEnd of read. +void ReadRangePython( + const nucleus::ConstProtoPtr<const ::nucleus::genomics::v1::Read>& + read_wrapped, + nucleus::EmptyProtoPtr<::nucleus::genomics::v1::Range> range_wrapped); + +// Fills into the range information (reference_name, start, end) of +// range_wrapped with the alignment reference_name, start, and ReadEnd of read. +// void ReadRangePython( +// const nucleus::ConstProtoPtr<const ::nucleus::genomics::v1::Read>& +// read_wrapped, +// nucleus::EmptyProtoPtr<::nucleus::genomics::v1::Range> range_wrapped); + +// Returns true if the alignment span of read overlaps with range. +bool ReadOverlapsRegion(const ::nucleus::genomics::v1::Read& read, + const ::nucleus::genomics::v1::Range& range); + +// Simple wrapper around ReadOverlapsRegion that allows us to efficiently pass +// large protobufs in from Python. Simply unwraps the ConstProtoPtr read and +// range ReadOverlapsRegion(). +inline bool ReadOverlapsRegionPython( + const nucleus::ConstProtoPtr<const ::nucleus::genomics::v1::Read>& + read_wrapped, + const nucleus::ConstProtoPtr<const ::nucleus::genomics::v1::Range> + range_wrapped) { + return ReadOverlapsRegion(*(read_wrapped.p_), *(range_wrapped.p_)); +} + +// Returns true if the read is properly placed. We define properly placed as +// read and mate both mapped to the same contig if mapped at all. This is less +// strict than the proper pair SAM flag. +bool IsReadProperlyPlaced(const nucleus::genomics::v1::Read& read); + +// Return a string_view that reflects removing quotation from the ends the +// input. (e.g. '"foo"' -> "foo"; '\'foo\'' -> 'foo') +// If the input string not quoted (on both sides, using the same quote mark), +// returns the input. +absl::string_view Unquote(absl::string_view input); + +// Creates a mapping from string to int for mapping of contig names to position +// in FASTA. This is used with the CompareVariants function. +std::map<string, int> MapContigNameToPosInFasta( + const std::vector<nucleus::genomics::v1::ContigInfo>& contigs); + +// Returns true if Variant `a` should appear before Variant `b`, +// false otherwise. +// Implements Compare to be used in std::sort. See: +// http://en.cppreference.com/w/cpp/concept/Compare +// "The return value of the function call operation applied to an object of " +// type Compare, when contextually converted to bool, yields true if the +// first argument of the call appears before the second in the strict weak +// ordering relation induced by this Compare type, and false otherwise." +bool CompareVariants(const nucleus::genomics::v1::Variant& a, + const nucleus::genomics::v1::Variant& b, + const std::map<string, int>& contig_name_to_pos_in_fasta); + +// Returns true if the string s ends with the string t. +bool EndsWith(const string& s, const string& t); + +// Templated convenience functions to set a value in a +// nucleus::genomics::v1::Value based on its C++ type. +// +// These are intended as helper functions for SetInfoField, but can be used +// standalone. +// Sets value to anything that set_string_value accepts as an argument type. +template <typename T> +void SetValuesValue(T value, nucleus::genomics::v1::Value* protobuf_value) { + protobuf_value->set_string_value(value); +} + +// Sets the value with set_bool_value for the bool C++ type. +template <typename T> +void SetValuesValue( + typename std::enable_if<std::is_same<T, bool>::value, T>::type value, + nucleus::genomics::v1::Value* protobuf_value) { + protobuf_value->set_bool_value(value); +} + +// Sets the value with set_int_value for any integral C++ type that is not bool. +template <typename T> +void SetValuesValue(typename std::enable_if<std::is_integral<T>::value && + !std::is_same<T, bool>::value, + T>::type value, + nucleus::genomics::v1::Value* protobuf_value) { + protobuf_value->set_int_value(value); +} + +// Sets the value with set_number_value for any floating point C++ type. +template <typename T> +void SetValuesValue( + typename std::enable_if<std::is_floating_point<T>::value, T>::type value, + nucleus::genomics::v1::Value* protobuf_value) { + protobuf_value->set_number_value(value); +} + +// Sets info field map[key] of proto to the vector of values. +// +// This function is heavily templated so that it's easy to use on both Variant +// and VariantCall protos to set the key of their info field maps (of type +// string => google.protobuf.ListValue) to a specific set of values. +// +// Example usages are: +// +// Variant variant; +// VariantCall call; +// SetInfoField("DP", 10, &variant); +// SetInfoField("AD", vector<int>{10, 20}, &call); +// SetInfoField("FLOAT_KEY", 1.234, &variant); +// SetInfoField("STRING_KEY", "a_string", &variant); +// +// Note that any existing binding for key will be overwritten. +template <typename ListValueInfoProto, typename Value> +void SetInfoField(const string& key, const std::vector<Value> values, + ListValueInfoProto* proto) { + (*proto->mutable_info())[key].clear_values(); + for (const Value& value : values) { + SetValuesValue<Value>(value, (*proto->mutable_info())[key].add_values()); + } +} + +template <typename ListValueInfoProto, typename Value> +void SetInfoField(const string& key, const Value value, + ListValueInfoProto* proto) { + SetInfoField(key, std::vector<Value>{value}, proto); +} + +// Given a ListValue proto, this function makes it easy to get the a vector of +// the number values or string values from that proto, in order. The function is +// templated so one get ints, floats, or strings as needed. Example usages: +// +// vector<int> ints = ListValues<int>(list_value_containing_ints); +// vector<float> floats = ListValues<float>(list_value_containing_floats); +// vector<string> strings = ListValues(list_value_containing_strings); +template <typename T> +std::vector<typename std::enable_if<std::is_same<T, bool>::value, T>::type> +ListValues(const nucleus::genomics::v1::ListValue& list_value) { + std::vector<T> values; + for (const auto& value : list_value.values()) { + values.push_back(value.bool_value()); + } + return values; +} + +template <typename T> +std::vector<typename std::enable_if< + std::is_integral<T>::value && !std::is_same<T, bool>::value, T>::type> +ListValues(const nucleus::genomics::v1::ListValue& list_value) { + std::vector<T> values; + for (const auto& value : list_value.values()) { + values.push_back(value.int_value()); + } + return values; +} + +template <typename T> +std::vector<typename std::enable_if<std::is_floating_point<T>::value, T>::type> +ListValues(const nucleus::genomics::v1::ListValue& list_value) { + std::vector<T> values; + for (const auto& value : list_value.values()) { + values.push_back(value.number_value()); + } + return values; +} + +template <typename T> +std::vector<typename std::enable_if<std::is_same<T, string>::value, T>::type> +ListValues(const nucleus::genomics::v1::ListValue& list_value) { + std::vector<string> values; + for (const auto& value : list_value.values()) { + values.push_back(value.string_value()); + } + return values; +} + +} // namespace nucleus + +#endif // THIRD_PARTY_NUCLEUS_UTIL_UTILS_H_