[9b26b7]: / deepvariant / direct_phasing.h

Download this file

337 lines (277 with data), 13.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
/*
* Copyright 2021 Google LLC.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef LEARNING_GENOMICS_DEEPVARIANT_DIRECT_PHASING_H_
#define LEARNING_GENOMICS_DEEPVARIANT_DIRECT_PHASING_H_
#ifndef FRIEND_TEST
#define FRIEND_TEST(test_case_name, test_name)\
friend class test_case_name##_##test_name##_Test
#endif
#include <string>
#include <string_view>
#include <vector>
#include "deepvariant/protos/deepvariant.pb.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/hash/hash.h"
#include "absl/strings/string_view.h"
#include "boost/graph/adjacency_list.hpp"
#include "boost/graph/graph_traits.hpp"
#include "third_party/nucleus/protos/reads.pb.h"
#include "third_party/nucleus/protos/variants.pb.h"
#include "third_party/nucleus/util/proto_ptr.h"
#include "third_party/nucleus/core/statusor.h"
namespace learning {
namespace genomics {
namespace deepvariant {
inline constexpr absl::string_view kUncalledAllele = "UNCALLED_ALLELE";
// TODO Add an overflow check when read indices are generated.
using ReadIndex = uint16_t;
// Data type storing read id and mapping quality. It is used in AlleleInfo.
struct ReadSupportInfo {
ReadIndex read_index;
bool is_low_quality;
bool is_first_allele;
bool operator==(const ReadSupportInfo& rs) const {
return rs.read_index == read_index && rs.is_low_quality == is_low_quality;
}
};
// Data type associated with graph nodes. It uniquely defines an allele by its
// type and bases along with the vector of supporting read ids.
struct AlleleInfo {
AlleleType type;
int64_t position = 0;
std::string bases = "";
int phase = 0;
std::vector<ReadSupportInfo> read_support;
};
// Data structure to hold variant phases. It is only used for reporting.
struct PhasedVariant {
int64_t position;
std::string phase_1_bases;
std::string phase_2_bases;
};
inline bool operator==(const AlleleInfo& lhs, const AlleleInfo& rhs) {
return lhs.type == rhs.type && lhs.position == rhs.position &&
lhs.bases == rhs.bases && lhs.read_support == rhs.read_support;
}
// Class that implements Direct Phasing algorithm. This class is only used by
// make_examples.py. There are two exported methods:
// * PhaseReads - called for each region and returns read phases calculated from
// candidates.
// * GraphViz - auxiliary methods to create graphviz output for debugging
// purposes.
class DirectPhasing {
public:
struct VertexInfo {
AlleleInfo allele_info;
};
struct EdgeInfo {
float weight;
};
using BoostGraph =
boost::adjacency_list<boost::setS, // Out edge list type.
boost::listS, // Vertex list type.
boost::bidirectionalS, // Directed graph.
VertexInfo, // Vertex label.
EdgeInfo>; // Edge label.
using Vertex = boost::graph_traits<BoostGraph>::vertex_descriptor;
using Edge = boost::graph_traits<BoostGraph>::edge_descriptor;
using RawVertexIndexMap = absl::flat_hash_map<Vertex, int>;
using VertexIndexMap =
boost::const_associative_property_map<RawVertexIndexMap>;
using VertexIterator = boost::graph_traits<BoostGraph>::vertex_iterator;
using EdgeIterator = boost::graph_traits<BoostGraph>::edge_iterator;
using InEdgeIterator = boost::graph_traits<BoostGraph>::in_edge_iterator;
using AdjacencyIterator = boost::graph_traits<BoostGraph>::adjacency_iterator;
struct AlleleSupport {
bool is_set = false;
Vertex vertex;
ReadSupportInfo read_support;
};
struct VertexPair {
Vertex phase_1_vertex;
Vertex phase_2_vertex;
template <typename H>
friend H AbslHashValue(H h, const VertexPair& vertex_pair) {
return H::combine(std::move(h), vertex_pair.phase_1_vertex,
vertex_pair.phase_2_vertex);
}
bool operator==(const VertexPair& vert_pair) const {
return this->phase_1_vertex == vert_pair.phase_1_vertex &&
this->phase_2_vertex == vert_pair.phase_2_vertex;
}
};
struct Score {
int score = 0;
// Source vertices are needed for back tracking.
Vertex from[2]; // Phase 1: Vertex[0], Phase 2: Vertex[1].
absl::flat_hash_set<ReadIndex> read_support[2]; // Read support for phase 1
// and phase 2.
};
// Function returns read phases for each read in the input reads preserving
// the order. Python wrapper will be used to add phases to read protos in
// order to avoid copying gigabytes of memory.
nucleus::StatusOr<std::vector<int>> PhaseReads(
const std::vector<DeepVariantCall>& candidates,
const std::vector<
nucleus::ConstProtoPtr<const nucleus::genomics::v1::Read>>& reads);
// PhaseReads wrapper for Python clif interface.
// TODO In other similar wrappers ConstProtoPtr is unwrapped in the
// wrapper. See if it makes sense to do here as well.
std::vector<int> PhaseReadsPython(
const std::vector<DeepVariantCall>& candidates,
const std::vector<
nucleus::ConstProtoPtr<const nucleus::genomics::v1::Read>>& reads) {
return PhaseReads(candidates, reads).ValueOrDie();
}
// Helper function to output graph into graphviz for debugging. This function
// is exported to Python.
std::string GraphViz() const;
// Returns variant phases. It is only used for reporting purposes.
std::vector<PhasedVariant> GetPhasedVariants() const;
private:
// Dynamic score for the partition. This score defines the best phasing up to
// a certain position.
// Convert Read protos to ReadSupportInfo, filtering low quality reads.
std::vector<ReadSupportInfo> ReadSupportFromProto(
const google::protobuf::RepeatedPtrField<DeepVariantCall_ReadSupport>& read_support)
const;
// Build graph from candidates.
void Build(
const std::vector<DeepVariantCall>& candidates,
const std::vector<
nucleus::ConstProtoPtr<const nucleus::genomics::v1::Read>>& reads);
void Build(const std::vector<DeepVariantCall>& candidates,
const std::vector<nucleus::genomics::v1::Read>& reads);
// Add nodes to the graph for each allele of the candidate. Fill auxiliary
// data structures.
void AddCandidate(const DeepVariantCall& candidate);
// Initializes all members of the class.
void Clear();
void InitializeReadMaps(
const std::vector<
nucleus::ConstProtoPtr<const nucleus::genomics::v1::Read>>& reads);
Vertex AddVertex(
int64_t position, AlleleType allele_type, absl::string_view bases,
const google::protobuf::RepeatedPtrField<DeepVariantCall_ReadSupport>& reads);
// Add edge to the graph using the provided weight.
Edge AddEdge(const Vertex& in_vertex, const Vertex& out_vertex, float weight);
// Add edge to the graph. The weight is calculated from read support for
// starting and ending vertices.
Edge AddEdge(const Vertex& in_vertex, bool is_low_quality_in,
const Vertex& out_vertex, bool is_low_quality_out);
void Prune();
void RebuildIndexMap();
// Update internal structures. It is assumed that this function is called
// once and only once for every vertex.
void UpdateReadToAllelesMap(const Vertex& v);
// Find all reads supporting starting_score partition and <vertex>.
// Reads that start at <vertex> are also counted.
absl::flat_hash_set<ReadIndex> FindSupportingReads(
const Vertex& vertex, const Score& starting_score, int phase) const;
// Calculate phasing score for pair of vertices that end <edge1> and <edge2>
// The score is calculated by adding a number of reads that support this path
// to the preceding score.
Score CalculateScore(const Edge& edge1, const Edge& edge2) const;
// Calculate phasing score for all pairs of verts when there are no incoming
// edges to any of the vers.
void UpdateStartingScore(const std::vector<Vertex>& verts);
// Assign phase to vertices. Some vertices will stay unassigned.
// Scores are assigned starting from the last position following the best
// score path.
void AssignPhasesToVertices();
// Assign phase to each read. Return a vector containing phases (0, 1, 2)
// for each read in the same order as input <reads>. Read objects are large
// therefore phases are returned in a separate vector instead of modifying
// input <reads>.
std::vector<int> AssignPhasesToReads(
const std::vector<
nucleus::ConstProtoPtr<const nucleus::genomics::v1::Read>>& reads)
const;
bool CompareVertexPairByBases(const Vertex& v1_1, const Vertex& v1_2,
const Vertex& v2_1, const Vertex& v2_2) const;
private:
BoostGraph graph_;
Vertex source_;
Vertex sink_;
RawVertexIndexMap vertex_index_map_; // This is needed for GraphViz.
absl::flat_hash_set<int> hom_positions_;
// Ordered candidate positions
std::vector<int> positions_;
// Helper structure to keep track of vertices at each position.
// Key is a position and value is a vector of vertices at this position.
absl::flat_hash_map<int, std::vector<Vertex>> vertices_by_position_;
// Pair of vertices define a partition (phasing) for a candidate.
// scores_ allows to keep track of the current best score for each partition.
absl::flat_hash_map<VertexPair, Score> scores_;
// Allele support for each read. Map is keyed by read id. Alleles are sorted
// by position. This map allows to quickly query all alleles that a read
// supports. Boolean variable designates if read to allele support is
// low_quality. If true then read supports the allele with low quality.
absl::flat_hash_map<ReadIndex, std::vector<AlleleSupport>> read_to_alleles_;
// Map read name to read id.
absl::flat_hash_map<std::string, ReadIndex> read_to_index_;
// Graph Vizualization
VertexIndexMap IndexMap() const;
// Unit test helper functions.
struct ReadFields {
std::string read_name;
ReadIndex read_index;
};
void PopulateReadsTest(const std::vector<ReadFields>& reads) {
for (const auto& read : reads) {
read_to_index_.insert(std::pair(read.read_name, read.read_index));
}
}
FRIEND_TEST(DirectPhasingTest, ReadSupportFromProtoSimple);
FRIEND_TEST(DirectPhasingTest, ReadSupportFromProtoLQReads);
FRIEND_TEST(DirectPhasingTest, BuildGraphSimple);
FRIEND_TEST(DirectPhasingTest, CalculateScoreFirstIteration);
FRIEND_TEST(DirectPhasingTest, CalculateScoreWithPreviousScore);
FRIEND_TEST(DirectPhasingTest, PhaseReadBrokenPath);
FRIEND_TEST(DirectPhasingTest, FilterOneAlleleCandidate);
FRIEND_TEST(DirectPhasingTest, FilterCandidateWithIndel);
};
// Helper functions.
// Calculate AlleleType by comparing alt allele size and candidate interval.
AlleleType AlleleTypeFromCandidate(std::string_view bases,
const DeepVariantCall& candidate);
// Calculate number of alt alleles that are SUBs.
int NumOfSubstitutionAlleles(const DeepVariantCall& candidate);
// Calculate number of alt alleles that are INDELs.
int NumOfIndelAlleles(const DeepVariantCall& candidate);
// Calculate the depth of all SUB alt alleles. This is done by enumerating all
// supporting reads for all SUB alleles.
int SubstitutionAllelesDepth(const DeepVariantCall& candidate);
} // namespace deepvariant
} // namespace genomics
} // namespace learning
#endif // LEARNING_GENOMICS_DEEPVARIANT_DIRECT_PHASING_H_