[9b26b7]: / third_party / nucleus / io / reference.h

Download this file

378 lines (320 with data), 15.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
/*
* Copyright 2018 Google LLC.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
// Get basic information about a reference genome as well as make it cpu and
// memory efficient and scalable to get the reference bases for an interval on
// the genome.
//
// The GenomeReference provides the core functionality needed to use a reference
// genome for data processing and analyses tools:
//
// -- Get information about the contigs (aka chromosomes) present the FASTA,
// such as its name, description, and number of basepairs.
// -- Efficiently lookup the sequence of bases in an interval in the reference
// genome. For example, GetBases("chr1", 0, 10) gets the basepair sequence
// from the first base to the ninth base on chr1. This function call has
// cost roughly proportional to the size of the query interval, regardless
// of its position in the original FASTA file.
//
// The code here makes some strong assumptions about what a client could want.
// It doesn't record the position of bases in the original FASTA, and it doesn't
// track line breaks, comments, and other features of the FASTA. It uppercases
// the basepair sequences, so complexity or other information encoded in the
// case of the bases is lost. It also ensures that all of the bases in the
// reference are either {A,C,G,T,N} by refusing to import reference sequences
// containing other characters. The code assumes that random accesses of
// reasonably small chunks of sequence is important at the expense of low-cost
// (but still reasonably efficient) iteration of all sequences in the FASTA.
#ifndef THIRD_PARTY_NUCLEUS_IO_REFERENCE_H_
#define THIRD_PARTY_NUCLEUS_IO_REFERENCE_H_
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "absl/types/optional.h"
#include "htslib/faidx.h"
#include "third_party/nucleus/io/reader_base.h"
#include "third_party/nucleus/io/text_reader.h"
#include "third_party/nucleus/platform/types.h"
#include "third_party/nucleus/protos/fasta.pb.h"
#include "third_party/nucleus/protos/range.pb.h"
#include "third_party/nucleus/protos/reference.pb.h"
#include "third_party/nucleus/core/status.h"
#include "third_party/nucleus/core/statusor.h"
namespace nucleus {
constexpr int INDEXED_FASTA_READER_DEFAULT_CACHE_SIZE = 64 * 1024;
// Alias for the abstract base class for FASTA record iterables, which
// corresponds to (name, sequence) pairs.
using GenomeReferenceRecord = std::pair<string, string>;
using GenomeReferenceRecordIterable = Iterable<GenomeReferenceRecord>;
class GenomeReference : public Reader {
public:
GenomeReference(const GenomeReference&) = delete;
GenomeReference& operator=(const GenomeReference&) = delete;
virtual ~GenomeReference() {}
virtual const std::vector<nucleus::genomics::v1::ContigInfo>& Contigs()
const = 0;
// Returns true if reference has a contig named contig_name.
virtual bool HasContig(const string& contig_name) const;
// Gets a vector of the contig names in this reference, in the order that they
// appeared in the original FASTA file. For example, would return:
//
// {"chrM", "chr1", "chr2"}
//
// for a reference with three contigs named chrM, chr1, and chr2.
virtual std::vector<string> ContigNames() const;
// Gets the metadata about a contig in the fasta, such as its name,
// description, length, etc. If contig_name isn't found in this reference,
// returns a value whose status is not ok().
virtual StatusOr<const nucleus::genomics::v1::ContigInfo*> Contig(
const string& contig_name) const;
// Gets the basepairs in the FASTA file from Range range.
//
// This follows the Range convention of getting bases from start
// (inclusive) to end (exclusive), both 0-based. That is,
// GetBases(Range("chr1", 2, 4)) gets a string starting with the *3rd* base on
// chr1 and extending through the *4th* base (excluding 4 as an offset).
// If chr isn't present in this reference, start is invalid or end is beyond
// the length of chr, returns a value whose status is not ok().
virtual StatusOr<string> GetBases(
const nucleus::genomics::v1::Range& range) const = 0;
// Gets all of the FASTA records in this file in order.
//
// The specific parsing, filtering, etc behavior is determined by the options
// provided during construction. Returns an OK status if the iterable can be
// constructed, or not OK otherwise.
virtual StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>> Iterate()
const = 0;
// Returns true iff the Range chr:start-end is a valid interval on chr and chr
// is a known contig in this reference.
bool IsValidInterval(const nucleus::genomics::v1::Range& range) const;
// Close the underlying resource descriptors.
virtual ::nucleus::Status Close() { return ::nucleus::Status(); }
// This no-op function is needed only for Python context manager support. Do
// not use it!
virtual ::nucleus::Status PythonEnter() const { return ::nucleus::Status(); }
protected:
// Default constructor for subclasses.
GenomeReference() {}
};
// A FASTA reader backed by a htslib FAI index.
//
// FASTA files store information about DNA/RNA/Amino Acid sequences:
//
// https://en.wikipedia.org/wiki/FASTA_format
//
// This reader is specialized for the FASTA variant used in NGS analyses, which
// has a FAI index created by samtools that allows efficient query() operations
// to obtain the subsequence of the FASTA on a specific contig between a start
// and end offsets:
//
// http://www.htslib.org/doc/faidx.html
// http://www.htslib.org/doc/samtools.html [faidx section]
//
// The FASTA file can be optionally block-gzipped compressed.
//
// This class provides methods to iterate through a the FASTA records and to
// also query() for the bases spanning a specific region on the genome.
//
// Uses the htslib C API for reading the FASTA and FAI. For details of
// the API, see:
//
// https://github.com/samtools/htslib/tree/develop/htslib
//
// The objects returned by iterate() or query() are strings containing the
// bases, all upper-cased.
class IndexedFastaReader : public GenomeReference {
public:
// Creates a new GenomeReference backed by the FASTA file fasta_path.
//
// Returns this newly allocated IndexedFastaReader object, passing ownership
// to the caller via a unique_ptr.
//
// htslib currently assumes that the FAI file is named fasta_path + '.fai',
// so that file must exist and be readable by htslib.
//
// We maintain a single entry cache of the bases from the last FASTA fetch, to
// reduce the number of file reads, which can be quite costly for remote
// filesystems. 64K is the default block size for htslib faidx fetches, so
// there is no penalty to rounding up all small access sizes to 64K. The
// cache can be disabled using `cache_size=0`.
static StatusOr<std::unique_ptr<IndexedFastaReader>> FromFile(
const string& fasta_path, const string& fai_path,
const nucleus::genomics::v1::FastaReaderOptions& options,
int cache_size_bases = INDEXED_FASTA_READER_DEFAULT_CACHE_SIZE);
static StatusOr<std::unique_ptr<IndexedFastaReader>> FromFile(
const string& fasta_path, const string& fai_path,
int cache_size_bases = INDEXED_FASTA_READER_DEFAULT_CACHE_SIZE);
~IndexedFastaReader();
// Disable copy and assignment operations
IndexedFastaReader(const IndexedFastaReader& other) = delete;
IndexedFastaReader& operator=(const IndexedFastaReader&) = delete;
const std::vector<nucleus::genomics::v1::ContigInfo>& Contigs()
const override {
return contigs_;
}
StatusOr<string> GetBases(
const nucleus::genomics::v1::Range& range) const override;
// Get the options controlling the behavior of this FastaReader.
const nucleus::genomics::v1::FastaReaderOptions& Options() const {
return options_;
}
StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>> Iterate()
const override;
// Close the underlying resource descriptors.
::nucleus::Status Close() override;
private:
// Allow iteration to access the underlying reader.
friend class IndexedFastaReaderIterable;
// Must use one of the static factory methods.
IndexedFastaReader(const string& fasta_path, faidx_t* faidx,
const nucleus::genomics::v1::FastaReaderOptions& options,
int cache_size_bases);
// Path to the FASTA file containing our genomic bases.
const string fasta_path_;
// Initialized htslib faidx_t index for our fasta. Effectively constant during
// the life of this object, but htslib API doesn't allow us to mark this as
// const.
faidx_t* faidx_;
// The options controlling the behavior of this FastaReader.
const nucleus::genomics::v1::FastaReaderOptions options_;
// A list of ContigInfo, each of which contains the information about the
// contigs used by this BAM file.
const std::vector<nucleus::genomics::v1::ContigInfo> contigs_;
// Size, in bases, of the read cache.
const int cache_size_bases_;
// Cache of the last "small" read from the FASTA (<= kFastaCacheSize).
mutable string small_read_cache_;
// The range that is held in the cache, or "empty" if there is no range cached
// yet. Range must be <= kFastaCacheSize in length.
mutable absl::optional<nucleus::genomics::v1::Range> cached_range_;
};
// A FASTA reader that is not backed by a htslib FAI index.
//
// FASTA files store information about DNA/RNA/Amino Acid sequences:
//
// https://en.wikipedia.org/wiki/FASTA_format
//
// This reader is for FASTA files that contain many small records and are
// explicitly not indexed. The FASTA files can be optionally block-gzipped
// compressed.
//
// This class provides methods to iterate through a the FASTA records but
// doesn't support query() for the bases spanning a specific region on the
// genome.
//
// The (name, bases) tuple returned by iterate() are strings containing the
// bases in uppercase.
class UnindexedFastaReader : public GenomeReference {
public:
// Creates a new GenomeReference backed by the FASTA file fasta_path.
//
// Returns this newly allocated UnindexedFastaReader object, passing ownership
// to the caller via a unique_ptr.
static StatusOr<std::unique_ptr<UnindexedFastaReader>> FromFile(
const string& fasta_path);
~UnindexedFastaReader();
// Disable copy and assignment operations
UnindexedFastaReader(const UnindexedFastaReader& other) = delete;
UnindexedFastaReader& operator=(const UnindexedFastaReader&) = delete;
const std::vector<nucleus::genomics::v1::ContigInfo>& Contigs()
const override;
StatusOr<string> GetBases(
const nucleus::genomics::v1::Range& range) const override;
StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>> Iterate()
const override;
// Close the underlying resource descriptors.
::nucleus::Status Close() override;
private:
// Allow iteration to access the underlying reader.
friend class UnindexedFastaReaderIterable;
// Must use one of the static factory methods.
UnindexedFastaReader(std::unique_ptr<TextReader> text_reader);
const std::vector<nucleus::genomics::v1::ContigInfo> contigs_;
// Underlying file reader.
std::unique_ptr<TextReader> text_reader_;
};
// An FASTA reader backed by in-memory ReferenceSequence protos.
//
// FASTA files store information about DNA/RNA/Amino Acid sequences:
//
// https://en.wikipedia.org/wiki/FASTA_format
//
//
// An InMemoryFastaReader provides the same API as GenomeReferenceFAI but
// doesn't fetch its data from an on-disk FASTA file but rather fetches the
// bases from an in-memory cache containing ReferenceSequence protos.
//
// In particular the GetBases(Range(chrom, start, end)) operation fetches bases
// from the tuple where chrom == chromosome, and then from the bases where the
// first base of bases starts at start. If start > 0, then the bases string is
// assumed to contain bases starting from that position in the region. For
// example, the record ('1', 10, 'ACGT') implies that
// GetBases(ranges.make_range('1', 11, 12)) will return the base 'C', as the 'A'
// base is at position 10. This makes it straightforward to cache a small region
// of a full chromosome without having to store the entire chromosome sequence
// in memory (potentially big!).
class InMemoryFastaReader : public GenomeReference {
public:
// Creates a new InMemoryFastaReader backed by ReferenceSequence protos.
static StatusOr<std::unique_ptr<InMemoryFastaReader>> Create(
const std::vector<nucleus::genomics::v1::ContigInfo>& contigs,
const std::vector<nucleus::genomics::v1::ReferenceSequence>& seqs);
// Disable copy and assignment operations
InMemoryFastaReader(const InMemoryFastaReader& other) = delete;
InMemoryFastaReader& operator=(const InMemoryFastaReader&) = delete;
const std::vector<nucleus::genomics::v1::ContigInfo>& Contigs()
const override {
return contigs_;
}
const std::unordered_map<string, nucleus::genomics::v1::ReferenceSequence>&
ReferenceSequences() const {
return seqs_;
}
StatusOr<string> GetBases(
const nucleus::genomics::v1::Range& range) const override;
StatusOr<std::shared_ptr<GenomeReferenceRecordIterable>> Iterate()
const override;
private:
// Allow iteration to access the underlying reader.
friend class FastaFullFileIterable;
// Must use one of the static factory methods.
InMemoryFastaReader(
const std::vector<nucleus::genomics::v1::ContigInfo>& contigs,
std::unordered_map<string, nucleus::genomics::v1::ReferenceSequence> seqs)
: contigs_(contigs), seqs_(seqs) {}
const std::vector<nucleus::genomics::v1::ContigInfo> contigs_;
const std::unordered_map<string, nucleus::genomics::v1::ReferenceSequence>
seqs_;
};
} // namespace nucleus
#endif // THIRD_PARTY_NUCLEUS_IO_REFERENCE_H_