[9b26b7]: / third_party / nucleus / io / converter.py

Download this file

203 lines (158 with data), 6.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# Copyright 2018 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""A universal converter program for nucleus-supported genomics file formats.
Invoked with a single argument, this program will open a genomics data file and
iterate over its contents, doing no writing. This is a good benchmark for I/O
and reader processing speed.
Invoked with two arguments, the program will open the first file, read its
records, and write them, one at a time, to the second file. The filetypes for
the first and second filename must be compatible ways of encoding the same
nucleus genomics record type (for example, `infile.gff` and
`outfile.gff.tfrecord.gz` are compatible, but `infile.gff` and `outfile.bam` are
not.
Note: at present we have no convention for encoding a file *header* in
tfrecords, so conversion is not possible from tfrecord to any native file format
for which a header is compulsory.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import re
import sys
import time
from absl import app
from absl import logging
from third_party.nucleus.io import bed
from third_party.nucleus.io import fastq
from third_party.nucleus.io import gff
from third_party.nucleus.io import sam
from third_party.nucleus.io import vcf
def _is_native_file(filename):
"""Returns true if filename is a native (non-tfrecord) genomics data file."""
return not re.match(r".*\.tfrecord(\.gz)?", filename)
def _filename_pattern(ext):
"""Returns an re matching native or tfrecord files of format `ext`."""
return r".*\.{}(\.tfrecord)?(\.gz)?".format(ext)
_FileType = collections.namedtuple(
"_FileType", ("reader_class", "writer_class", "has_header"))
_FILETYPE_LOOKUP = {
_filename_pattern("bed"):
_FileType(bed.BedReader, bed.BedWriter, False),
_filename_pattern("(fastq|fq)"):
_FileType(fastq.FastqReader, fastq.FastqWriter, False),
_filename_pattern("gff"):
_FileType(gff.GffReader, gff.GffWriter, True),
_filename_pattern("(bam|sam)"):
_FileType(sam.SamReader, sam.SamWriter, True),
_filename_pattern("vcf"):
_FileType(vcf.VcfReader, vcf.VcfWriter, True),
}
def _lookup_filetype(filename):
for pattern in _FILETYPE_LOOKUP:
if re.match(pattern, filename):
return _FILETYPE_LOOKUP[pattern]
raise ConversionError("Unrecognized extension!")
LOG_EVERY = 100000
class ConversionError(Exception):
"""An exception used to signal file conversion error."""
pass
class NullWriter(object):
"""A writer class whose .write() method is a no-op.
This allows us to create and use a writer object where one is required by
context but we do not wish to write to any file.
"""
def __init__(self, unused_filename, header=None):
pass
def write(self, unused_record):
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
pass
def _reader_writer_classes(in_filename, out_filename):
"""Returns reader, writer classes for filenames, if conversion is possible.
Args:
in_filename: filename of a genomics data file to use as input.
out_filename: filename of a genomics data file to use as output, or None,
if no output should be written.
Raises:
ConversionError: if in_filename is not convertible to out_filename.
"""
in_filetype = _lookup_filetype(in_filename)
out_filetype = _lookup_filetype(out_filename) if out_filename else None
if out_filetype:
if in_filetype != out_filetype:
raise ConversionError(
"Input and output filetypes specified are incompatible.")
input_has_header = in_filetype.has_header and _is_native_file(in_filename)
output_requires_header = (
out_filetype.has_header and _is_native_file(out_filename))
if output_requires_header and not input_has_header:
raise ConversionError(
"Input file does not have a header, which is needed to construct "
"output file")
writer_class = out_filetype.writer_class
else:
writer_class = NullWriter
return in_filetype.reader_class, writer_class
def convert(in_filename, out_filename):
"""Converts a recognized genomics file `in_filename` to `out_filename`.
Args:
in_filename: str; filename of a genomics data file to use as input.
out_filename: str; filename of a genomics data file to use as output, or
None, if no output should be written.
Raises:
ConversionError, if the conversion could not be executed.
"""
reader_class, writer_class = _reader_writer_classes(in_filename, out_filename)
reader = reader_class(in_filename)
with reader_class(in_filename) as reader:
with writer_class(out_filename, header=reader.header) as writer:
start = time.time()
i = 0
for record in reader:
i += 1
writer.write(record)
logging.log_every_n(logging.INFO, "Progress: %d records", LOG_EVERY, i)
elapsed = time.time() - start
logging.info("Done, processed %d records in %0.2f seconds.", i, elapsed)
def main(argv):
if len(argv) not in (2, 3):
print("Usage: %s <input_filename> [<output_filename>]" % argv[0])
sys.exit(1)
input_filename = argv[1]
output_filename = None if len(argv) == 2 else argv[2]
try:
convert(input_filename, output_filename)
except ConversionError as e:
print("Could not execute conversion:", e)
sys.exit(1)
if __name__ == "__main__":
app.run(main)