deepvariant-r1.6.1 / Git / [9b26b7] /third_party/nucleus/io/converter.py

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / third_party / nucleus / io / converter.py
History
Download this file
203 lines (158 with data), 6.9 kB

# Copyright 2018 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
#    this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from this
#    software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""A universal converter program for nucleus-supported genomics file formats.

Invoked with a single argument, this program will open a genomics data file and
iterate over its contents, doing no writing.  This is a good benchmark for I/O
and reader processing speed.

Invoked with two arguments, the program will open the first file, read its
records, and write them, one at a time, to the second file.  The filetypes for
the first and second filename must be compatible ways of encoding the same
nucleus genomics record type (for example, `infile.gff` and
`outfile.gff.tfrecord.gz` are compatible, but `infile.gff` and `outfile.bam` are
not.

Note: at present we have no convention for encoding a file *header* in
tfrecords, so conversion is not possible from tfrecord to any native file format
for which a header is compulsory.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import re
import sys
import time

from absl import app
from absl import logging

from third_party.nucleus.io import bed
from third_party.nucleus.io import fastq
from third_party.nucleus.io import gff
from third_party.nucleus.io import sam
from third_party.nucleus.io import vcf


def _is_native_file(filename):
  """Returns true if filename is a native (non-tfrecord) genomics data file."""
  return not re.match(r".*\.tfrecord(\.gz)?", filename)


def _filename_pattern(ext):
  """Returns an re matching native or tfrecord files of format `ext`."""
  return r".*\.{}(\.tfrecord)?(\.gz)?".format(ext)


_FileType = collections.namedtuple(
    "_FileType", ("reader_class", "writer_class", "has_header"))

_FILETYPE_LOOKUP = {
    _filename_pattern("bed"):
        _FileType(bed.BedReader, bed.BedWriter, False),
    _filename_pattern("(fastq|fq)"):
        _FileType(fastq.FastqReader, fastq.FastqWriter, False),
    _filename_pattern("gff"):
        _FileType(gff.GffReader, gff.GffWriter, True),
    _filename_pattern("(bam|sam)"):
        _FileType(sam.SamReader, sam.SamWriter, True),
    _filename_pattern("vcf"):
        _FileType(vcf.VcfReader, vcf.VcfWriter, True),
}


def _lookup_filetype(filename):
  for pattern in _FILETYPE_LOOKUP:
    if re.match(pattern, filename):
      return _FILETYPE_LOOKUP[pattern]
  raise ConversionError("Unrecognized extension!")

LOG_EVERY = 100000


class ConversionError(Exception):
  """An exception used to signal file conversion error."""
  pass


class NullWriter(object):
  """A writer class whose .write() method is a no-op.

  This allows us to create and use a writer object where one is required by
  context but we do not wish to write to any file.
  """

  def __init__(self, unused_filename, header=None):
    pass

  def write(self, unused_record):
    pass

  def __enter__(self):
    return self

  def __exit__(self, exc_type, exc_value, traceback):
    pass


def _reader_writer_classes(in_filename, out_filename):
  """Returns reader, writer classes for filenames, if conversion is possible.

  Args:
    in_filename: filename of a genomics data file to use as input.
    out_filename: filename of a genomics data file to use as output, or None,
      if no output should be written.

  Raises:
    ConversionError: if in_filename is not convertible to out_filename.
  """
  in_filetype = _lookup_filetype(in_filename)
  out_filetype = _lookup_filetype(out_filename) if out_filename else None

  if out_filetype:
    if in_filetype != out_filetype:
      raise ConversionError(
          "Input and output filetypes specified are incompatible.")
    input_has_header = in_filetype.has_header and _is_native_file(in_filename)
    output_requires_header = (
        out_filetype.has_header and _is_native_file(out_filename))
    if output_requires_header and not input_has_header:
      raise ConversionError(
          "Input file does not have a header, which is needed to construct "
          "output file")
    writer_class = out_filetype.writer_class

  else:
    writer_class = NullWriter

  return in_filetype.reader_class, writer_class


def convert(in_filename, out_filename):
  """Converts a recognized genomics file `in_filename` to `out_filename`.

  Args:
    in_filename: str; filename of a genomics data file to use as input.
    out_filename: str; filename of a genomics data file to use as output, or
      None, if no output should be written.

  Raises:
    ConversionError, if the conversion could not be executed.
  """
  reader_class, writer_class = _reader_writer_classes(in_filename, out_filename)
  reader = reader_class(in_filename)

  with reader_class(in_filename) as reader:
    with writer_class(out_filename, header=reader.header) as writer:
      start = time.time()
      i = 0
      for record in reader:
        i += 1
        writer.write(record)
        logging.log_every_n(logging.INFO, "Progress: %d records", LOG_EVERY, i)
      elapsed = time.time() - start
      logging.info("Done, processed %d records in %0.2f seconds.", i, elapsed)


def main(argv):
  if len(argv) not in (2, 3):
    print("Usage: %s <input_filename> [<output_filename>]" % argv[0])
    sys.exit(1)

  input_filename = argv[1]
  output_filename = None if len(argv) == 2 else argv[2]

  try:
    convert(input_filename, output_filename)
  except ConversionError as e:
    print("Could not execute conversion:", e)
    sys.exit(1)


if __name__ == "__main__":
  app.run(main)