[548210]: / openomics / io / read_gtf.py

Download this file

447 lines (378 with data), 17.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, division, absolute_import
import logging
from collections import OrderedDict
from os.path import exists
from typing import List, Dict, Union
import dask.dataframe as dd
import numpy as np
import pandas as pd
from six import string_types
from six.moves import intern
COLUMN_NAMES = [
"seqname",
"source",
"feature",
"start",
"end",
"score",
"strand",
"frame",
"attribute",
]
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def read_gtf(filepath_or_buffer, blocksize=None, compression=None, expand_attribute_column=True,
infer_biotype_column=False, column_converters={}, usecols=None, features=None, chunksize=1024 * 1024):
"""Parse a GTF into a dictionary mapping column names to sequences of
values.
Args:
filepath_or_buffer (str or buffer object): Path to GTF file (may be gzip
compressed) or buffer object such as StringIO
blocksize (int): Number of blocksize for the dask dataframe, if an integer > 10. Default None to use pandas.DataFrame instead.
compression (str): Compression type to be passed into dask.dataframe.read_table(). Default None.
expand_attribute_column (bool): Replace strings of semi-colon separated
key-value values in the 'attribute' column with one column per
distinct key, with a list of values for each row (using None for
rows where key didn't occur).
infer_biotype_column (bool): Due to the annoying ambiguity of the second
GTF column across multiple Ensembl releases, figure out if an older
GTF's source column is actually the gene_biotype or
transcript_biotype.
column_converters (dict, optional): Dictionary mapping column names to
conversion functions. Will replace empty strings with None and
otherwise passes them to given conversion function.
usecols (list of str or None): Restrict which columns are loaded to the
give set. If None, then load all columns.
features (set of str or None): Drop rows which aren't one of the
features in the supplied set
chunksize (int): The chunksize for pandas.read_csv(), ignored if blocksize is not None.
"""
if isinstance(filepath_or_buffer, string_types) and not exists(filepath_or_buffer):
raise ValueError("GTF file does not exist: %s" % filepath_or_buffer)
if expand_attribute_column:
result_df = parse_gtf_and_expand_attributes(filepath_or_buffer, blocksize=blocksize,
compression=compression,
chunksize=chunksize,
restrict_attribute_columns=usecols)
else:
if blocksize:
result_df = parse_gtf_dask(filepath_or_buffer, blocksize=blocksize, features=features,
compression=compression)
else:
result_df = parse_gtf(filepath_or_buffer, features=features, compression=compression)
for column_name, column_type in list(column_converters.items()):
result_df[column_name] = [
column_type(string_value) if len(string_value) > 0 else None
for string_value
in result_df[column_name]
]
# Hackishly infer whether the values in the 'source' column of this GTF
# are actually representing a biotype by checking for the most common
# gene_biotype and transcript_biotype value 'protein_coding'
if infer_biotype_column:
unique_source_values = set(result_df["source"])
if "protein_coding" in unique_source_values:
column_names = set(result_df.columns)
# Disambiguate between the two biotypes by checking if
# gene_biotype is already present in another column. If it is,
# the 2nd column is the transcript_biotype (otherwise, it's the
# gene_biotype)
if "gene_biotype" not in column_names:
logging.info("Using column 'source' to replace missing 'gene_biotype'")
result_df["gene_biotype"] = result_df["source"]
if "transcript_biotype" not in column_names:
logging.info("Using column 'source' to replace missing 'transcript_biotype'")
result_df["transcript_biotype"] = result_df["source"]
if usecols is not None:
column_names = set(result_df.columns)
valid_columns = [c for c in usecols if c in column_names]
result_df = result_df[valid_columns]
return result_df
def parse_gtf(filepath_or_buffer, chunksize=1024 * 1024, features=None,
intern_columns=["seqname", "source", "strand", "frame"],
fix_quotes_columns=["attribute"], compression=None) -> pd.DataFrame:
"""
Borrowed code from https://github.com/openvax/gtfparse
Args:
filepath_or_buffer (str or buffer object):
chunksize (int): Default 1048576.
features (set or None): Drop entries which aren't one of these features
intern_columns (list): These columns are short strings which should be
interned
fix_quotes_columns (list): Most commonly the 'attribute' column which
had broken quotes on some Ensembl release GTF files.
"""
if features is not None:
features = set(features)
def parse_frame(s):
if s == ".":
return 0
else:
return int(s)
# GTF columns:
# 1) seqname: str ("1", "X", "chrX", etc...)
# 2) source : str
# Different versions of GTF use second column as of:
# (a) gene biotype
# (b) transcript biotype
# (c) the annotation source
# See: https://www.biostars.org/p/120306/#120321
# 3) feature : str ("gene", "transcript", &c)
# 4) start : int
# 5) end : int
# 6) score : float or "."
# 7) strand : "+", "-", or "."
# 8) frame : 0, 1, 2 or "."
# 9) attribute : key-value pairs separated by semicolons
# (see more complete description in docstring at top of file)
chunk_iterator = pd.read_csv(
filepath_or_buffer,
sep="\t",
comment="#",
names=COLUMN_NAMES,
skipinitialspace=True,
skip_blank_lines=True,
on_bad_lines='error',
chunksize=chunksize,
compression=compression,
engine="c",
dtype={
"start": np.int64,
"end": np.int64,
"score": np.float32,
"seqname": str,
},
na_values=".",
converters={"frame": parse_frame})
dataframes = []
try:
for df in chunk_iterator:
for intern_column in intern_columns:
df[intern_column] = [intern(str(s)) for s in df[intern_column]]
# compare feature strings after interning
if features is not None:
df = df[df["feature"].isin(features)]
for fix_quotes_column in fix_quotes_columns:
# Catch mistaken semicolons by replacing "xyz;" with "xyz"
# Required to do this since the Ensembl GTF for Ensembl
# release 78 has mistakes such as:
# gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201"
df[fix_quotes_column] = [
s.replace(';\"', '\"').replace(";-", "-")
for s in df[fix_quotes_column]
]
dataframes.append(df)
except Exception as e:
raise e
# raise Exception("ParsingError:" + str(e))
df = pd.concat(dataframes)
return df
def parse_gtf_dask(filepath_or_buffer, blocksize=None, compression=None, features=None) -> dd.DataFrame:
"""
Args:
filepath_or_buffer (str or buffer object):
blocksize (int): Number of chunksize for the dask dataframe. Default None.
compression (str): Compression type to be passed into dask.dataframe.read_table(). Default None.
features (set or None): Drop entries which aren't one of these features
"""
if features is not None:
features = set(features)
def parse_frame(s):
if s == ".":
return 0
else:
return int(s)
# GTF columns:
# 1) seqname: str ("1", "X", "chrX", etc...)
# 2) source : str
# Different versions of GTF use second column as of:
# (a) gene biotype
# (b) transcript biotype
# (c) the annotation source
# See: https://www.biostars.org/p/120306/#120321
# 3) feature : str ("gene", "transcript", &c)
# 4) start : int
# 5) end : int
# 6) score : float or "."
# 7) strand : "+", "-", or "."
# 8) frame : 0, 1, 2 or "."
# 9) attribute : key-value pairs separated by semicolons
# (see more complete description in docstring at top of file)
# Uses Dask
dataframe = dd.read_table(
filepath_or_buffer,
sep="\t",
compression=compression,
blocksize=None if isinstance(blocksize, bool) else blocksize,
comment="#",
names=COLUMN_NAMES,
skipinitialspace=True,
skip_blank_lines=True,
on_bad_lines='error',
engine="c",
dtype={
"start": np.int64,
"end": np.int64,
"score": np.float32,
"seqname": str,
},
na_values=".",
converters={"frame": parse_frame})
return dataframe
def parse_gtf_and_expand_attributes(filepath_or_buffer, blocksize=None, compression=None, chunksize=1024 * 1024,
restrict_attribute_columns=None, features=None) -> Union[
dd.DataFrame, pd.DataFrame]:
"""Parse lines into column->values dictionary and then expand the
'attribute' column into multiple columns. This expansion happens by
replacing strings of semi-colon separated key-value values in the
'attribute' column with one column per distinct key, with a list of values
for each row (using None for rows where key didn't occur).
Args:
compression:
filepath_or_buffer (str or buffer object):
blocksize:
chunksize (int):
restrict_attribute_columns (list/set of str or None): If given, then only uses attribute columns.
features (set or None): Ignore entries which don't correspond to one of the supplied features
"""
if blocksize:
# Dask dataframe
def expand_attr_strings_to_dict(attribute_string: str, quote_char='\"', usecols=None,
column_interned_strings={}, value_interned_strings={}) \
-> Dict[str, str]:
"""The last column of GTF has a variable number of key value pairs of the
format: "key1 value1; key2 value2;" Parse these into a dictionary mapping
each key onto a list of values, where the value is None for any row where
the key was missing.
Args:
attribute_strings (list of str):
quote_char (str): Quote character to remove from values
nan_value (any): If an attribute is missing from a row, give it this
value.
usecols (list of str or None): If not None, then only expand columns
included in this set, otherwise use all columns.
column_interned_strings: Mutable default argument used for multiple
calls to the same data.
"""
output = {}
for kv in attribute_string.split(";"):
# We're slicing the first two elements out of split() because
# Ensembl release 79 added values like:
# transcript_support_level "1 (assigned to previous version 5)";
# ...which gets mangled by splitting on spaces.
parts = kv.strip().split(" ", 2)[:2]
if len(parts) != 2: continue
column_name, value = parts
try:
column_name = column_interned_strings[column_name]
except KeyError:
column_name = intern(str(column_name))
column_interned_strings[column_name] = column_name
if usecols is not None and column_name not in usecols: continue
value = value.replace(quote_char, "") if value.startswith(quote_char) else value
try:
value = value_interned_strings[value]
except KeyError:
value = intern(str(value))
value_interned_strings[value] = value
output[column_name] = value
return output
df = parse_gtf_dask(filepath_or_buffer, blocksize=blocksize, compression=compression, features=features)
attribute_values: dd.Series = df.pop("attribute")
# Sample 500 rows to determine the meta dataframe
attr_vals = attribute_values.head(500)
expected_df = attr_vals.map(expand_attr_strings_to_dict).apply(pd.Series)
attr_df = attribute_values.to_bag(index=False, format='tuple') \
.map(expand_attr_strings_to_dict) \
.to_dataframe(meta=expected_df)
attr_df.index = df.index
df = dd.concat([df, attr_df], axis=1)
else:
# Pandas
df = parse_gtf(filepath_or_buffer, chunksize=chunksize, features=features, compression=compression)
attribute_values = df.pop("attribute")
for column_name, values in expand_attribute_strings(attribute_values,
usecols=restrict_attribute_columns).items():
df[column_name] = values
return df
def expand_attribute_strings(attribute_strings: List[str], quote_char='\"', nan_value=None,
usecols=None) -> OrderedDict:
"""The last column of GTF has a variable number of key value pairs of the
format: "key1 value1; key2 value2;" Parse these into a dictionary mapping
each key onto a list of values, where the value is None for any row where
the key was missing.
Args:
attribute_strings (list of str):
quote_char (str): Quote character to remove from values
nan_value (any): If an attribute is missing from a row, give it this
value.
usecols (list of str or None): If not None, then only expand columns
included in this set, otherwise use all columns.
"""
n = len(attribute_strings)
extra_columns = {}
column_order = []
#
# SOME NOTES ABOUT THE BIZARRE STRING INTERNING GOING ON BELOW
#
# While parsing millions of repeated strings (e.g. "gene_id" and "TP53"),
# we can save a lot of memory by making sure there's only one string
# object per unique string. The canonical way to do this is using
# the 'intern' function. One problem is that Py2 won't let you intern
# unicode objects, so to get around this we call intern(str(...)).
#
# It also turns out to be faster to check interned strings ourselves
# using a local dictionary, hence the two dictionaries below
# and pair of try/except blocks in the loop.
column_interned_strings = {}
value_interned_strings = {}
for (i, attribute_string) in enumerate(attribute_strings):
for kv in attribute_string.split(";"):
# We're slicing the first two elements out of split() because
# Ensembl release 79 added values like:
# transcript_support_level "1 (assigned to previous version 5)";
# ...which gets mangled by splitting on spaces.
parts = kv.strip().split(" ", 2)[:2]
if len(parts) != 2:
continue
column_name, value = parts
try:
column_name = column_interned_strings[column_name]
except KeyError:
column_name = intern(str(column_name))
column_interned_strings[column_name] = column_name
if usecols is not None and column_name not in usecols:
continue
try:
column = extra_columns[column_name]
except KeyError:
column = [nan_value] * n
extra_columns[column_name] = column
column_order.append(column_name)
value = value.replace(quote_char, "") if value.startswith(quote_char) else value
try:
value = value_interned_strings[value]
except KeyError:
value = intern(str(value))
value_interned_strings[value] = value
# if an attribute is used repeatedly then
# keep track of all its values in a list
old_value = column[i]
if old_value is nan_value:
column[i] = value
else:
column[i] = "%s,%s" % (old_value, value)
return OrderedDict(
(column_name, extra_columns[column_name])
for column_name in column_order)