|
a |
|
b/openomics/transcriptomics.py |
|
|
1 |
import io |
|
|
2 |
import logging |
|
|
3 |
import os |
|
|
4 |
import re |
|
|
5 |
from glob import glob |
|
|
6 |
from typing import Union |
|
|
7 |
|
|
|
8 |
import dask.dataframe as dd |
|
|
9 |
import numpy as np |
|
|
10 |
import pandas as pd |
|
|
11 |
import validators |
|
|
12 |
# from Bio.UniProt import GOA |
|
|
13 |
from dask import delayed |
|
|
14 |
|
|
|
15 |
from .database.base import Annotatable |
|
|
16 |
from .io.files import get_pkg_data_filename |
|
|
17 |
from .transforms.df import drop_duplicate_columns |
|
|
18 |
|
|
|
19 |
__all__ = ['Expression', 'MessengerRNA', 'MicroRNA', 'LncRNA', ] |
|
|
20 |
|
|
|
21 |
class Expression(object): |
|
|
22 |
"""This class handles importing of any quantitative omics data that is |
|
|
23 |
in a table format (e.g. csv, tsv, excel). Pandas will load the DataFrame |
|
|
24 |
from file with the user-specified columns and genes column name, then |
|
|
25 |
tranpose it such that the rows are samples and columns are |
|
|
26 |
gene/transcript/peptides. The user will also specify the index argument, |
|
|
27 |
which specifies if the genes are ensembl genes ID or gene name, or |
|
|
28 |
transcripts id/names. The user should be careful about choosing the |
|
|
29 |
right genes index which makes it easier to annotate functional, |
|
|
30 |
sequence, and interaction data to it. The dataframe should only contain |
|
|
31 |
numeric values besides the genes_col_name and the sample barcode id |
|
|
32 |
indices. |
|
|
33 |
|
|
|
34 |
""" |
|
|
35 |
expressions: pd.DataFrame |
|
|
36 |
def __init__(self, data, transpose, gene_index=None, usecols=None, gene_level=None, sample_level="sample_index", |
|
|
37 |
transform_fn=None, dropna=False, npartitions=None, **kwargs): |
|
|
38 |
"""This constructor will create a DataFrame |
|
|
39 |
from file with the user-specified columns and genes column name, then |
|
|
40 |
tranpose it such that the rows are samples and columns are |
|
|
41 |
gene/transcript/peptides. The user will also specify the index argument, |
|
|
42 |
which specifies if the genes are ensembl genes ID or gene name, or |
|
|
43 |
transcripts id/names. The user should be careful about choosing the |
|
|
44 |
right genes index which makes it easier to annotate functional, |
|
|
45 |
sequence, and interaction data to it. The dataframe should only contain |
|
|
46 |
numeric values besides the genes_col_name and the sample barcode id |
|
|
47 |
indices. |
|
|
48 |
|
|
|
49 |
Args: |
|
|
50 |
data (str, byte-like, pandas.DataFrame): Path or file stream of the |
|
|
51 |
table file to import. If a pandas DataFrame is passed, then |
|
|
52 |
import this dataframe and skip preprocessing steps. |
|
|
53 |
transpose (bool): True if given data table has samples or columns |
|
|
54 |
and variables for rows. False if the table has samples for row |
|
|
55 |
index, and gene names as columns. |
|
|
56 |
gene_index (str): The column name of gene/transcript/protein to |
|
|
57 |
index by. |
|
|
58 |
usecols: A regex string to import column names from the table. |
|
|
59 |
Columns names imported are string match, separated by "|". |
|
|
60 |
gene_level (str): {"gene", "transcript", "peptide"} Chooses the |
|
|
61 |
level of the gene/transcript/peptide of the genes list in this |
|
|
62 |
expression data. The expression DataFrame's index will be |
|
|
63 |
renamed to this. |
|
|
64 |
sample_level (str): {"sample_index", "patient_index"} Chooses the |
|
|
65 |
level of the patient/sample/aliquot indexing. |
|
|
66 |
transform_fn (bool): default False A callable function to transform |
|
|
67 |
single values. |
|
|
68 |
dropna (bool): Whether to drop rows with null values |
|
|
69 |
npartitions (int): [0-n], default 0 If 0, then uses a Pandas |
|
|
70 |
DataFrame, if >1, then creates an off-memory Dask DataFrame with |
|
|
71 |
n partitions |
|
|
72 |
**kwargs: Any arguments to pass into pd.read_table(**kwargs) |
|
|
73 |
""" |
|
|
74 |
self.gene_level = gene_level |
|
|
75 |
self.sample_level = sample_level |
|
|
76 |
|
|
|
77 |
df = self.load_dataframe(data, transpose=transpose, usecols=usecols, gene_index=gene_index, dropna=dropna, |
|
|
78 |
**kwargs) |
|
|
79 |
|
|
|
80 |
self.expressions = self.preprocess_table( |
|
|
81 |
df, |
|
|
82 |
usecols=usecols, |
|
|
83 |
gene_index=gene_index, |
|
|
84 |
transposed=transpose, |
|
|
85 |
dropna=dropna, |
|
|
86 |
) |
|
|
87 |
|
|
|
88 |
# TODO load DD from file directly |
|
|
89 |
if npartitions and isinstance(self.expressions, pd.DataFrame): |
|
|
90 |
self.expressions = dd.from_pandas(self.expressions, npartitions=npartitions) |
|
|
91 |
|
|
|
92 |
if gene_level is not None: |
|
|
93 |
self.expressions.columns.name = gene_level |
|
|
94 |
|
|
|
95 |
self.expressions.index.name = self.sample_level |
|
|
96 |
|
|
|
97 |
if callable(transform_fn): |
|
|
98 |
self.expressions = self.expressions.applymap(transform_fn) |
|
|
99 |
elif transform_fn == "log2": |
|
|
100 |
self.expressions = self.expressions.applymap( |
|
|
101 |
lambda x: np.log2(x + 1)) |
|
|
102 |
|
|
|
103 |
@property |
|
|
104 |
def gene_index(self): |
|
|
105 |
return self.expressions.columns.name |
|
|
106 |
|
|
|
107 |
def load_dataframe(self, |
|
|
108 |
data: Union[str, pd.DataFrame, dd.DataFrame, io.StringIO], |
|
|
109 |
transpose: bool, |
|
|
110 |
usecols: str, |
|
|
111 |
gene_index: str, |
|
|
112 |
dropna: bool, **kwargs) -> pd.DataFrame: |
|
|
113 |
"""Reading table data inputs to create a DataFrame. |
|
|
114 |
|
|
|
115 |
Args: |
|
|
116 |
data: either a file path, a glob file path (e.g. "table-*.tsv"), a |
|
|
117 |
pandas.DataFrame, or a dask DataFrame. |
|
|
118 |
transpose (bool): True if table oriented with samples columns, else |
|
|
119 |
False. |
|
|
120 |
usecols (str): A regex string to select columns. Default None. |
|
|
121 |
gene_index (str): The column name what contains the gene names or IDs. |
|
|
122 |
dropna (bool): Whether to drop rows with null values |
|
|
123 |
|
|
|
124 |
Returns: |
|
|
125 |
Union[pd.DataFrame, dd.DataFrame]: The loaded dataframe. |
|
|
126 |
""" |
|
|
127 |
if isinstance(data, (pd.DataFrame, dd.DataFrame)): |
|
|
128 |
df = data |
|
|
129 |
|
|
|
130 |
elif isinstance(data, str) and "*" in data: |
|
|
131 |
# TODO implement handling for multiple file ByteIO |
|
|
132 |
df = self.load_dataframe_glob(globstring=data, usecols=usecols, gene_index=gene_index, transpose=transpose, |
|
|
133 |
dropna=dropna, **kwargs) |
|
|
134 |
|
|
|
135 |
elif isinstance(data, io.StringIO): |
|
|
136 |
# Needed since the file was previous read to extract columns information |
|
|
137 |
data.seek(0) |
|
|
138 |
df = pd.read_table(data, **kwargs) |
|
|
139 |
|
|
|
140 |
|
|
|
141 |
elif isinstance(data, str) and validators.url(data): |
|
|
142 |
dataurl, filename = os.path.split(data) |
|
|
143 |
file = get_pkg_data_filename(dataurl + "/", filename) |
|
|
144 |
df = pd.read_table(file, **kwargs) |
|
|
145 |
|
|
|
146 |
elif isinstance(data, str) and os.path.isfile(data): |
|
|
147 |
df = pd.read_table(data, sep=None, engine="python") |
|
|
148 |
|
|
|
149 |
|
|
|
150 |
else: |
|
|
151 |
raise FileNotFoundError(data) |
|
|
152 |
|
|
|
153 |
return df |
|
|
154 |
|
|
|
155 |
def preprocess_table(self, |
|
|
156 |
df: Union[pd.DataFrame, dd.DataFrame], |
|
|
157 |
usecols: str = None, |
|
|
158 |
gene_index: str = None, |
|
|
159 |
transposed: bool = True, |
|
|
160 |
sort_index: bool = False, |
|
|
161 |
dropna: bool = True, |
|
|
162 |
): |
|
|
163 |
"""This function preprocesses the expression table files where columns |
|
|
164 |
are samples and rows are gene/transcripts :param df: A Dask or Pandas |
|
|
165 |
DataFrame :type df: DataFrame :param usecols: A regular expression |
|
|
166 |
string for the column names to fetch. :type usecols: str :param |
|
|
167 |
gene_index: The column name containing the gene/transcript names or |
|
|
168 |
id's. :type gene_index: str :param transposed: Default True. Whether to |
|
|
169 |
transpose the dataframe so columns are genes (features) and rows are |
|
|
170 |
samples. |
|
|
171 |
|
|
|
172 |
Args: |
|
|
173 |
df (pd.DataFrame): |
|
|
174 |
usecols (str): |
|
|
175 |
gene_index (str): |
|
|
176 |
transposed (bool): |
|
|
177 |
sort_index (bool): |
|
|
178 |
dropna (bool): |
|
|
179 |
|
|
|
180 |
Returns: |
|
|
181 |
Union[pd.DataFrame, dd.DataFrame]: a processed Dask DataFrame |
|
|
182 |
""" |
|
|
183 |
# Filter columns |
|
|
184 |
if usecols is not None and isinstance(usecols, str): |
|
|
185 |
if gene_index not in usecols: |
|
|
186 |
# include index column in the filter regex query |
|
|
187 |
usecols = (usecols + "|" + gene_index) |
|
|
188 |
|
|
|
189 |
if isinstance(df, pd.DataFrame): |
|
|
190 |
df = df.filter(regex=usecols) |
|
|
191 |
elif isinstance(df, dd.DataFrame): |
|
|
192 |
columns = list(filter(re.compile(usecols).match, df.columns)) |
|
|
193 |
df = df[columns] |
|
|
194 |
|
|
|
195 |
elif usecols is not None and isinstance(usecols, list): |
|
|
196 |
if gene_index not in usecols: |
|
|
197 |
usecols.append(gene_index) |
|
|
198 |
df = df[usecols] |
|
|
199 |
|
|
|
200 |
# Drop duplicate column names |
|
|
201 |
df = drop_duplicate_columns(df) |
|
|
202 |
|
|
|
203 |
# Drop NA geneID rows |
|
|
204 |
if dropna: |
|
|
205 |
df.dropna(axis=0, inplace=True) |
|
|
206 |
|
|
|
207 |
if gene_index is not None and df.index.name != gene_index: |
|
|
208 |
df = df.set_index(gene_index) |
|
|
209 |
|
|
|
210 |
# Needed for Dask Delayed |
|
|
211 |
if sort_index is True: |
|
|
212 |
df = df.sort_index(axis=0, ascending=True) |
|
|
213 |
|
|
|
214 |
# Select only numerical columns |
|
|
215 |
df = df.select_dtypes(include="number") |
|
|
216 |
|
|
|
217 |
# Transpose dataframe to sample rows and gene columns |
|
|
218 |
if transposed: |
|
|
219 |
df = df.T |
|
|
220 |
# Drop duplicate genes |
|
|
221 |
df = drop_duplicate_columns(df) |
|
|
222 |
|
|
|
223 |
return df |
|
|
224 |
|
|
|
225 |
def load_dataframe_glob(self, globstring: str, usecols: str, gene_index: str, transpose: bool, dropna: bool, |
|
|
226 |
**kwargs): |
|
|
227 |
""" |
|
|
228 |
Args: |
|
|
229 |
globstring (str): |
|
|
230 |
usecols (str): |
|
|
231 |
gene_index (str): |
|
|
232 |
transpose (bool): |
|
|
233 |
Returns: |
|
|
234 |
dd.DataFrame |
|
|
235 |
""" |
|
|
236 |
|
|
|
237 |
def convert_numerical_to_float(df: pd.DataFrame): |
|
|
238 |
cols = df.columns[~df.dtypes.eq('object')] |
|
|
239 |
df[cols] = df[cols].astype(float) |
|
|
240 |
return df |
|
|
241 |
|
|
|
242 |
filenames = [] |
|
|
243 |
|
|
|
244 |
lazy_dataframes = [] |
|
|
245 |
for file_path in glob(globstring): |
|
|
246 |
filenames.append(os.path.split(file_path)[1]) |
|
|
247 |
|
|
|
248 |
df = delayed(pd.read_table)(file_path, **kwargs) |
|
|
249 |
# df = delayed(convert_numerical_to_float)(df) |
|
|
250 |
df = delayed(self.preprocess_table)( |
|
|
251 |
df, |
|
|
252 |
usecols, |
|
|
253 |
gene_index, |
|
|
254 |
transpose, |
|
|
255 |
True, # sort_index |
|
|
256 |
dropna) |
|
|
257 |
lazy_dataframes.append(df) |
|
|
258 |
|
|
|
259 |
logging.info("Files matched: {}".format(filenames)) |
|
|
260 |
|
|
|
261 |
return dd.from_delayed(lazy_dataframes, divisions=None, verify_meta=True) |
|
|
262 |
|
|
|
263 |
def set_genes_index(self, index: str, old_index: str): |
|
|
264 |
""" |
|
|
265 |
Args: |
|
|
266 |
index (str): |
|
|
267 |
old_index (str): |
|
|
268 |
""" |
|
|
269 |
assert isinstance(self, Annotatable) and isinstance(self, Expression) |
|
|
270 |
# Change gene name columns in expressions |
|
|
271 |
rename_dict = self.get_rename_dict(from_index=old_index, |
|
|
272 |
to_index=index) |
|
|
273 |
self.expressions.rename(columns=rename_dict, inplace=True) |
|
|
274 |
self.gene_index = index |
|
|
275 |
|
|
|
276 |
# Change index name in annotation |
|
|
277 |
self.set_index(index) |
|
|
278 |
|
|
|
279 |
def drop_genes(self, gene_ids: str): |
|
|
280 |
"""Drop columns representing genes/rna/proteins in self.expressions |
|
|
281 |
dataframe. |
|
|
282 |
|
|
|
283 |
Args: |
|
|
284 |
gene_ids (str): list of strings that are a subset of the columns |
|
|
285 |
list |
|
|
286 |
""" |
|
|
287 |
self.expressions = self.expressions.drop(gene_ids, axis=1) |
|
|
288 |
if hasattr(self, "annotations") and not self.annotations.empty: |
|
|
289 |
self.annotations = self.annotations.drop(gene_ids, axis=0) |
|
|
290 |
|
|
|
291 |
def drop_samples(self, sample_ids): |
|
|
292 |
""" |
|
|
293 |
Args: |
|
|
294 |
sample_ids: |
|
|
295 |
""" |
|
|
296 |
self.expressions = self.expressions.drop(sample_ids, axis=0) |
|
|
297 |
|
|
|
298 |
@classmethod |
|
|
299 |
def name(cls): |
|
|
300 |
raise NotImplementedError |
|
|
301 |
|
|
|
302 |
def get_genes_list(self, level: int = None): |
|
|
303 |
""" |
|
|
304 |
Args: |
|
|
305 |
level (int): Default None. Only needed if gene index is a :class:`pd.MultiIndex` |
|
|
306 |
""" |
|
|
307 |
index = self.expressions.columns |
|
|
308 |
|
|
|
309 |
if isinstance(index, pd.MultiIndex): |
|
|
310 |
return index.get_level_values( |
|
|
311 |
self.gene_index if level is None else level) |
|
|
312 |
else: |
|
|
313 |
return index |
|
|
314 |
|
|
|
315 |
def get_samples_list(self, level=None): |
|
|
316 |
""" |
|
|
317 |
Args: |
|
|
318 |
level: |
|
|
319 |
""" |
|
|
320 |
index = self.expressions.index |
|
|
321 |
if isinstance(index, pd.MultiIndex): |
|
|
322 |
return index.get_level_values( |
|
|
323 |
self.gene_index if level is None else level) |
|
|
324 |
else: |
|
|
325 |
return index |
|
|
326 |
|
|
|
327 |
samples = property(get_samples_list) |
|
|
328 |
features = property(get_genes_list) |
|
|
329 |
|
|
|
330 |
|
|
|
331 |
class LncRNA(Expression, Annotatable): |
|
|
332 |
def __init__( |
|
|
333 |
self, |
|
|
334 |
data, |
|
|
335 |
transpose, |
|
|
336 |
gene_index=None, |
|
|
337 |
usecols=None, |
|
|
338 |
gene_level=None, |
|
|
339 |
sample_level="sample_index", |
|
|
340 |
transform_fn=None, |
|
|
341 |
dropna=False, |
|
|
342 |
npartitions=None, |
|
|
343 |
cohort_name=None, |
|
|
344 |
): |
|
|
345 |
""" |
|
|
346 |
Args: |
|
|
347 |
data: |
|
|
348 |
transpose: |
|
|
349 |
gene_index: |
|
|
350 |
usecols: |
|
|
351 |
gene_level: |
|
|
352 |
sample_level: |
|
|
353 |
transform_fn: |
|
|
354 |
dropna: |
|
|
355 |
npartitions: |
|
|
356 |
cohort_name: |
|
|
357 |
""" |
|
|
358 |
super().__init__(data=data, transpose=transpose, gene_index=gene_index, usecols=usecols, |
|
|
359 |
gene_level=gene_level, sample_level=sample_level, transform_fn=transform_fn, |
|
|
360 |
dropna=dropna, npartitions=npartitions, cohort_name=cohort_name) |
|
|
361 |
|
|
|
362 |
@classmethod |
|
|
363 |
def name(cls): |
|
|
364 |
return cls.__name__ |
|
|
365 |
|
|
|
366 |
|
|
|
367 |
class MessengerRNA(Expression, Annotatable): |
|
|
368 |
def __init__( |
|
|
369 |
self, |
|
|
370 |
data, |
|
|
371 |
transpose, |
|
|
372 |
gene_index=None, |
|
|
373 |
usecols=None, |
|
|
374 |
gene_level=None, |
|
|
375 |
sample_level="sample_index", |
|
|
376 |
transform_fn=None, |
|
|
377 |
dropna=False, |
|
|
378 |
npartitions=None, |
|
|
379 |
cohort_name=None, |
|
|
380 |
): |
|
|
381 |
super().__init__(data=data, transpose=transpose, gene_index=gene_index, usecols=usecols, |
|
|
382 |
gene_level=gene_level, sample_level=sample_level, transform_fn=transform_fn, |
|
|
383 |
dropna=dropna, npartitions=npartitions, cohort_name=cohort_name) |
|
|
384 |
|
|
|
385 |
@classmethod |
|
|
386 |
def name(cls): |
|
|
387 |
return cls.__name__ |
|
|
388 |
|
|
|
389 |
|
|
|
390 |
class MicroRNA(Expression, Annotatable): |
|
|
391 |
def __init__( |
|
|
392 |
self, |
|
|
393 |
data, |
|
|
394 |
transpose, |
|
|
395 |
gene_index=None, |
|
|
396 |
usecols=None, |
|
|
397 |
gene_level=None, |
|
|
398 |
sample_level="sample_index", |
|
|
399 |
transform_fn=None, |
|
|
400 |
dropna=False, |
|
|
401 |
npartitions=None, |
|
|
402 |
cohort_name=None, |
|
|
403 |
): |
|
|
404 |
super().__init__(data=data, transpose=transpose, gene_index=gene_index, usecols=usecols, |
|
|
405 |
gene_level=gene_level, sample_level=sample_level, transform_fn=transform_fn, |
|
|
406 |
dropna=dropna, npartitions=npartitions, cohort_name=cohort_name) |
|
|
407 |
|
|
|
408 |
@classmethod |
|
|
409 |
def name(cls): |
|
|
410 |
return cls.__name__ |