Switch to unified view

a b/openomics/transcriptomics.py
1
import io
2
import logging
3
import os
4
import re
5
from glob import glob
6
from typing import Union
7
8
import dask.dataframe as dd
9
import numpy as np
10
import pandas as pd
11
import validators
12
# from Bio.UniProt import GOA
13
from dask import delayed
14
15
from .database.base import Annotatable
16
from .io.files import get_pkg_data_filename
17
from .transforms.df import drop_duplicate_columns
18
19
__all__ = ['Expression', 'MessengerRNA', 'MicroRNA', 'LncRNA', ]
20
21
class Expression(object):
22
    """This class handles importing of any quantitative omics data that is
23
        in a table format (e.g. csv, tsv, excel). Pandas will load the DataFrame
24
        from file with the user-specified columns and genes column name, then
25
        tranpose it such that the rows are samples and columns are
26
        gene/transcript/peptides. The user will also specify the index argument,
27
        which specifies if the genes are ensembl genes ID or gene name, or
28
        transcripts id/names. The user should be careful about choosing the
29
        right genes index which makes it easier to annotate functional,
30
        sequence, and interaction data to it. The dataframe should only contain
31
        numeric values besides the genes_col_name and the sample barcode id
32
        indices.
33
34
    """
35
    expressions: pd.DataFrame
36
    def __init__(self, data, transpose, gene_index=None, usecols=None, gene_level=None, sample_level="sample_index",
37
                 transform_fn=None, dropna=False, npartitions=None, **kwargs):
38
        """This constructor will create a DataFrame
39
        from file with the user-specified columns and genes column name, then
40
        tranpose it such that the rows are samples and columns are
41
        gene/transcript/peptides. The user will also specify the index argument,
42
        which specifies if the genes are ensembl genes ID or gene name, or
43
        transcripts id/names. The user should be careful about choosing the
44
        right genes index which makes it easier to annotate functional,
45
        sequence, and interaction data to it. The dataframe should only contain
46
        numeric values besides the genes_col_name and the sample barcode id
47
        indices.
48
49
        Args:
50
            data (str, byte-like, pandas.DataFrame): Path or file stream of the
51
                table file to import. If a pandas DataFrame is passed, then
52
                import this dataframe and skip preprocessing steps.
53
            transpose (bool): True if given data table has samples or columns
54
                and variables for rows. False if the table has samples for row
55
                index, and gene names as columns.
56
            gene_index (str): The column name of gene/transcript/protein to
57
                index by.
58
            usecols: A regex string to import column names from the table.
59
                Columns names imported are string match, separated by "|".
60
            gene_level (str): {"gene", "transcript", "peptide"} Chooses the
61
                level of the gene/transcript/peptide of the genes list in this
62
                expression data. The expression DataFrame's index will be
63
                renamed to this.
64
            sample_level (str): {"sample_index", "patient_index"} Chooses the
65
                level of the patient/sample/aliquot indexing.
66
            transform_fn (bool): default False A callable function to transform
67
                single values.
68
            dropna (bool): Whether to drop rows with null values
69
            npartitions (int): [0-n], default 0 If 0, then uses a Pandas
70
                DataFrame, if >1, then creates an off-memory Dask DataFrame with
71
                n partitions
72
            **kwargs: Any arguments to pass into pd.read_table(**kwargs)
73
        """
74
        self.gene_level = gene_level
75
        self.sample_level = sample_level
76
77
        df = self.load_dataframe(data, transpose=transpose, usecols=usecols, gene_index=gene_index, dropna=dropna,
78
                                 **kwargs)
79
80
        self.expressions = self.preprocess_table(
81
            df,
82
            usecols=usecols,
83
            gene_index=gene_index,
84
            transposed=transpose,
85
            dropna=dropna,
86
        )
87
88
        # TODO load DD from file directly
89
        if npartitions and isinstance(self.expressions, pd.DataFrame):
90
            self.expressions = dd.from_pandas(self.expressions, npartitions=npartitions)
91
92
        if gene_level is not None:
93
            self.expressions.columns.name = gene_level
94
95
        self.expressions.index.name = self.sample_level
96
97
        if callable(transform_fn):
98
            self.expressions = self.expressions.applymap(transform_fn)
99
        elif transform_fn == "log2":
100
            self.expressions = self.expressions.applymap(
101
                lambda x: np.log2(x + 1))
102
103
    @property
104
    def gene_index(self):
105
        return self.expressions.columns.name
106
107
    def load_dataframe(self,
108
                       data: Union[str, pd.DataFrame, dd.DataFrame, io.StringIO],
109
                       transpose: bool,
110
                       usecols: str,
111
                       gene_index: str,
112
                       dropna: bool, **kwargs) -> pd.DataFrame:
113
        """Reading table data inputs to create a DataFrame.
114
115
        Args:
116
            data: either a file path, a glob file path (e.g. "table-*.tsv"), a
117
                pandas.DataFrame, or a dask DataFrame.
118
            transpose (bool): True if table oriented with samples columns, else
119
                False.
120
            usecols (str): A regex string to select columns. Default None.
121
            gene_index (str): The column name what contains the gene names or IDs.
122
            dropna (bool): Whether to drop rows with null values
123
124
        Returns:
125
            Union[pd.DataFrame, dd.DataFrame]: The loaded dataframe.
126
        """
127
        if isinstance(data, (pd.DataFrame, dd.DataFrame)):
128
            df = data
129
130
        elif isinstance(data, str) and "*" in data:
131
            # TODO implement handling for multiple file ByteIO
132
            df = self.load_dataframe_glob(globstring=data, usecols=usecols, gene_index=gene_index, transpose=transpose,
133
                                          dropna=dropna, **kwargs)
134
135
        elif isinstance(data, io.StringIO):
136
            # Needed since the file was previous read to extract columns information
137
            data.seek(0)
138
            df = pd.read_table(data, **kwargs)
139
140
141
        elif isinstance(data, str) and validators.url(data):
142
            dataurl, filename = os.path.split(data)
143
            file = get_pkg_data_filename(dataurl + "/", filename)
144
            df = pd.read_table(file, **kwargs)
145
146
        elif isinstance(data, str) and os.path.isfile(data):
147
            df = pd.read_table(data, sep=None, engine="python")
148
149
150
        else:
151
            raise FileNotFoundError(data)
152
153
        return df
154
155
    def preprocess_table(self,
156
        df: Union[pd.DataFrame, dd.DataFrame],
157
        usecols: str = None,
158
        gene_index: str = None,
159
        transposed: bool = True,
160
        sort_index: bool = False,
161
        dropna: bool = True,
162
    ):
163
        """This function preprocesses the expression table files where columns
164
        are samples and rows are gene/transcripts :param df: A Dask or Pandas
165
        DataFrame :type df: DataFrame :param usecols: A regular expression
166
        string for the column names to fetch. :type usecols: str :param
167
        gene_index: The column name containing the gene/transcript names or
168
        id's. :type gene_index: str :param transposed: Default True. Whether to
169
        transpose the dataframe so columns are genes (features) and rows are
170
        samples.
171
172
        Args:
173
            df (pd.DataFrame):
174
            usecols (str):
175
            gene_index (str):
176
            transposed (bool):
177
            sort_index (bool):
178
            dropna (bool):
179
180
        Returns:
181
            Union[pd.DataFrame, dd.DataFrame]: a processed Dask DataFrame
182
        """
183
        # Filter columns
184
        if usecols is not None and isinstance(usecols, str):
185
            if gene_index not in usecols:
186
                # include index column in the filter regex query
187
                usecols = (usecols + "|" + gene_index)
188
189
            if isinstance(df, pd.DataFrame):
190
                df = df.filter(regex=usecols)
191
            elif isinstance(df, dd.DataFrame):
192
                columns = list(filter(re.compile(usecols).match, df.columns))
193
                df = df[columns]
194
195
        elif usecols is not None and isinstance(usecols, list):
196
            if gene_index not in usecols:
197
                usecols.append(gene_index)
198
            df =  df[usecols]
199
200
        # Drop duplicate column names
201
        df = drop_duplicate_columns(df)
202
203
        # Drop NA geneID rows
204
        if dropna:
205
            df.dropna(axis=0, inplace=True)
206
207
        if gene_index is not None and df.index.name != gene_index:
208
            df = df.set_index(gene_index)
209
210
        # Needed for Dask Delayed
211
        if sort_index is True:
212
            df = df.sort_index(axis=0, ascending=True)
213
214
        # Select only numerical columns
215
        df = df.select_dtypes(include="number")
216
217
        # Transpose dataframe to sample rows and gene columns
218
        if transposed:
219
            df = df.T
220
            # Drop duplicate genes
221
            df = drop_duplicate_columns(df)
222
223
        return df
224
225
    def load_dataframe_glob(self, globstring: str, usecols: str, gene_index: str, transpose: bool, dropna: bool,
226
                            **kwargs):
227
        """
228
        Args:
229
            globstring (str):
230
            usecols (str):
231
            gene_index (str):
232
            transpose (bool):
233
        Returns:
234
            dd.DataFrame
235
        """
236
237
        def convert_numerical_to_float(df: pd.DataFrame):
238
            cols = df.columns[~df.dtypes.eq('object')]
239
            df[cols] = df[cols].astype(float)
240
            return df
241
242
        filenames = []
243
244
        lazy_dataframes = []
245
        for file_path in glob(globstring):
246
            filenames.append(os.path.split(file_path)[1])
247
248
            df = delayed(pd.read_table)(file_path, **kwargs)
249
            # df = delayed(convert_numerical_to_float)(df)
250
            df = delayed(self.preprocess_table)(
251
                df,
252
                usecols,
253
                gene_index,
254
                transpose,
255
                True, # sort_index
256
                dropna)
257
            lazy_dataframes.append(df)
258
259
        logging.info("Files matched: {}".format(filenames))
260
261
        return dd.from_delayed(lazy_dataframes, divisions=None, verify_meta=True)
262
263
    def set_genes_index(self, index: str, old_index: str):
264
        """
265
        Args:
266
            index (str):
267
            old_index (str):
268
        """
269
        assert isinstance(self, Annotatable) and isinstance(self, Expression)
270
        # Change gene name columns in expressions
271
        rename_dict = self.get_rename_dict(from_index=old_index,
272
                                           to_index=index)
273
        self.expressions.rename(columns=rename_dict, inplace=True)
274
        self.gene_index = index
275
276
        # Change index name in annotation
277
        self.set_index(index)
278
279
    def drop_genes(self, gene_ids: str):
280
        """Drop columns representing genes/rna/proteins in self.expressions
281
        dataframe.
282
283
        Args:
284
            gene_ids (str): list of strings that are a subset of the columns
285
                list
286
        """
287
        self.expressions = self.expressions.drop(gene_ids, axis=1)
288
        if hasattr(self, "annotations") and not self.annotations.empty:
289
            self.annotations = self.annotations.drop(gene_ids, axis=0)
290
291
    def drop_samples(self, sample_ids):
292
        """
293
        Args:
294
            sample_ids:
295
        """
296
        self.expressions = self.expressions.drop(sample_ids, axis=0)
297
298
    @classmethod
299
    def name(cls):
300
        raise NotImplementedError
301
302
    def get_genes_list(self, level: int = None):
303
        """
304
        Args:
305
            level (int): Default None. Only needed if gene index is a :class:`pd.MultiIndex`
306
        """
307
        index = self.expressions.columns
308
309
        if isinstance(index, pd.MultiIndex):
310
            return index.get_level_values(
311
                self.gene_index if level is None else level)
312
        else:
313
            return index
314
315
    def get_samples_list(self, level=None):
316
        """
317
        Args:
318
            level:
319
        """
320
        index = self.expressions.index
321
        if isinstance(index, pd.MultiIndex):
322
            return index.get_level_values(
323
                self.gene_index if level is None else level)
324
        else:
325
            return index
326
327
    samples = property(get_samples_list)
328
    features = property(get_genes_list)
329
330
331
class LncRNA(Expression, Annotatable):
332
    def __init__(
333
        self,
334
        data,
335
        transpose,
336
        gene_index=None,
337
        usecols=None,
338
        gene_level=None,
339
        sample_level="sample_index",
340
        transform_fn=None,
341
        dropna=False,
342
        npartitions=None,
343
        cohort_name=None,
344
    ):
345
        """
346
        Args:
347
            data:
348
            transpose:
349
            gene_index:
350
            usecols:
351
            gene_level:
352
            sample_level:
353
            transform_fn:
354
            dropna:
355
            npartitions:
356
            cohort_name:
357
        """
358
        super().__init__(data=data, transpose=transpose, gene_index=gene_index, usecols=usecols,
359
                         gene_level=gene_level, sample_level=sample_level, transform_fn=transform_fn,
360
                         dropna=dropna, npartitions=npartitions, cohort_name=cohort_name)
361
362
    @classmethod
363
    def name(cls):
364
        return cls.__name__
365
366
367
class MessengerRNA(Expression, Annotatable):
368
    def __init__(
369
        self,
370
        data,
371
        transpose,
372
        gene_index=None,
373
        usecols=None,
374
        gene_level=None,
375
        sample_level="sample_index",
376
        transform_fn=None,
377
        dropna=False,
378
        npartitions=None,
379
        cohort_name=None,
380
    ):
381
        super().__init__(data=data, transpose=transpose, gene_index=gene_index, usecols=usecols,
382
                         gene_level=gene_level, sample_level=sample_level, transform_fn=transform_fn,
383
                         dropna=dropna, npartitions=npartitions, cohort_name=cohort_name)
384
385
    @classmethod
386
    def name(cls):
387
        return cls.__name__
388
389
390
class MicroRNA(Expression, Annotatable):
391
    def __init__(
392
        self,
393
        data,
394
        transpose,
395
        gene_index=None,
396
        usecols=None,
397
        gene_level=None,
398
        sample_level="sample_index",
399
        transform_fn=None,
400
        dropna=False,
401
        npartitions=None,
402
        cohort_name=None,
403
    ):
404
        super().__init__(data=data, transpose=transpose, gene_index=gene_index, usecols=usecols,
405
                         gene_level=gene_level, sample_level=sample_level, transform_fn=transform_fn,
406
                         dropna=dropna, npartitions=npartitions, cohort_name=cohort_name)
407
408
    @classmethod
409
    def name(cls):
410
        return cls.__name__