Diff of /openomics/multiomics.py [000000] .. [548210]

Switch to unified view

a b/openomics/multiomics.py
1
import logging
2
import os
3
import warnings
4
from os.path import exists, join
5
from typing import List, Dict, Union
6
7
import pandas as pd
8
from logzero import logger
9
from ruamel import yaml
10
11
import openomics
12
from .clinical import (
13
    ClinicalData,
14
    HISTOLOGIC_SUBTYPE_COL,
15
    PATHOLOGIC_STAGE_COL,
16
    TUMOR_NORMAL_COL,
17
    PREDICTED_SUBTYPE_COL,
18
)
19
from .database.base import Annotatable
20
from .genomics import SomaticMutation, CopyNumberVariation, DNAMethylation
21
from .imageomics import WholeSlideImage
22
from .proteomics import Protein
23
from .transcriptomics import MessengerRNA, MicroRNA, LncRNA, Expression
24
25
__all__ = ['MultiOmics']
26
27
class MultiOmics:
28
    """A data object which holds multiple -omics data for a single clinical
29
    cohort.
30
    """
31
    def __init__(self, cohort_name, omics_data=None):
32
        """
33
        Args:
34
            cohort_name (str): the clinical cohort name
35
            omics_data:
36
        """
37
        self._cohort_name = cohort_name
38
        self._omics = []
39
40
        # This is a data dictionary accessor to retrieve individual -omic data
41
        self.data: Dict[str, pd.DataFrame] = {}
42
43
        if omics_data:
44
            for omics in omics_data:
45
                self.add_omic(omics)
46
47
    def __repr__(self):
48
        return f"Cohort: {self._cohort_name}" \
49
               "\nExpression: {}" \
50
               "\nAnnotations: {}".format(
51
            {ntype: df.shape for ntype, df in self.data.items()},
52
            {ntype: omic.annotations.shape for ntype, omic in self.__dict__.items() if hasattr(omic, 'annotations')})
53
54
    @classmethod
55
    def load(cls, path):
56
        """
57
        Load a MultiOmics save directory and create a MultiOmics object.
58
        Args:
59
            path (str):
60
61
        Returns:
62
            MultiOmics
63
        """
64
        if isinstance(path, str) and '~' in path:
65
            path = os.path.expanduser(path)
66
67
        with open(join(path, 'metadata.yml'), 'r') as f:
68
            warnings.simplefilter('ignore', yaml.error.UnsafeLoaderWarning)
69
            attrs: Dict = yaml.load(f)
70
71
        logger.info(f"Loading MultiOmics {attrs} from {path}")
72
73
        omics = []
74
        for name in attrs['_omics']:
75
            if exists(join(path, f'{name}.expressions.pickle')):
76
                df = pd.read_pickle(join(path, f'{name}.expressions.pickle'))
77
                if name == MessengerRNA.name():
78
                    OmicsClass = MessengerRNA
79
                elif name == MicroRNA.name():
80
                    OmicsClass = MicroRNA
81
                elif name == LncRNA.name():
82
                    OmicsClass = LncRNA
83
                elif name == Protein.name():
84
                    OmicsClass = Protein
85
                elif name == Expression.name():
86
                    OmicsClass = Expression
87
88
                omic = OmicsClass(df, transpose=False)
89
            else:
90
                continue
91
92
            # annotation data
93
            if exists(join(path, f'{name}.annotations.pickle')):
94
                omic.annotations = pd.read_pickle(join(path, f'{name}.annotations.pickle'))
95
96
            omics.append(omic)
97
98
        self = cls(cohort_name=attrs['_cohort_name'], omics_data=omics)
99
        return self
100
101
    def save(self, path):
102
        """
103
        Serialize all omics data to disk.
104
        Args:
105
            path (str): A path to a directory where the MultiOmics data will be serialized into files.
106
        """
107
        if isinstance(path, str) and '~' in path:
108
            path = os.path.expanduser(path)
109
110
        if not exists(path):
111
            os.makedirs(path)
112
113
        # Write metadata to YAML so can be readable
114
        attrs = {
115
            '_omics': getattr(self, '_omics', None),
116
            '_cohort_name': getattr(self, '_cohort_name', None),
117
        }
118
        with open(join(path, 'metadata.yml'), 'w') as outfile:
119
            yaml.dump(attrs, outfile, default_flow_style=False)
120
121
        # Write each omics type
122
        for name, expressions_df in self.data.items():
123
            # expressions data
124
            if not exists(join(path, f'{name}.expressions.pickle')):
125
                expressions_df.to_pickle(join(path, f'{name}.expressions.pickle'))
126
127
            # annotation data
128
            expression_obj = self.__getitem__(name)
129
            if not exists(join(path, f'{name}.annotations.pickle')):
130
                expression_obj.annotations.to_pickle(join(path, f'{name}.annotations.pickle'))
131
132
    def add_omic(self,
133
                 omic_data: Union[Expression, Annotatable],
134
                 init_annotations: bool = True):
135
        """Adds an omic object to the Multiomics such that the samples in omic
136
        matches the samples existing in the other omics.
137
138
        Args:
139
            omic_data (Expression): The omic to add, e.g., MessengerRNA,
140
                MicroRNA, LncRNA, etc.
141
            init_annotations (bool): default True. If true, initializes
142
                the annotation dataframe in the omic object
143
        """
144
        self.__dict__[omic_data.name()] = omic_data
145
146
        if omic_data.name() not in self._omics:
147
            self._omics.append(omic_data.name())
148
149
        # dictionary as data accessor to the expression data
150
        self.data[omic_data.name()] = omic_data.expressions
151
152
        # Initialize annotation
153
        if init_annotations:
154
            omic_data.init_annotations(index=omic_data.get_genes_list())
155
156
        logging.info(
157
            "{} {} , indexed by: {}".format(omic_data.name(),
158
                                            self.data[omic_data.name()].shape if hasattr(
159
                                                self.data[omic_data.name()], "shape") else ": None",
160
                                            omic_data.annotations.index.name))
161
162
    def add_clinical_data(self, clinical: openomics.clinical.ClinicalData, **kwargs):
163
        """Add a ClinicalData instance to the MultiOmics instance.
164
165
        Args:
166
            clinical (openomics.clinical.ClinicalData):
167
            **kwargs:
168
        """
169
        if not isinstance(clinical, ClinicalData):
170
            raise Exception("Must pass a ClinicalData in, not a file path.")
171
172
        self.clinical = clinical
173
174
        self.data["PATIENTS"] = self.clinical.patient
175
        if hasattr(self.clinical, "biospecimen"):
176
            self.data["BIOSPECIMENS"] = self.clinical.biospecimen
177
        if hasattr(self.clinical, "drugs"):
178
            self.data["DRUGS"] = self.clinical.drugs
179
180
        self.build_samples(**kwargs)
181
182
    def get_omics_list(self):
183
        return self._omics
184
185
    def __getitem__(self, item: str) -> Union[Expression, Annotatable]:
186
        """This function allows the MultiOmicData class objects to access
187
        individual omics by a dictionary lookup, e.g. openomics["MicroRNA"]
188
189
        Args:
190
            item (str): a string of the class name
191
        """
192
        if item.lower() == MessengerRNA.name().lower():
193
            return self.__getattribute__(MessengerRNA.name())
194
195
        elif item.lower() == MicroRNA.name().lower():
196
            return self.__getattribute__(MicroRNA.name())
197
198
        elif item.lower() == LncRNA.name().lower():
199
            return self.__getattribute__(LncRNA.name())
200
201
        elif item.lower() == WholeSlideImage.name().lower():
202
            return self.__getattribute__(WholeSlideImage.name())
203
204
        elif item.lower() == SomaticMutation.name().lower():
205
            return self.__getattribute__(SomaticMutation.name())
206
207
        elif item.lower() == CopyNumberVariation.name().lower():
208
            return self.__getattribute__(CopyNumberVariation.name())
209
210
        elif item.lower() == DNAMethylation.name().lower():
211
            return self.__getattribute__(DNAMethylation.name())
212
213
        elif item.lower() == Protein.name().lower():
214
            return self.__getattribute__(Protein.name())
215
216
        elif item.lower() == "patients":
217
            return self.clinical.patient
218
        elif item.lower() == "samples":
219
            if hasattr(self, "clinical"):
220
                return self.clinical.samples
221
            else:
222
                return self.samples
223
        elif item.lower() == "drugs":
224
            return self.clinical.drugs
225
        else:
226
            raise Exception(
227
                'String accessor must be one of {"MessengerRNA", "MicroRNA", "LncRNA", "Protein", etc.}'
228
            )
229
230
    def remove_duplicate_genes(self):
231
        """Removes duplicate genes between any omics such that the gene index
232
        across all omics has no duplicates.
233
        """
234
        for omic_A in self._omics:
235
            for omic_B in self._omics:
236
                if omic_A != omic_B:
237
                    self.__getattribute__(omic_A).drop_genes(
238
                        set(self.__getattribute__(omic_A).get_genes_list())
239
                        & set(self.__getattribute__(omic_B).get_genes_list()))
240
241
    def build_samples(self, agg_by="union"):
242
        """Running this function will build a dataframe for all samples across
243
        the different omics (either by a union or intersection). Then,
244
245
        Args:
246
            agg_by (str): ["union", "intersection"]
247
        """
248
        # make sure at least one ExpressionData present
249
        if len(self._omics) < 1:
250
            logging.debug(
251
                "build_samples() does nothing. Must add at least one omic to this MultiOmics object."
252
            )
253
            return
254
255
        all_samples = pd.Index([])
256
        for omic in self._omics:
257
            if agg_by == "union":
258
                all_samples = all_samples.union(self.data[omic].index)
259
            elif agg_by == "intersection":
260
                all_samples = all_samples.intersection(self.data[omic].index)
261
262
        if hasattr(self, "clinical"):
263
            self.clinical.build_clinical_samples(all_samples)
264
            self.samples = self.clinical.samples.index
265
        else:
266
            self.samples = all_samples
267
268
    def __dir__(self):
269
        return list(self.data.keys())
270
271
    def match_samples(self, omics) -> pd.Index:
272
        """Return the index of sample IDs of the intersection of
273
        samples from all modalities
274
275
        Args:
276
            omics: An array of modalities
277
278
        Returns:
279
            matched_sapmles: An pandas Index list
280
        """
281
        # TODO check that for single modalities, this fetch all patients
282
        matched_samples = self.data[omics[0]].index.copy()
283
284
        for omic in omics:
285
            matched_samples = matched_samples.join(self.data[omic].index,
286
                                                   how="inner")
287
288
        return matched_samples
289
290
    def load_data(self,
291
                  omics: Union[List[str], str],
292
                  target: List[str] = ["pathologic_stage"],
293
                  pathologic_stages=None,
294
                  histological_subtypes=None,
295
                  predicted_subtypes=None,
296
                  tumor_normal=None,
297
                  samples_barcode=None,
298
                  remove_duplicates=True):
299
        """ Prepare the multiomics data in format
300
301
        Args:
302
            omics (list): A list of the data modalities to load. Default "all"
303
                to select all modalities
304
            target (list): The clinical data fields to include in the
305
            pathologic_stages (list): Only fetch samples having certain stages
306
                in their corresponding patient's clinical data. For instance,
307
                ["Stage I", "Stage II"] will only fetch samples from Stage I and
308
                Stage II patients. Default is [] which fetches all pathologic
309
                stages.
310
            histological_subtypes: A list specifying the histological subtypes
311
                to fetch. Default is [] which fetches all histological sybtypes.
312
            predicted_subtypes: A list specifying the histological subtypes to
313
                fetch. Default is [] which fetches all histological sybtypes.
314
            tumor_normal: ["Tumor", "Normal"]. Default is [], which fetches all
315
                tumor or normal sample types.
316
            samples_barcode: A list of sample's barcode. If not None, only fetch
317
                data with matching samples provided in this list.
318
            remove_duplicates (bool): If True, only selects samples with non-duplicated index.
319
320
        Returns:
321
            Tuple[Dict[str, pd.DataFrame], pd.DataFrame]: Returns (X, y), where
322
            X is a dictionary containing the multiomics data with matched
323
            samples, and y contain the :param target: labels for those samples.
324
        """
325
        if omics == "all" or omics is None:
326
            omics = self._omics
327
328
        matched_samples = self.match_samples(omics)
329
330
        if samples_barcode is not None:
331
            matched_samples = samples_barcode
332
333
        if hasattr(self, "clinical") and isinstance(self.clinical,
334
                                                    ClinicalData):
335
            # Build targets clinical data
336
            y = self.get_sample_attributes(matched_samples)
337
338
            # Select only samples with certain cancer stage or subtype
339
            if pathologic_stages:
340
                y = y[y[PATHOLOGIC_STAGE_COL].isin(pathologic_stages)]
341
            if histological_subtypes:
342
                y = y[y[HISTOLOGIC_SUBTYPE_COL].isin(histological_subtypes)]
343
            if predicted_subtypes:
344
                y = y[y[PREDICTED_SUBTYPE_COL].isin(predicted_subtypes)]
345
            if tumor_normal:
346
                y = y[y[TUMOR_NORMAL_COL].isin(tumor_normal)]
347
348
            # Filter y target column labels
349
            y = y.filter(target)
350
            y.dropna(axis=0, inplace=True)
351
            matched_samples = y.index
352
        else:
353
            y = None
354
355
        # Build expression matrix for each omic, indexed by matched_samples
356
        X_multiomics = {}
357
        for omic in omics:
358
            X_multiomics[omic] = self.data[omic].loc[matched_samples, self[omic].get_genes_list()]
359
360
            if remove_duplicates:
361
                X_multiomics[omic] = X_multiomics[omic].loc[~X_multiomics[omic].index.duplicated(keep="first")]
362
363
        return X_multiomics, y
364
365
    def get_sample_attributes(self, matched_samples):
366
        """Fetch patient's clinical data for each given samples barcodes in the
367
        matched_samples
368
369
        Returns
370
            samples_index: Index of samples
371
372
        Args:
373
            matched_samples: A list of sample barcodes
374
        """
375
        return self.data["SAMPLES"].reindex(matched_samples)
376
377
    def print_sample_sizes(self):
378
        for omic in self.data:
379
            print(
380
                omic,
381
                self.data[omic].shape
382
                if hasattr(self.data[omic], "shape") else "None",
383
            )
384
385
    def annotate_samples(self, dictionary):
386
        """This function adds a "predicted_subtype" field to the patients
387
        clinical data. For instance, patients were classified into subtypes
388
        based on their expression profile using k-means, then, to use this
389
        function, do:
390
391
        annotate_patients(dict(zip(patient index>, <list of corresponding
392
        patient's subtypes>)))
393
394
        Adding a field to the patients clinical data allows openomics to
395
        query the patients data through the .load_data(subtypes=[]) parameter,
396
397
        Args:
398
            dictionary: A dictionary mapping patient's index to a subtype
399
        """
400
        self.data["PATIENTS"] = self.data["PATIENTS"].assign(
401
            subtypes=self.data["PATIENTS"][
402
                self.clinical.patient_column].map(dictionary))