|
a |
|
b/openomics/multiomics.py |
|
|
1 |
import logging |
|
|
2 |
import os |
|
|
3 |
import warnings |
|
|
4 |
from os.path import exists, join |
|
|
5 |
from typing import List, Dict, Union |
|
|
6 |
|
|
|
7 |
import pandas as pd |
|
|
8 |
from logzero import logger |
|
|
9 |
from ruamel import yaml |
|
|
10 |
|
|
|
11 |
import openomics |
|
|
12 |
from .clinical import ( |
|
|
13 |
ClinicalData, |
|
|
14 |
HISTOLOGIC_SUBTYPE_COL, |
|
|
15 |
PATHOLOGIC_STAGE_COL, |
|
|
16 |
TUMOR_NORMAL_COL, |
|
|
17 |
PREDICTED_SUBTYPE_COL, |
|
|
18 |
) |
|
|
19 |
from .database.base import Annotatable |
|
|
20 |
from .genomics import SomaticMutation, CopyNumberVariation, DNAMethylation |
|
|
21 |
from .imageomics import WholeSlideImage |
|
|
22 |
from .proteomics import Protein |
|
|
23 |
from .transcriptomics import MessengerRNA, MicroRNA, LncRNA, Expression |
|
|
24 |
|
|
|
25 |
__all__ = ['MultiOmics'] |
|
|
26 |
|
|
|
27 |
class MultiOmics: |
|
|
28 |
"""A data object which holds multiple -omics data for a single clinical |
|
|
29 |
cohort. |
|
|
30 |
""" |
|
|
31 |
def __init__(self, cohort_name, omics_data=None): |
|
|
32 |
""" |
|
|
33 |
Args: |
|
|
34 |
cohort_name (str): the clinical cohort name |
|
|
35 |
omics_data: |
|
|
36 |
""" |
|
|
37 |
self._cohort_name = cohort_name |
|
|
38 |
self._omics = [] |
|
|
39 |
|
|
|
40 |
# This is a data dictionary accessor to retrieve individual -omic data |
|
|
41 |
self.data: Dict[str, pd.DataFrame] = {} |
|
|
42 |
|
|
|
43 |
if omics_data: |
|
|
44 |
for omics in omics_data: |
|
|
45 |
self.add_omic(omics) |
|
|
46 |
|
|
|
47 |
def __repr__(self): |
|
|
48 |
return f"Cohort: {self._cohort_name}" \ |
|
|
49 |
"\nExpression: {}" \ |
|
|
50 |
"\nAnnotations: {}".format( |
|
|
51 |
{ntype: df.shape for ntype, df in self.data.items()}, |
|
|
52 |
{ntype: omic.annotations.shape for ntype, omic in self.__dict__.items() if hasattr(omic, 'annotations')}) |
|
|
53 |
|
|
|
54 |
@classmethod |
|
|
55 |
def load(cls, path): |
|
|
56 |
""" |
|
|
57 |
Load a MultiOmics save directory and create a MultiOmics object. |
|
|
58 |
Args: |
|
|
59 |
path (str): |
|
|
60 |
|
|
|
61 |
Returns: |
|
|
62 |
MultiOmics |
|
|
63 |
""" |
|
|
64 |
if isinstance(path, str) and '~' in path: |
|
|
65 |
path = os.path.expanduser(path) |
|
|
66 |
|
|
|
67 |
with open(join(path, 'metadata.yml'), 'r') as f: |
|
|
68 |
warnings.simplefilter('ignore', yaml.error.UnsafeLoaderWarning) |
|
|
69 |
attrs: Dict = yaml.load(f) |
|
|
70 |
|
|
|
71 |
logger.info(f"Loading MultiOmics {attrs} from {path}") |
|
|
72 |
|
|
|
73 |
omics = [] |
|
|
74 |
for name in attrs['_omics']: |
|
|
75 |
if exists(join(path, f'{name}.expressions.pickle')): |
|
|
76 |
df = pd.read_pickle(join(path, f'{name}.expressions.pickle')) |
|
|
77 |
if name == MessengerRNA.name(): |
|
|
78 |
OmicsClass = MessengerRNA |
|
|
79 |
elif name == MicroRNA.name(): |
|
|
80 |
OmicsClass = MicroRNA |
|
|
81 |
elif name == LncRNA.name(): |
|
|
82 |
OmicsClass = LncRNA |
|
|
83 |
elif name == Protein.name(): |
|
|
84 |
OmicsClass = Protein |
|
|
85 |
elif name == Expression.name(): |
|
|
86 |
OmicsClass = Expression |
|
|
87 |
|
|
|
88 |
omic = OmicsClass(df, transpose=False) |
|
|
89 |
else: |
|
|
90 |
continue |
|
|
91 |
|
|
|
92 |
# annotation data |
|
|
93 |
if exists(join(path, f'{name}.annotations.pickle')): |
|
|
94 |
omic.annotations = pd.read_pickle(join(path, f'{name}.annotations.pickle')) |
|
|
95 |
|
|
|
96 |
omics.append(omic) |
|
|
97 |
|
|
|
98 |
self = cls(cohort_name=attrs['_cohort_name'], omics_data=omics) |
|
|
99 |
return self |
|
|
100 |
|
|
|
101 |
def save(self, path): |
|
|
102 |
""" |
|
|
103 |
Serialize all omics data to disk. |
|
|
104 |
Args: |
|
|
105 |
path (str): A path to a directory where the MultiOmics data will be serialized into files. |
|
|
106 |
""" |
|
|
107 |
if isinstance(path, str) and '~' in path: |
|
|
108 |
path = os.path.expanduser(path) |
|
|
109 |
|
|
|
110 |
if not exists(path): |
|
|
111 |
os.makedirs(path) |
|
|
112 |
|
|
|
113 |
# Write metadata to YAML so can be readable |
|
|
114 |
attrs = { |
|
|
115 |
'_omics': getattr(self, '_omics', None), |
|
|
116 |
'_cohort_name': getattr(self, '_cohort_name', None), |
|
|
117 |
} |
|
|
118 |
with open(join(path, 'metadata.yml'), 'w') as outfile: |
|
|
119 |
yaml.dump(attrs, outfile, default_flow_style=False) |
|
|
120 |
|
|
|
121 |
# Write each omics type |
|
|
122 |
for name, expressions_df in self.data.items(): |
|
|
123 |
# expressions data |
|
|
124 |
if not exists(join(path, f'{name}.expressions.pickle')): |
|
|
125 |
expressions_df.to_pickle(join(path, f'{name}.expressions.pickle')) |
|
|
126 |
|
|
|
127 |
# annotation data |
|
|
128 |
expression_obj = self.__getitem__(name) |
|
|
129 |
if not exists(join(path, f'{name}.annotations.pickle')): |
|
|
130 |
expression_obj.annotations.to_pickle(join(path, f'{name}.annotations.pickle')) |
|
|
131 |
|
|
|
132 |
def add_omic(self, |
|
|
133 |
omic_data: Union[Expression, Annotatable], |
|
|
134 |
init_annotations: bool = True): |
|
|
135 |
"""Adds an omic object to the Multiomics such that the samples in omic |
|
|
136 |
matches the samples existing in the other omics. |
|
|
137 |
|
|
|
138 |
Args: |
|
|
139 |
omic_data (Expression): The omic to add, e.g., MessengerRNA, |
|
|
140 |
MicroRNA, LncRNA, etc. |
|
|
141 |
init_annotations (bool): default True. If true, initializes |
|
|
142 |
the annotation dataframe in the omic object |
|
|
143 |
""" |
|
|
144 |
self.__dict__[omic_data.name()] = omic_data |
|
|
145 |
|
|
|
146 |
if omic_data.name() not in self._omics: |
|
|
147 |
self._omics.append(omic_data.name()) |
|
|
148 |
|
|
|
149 |
# dictionary as data accessor to the expression data |
|
|
150 |
self.data[omic_data.name()] = omic_data.expressions |
|
|
151 |
|
|
|
152 |
# Initialize annotation |
|
|
153 |
if init_annotations: |
|
|
154 |
omic_data.init_annotations(index=omic_data.get_genes_list()) |
|
|
155 |
|
|
|
156 |
logging.info( |
|
|
157 |
"{} {} , indexed by: {}".format(omic_data.name(), |
|
|
158 |
self.data[omic_data.name()].shape if hasattr( |
|
|
159 |
self.data[omic_data.name()], "shape") else ": None", |
|
|
160 |
omic_data.annotations.index.name)) |
|
|
161 |
|
|
|
162 |
def add_clinical_data(self, clinical: openomics.clinical.ClinicalData, **kwargs): |
|
|
163 |
"""Add a ClinicalData instance to the MultiOmics instance. |
|
|
164 |
|
|
|
165 |
Args: |
|
|
166 |
clinical (openomics.clinical.ClinicalData): |
|
|
167 |
**kwargs: |
|
|
168 |
""" |
|
|
169 |
if not isinstance(clinical, ClinicalData): |
|
|
170 |
raise Exception("Must pass a ClinicalData in, not a file path.") |
|
|
171 |
|
|
|
172 |
self.clinical = clinical |
|
|
173 |
|
|
|
174 |
self.data["PATIENTS"] = self.clinical.patient |
|
|
175 |
if hasattr(self.clinical, "biospecimen"): |
|
|
176 |
self.data["BIOSPECIMENS"] = self.clinical.biospecimen |
|
|
177 |
if hasattr(self.clinical, "drugs"): |
|
|
178 |
self.data["DRUGS"] = self.clinical.drugs |
|
|
179 |
|
|
|
180 |
self.build_samples(**kwargs) |
|
|
181 |
|
|
|
182 |
def get_omics_list(self): |
|
|
183 |
return self._omics |
|
|
184 |
|
|
|
185 |
def __getitem__(self, item: str) -> Union[Expression, Annotatable]: |
|
|
186 |
"""This function allows the MultiOmicData class objects to access |
|
|
187 |
individual omics by a dictionary lookup, e.g. openomics["MicroRNA"] |
|
|
188 |
|
|
|
189 |
Args: |
|
|
190 |
item (str): a string of the class name |
|
|
191 |
""" |
|
|
192 |
if item.lower() == MessengerRNA.name().lower(): |
|
|
193 |
return self.__getattribute__(MessengerRNA.name()) |
|
|
194 |
|
|
|
195 |
elif item.lower() == MicroRNA.name().lower(): |
|
|
196 |
return self.__getattribute__(MicroRNA.name()) |
|
|
197 |
|
|
|
198 |
elif item.lower() == LncRNA.name().lower(): |
|
|
199 |
return self.__getattribute__(LncRNA.name()) |
|
|
200 |
|
|
|
201 |
elif item.lower() == WholeSlideImage.name().lower(): |
|
|
202 |
return self.__getattribute__(WholeSlideImage.name()) |
|
|
203 |
|
|
|
204 |
elif item.lower() == SomaticMutation.name().lower(): |
|
|
205 |
return self.__getattribute__(SomaticMutation.name()) |
|
|
206 |
|
|
|
207 |
elif item.lower() == CopyNumberVariation.name().lower(): |
|
|
208 |
return self.__getattribute__(CopyNumberVariation.name()) |
|
|
209 |
|
|
|
210 |
elif item.lower() == DNAMethylation.name().lower(): |
|
|
211 |
return self.__getattribute__(DNAMethylation.name()) |
|
|
212 |
|
|
|
213 |
elif item.lower() == Protein.name().lower(): |
|
|
214 |
return self.__getattribute__(Protein.name()) |
|
|
215 |
|
|
|
216 |
elif item.lower() == "patients": |
|
|
217 |
return self.clinical.patient |
|
|
218 |
elif item.lower() == "samples": |
|
|
219 |
if hasattr(self, "clinical"): |
|
|
220 |
return self.clinical.samples |
|
|
221 |
else: |
|
|
222 |
return self.samples |
|
|
223 |
elif item.lower() == "drugs": |
|
|
224 |
return self.clinical.drugs |
|
|
225 |
else: |
|
|
226 |
raise Exception( |
|
|
227 |
'String accessor must be one of {"MessengerRNA", "MicroRNA", "LncRNA", "Protein", etc.}' |
|
|
228 |
) |
|
|
229 |
|
|
|
230 |
def remove_duplicate_genes(self): |
|
|
231 |
"""Removes duplicate genes between any omics such that the gene index |
|
|
232 |
across all omics has no duplicates. |
|
|
233 |
""" |
|
|
234 |
for omic_A in self._omics: |
|
|
235 |
for omic_B in self._omics: |
|
|
236 |
if omic_A != omic_B: |
|
|
237 |
self.__getattribute__(omic_A).drop_genes( |
|
|
238 |
set(self.__getattribute__(omic_A).get_genes_list()) |
|
|
239 |
& set(self.__getattribute__(omic_B).get_genes_list())) |
|
|
240 |
|
|
|
241 |
def build_samples(self, agg_by="union"): |
|
|
242 |
"""Running this function will build a dataframe for all samples across |
|
|
243 |
the different omics (either by a union or intersection). Then, |
|
|
244 |
|
|
|
245 |
Args: |
|
|
246 |
agg_by (str): ["union", "intersection"] |
|
|
247 |
""" |
|
|
248 |
# make sure at least one ExpressionData present |
|
|
249 |
if len(self._omics) < 1: |
|
|
250 |
logging.debug( |
|
|
251 |
"build_samples() does nothing. Must add at least one omic to this MultiOmics object." |
|
|
252 |
) |
|
|
253 |
return |
|
|
254 |
|
|
|
255 |
all_samples = pd.Index([]) |
|
|
256 |
for omic in self._omics: |
|
|
257 |
if agg_by == "union": |
|
|
258 |
all_samples = all_samples.union(self.data[omic].index) |
|
|
259 |
elif agg_by == "intersection": |
|
|
260 |
all_samples = all_samples.intersection(self.data[omic].index) |
|
|
261 |
|
|
|
262 |
if hasattr(self, "clinical"): |
|
|
263 |
self.clinical.build_clinical_samples(all_samples) |
|
|
264 |
self.samples = self.clinical.samples.index |
|
|
265 |
else: |
|
|
266 |
self.samples = all_samples |
|
|
267 |
|
|
|
268 |
def __dir__(self): |
|
|
269 |
return list(self.data.keys()) |
|
|
270 |
|
|
|
271 |
def match_samples(self, omics) -> pd.Index: |
|
|
272 |
"""Return the index of sample IDs of the intersection of |
|
|
273 |
samples from all modalities |
|
|
274 |
|
|
|
275 |
Args: |
|
|
276 |
omics: An array of modalities |
|
|
277 |
|
|
|
278 |
Returns: |
|
|
279 |
matched_sapmles: An pandas Index list |
|
|
280 |
""" |
|
|
281 |
# TODO check that for single modalities, this fetch all patients |
|
|
282 |
matched_samples = self.data[omics[0]].index.copy() |
|
|
283 |
|
|
|
284 |
for omic in omics: |
|
|
285 |
matched_samples = matched_samples.join(self.data[omic].index, |
|
|
286 |
how="inner") |
|
|
287 |
|
|
|
288 |
return matched_samples |
|
|
289 |
|
|
|
290 |
def load_data(self, |
|
|
291 |
omics: Union[List[str], str], |
|
|
292 |
target: List[str] = ["pathologic_stage"], |
|
|
293 |
pathologic_stages=None, |
|
|
294 |
histological_subtypes=None, |
|
|
295 |
predicted_subtypes=None, |
|
|
296 |
tumor_normal=None, |
|
|
297 |
samples_barcode=None, |
|
|
298 |
remove_duplicates=True): |
|
|
299 |
""" Prepare the multiomics data in format |
|
|
300 |
|
|
|
301 |
Args: |
|
|
302 |
omics (list): A list of the data modalities to load. Default "all" |
|
|
303 |
to select all modalities |
|
|
304 |
target (list): The clinical data fields to include in the |
|
|
305 |
pathologic_stages (list): Only fetch samples having certain stages |
|
|
306 |
in their corresponding patient's clinical data. For instance, |
|
|
307 |
["Stage I", "Stage II"] will only fetch samples from Stage I and |
|
|
308 |
Stage II patients. Default is [] which fetches all pathologic |
|
|
309 |
stages. |
|
|
310 |
histological_subtypes: A list specifying the histological subtypes |
|
|
311 |
to fetch. Default is [] which fetches all histological sybtypes. |
|
|
312 |
predicted_subtypes: A list specifying the histological subtypes to |
|
|
313 |
fetch. Default is [] which fetches all histological sybtypes. |
|
|
314 |
tumor_normal: ["Tumor", "Normal"]. Default is [], which fetches all |
|
|
315 |
tumor or normal sample types. |
|
|
316 |
samples_barcode: A list of sample's barcode. If not None, only fetch |
|
|
317 |
data with matching samples provided in this list. |
|
|
318 |
remove_duplicates (bool): If True, only selects samples with non-duplicated index. |
|
|
319 |
|
|
|
320 |
Returns: |
|
|
321 |
Tuple[Dict[str, pd.DataFrame], pd.DataFrame]: Returns (X, y), where |
|
|
322 |
X is a dictionary containing the multiomics data with matched |
|
|
323 |
samples, and y contain the :param target: labels for those samples. |
|
|
324 |
""" |
|
|
325 |
if omics == "all" or omics is None: |
|
|
326 |
omics = self._omics |
|
|
327 |
|
|
|
328 |
matched_samples = self.match_samples(omics) |
|
|
329 |
|
|
|
330 |
if samples_barcode is not None: |
|
|
331 |
matched_samples = samples_barcode |
|
|
332 |
|
|
|
333 |
if hasattr(self, "clinical") and isinstance(self.clinical, |
|
|
334 |
ClinicalData): |
|
|
335 |
# Build targets clinical data |
|
|
336 |
y = self.get_sample_attributes(matched_samples) |
|
|
337 |
|
|
|
338 |
# Select only samples with certain cancer stage or subtype |
|
|
339 |
if pathologic_stages: |
|
|
340 |
y = y[y[PATHOLOGIC_STAGE_COL].isin(pathologic_stages)] |
|
|
341 |
if histological_subtypes: |
|
|
342 |
y = y[y[HISTOLOGIC_SUBTYPE_COL].isin(histological_subtypes)] |
|
|
343 |
if predicted_subtypes: |
|
|
344 |
y = y[y[PREDICTED_SUBTYPE_COL].isin(predicted_subtypes)] |
|
|
345 |
if tumor_normal: |
|
|
346 |
y = y[y[TUMOR_NORMAL_COL].isin(tumor_normal)] |
|
|
347 |
|
|
|
348 |
# Filter y target column labels |
|
|
349 |
y = y.filter(target) |
|
|
350 |
y.dropna(axis=0, inplace=True) |
|
|
351 |
matched_samples = y.index |
|
|
352 |
else: |
|
|
353 |
y = None |
|
|
354 |
|
|
|
355 |
# Build expression matrix for each omic, indexed by matched_samples |
|
|
356 |
X_multiomics = {} |
|
|
357 |
for omic in omics: |
|
|
358 |
X_multiomics[omic] = self.data[omic].loc[matched_samples, self[omic].get_genes_list()] |
|
|
359 |
|
|
|
360 |
if remove_duplicates: |
|
|
361 |
X_multiomics[omic] = X_multiomics[omic].loc[~X_multiomics[omic].index.duplicated(keep="first")] |
|
|
362 |
|
|
|
363 |
return X_multiomics, y |
|
|
364 |
|
|
|
365 |
def get_sample_attributes(self, matched_samples): |
|
|
366 |
"""Fetch patient's clinical data for each given samples barcodes in the |
|
|
367 |
matched_samples |
|
|
368 |
|
|
|
369 |
Returns |
|
|
370 |
samples_index: Index of samples |
|
|
371 |
|
|
|
372 |
Args: |
|
|
373 |
matched_samples: A list of sample barcodes |
|
|
374 |
""" |
|
|
375 |
return self.data["SAMPLES"].reindex(matched_samples) |
|
|
376 |
|
|
|
377 |
def print_sample_sizes(self): |
|
|
378 |
for omic in self.data: |
|
|
379 |
print( |
|
|
380 |
omic, |
|
|
381 |
self.data[omic].shape |
|
|
382 |
if hasattr(self.data[omic], "shape") else "None", |
|
|
383 |
) |
|
|
384 |
|
|
|
385 |
def annotate_samples(self, dictionary): |
|
|
386 |
"""This function adds a "predicted_subtype" field to the patients |
|
|
387 |
clinical data. For instance, patients were classified into subtypes |
|
|
388 |
based on their expression profile using k-means, then, to use this |
|
|
389 |
function, do: |
|
|
390 |
|
|
|
391 |
annotate_patients(dict(zip(patient index>, <list of corresponding |
|
|
392 |
patient's subtypes>))) |
|
|
393 |
|
|
|
394 |
Adding a field to the patients clinical data allows openomics to |
|
|
395 |
query the patients data through the .load_data(subtypes=[]) parameter, |
|
|
396 |
|
|
|
397 |
Args: |
|
|
398 |
dictionary: A dictionary mapping patient's index to a subtype |
|
|
399 |
""" |
|
|
400 |
self.data["PATIENTS"] = self.data["PATIENTS"].assign( |
|
|
401 |
subtypes=self.data["PATIENTS"][ |
|
|
402 |
self.clinical.patient_column].map(dictionary)) |