|
a |
|
b/pyMultiOmics/base.py |
|
|
1 |
from collections import OrderedDict |
|
|
2 |
|
|
|
3 |
import pandas as pd |
|
|
4 |
from loguru import logger |
|
|
5 |
|
|
|
6 |
from .constants import MEASUREMENT_DF_LABEL, DESIGN_DF_LABEL, PADJ_COL_PREFIX, PVALUE_COL_PREFIX, FC_COL_PREFIX |
|
|
7 |
from .info import get_info |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
class SingleOmicsData(): |
|
|
11 |
def __init__(self, data_type, measurement_df, design_df, feature_annot_df=None, significant_df=None): |
|
|
12 |
self.data_type = data_type |
|
|
13 |
|
|
|
14 |
self.original_measurement_df = measurement_df.copy() |
|
|
15 |
self.original_design_df = design_df.copy() |
|
|
16 |
self.original_feature_annot_df = feature_annot_df |
|
|
17 |
self.original_significant_df = significant_df |
|
|
18 |
|
|
|
19 |
# extract columns containing p-values and fold changes into its own dataframe |
|
|
20 |
if significant_df is not None: |
|
|
21 |
self.significant_df = significant_df |
|
|
22 |
else: # try to extract from the data |
|
|
23 |
keep_df, drop_df = self._get_significant_df(measurement_df) |
|
|
24 |
if not drop_df.empty: |
|
|
25 |
self.significant_df = drop_df |
|
|
26 |
measurement_df = keep_df |
|
|
27 |
|
|
|
28 |
# clean data |
|
|
29 |
cleaned_measurement_df, cleaned_design_df = self._clean_data(measurement_df, design_df) |
|
|
30 |
msg = 'cleaned_measurement_df = %s, cleaned_design_df = %s' % ( |
|
|
31 |
cleaned_measurement_df.shape, |
|
|
32 |
cleaned_design_df.shape |
|
|
33 |
) |
|
|
34 |
assert cleaned_measurement_df.shape[1] == cleaned_design_df.shape[0], msg |
|
|
35 |
|
|
|
36 |
self.initial_measurement_df = cleaned_measurement_df |
|
|
37 |
self.design_df = cleaned_design_df |
|
|
38 |
self.feature_annot_df = feature_annot_df |
|
|
39 |
|
|
|
40 |
# An ordered dict of dataframes as it's processed through the pipeline. |
|
|
41 |
# The ordering is important since data is processed sequentially |
|
|
42 |
# The last entry is the current result to use |
|
|
43 |
self.processed_dfs = OrderedDict() |
|
|
44 |
|
|
|
45 |
@property |
|
|
46 |
def data_df(self): |
|
|
47 |
od = self.processed_dfs |
|
|
48 |
if len(od) > 0: |
|
|
49 |
return od[next(reversed(od))] |
|
|
50 |
else: |
|
|
51 |
return self.initial_measurement_df |
|
|
52 |
|
|
|
53 |
def get_initial_measurement_df(self): |
|
|
54 |
return self.initial_measurement_df.copy() |
|
|
55 |
|
|
|
56 |
def _get_significant_df(self, df): |
|
|
57 |
# old data in GraphOmics was using PADJ_COL_PREFIX rather than PVALUE_COL_PREFIX |
|
|
58 |
df = df.copy() |
|
|
59 |
df.columns = df.columns.str.replace(PADJ_COL_PREFIX, PVALUE_COL_PREFIX) |
|
|
60 |
|
|
|
61 |
# find significant information in columns starting with FC_COL_PREFIX, PVALUE_COL_PREFIX |
|
|
62 |
drop_cols = [FC_COL_PREFIX, PVALUE_COL_PREFIX] |
|
|
63 |
drop_cols = tuple([x.lower() for x in drop_cols]) |
|
|
64 |
to_drop = list(filter(lambda x: x.lower().startswith(drop_cols), df.columns)) |
|
|
65 |
to_keep = [col for col in df.columns if col not in to_drop] |
|
|
66 |
|
|
|
67 |
keep_df = df[to_keep] |
|
|
68 |
drop_df = df[to_drop] |
|
|
69 |
return keep_df, drop_df |
|
|
70 |
|
|
|
71 |
def _clean_data(self, measurement_df, design_df): |
|
|
72 |
|
|
|
73 |
# drop duplicate rows and columns by values |
|
|
74 |
# measurement_df = self._drop_dupes_by_values(measurement_df, MEASUREMENT_DF_LABEL) |
|
|
75 |
# design_df = self._drop_dupes_by_values(design_df, DESIGN_DF_LABEL) # don't do this! |
|
|
76 |
|
|
|
77 |
# drop duplicate rows and columns by sample names |
|
|
78 |
measurement_df = self._drop_dupes_by_colnames(measurement_df, MEASUREMENT_DF_LABEL) |
|
|
79 |
design_df = self._drop_dupes_by_colnames(design_df.transpose(), DESIGN_DF_LABEL).transpose() |
|
|
80 |
|
|
|
81 |
# keep common samples having both measurements and metadata |
|
|
82 |
measurement_df, design_df = self._keep_common_samples(measurement_df, design_df) |
|
|
83 |
return measurement_df, design_df |
|
|
84 |
|
|
|
85 |
def _drop_dupes_by_values(self, df, label): |
|
|
86 |
# drop duplicate rows, keep no duplicates |
|
|
87 |
no_dupe_rows = df.drop_duplicates(keep='first') |
|
|
88 |
|
|
|
89 |
# drop duplicate columns, keep no duplicates |
|
|
90 |
no_dupe = no_dupe_rows.transpose().drop_duplicates(keep='first').transpose() |
|
|
91 |
|
|
|
92 |
# print message is something has been dropped |
|
|
93 |
if df.shape != no_dupe.shape: |
|
|
94 |
logger.warning('Dropped duplicate from %s by values: %d rows and %d cols' % ( |
|
|
95 |
label, |
|
|
96 |
df.shape[0] - no_dupe.shape[0], |
|
|
97 |
df.shape[1] - no_dupe.shape[1] |
|
|
98 |
)) |
|
|
99 |
return no_dupe |
|
|
100 |
|
|
|
101 |
def _drop_dupes_by_colnames(self, df, label): |
|
|
102 |
# find columns that have the same name |
|
|
103 |
# https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns |
|
|
104 |
cleaned_df = df.loc[:, ~df.columns.duplicated()] |
|
|
105 |
|
|
|
106 |
n_cols_initial = df.shape[1] |
|
|
107 |
n_cols_cleaned = cleaned_df.shape[1] |
|
|
108 |
diff = n_cols_initial - n_cols_cleaned |
|
|
109 |
if diff > 0: |
|
|
110 |
logger.warning('Dropped %d duplicate sample names from %s' % (diff, label)) |
|
|
111 |
return cleaned_df |
|
|
112 |
|
|
|
113 |
def _keep_common_samples(self, measurement_df, design_df): |
|
|
114 |
# find common sample names (rows and columns) in measurement and design dfs |
|
|
115 |
cols = measurement_df.columns.values |
|
|
116 |
rows = design_df.index.values |
|
|
117 |
common = set(cols).intersection(set(rows)) |
|
|
118 |
|
|
|
119 |
# select the common row and col names |
|
|
120 |
selected_cols = [col for col in cols if col in common] |
|
|
121 |
selected_rows = [row for row in rows if row in common] |
|
|
122 |
cleaned_measurement_df = measurement_df[selected_cols] |
|
|
123 |
cleaned_design_df = design_df.loc[selected_rows] |
|
|
124 |
|
|
|
125 |
diff = measurement_df.shape[1] - cleaned_measurement_df.shape[1] |
|
|
126 |
if diff > 0: |
|
|
127 |
logger.warning('Dropped %d columns from measurement dataframe due to missing metadata' % diff) |
|
|
128 |
|
|
|
129 |
diff = design_df.shape[0] - cleaned_design_df.shape[0] |
|
|
130 |
if diff > 0: |
|
|
131 |
logger.warning('Dropped %d columns from sample metadata due to missing measurements' % diff) |
|
|
132 |
|
|
|
133 |
return cleaned_measurement_df, cleaned_design_df |
|
|
134 |
|
|
|
135 |
def __repr__(self): |
|
|
136 |
dtype_str = self.data_type |
|
|
137 |
shape = self.data_df.shape |
|
|
138 |
return '%s data with (%d, %d) measurements' % (dtype_str, shape[0], shape[1]) |
|
|
139 |
|
|
|
140 |
|
|
|
141 |
class MultiOmicsData(): |
|
|
142 |
def __init__(self, publication=None, url=None): |
|
|
143 |
self.views = {} |
|
|
144 |
self.publication = publication |
|
|
145 |
self.url = url |
|
|
146 |
|
|
|
147 |
def add_data(self, omics_data): |
|
|
148 |
# check if list |
|
|
149 |
if not isinstance(omics_data, list): |
|
|
150 |
# if not, put it in a list |
|
|
151 |
items = [omics_data] |
|
|
152 |
else: |
|
|
153 |
items = omics_data |
|
|
154 |
|
|
|
155 |
# push list items to dictionary |
|
|
156 |
for item in items: |
|
|
157 |
view_name = item.data_type |
|
|
158 |
self.views[view_name] = item |
|
|
159 |
|
|
|
160 |
def has_data(self, data_type): |
|
|
161 |
return data_type in self.views |
|
|
162 |
|
|
|
163 |
def get_data(self, data_type): |
|
|
164 |
try: |
|
|
165 |
return self.views[data_type] |
|
|
166 |
except KeyError: |
|
|
167 |
return None |
|
|
168 |
|
|
|
169 |
def get_dfs(self, data_type): |
|
|
170 |
if self.has_data(data_type): |
|
|
171 |
data = self.get_data(data_type) |
|
|
172 |
return data.data_df, data.design_df |
|
|
173 |
else: |
|
|
174 |
return None, None |
|
|
175 |
|
|
|
176 |
def get_info(self, entity_id, data_type): |
|
|
177 |
return get_info(entity_id, data_type) |
|
|
178 |
|
|
|
179 |
def to_mofa(self): |
|
|
180 |
res = pd.DataFrame() |
|
|
181 |
for v in self.views: |
|
|
182 |
data = self.views[v] |
|
|
183 |
df = data.data_df |
|
|
184 |
df['feature'] = df.index |
|
|
185 |
df = df.melt(id_vars='feature',var_name='sample') |
|
|
186 |
df['view'] = data.data_type |
|
|
187 |
df = df.join(data.design_df, on = 'sample') |
|
|
188 |
res = res.append(df) |
|
|
189 |
return res |
|
|
190 |
|
|
|
191 |
def __repr__(self): |
|
|
192 |
msg = 'Multi-omics data container' |
|
|
193 |
if self.publication is not None: |
|
|
194 |
msg += '\n- publication: %s' % self.publication |
|
|
195 |
if self.url is not None: |
|
|
196 |
msg += '\n- URL: %s' % self.url |
|
|
197 |
if len(self.views) > 0: |
|
|
198 |
msg += '\n- Views: %d modalities' % len(self.views) |
|
|
199 |
for v in self.views: |
|
|
200 |
msg += '\n\t - %s' % self.views[v] |
|
|
201 |
return msg |