Diff of /pyMultiOmics/base.py [000000] .. [7d5693]

Switch to unified view

a b/pyMultiOmics/base.py
1
from collections import OrderedDict
2
3
import pandas as pd
4
from loguru import logger
5
6
from .constants import MEASUREMENT_DF_LABEL, DESIGN_DF_LABEL, PADJ_COL_PREFIX, PVALUE_COL_PREFIX, FC_COL_PREFIX
7
from .info import get_info
8
9
10
class SingleOmicsData():
11
    def __init__(self, data_type, measurement_df, design_df, feature_annot_df=None, significant_df=None):
12
        self.data_type = data_type
13
14
        self.original_measurement_df = measurement_df.copy()
15
        self.original_design_df = design_df.copy()
16
        self.original_feature_annot_df = feature_annot_df
17
        self.original_significant_df = significant_df
18
19
        # extract columns containing p-values and fold changes into its own dataframe
20
        if significant_df is not None:
21
            self.significant_df = significant_df
22
        else:  # try to extract from the data
23
            keep_df, drop_df = self._get_significant_df(measurement_df)
24
            if not drop_df.empty:
25
                self.significant_df = drop_df
26
                measurement_df = keep_df
27
28
        # clean data
29
        cleaned_measurement_df, cleaned_design_df = self._clean_data(measurement_df, design_df)
30
        msg = 'cleaned_measurement_df = %s, cleaned_design_df = %s' % (
31
            cleaned_measurement_df.shape,
32
            cleaned_design_df.shape
33
        )
34
        assert cleaned_measurement_df.shape[1] == cleaned_design_df.shape[0], msg
35
36
        self.initial_measurement_df = cleaned_measurement_df
37
        self.design_df = cleaned_design_df
38
        self.feature_annot_df = feature_annot_df
39
40
        # An ordered dict of dataframes as it's processed through the pipeline.
41
        # The ordering is important since data is processed sequentially
42
        # The last entry is the current result to use
43
        self.processed_dfs = OrderedDict()
44
45
    @property
46
    def data_df(self):
47
        od = self.processed_dfs
48
        if len(od) > 0:
49
            return od[next(reversed(od))]
50
        else:
51
            return self.initial_measurement_df
52
53
    def get_initial_measurement_df(self):
54
        return self.initial_measurement_df.copy()
55
56
    def _get_significant_df(self, df):
57
        # old data in GraphOmics was using PADJ_COL_PREFIX rather than PVALUE_COL_PREFIX
58
        df = df.copy()
59
        df.columns = df.columns.str.replace(PADJ_COL_PREFIX, PVALUE_COL_PREFIX)
60
61
        # find significant information in columns starting with FC_COL_PREFIX, PVALUE_COL_PREFIX
62
        drop_cols = [FC_COL_PREFIX, PVALUE_COL_PREFIX]
63
        drop_cols = tuple([x.lower() for x in drop_cols])
64
        to_drop = list(filter(lambda x: x.lower().startswith(drop_cols), df.columns))
65
        to_keep = [col for col in df.columns if col not in to_drop]
66
67
        keep_df = df[to_keep]
68
        drop_df = df[to_drop]
69
        return keep_df, drop_df
70
71
    def _clean_data(self, measurement_df, design_df):
72
73
        # drop duplicate rows and columns by values
74
        # measurement_df = self._drop_dupes_by_values(measurement_df, MEASUREMENT_DF_LABEL)
75
        # design_df = self._drop_dupes_by_values(design_df, DESIGN_DF_LABEL) # don't do this!
76
77
        # drop duplicate rows and columns by sample names
78
        measurement_df = self._drop_dupes_by_colnames(measurement_df, MEASUREMENT_DF_LABEL)
79
        design_df = self._drop_dupes_by_colnames(design_df.transpose(), DESIGN_DF_LABEL).transpose()
80
81
        # keep common samples having both measurements and metadata
82
        measurement_df, design_df = self._keep_common_samples(measurement_df, design_df)
83
        return measurement_df, design_df
84
85
    def _drop_dupes_by_values(self, df, label):
86
        # drop duplicate rows, keep no duplicates
87
        no_dupe_rows = df.drop_duplicates(keep='first')
88
89
        # drop duplicate columns, keep no duplicates
90
        no_dupe = no_dupe_rows.transpose().drop_duplicates(keep='first').transpose()
91
92
        # print message is something has been dropped
93
        if df.shape != no_dupe.shape:
94
            logger.warning('Dropped duplicate from %s by values: %d rows and %d cols' % (
95
                label,
96
                df.shape[0] - no_dupe.shape[0],
97
                df.shape[1] - no_dupe.shape[1]
98
            ))
99
        return no_dupe
100
101
    def _drop_dupes_by_colnames(self, df, label):
102
        # find columns that have the same name
103
        # https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
104
        cleaned_df = df.loc[:, ~df.columns.duplicated()]
105
106
        n_cols_initial = df.shape[1]
107
        n_cols_cleaned = cleaned_df.shape[1]
108
        diff = n_cols_initial - n_cols_cleaned
109
        if diff > 0:
110
            logger.warning('Dropped %d duplicate sample names from %s' % (diff, label))
111
        return cleaned_df
112
113
    def _keep_common_samples(self, measurement_df, design_df):
114
        # find common sample names (rows and columns) in measurement and design dfs
115
        cols = measurement_df.columns.values
116
        rows = design_df.index.values
117
        common = set(cols).intersection(set(rows))
118
119
        # select the common row and col names
120
        selected_cols = [col for col in cols if col in common]
121
        selected_rows = [row for row in rows if row in common]
122
        cleaned_measurement_df = measurement_df[selected_cols]
123
        cleaned_design_df = design_df.loc[selected_rows]
124
125
        diff = measurement_df.shape[1] - cleaned_measurement_df.shape[1]
126
        if diff > 0:
127
            logger.warning('Dropped %d columns from measurement dataframe due to missing metadata' % diff)
128
129
        diff = design_df.shape[0] - cleaned_design_df.shape[0]
130
        if diff > 0:
131
            logger.warning('Dropped %d columns from sample metadata due to missing measurements' % diff)
132
133
        return cleaned_measurement_df, cleaned_design_df
134
135
    def __repr__(self):
136
        dtype_str = self.data_type
137
        shape = self.data_df.shape
138
        return '%s data with (%d, %d) measurements' % (dtype_str, shape[0], shape[1])
139
140
141
class MultiOmicsData():
142
    def __init__(self, publication=None, url=None):
143
        self.views = {}
144
        self.publication = publication
145
        self.url = url
146
147
    def add_data(self, omics_data):
148
        # check if list
149
        if not isinstance(omics_data, list):
150
            # if not, put it in a list
151
            items = [omics_data]
152
        else:
153
            items = omics_data
154
155
        # push list items to dictionary
156
        for item in items:
157
            view_name = item.data_type
158
            self.views[view_name] = item
159
160
    def has_data(self, data_type):
161
        return data_type in self.views
162
163
    def get_data(self, data_type):
164
        try:
165
            return self.views[data_type]
166
        except KeyError:
167
            return None
168
169
    def get_dfs(self, data_type):
170
        if self.has_data(data_type):
171
            data = self.get_data(data_type)
172
            return data.data_df, data.design_df
173
        else:
174
            return None, None
175
176
    def get_info(self, entity_id, data_type):
177
        return get_info(entity_id, data_type)
178
179
    def to_mofa(self):
180
        res = pd.DataFrame()
181
        for v in self.views:
182
            data = self.views[v]
183
            df = data.data_df
184
            df['feature'] = df.index
185
            df = df.melt(id_vars='feature',var_name='sample')
186
            df['view'] = data.data_type
187
            df = df.join(data.design_df, on = 'sample')
188
            res = res.append(df)
189
        return res
190
191
    def __repr__(self):
192
        msg = 'Multi-omics data container'
193
        if self.publication is not None:
194
            msg += '\n- publication: %s' % self.publication
195
        if self.url is not None:
196
            msg += '\n- URL: %s' % self.url
197
        if len(self.views) > 0:
198
            msg += '\n- Views: %d modalities' % len(self.views)
199
            for v in self.views:
200
                msg += '\n\t - %s' % self.views[v]
201
        return msg