Switch to unified view

a b/preprocessing/FIDDLE_helpers.py
1
import argparse
2
def str2bool(v):
3
    if isinstance(v, bool):
4
        return v
5
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
6
        return True
7
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
8
        return False
9
    else:
10
        raise argparse.ArgumentTypeError('Boolean value expected.')
11
12
13
try:
14
    from .FIDDLE_config import *
15
except:
16
    from FIDDLE_config import *
17
import pandas as pd
18
import numpy as np
19
import scipy
20
import sparse
21
from collections import defaultdict
22
23
from joblib import Parallel, delayed, parallel_backend
24
from tqdm import tqdm
25
26
from sklearn.feature_selection import VarianceThreshold
27
import sklearn
28
from collections import defaultdict
29
30
def print_header(*content, char='='):
31
    print()
32
    print(char * 80)
33
    print(*content)
34
    print(char * 80, flush=True)
35
36
37
######
38
# Transform
39
######
40
41
def get_unique_variables(df):
42
    return sorted(df[var_col].unique())
43
44
def get_frequent_numeric_variables(df_time_series, variables, threshold, args):
45
    data_path = args.data_path
46
    df_population = args.df_population
47
    T, dt = args.T, args.dt
48
    
49
    df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type']
50
    numeric_vars = [col for col in variables if df_types[col] == 'Numeric']
51
    df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient. 
52
    variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)]
53
    return variables_num_freq
54
55
def calculate_variable_counts(df_data, df_population):
56
    """
57
    df_data in raw format with four columns
58
    """
59
    df = df_data.copy()
60
    df['count'] = 1
61
    df_count = df[[ID_col, var_col, 'count']].groupby([ID_col, var_col]).count().unstack(1, fill_value=0)
62
    df_count.columns = df_count.columns.droplevel()
63
    df_count = df_count.reindex(df_population.index, fill_value=0)
64
    ## Slower version
65
    # df_count = df[['ID', 'variable_name', 'count']].pivot_table(index='ID', columns='variable_name', aggfunc='count', fill_value=0)
66
    return df_count
67
68
def select_dtype(df, dtype, dtypes=None):
69
    if dtypes is None:
70
        ## Need to assert dtypes are not all objects
71
        assert not all(df.dtypes == 'object')
72
        if dtype == 'mask':
73
            return df.select_dtypes('bool')
74
        elif dtype == '~mask':
75
            return df.select_dtypes(exclude='bool')
76
    else:
77
        ## Need to assert df.columns and dtypes.index are the same
78
        if dtype == 'mask':
79
            return df.loc[:, (dtypes == 'bool')].astype(bool)
80
        elif dtype == '~mask':
81
            return df.loc[:, (dtypes != 'bool')]
82
        else:
83
            assert False
84
    return
85
86
# def smart_qcut_dummify(x, q):
87
#     z = smart_qcut(x, q)
88
#     return pd.get_dummies(z, prefix=z.name)
89
90
# def smart_qcut(x, q):
91
#     # ignore strings when performing qcut
92
#     x = x.copy()
93
#     x = x.apply(make_float)
94
#     m = x.apply(np.isreal)
95
#     if x.loc[m].dropna().nunique() > 1: # when more than one numeric values
96
#         if x.loc[m].dropna().nunique() == 2:
97
#             pass
98
#         else:
99
#             x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop')
100
# #             bins = np.unique(np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100]))
101
# #             x.loc[m] = pd.cut(x, bins)
102
#     return x
103
104
def smart_qcut_dummify(x, q):
105
    z, bins = smart_qcut(x, q)
106
    return pd.get_dummies(z, prefix=z.name), bins
107
108
def dummify(z):
109
    return pd.get_dummies(z, prefix=z.name)
110
111
def smart_qcut(x, q=5):
112
    # ignore strings when performing qcut
113
    x = x.copy()
114
    x = x.apply(make_float)
115
    m = x.apply(np.isreal)
116
    bins = None
117
    if x.loc[m].dropna().nunique() > 1: # when more than one numeric values
118
        if x.loc[m].dropna().nunique() == 2:
119
            pass
120
        else:
121
#             x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop')
122
            bins = np.unique(np.nanpercentile(x.loc[m].astype(float).values, [0, 20, 40, 60, 80, 100]))
123
            x.loc[m] = pd.cut(x.loc[m], bins, duplicates='drop', include_lowest=True)
124
            bins = list(bins)
125
    return x, (x.name, bins)
126
127
def smart_qcut_bins(first_args):
128
    (x, bins) = first_args
129
    # ignore strings when performing qcut
130
    x = x.copy()
131
    x = x.apply(make_float)
132
    m = x.apply(np.isreal)
133
    if bins is not None:
134
        x.loc[m] = pd.cut(x.loc[m], bins, duplicates='drop', include_lowest=True)
135
    else:
136
        pass
137
    return x, (x.name, bins)
138
139
def smart_dummify_impute(x):
140
    x = x.copy()
141
    x = x.apply(make_float)
142
    m = x.apply(np.isreal)
143
    if x.loc[m].dropna().nunique() == 0: # all string values
144
        return pd.get_dummies(x, prefix=x.name, prefix_sep=':')
145
    else:
146
        x = pd.DataFrame(x)
147
#         x = x.fillna(x.mean()) # simple mean imputation
148
        return x
149
150
def make_float(v):
151
    try:
152
        return float(v)
153
    except ValueError:
154
        return v
155
    assert False
156
157
def is_numeric(v):
158
    try:
159
        float(v)
160
        return True
161
    except ValueError:
162
        return False
163
    assert False
164
165
166
######
167
# Time-series internals
168
######
169
170
def _get_time_bins(T, dt):
171
    # Defines the boundaries of time bins [0, dt, 2*dt, ..., k*dt] 
172
    # where k*dt <= T and (k+1)*dt > T
173
    return np.arange(0, dt*(np.floor(T/dt)+1), dt)
174
175
def _get_time_bins_index(T, dt):
176
    return pd.cut([], _get_time_bins(T, dt), right=False).categories
177
178
def pivot_event_table(df):
179
    df = df.copy()
180
    
181
    # Handle cases where the same variable is recorded multiple times with the same timestamp
182
    # Adjust the timestamps by epsilon so that all timestamps are unique
183
    eps = 1e-6
184
    m_dups = df.duplicated([t_col, var_col], keep=False)
185
    df_dups = df[m_dups].copy()
186
    for v, df_v in df_dups.groupby(var_col):
187
        df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v))
188
    
189
    df = pd.concat([df[~m_dups], df_dups])
190
    assert not df.duplicated([t_col, var_col], keep=False).any()
191
    
192
    return pd.pivot_table(df, val_col, t_col, var_col, 'first')
193
194
def presence_mask(df_i, variables, T, dt):
195
    # for each itemid
196
    # for each time bin, whether there is real measurement
197
    if len(df_i) == 0:
198
        mask_i = pd.DataFrame().reindex(index=_get_time_bins_index(T, dt), columns=list(variables), fill_value=False)
199
    else:
200
        mask_i = df_i.groupby(
201
            pd.cut(df_i.index, _get_time_bins(T, dt), right=False)
202
        ).apply(lambda x: x.notnull().any())
203
        mask_i = mask_i.reindex(columns=variables, fill_value=False)
204
    
205
    mask_i.columns = [str(col) + '_mask' for col in mask_i.columns]
206
    return mask_i
207
208
def get_delta_time(mask_i):
209
    a = 1 - mask_i
210
    b = a.cumsum()
211
    c = mask_i.cumsum()
212
    dt_i = b - b.where(~a.astype(bool)).ffill().fillna(0).astype(int)
213
    
214
    # the delta time for itemid's for which there are no measurements must be 0
215
    # or where there's no previous measurement and no imputation
216
    dt_i[c == 0] = 0
217
    
218
    dt_i.columns = [str(col).replace('_mask', '_delta_time') for col in dt_i.columns]
219
    return dt_i
220
221
def impute_ffill(df, columns, T, dt, mask=None):
222
    if len(df) == 0:
223
        return pd.DataFrame().reindex(columns=columns, fill_value=np.nan)
224
    
225
    if mask is None:
226
        mask = presence_mask(df, columns)
227
228
    # Calculate time bins, sorted by time
229
    df_bin = df.copy()
230
    df_bin.index = pd.cut(df_bin.index, _get_time_bins(T, dt), right=False)
231
    
232
    # Compute the values used for imputation
233
    ## Collapse duplicate time bins, keeping latest values for each time bin
234
    df_imp = df_bin.ffill()
235
    df_imp = df_imp[~df_imp.index.duplicated(keep='last')]
236
    ## Reindex to make sure every time bin exists
237
    df_imp = df_imp.reindex(_get_time_bins_index(T, dt))
238
    ## Forward fill the missing time bins
239
    df_imp = df_imp.ffill()
240
241
    df_ff = df_imp
242
    df_ff[mask.to_numpy()] = np.nan
243
    df_ff.index = df_ff.index.mid ## Imputed values lie at the middle of a time bin
244
    df_ff = pd.concat([df, df_ff]).dropna(how='all')
245
    df_ff.sort_index(inplace=True)
246
    return df_ff
247
248
def most_recent_values(df_i, columns, T, dt):
249
    df_bin = df_i.copy()
250
    df_bin.index = pd.cut(df_bin.index, _get_time_bins(T, dt), right=False)
251
    df_v = df_bin.groupby(level=0).last()
252
    df_v.columns = [str(col) + '_value' for col in df_v.columns]
253
    df_v = df_v.reindex(_get_time_bins_index(T, dt))
254
    return df_v
255
256
def summary_statistics(df_i, columns, stats_functions, T, dt):
257
    # e.g. stats_functions=['mean', 'min', 'max']
258
    if len(columns) == 0:
259
        return pd.DataFrame().reindex(_get_time_bins_index(T, dt))
260
    else:
261
        # Encode statistics for numeric, frequent variables
262
        df_numeric = df_i[columns]
263
        df = df_numeric.copy().astype(float)
264
        df.index = pd.cut(df.index, _get_time_bins(T, dt), right=False)
265
        df_v = df.reset_index().groupby('index').agg(stats_functions)
266
        df_v.columns = list(map('_'.join, df_v.columns.values))
267
        df_v = df_v.reindex(_get_time_bins_index(T, dt))
268
        return df_v
269
270
def check_imputed_output(df_v):
271
    # Check imputation is successful
272
    ## If column is all null -> OK
273
    ## If column is all non-null -> OK
274
    ## If column has some null -> should only occur at the beginning
275
    not_null = df_v.notnull().all()
276
    all_null = df_v.isnull().all()
277
    cols_to_check = list(df_v.columns[~(not_null | all_null)])
278
279
    for col in cols_to_check:
280
        x = df_v[col].to_numpy()
281
        last_null_idx = np.argmax(np.where(pd.isnull(x))) # Find index of last nan
282
        assert pd.isnull(x[:(last_null_idx+1)]).all() # all values up to here are nan
283
        assert (~pd.isnull(x[(last_null_idx+1):])).all() # all values after here are not nan
284
    return
285
286
287
######
288
# Post-filter: feature selection classes
289
######
290
try:
291
    from sklearn.feature_selection._base import SelectorMixin
292
except:
293
    from sklearn.feature_selection.base import SelectorMixin
294
295
class FrequencyThreshold_temporal(
296
    sklearn.base.BaseEstimator,
297
    SelectorMixin
298
):
299
    def __init__(self, threshold=0., L=None):
300
        assert L is not None
301
        self.threshold = threshold
302
        self.L = L
303
    
304
    def fit(self, X, y=None):
305
        # Reshape to be 3-dimensional array
306
        NL, D = X.shape
307
        X = X.reshape((int(NL/self.L), self.L, D))
308
        
309
        # Collapse time dimension, generating NxD matrix
310
        X_notalways0 = X.any(axis=1)
311
        X_notalways1 = (1-X).any(axis=1)
312
        
313
        self.freqs_notalways0 = np.mean(X_notalways0, axis=0)
314
        self.freqs_notalways1 = np.mean(X_notalways1, axis=0)
315
        return self
316
317
    def _get_support_mask(self):
318
        mask = np.logical_and(
319
            self.freqs_notalways0 > self.threshold,
320
            self.freqs_notalways1 > self.threshold,
321
        )
322
        if hasattr(mask, "toarray"):
323
            mask = mask.toarray()
324
        if hasattr(mask, "todense"):
325
            mask = mask.todense()
326
        return mask
327
328
# Keep only first feature in a pairwise perfectly correlated feature group
329
class CorrelationSelector(
330
    sklearn.base.BaseEstimator,
331
    SelectorMixin,
332
):
333
    def __init__(self):
334
        super().__init__()
335
    
336
    def fit(self, X, y=None):
337
        if hasattr(X, "to_scipy_sparse"):   # sparse matrix
338
            X = X.to_scipy_sparse()
339
        
340
        # Calculate correlation matrix
341
        # Keep only lower triangular matrix
342
        if scipy.sparse.issparse(X):
343
            self.corr_matrix = sparse_corrcoef(X.T)
344
        else:
345
            self.corr_matrix = np.corrcoef(X.T)
346
        np.fill_diagonal(self.corr_matrix, 0)
347
        self.corr_matrix *= np.tri(*self.corr_matrix.shape)
348
        
349
        # get absolute value
350
        corr = abs(self.corr_matrix)
351
        
352
        # coefficient close to 1 means perfectly correlated
353
        # Compare each feature to previous feature (smaller index) to see if they have correlation of 1
354
        to_drop = np.isclose(corr, 1.0).sum(axis=1).astype(bool)
355
        self.to_keep = ~to_drop
356
        
357
        return self
358
359
    def _get_support_mask(self):
360
        return self.to_keep
361
    
362
    def get_feature_aliases(self, feature_names):
363
        feature_names = [str(n) for n in feature_names]
364
        corr_matrix = self.corr_matrix
365
        flags = np.isclose(abs(corr_matrix), 1.0)
366
        alias_map = defaultdict(list)
367
        for i in range(1, corr_matrix.shape[0]):
368
            for j in range(i):
369
                if flags[i,j]:
370
                    if np.isclose(corr_matrix[i,j], 1.0):
371
                        alias_map[feature_names[j]].append(feature_names[i])
372
                    elif np.isclose(corr_matrix[i,j], -1.0):
373
                        alias_map[feature_names[j]].append('~{' + feature_names[i] + '}')
374
                    else:
375
                        assert False
376
377
                    # Only save alias for first in the list
378
                    break
379
        return dict(alias_map)
380
381
# https://stackoverflow.com/questions/19231268/correlation-coefficients-for-sparse-matrix-in-python
382
def sparse_corrcoef(A, B=None):
383
    if B is not None:
384
        A = sparse.vstack((A, B), format='csr')
385
    
386
    A = A.astype(np.float64)
387
    n = A.shape[1]
388
389
    # Compute the covariance matrix
390
    rowsum = A.sum(1)
391
    centering = rowsum.dot(rowsum.T.conjugate()) / n
392
    C = (A.dot(A.T.conjugate()) - centering) / (n - 1)
393
394
    # The correlation coefficients are given by
395
    # C_{i,j} / sqrt(C_{i} * C_{j})
396
    d = np.diag(C)
397
    coeffs = C / np.sqrt(np.outer(d, d))
398
399
    return np.array(coeffs)