|
a |
|
b/preprocessing/FIDDLE_helpers.py |
|
|
1 |
import argparse |
|
|
2 |
def str2bool(v): |
|
|
3 |
if isinstance(v, bool): |
|
|
4 |
return v |
|
|
5 |
if v.lower() in ('yes', 'true', 't', 'y', '1'): |
|
|
6 |
return True |
|
|
7 |
elif v.lower() in ('no', 'false', 'f', 'n', '0'): |
|
|
8 |
return False |
|
|
9 |
else: |
|
|
10 |
raise argparse.ArgumentTypeError('Boolean value expected.') |
|
|
11 |
|
|
|
12 |
|
|
|
13 |
try: |
|
|
14 |
from .FIDDLE_config import * |
|
|
15 |
except: |
|
|
16 |
from FIDDLE_config import * |
|
|
17 |
import pandas as pd |
|
|
18 |
import numpy as np |
|
|
19 |
import scipy |
|
|
20 |
import sparse |
|
|
21 |
from collections import defaultdict |
|
|
22 |
|
|
|
23 |
from joblib import Parallel, delayed, parallel_backend |
|
|
24 |
from tqdm import tqdm |
|
|
25 |
|
|
|
26 |
from sklearn.feature_selection import VarianceThreshold |
|
|
27 |
import sklearn |
|
|
28 |
from collections import defaultdict |
|
|
29 |
|
|
|
30 |
def print_header(*content, char='='): |
|
|
31 |
print() |
|
|
32 |
print(char * 80) |
|
|
33 |
print(*content) |
|
|
34 |
print(char * 80, flush=True) |
|
|
35 |
|
|
|
36 |
|
|
|
37 |
###### |
|
|
38 |
# Transform |
|
|
39 |
###### |
|
|
40 |
|
|
|
41 |
def get_unique_variables(df): |
|
|
42 |
return sorted(df[var_col].unique()) |
|
|
43 |
|
|
|
44 |
def get_frequent_numeric_variables(df_time_series, variables, threshold, args): |
|
|
45 |
data_path = args.data_path |
|
|
46 |
df_population = args.df_population |
|
|
47 |
T, dt = args.T, args.dt |
|
|
48 |
|
|
|
49 |
df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type'] |
|
|
50 |
numeric_vars = [col for col in variables if df_types[col] == 'Numeric'] |
|
|
51 |
df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient. |
|
|
52 |
variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)] |
|
|
53 |
return variables_num_freq |
|
|
54 |
|
|
|
55 |
def calculate_variable_counts(df_data, df_population): |
|
|
56 |
""" |
|
|
57 |
df_data in raw format with four columns |
|
|
58 |
""" |
|
|
59 |
df = df_data.copy() |
|
|
60 |
df['count'] = 1 |
|
|
61 |
df_count = df[[ID_col, var_col, 'count']].groupby([ID_col, var_col]).count().unstack(1, fill_value=0) |
|
|
62 |
df_count.columns = df_count.columns.droplevel() |
|
|
63 |
df_count = df_count.reindex(df_population.index, fill_value=0) |
|
|
64 |
## Slower version |
|
|
65 |
# df_count = df[['ID', 'variable_name', 'count']].pivot_table(index='ID', columns='variable_name', aggfunc='count', fill_value=0) |
|
|
66 |
return df_count |
|
|
67 |
|
|
|
68 |
def select_dtype(df, dtype, dtypes=None): |
|
|
69 |
if dtypes is None: |
|
|
70 |
## Need to assert dtypes are not all objects |
|
|
71 |
assert not all(df.dtypes == 'object') |
|
|
72 |
if dtype == 'mask': |
|
|
73 |
return df.select_dtypes('bool') |
|
|
74 |
elif dtype == '~mask': |
|
|
75 |
return df.select_dtypes(exclude='bool') |
|
|
76 |
else: |
|
|
77 |
## Need to assert df.columns and dtypes.index are the same |
|
|
78 |
if dtype == 'mask': |
|
|
79 |
return df.loc[:, (dtypes == 'bool')].astype(bool) |
|
|
80 |
elif dtype == '~mask': |
|
|
81 |
return df.loc[:, (dtypes != 'bool')] |
|
|
82 |
else: |
|
|
83 |
assert False |
|
|
84 |
return |
|
|
85 |
|
|
|
86 |
# def smart_qcut_dummify(x, q): |
|
|
87 |
# z = smart_qcut(x, q) |
|
|
88 |
# return pd.get_dummies(z, prefix=z.name) |
|
|
89 |
|
|
|
90 |
# def smart_qcut(x, q): |
|
|
91 |
# # ignore strings when performing qcut |
|
|
92 |
# x = x.copy() |
|
|
93 |
# x = x.apply(make_float) |
|
|
94 |
# m = x.apply(np.isreal) |
|
|
95 |
# if x.loc[m].dropna().nunique() > 1: # when more than one numeric values |
|
|
96 |
# if x.loc[m].dropna().nunique() == 2: |
|
|
97 |
# pass |
|
|
98 |
# else: |
|
|
99 |
# x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop') |
|
|
100 |
# # bins = np.unique(np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100])) |
|
|
101 |
# # x.loc[m] = pd.cut(x, bins) |
|
|
102 |
# return x |
|
|
103 |
|
|
|
104 |
def smart_qcut_dummify(x, q): |
|
|
105 |
z, bins = smart_qcut(x, q) |
|
|
106 |
return pd.get_dummies(z, prefix=z.name), bins |
|
|
107 |
|
|
|
108 |
def dummify(z): |
|
|
109 |
return pd.get_dummies(z, prefix=z.name) |
|
|
110 |
|
|
|
111 |
def smart_qcut(x, q=5): |
|
|
112 |
# ignore strings when performing qcut |
|
|
113 |
x = x.copy() |
|
|
114 |
x = x.apply(make_float) |
|
|
115 |
m = x.apply(np.isreal) |
|
|
116 |
bins = None |
|
|
117 |
if x.loc[m].dropna().nunique() > 1: # when more than one numeric values |
|
|
118 |
if x.loc[m].dropna().nunique() == 2: |
|
|
119 |
pass |
|
|
120 |
else: |
|
|
121 |
# x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop') |
|
|
122 |
bins = np.unique(np.nanpercentile(x.loc[m].astype(float).values, [0, 20, 40, 60, 80, 100])) |
|
|
123 |
x.loc[m] = pd.cut(x.loc[m], bins, duplicates='drop', include_lowest=True) |
|
|
124 |
bins = list(bins) |
|
|
125 |
return x, (x.name, bins) |
|
|
126 |
|
|
|
127 |
def smart_qcut_bins(first_args): |
|
|
128 |
(x, bins) = first_args |
|
|
129 |
# ignore strings when performing qcut |
|
|
130 |
x = x.copy() |
|
|
131 |
x = x.apply(make_float) |
|
|
132 |
m = x.apply(np.isreal) |
|
|
133 |
if bins is not None: |
|
|
134 |
x.loc[m] = pd.cut(x.loc[m], bins, duplicates='drop', include_lowest=True) |
|
|
135 |
else: |
|
|
136 |
pass |
|
|
137 |
return x, (x.name, bins) |
|
|
138 |
|
|
|
139 |
def smart_dummify_impute(x): |
|
|
140 |
x = x.copy() |
|
|
141 |
x = x.apply(make_float) |
|
|
142 |
m = x.apply(np.isreal) |
|
|
143 |
if x.loc[m].dropna().nunique() == 0: # all string values |
|
|
144 |
return pd.get_dummies(x, prefix=x.name, prefix_sep=':') |
|
|
145 |
else: |
|
|
146 |
x = pd.DataFrame(x) |
|
|
147 |
# x = x.fillna(x.mean()) # simple mean imputation |
|
|
148 |
return x |
|
|
149 |
|
|
|
150 |
def make_float(v): |
|
|
151 |
try: |
|
|
152 |
return float(v) |
|
|
153 |
except ValueError: |
|
|
154 |
return v |
|
|
155 |
assert False |
|
|
156 |
|
|
|
157 |
def is_numeric(v): |
|
|
158 |
try: |
|
|
159 |
float(v) |
|
|
160 |
return True |
|
|
161 |
except ValueError: |
|
|
162 |
return False |
|
|
163 |
assert False |
|
|
164 |
|
|
|
165 |
|
|
|
166 |
###### |
|
|
167 |
# Time-series internals |
|
|
168 |
###### |
|
|
169 |
|
|
|
170 |
def _get_time_bins(T, dt): |
|
|
171 |
# Defines the boundaries of time bins [0, dt, 2*dt, ..., k*dt] |
|
|
172 |
# where k*dt <= T and (k+1)*dt > T |
|
|
173 |
return np.arange(0, dt*(np.floor(T/dt)+1), dt) |
|
|
174 |
|
|
|
175 |
def _get_time_bins_index(T, dt): |
|
|
176 |
return pd.cut([], _get_time_bins(T, dt), right=False).categories |
|
|
177 |
|
|
|
178 |
def pivot_event_table(df): |
|
|
179 |
df = df.copy() |
|
|
180 |
|
|
|
181 |
# Handle cases where the same variable is recorded multiple times with the same timestamp |
|
|
182 |
# Adjust the timestamps by epsilon so that all timestamps are unique |
|
|
183 |
eps = 1e-6 |
|
|
184 |
m_dups = df.duplicated([t_col, var_col], keep=False) |
|
|
185 |
df_dups = df[m_dups].copy() |
|
|
186 |
for v, df_v in df_dups.groupby(var_col): |
|
|
187 |
df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v)) |
|
|
188 |
|
|
|
189 |
df = pd.concat([df[~m_dups], df_dups]) |
|
|
190 |
assert not df.duplicated([t_col, var_col], keep=False).any() |
|
|
191 |
|
|
|
192 |
return pd.pivot_table(df, val_col, t_col, var_col, 'first') |
|
|
193 |
|
|
|
194 |
def presence_mask(df_i, variables, T, dt): |
|
|
195 |
# for each itemid |
|
|
196 |
# for each time bin, whether there is real measurement |
|
|
197 |
if len(df_i) == 0: |
|
|
198 |
mask_i = pd.DataFrame().reindex(index=_get_time_bins_index(T, dt), columns=list(variables), fill_value=False) |
|
|
199 |
else: |
|
|
200 |
mask_i = df_i.groupby( |
|
|
201 |
pd.cut(df_i.index, _get_time_bins(T, dt), right=False) |
|
|
202 |
).apply(lambda x: x.notnull().any()) |
|
|
203 |
mask_i = mask_i.reindex(columns=variables, fill_value=False) |
|
|
204 |
|
|
|
205 |
mask_i.columns = [str(col) + '_mask' for col in mask_i.columns] |
|
|
206 |
return mask_i |
|
|
207 |
|
|
|
208 |
def get_delta_time(mask_i): |
|
|
209 |
a = 1 - mask_i |
|
|
210 |
b = a.cumsum() |
|
|
211 |
c = mask_i.cumsum() |
|
|
212 |
dt_i = b - b.where(~a.astype(bool)).ffill().fillna(0).astype(int) |
|
|
213 |
|
|
|
214 |
# the delta time for itemid's for which there are no measurements must be 0 |
|
|
215 |
# or where there's no previous measurement and no imputation |
|
|
216 |
dt_i[c == 0] = 0 |
|
|
217 |
|
|
|
218 |
dt_i.columns = [str(col).replace('_mask', '_delta_time') for col in dt_i.columns] |
|
|
219 |
return dt_i |
|
|
220 |
|
|
|
221 |
def impute_ffill(df, columns, T, dt, mask=None): |
|
|
222 |
if len(df) == 0: |
|
|
223 |
return pd.DataFrame().reindex(columns=columns, fill_value=np.nan) |
|
|
224 |
|
|
|
225 |
if mask is None: |
|
|
226 |
mask = presence_mask(df, columns) |
|
|
227 |
|
|
|
228 |
# Calculate time bins, sorted by time |
|
|
229 |
df_bin = df.copy() |
|
|
230 |
df_bin.index = pd.cut(df_bin.index, _get_time_bins(T, dt), right=False) |
|
|
231 |
|
|
|
232 |
# Compute the values used for imputation |
|
|
233 |
## Collapse duplicate time bins, keeping latest values for each time bin |
|
|
234 |
df_imp = df_bin.ffill() |
|
|
235 |
df_imp = df_imp[~df_imp.index.duplicated(keep='last')] |
|
|
236 |
## Reindex to make sure every time bin exists |
|
|
237 |
df_imp = df_imp.reindex(_get_time_bins_index(T, dt)) |
|
|
238 |
## Forward fill the missing time bins |
|
|
239 |
df_imp = df_imp.ffill() |
|
|
240 |
|
|
|
241 |
df_ff = df_imp |
|
|
242 |
df_ff[mask.to_numpy()] = np.nan |
|
|
243 |
df_ff.index = df_ff.index.mid ## Imputed values lie at the middle of a time bin |
|
|
244 |
df_ff = pd.concat([df, df_ff]).dropna(how='all') |
|
|
245 |
df_ff.sort_index(inplace=True) |
|
|
246 |
return df_ff |
|
|
247 |
|
|
|
248 |
def most_recent_values(df_i, columns, T, dt): |
|
|
249 |
df_bin = df_i.copy() |
|
|
250 |
df_bin.index = pd.cut(df_bin.index, _get_time_bins(T, dt), right=False) |
|
|
251 |
df_v = df_bin.groupby(level=0).last() |
|
|
252 |
df_v.columns = [str(col) + '_value' for col in df_v.columns] |
|
|
253 |
df_v = df_v.reindex(_get_time_bins_index(T, dt)) |
|
|
254 |
return df_v |
|
|
255 |
|
|
|
256 |
def summary_statistics(df_i, columns, stats_functions, T, dt): |
|
|
257 |
# e.g. stats_functions=['mean', 'min', 'max'] |
|
|
258 |
if len(columns) == 0: |
|
|
259 |
return pd.DataFrame().reindex(_get_time_bins_index(T, dt)) |
|
|
260 |
else: |
|
|
261 |
# Encode statistics for numeric, frequent variables |
|
|
262 |
df_numeric = df_i[columns] |
|
|
263 |
df = df_numeric.copy().astype(float) |
|
|
264 |
df.index = pd.cut(df.index, _get_time_bins(T, dt), right=False) |
|
|
265 |
df_v = df.reset_index().groupby('index').agg(stats_functions) |
|
|
266 |
df_v.columns = list(map('_'.join, df_v.columns.values)) |
|
|
267 |
df_v = df_v.reindex(_get_time_bins_index(T, dt)) |
|
|
268 |
return df_v |
|
|
269 |
|
|
|
270 |
def check_imputed_output(df_v): |
|
|
271 |
# Check imputation is successful |
|
|
272 |
## If column is all null -> OK |
|
|
273 |
## If column is all non-null -> OK |
|
|
274 |
## If column has some null -> should only occur at the beginning |
|
|
275 |
not_null = df_v.notnull().all() |
|
|
276 |
all_null = df_v.isnull().all() |
|
|
277 |
cols_to_check = list(df_v.columns[~(not_null | all_null)]) |
|
|
278 |
|
|
|
279 |
for col in cols_to_check: |
|
|
280 |
x = df_v[col].to_numpy() |
|
|
281 |
last_null_idx = np.argmax(np.where(pd.isnull(x))) # Find index of last nan |
|
|
282 |
assert pd.isnull(x[:(last_null_idx+1)]).all() # all values up to here are nan |
|
|
283 |
assert (~pd.isnull(x[(last_null_idx+1):])).all() # all values after here are not nan |
|
|
284 |
return |
|
|
285 |
|
|
|
286 |
|
|
|
287 |
###### |
|
|
288 |
# Post-filter: feature selection classes |
|
|
289 |
###### |
|
|
290 |
try: |
|
|
291 |
from sklearn.feature_selection._base import SelectorMixin |
|
|
292 |
except: |
|
|
293 |
from sklearn.feature_selection.base import SelectorMixin |
|
|
294 |
|
|
|
295 |
class FrequencyThreshold_temporal( |
|
|
296 |
sklearn.base.BaseEstimator, |
|
|
297 |
SelectorMixin |
|
|
298 |
): |
|
|
299 |
def __init__(self, threshold=0., L=None): |
|
|
300 |
assert L is not None |
|
|
301 |
self.threshold = threshold |
|
|
302 |
self.L = L |
|
|
303 |
|
|
|
304 |
def fit(self, X, y=None): |
|
|
305 |
# Reshape to be 3-dimensional array |
|
|
306 |
NL, D = X.shape |
|
|
307 |
X = X.reshape((int(NL/self.L), self.L, D)) |
|
|
308 |
|
|
|
309 |
# Collapse time dimension, generating NxD matrix |
|
|
310 |
X_notalways0 = X.any(axis=1) |
|
|
311 |
X_notalways1 = (1-X).any(axis=1) |
|
|
312 |
|
|
|
313 |
self.freqs_notalways0 = np.mean(X_notalways0, axis=0) |
|
|
314 |
self.freqs_notalways1 = np.mean(X_notalways1, axis=0) |
|
|
315 |
return self |
|
|
316 |
|
|
|
317 |
def _get_support_mask(self): |
|
|
318 |
mask = np.logical_and( |
|
|
319 |
self.freqs_notalways0 > self.threshold, |
|
|
320 |
self.freqs_notalways1 > self.threshold, |
|
|
321 |
) |
|
|
322 |
if hasattr(mask, "toarray"): |
|
|
323 |
mask = mask.toarray() |
|
|
324 |
if hasattr(mask, "todense"): |
|
|
325 |
mask = mask.todense() |
|
|
326 |
return mask |
|
|
327 |
|
|
|
328 |
# Keep only first feature in a pairwise perfectly correlated feature group |
|
|
329 |
class CorrelationSelector( |
|
|
330 |
sklearn.base.BaseEstimator, |
|
|
331 |
SelectorMixin, |
|
|
332 |
): |
|
|
333 |
def __init__(self): |
|
|
334 |
super().__init__() |
|
|
335 |
|
|
|
336 |
def fit(self, X, y=None): |
|
|
337 |
if hasattr(X, "to_scipy_sparse"): # sparse matrix |
|
|
338 |
X = X.to_scipy_sparse() |
|
|
339 |
|
|
|
340 |
# Calculate correlation matrix |
|
|
341 |
# Keep only lower triangular matrix |
|
|
342 |
if scipy.sparse.issparse(X): |
|
|
343 |
self.corr_matrix = sparse_corrcoef(X.T) |
|
|
344 |
else: |
|
|
345 |
self.corr_matrix = np.corrcoef(X.T) |
|
|
346 |
np.fill_diagonal(self.corr_matrix, 0) |
|
|
347 |
self.corr_matrix *= np.tri(*self.corr_matrix.shape) |
|
|
348 |
|
|
|
349 |
# get absolute value |
|
|
350 |
corr = abs(self.corr_matrix) |
|
|
351 |
|
|
|
352 |
# coefficient close to 1 means perfectly correlated |
|
|
353 |
# Compare each feature to previous feature (smaller index) to see if they have correlation of 1 |
|
|
354 |
to_drop = np.isclose(corr, 1.0).sum(axis=1).astype(bool) |
|
|
355 |
self.to_keep = ~to_drop |
|
|
356 |
|
|
|
357 |
return self |
|
|
358 |
|
|
|
359 |
def _get_support_mask(self): |
|
|
360 |
return self.to_keep |
|
|
361 |
|
|
|
362 |
def get_feature_aliases(self, feature_names): |
|
|
363 |
feature_names = [str(n) for n in feature_names] |
|
|
364 |
corr_matrix = self.corr_matrix |
|
|
365 |
flags = np.isclose(abs(corr_matrix), 1.0) |
|
|
366 |
alias_map = defaultdict(list) |
|
|
367 |
for i in range(1, corr_matrix.shape[0]): |
|
|
368 |
for j in range(i): |
|
|
369 |
if flags[i,j]: |
|
|
370 |
if np.isclose(corr_matrix[i,j], 1.0): |
|
|
371 |
alias_map[feature_names[j]].append(feature_names[i]) |
|
|
372 |
elif np.isclose(corr_matrix[i,j], -1.0): |
|
|
373 |
alias_map[feature_names[j]].append('~{' + feature_names[i] + '}') |
|
|
374 |
else: |
|
|
375 |
assert False |
|
|
376 |
|
|
|
377 |
# Only save alias for first in the list |
|
|
378 |
break |
|
|
379 |
return dict(alias_map) |
|
|
380 |
|
|
|
381 |
# https://stackoverflow.com/questions/19231268/correlation-coefficients-for-sparse-matrix-in-python |
|
|
382 |
def sparse_corrcoef(A, B=None): |
|
|
383 |
if B is not None: |
|
|
384 |
A = sparse.vstack((A, B), format='csr') |
|
|
385 |
|
|
|
386 |
A = A.astype(np.float64) |
|
|
387 |
n = A.shape[1] |
|
|
388 |
|
|
|
389 |
# Compute the covariance matrix |
|
|
390 |
rowsum = A.sum(1) |
|
|
391 |
centering = rowsum.dot(rowsum.T.conjugate()) / n |
|
|
392 |
C = (A.dot(A.T.conjugate()) - centering) / (n - 1) |
|
|
393 |
|
|
|
394 |
# The correlation coefficients are given by |
|
|
395 |
# C_{i,j} / sqrt(C_{i} * C_{j}) |
|
|
396 |
d = np.diag(C) |
|
|
397 |
coeffs = C / np.sqrt(np.outer(d, d)) |
|
|
398 |
|
|
|
399 |
return np.array(coeffs) |