|
a |
|
b/preprocessing/FIDDLE_steps_2.py |
|
|
1 |
""" |
|
|
2 |
FIDDLE Preprocessing steps |
|
|
3 |
1. Pre-filter |
|
|
4 |
2. Transform |
|
|
5 |
3. Post-filter |
|
|
6 |
""" |
|
|
7 |
try: |
|
|
8 |
from FIDDLE_helpers import * |
|
|
9 |
except: |
|
|
10 |
from .FIDDLE_helpers import * |
|
|
11 |
import time |
|
|
12 |
import json |
|
|
13 |
import pickle5 as pickle |
|
|
14 |
import joblib |
|
|
15 |
from datetime import datetime |
|
|
16 |
import multiprocessing |
|
|
17 |
|
|
|
18 |
# import pickle4reducer |
|
|
19 |
# import multiprocessing |
|
|
20 |
# ctx = multiprocessing.get_context() |
|
|
21 |
# ctx.reducer = pickle4reducer.Pickle4Reducer() |
|
|
22 |
|
|
|
23 |
def detect_variable_data_type(df_data, value_type_override, args): |
|
|
24 |
data_path = args.data_path |
|
|
25 |
print_header('*) Detecting value types', char='-') |
|
|
26 |
|
|
|
27 |
data_types = [] |
|
|
28 |
df = df_data |
|
|
29 |
assert val_col in df.columns |
|
|
30 |
|
|
|
31 |
# Collect the unique values of each variable |
|
|
32 |
# values_by_variable: dict(variable_name -> [value1, value2, ...]) |
|
|
33 |
d = df[[var_col, val_col]].drop_duplicates().sort_values(by=[var_col, val_col]) |
|
|
34 |
values_by_variable = defaultdict(list) |
|
|
35 |
for n,v in zip(d[var_col], d[val_col]): |
|
|
36 |
values_by_variable[n].append(v) |
|
|
37 |
|
|
|
38 |
# Determine type of each variable |
|
|
39 |
for variable, values in sorted(values_by_variable.items()): |
|
|
40 |
# Manual override type in config |
|
|
41 |
if variable in value_type_override: |
|
|
42 |
data_types.append((variable, value_type_override[variable])) |
|
|
43 |
# Force categorical values to be a string |
|
|
44 |
if value_type_override[variable] == 'Categorical' and \ |
|
|
45 |
any(is_numeric(v) for v in values if not pd.isnull(v)): |
|
|
46 |
m_var = df[var_col] == variable |
|
|
47 |
df.loc[m_var, val_col] = df.loc[m_var, val_col].apply(lambda s: '_' + str(s)) |
|
|
48 |
else: |
|
|
49 |
if len(values) == 1 and pd.isnull(values[0]): |
|
|
50 |
data_types.append((variable, 'None')) |
|
|
51 |
elif all(is_numeric(v) for v in values if not pd.isnull(v)): |
|
|
52 |
data_types.append((variable, 'Numeric')) |
|
|
53 |
elif any(is_numeric(v) for v in values if not pd.isnull(v)): |
|
|
54 |
data_types.append((variable, 'Numeric + Categorical')) |
|
|
55 |
else: |
|
|
56 |
data_types.append((variable, 'Categorical')) |
|
|
57 |
|
|
|
58 |
df_types = pd.DataFrame(data_types, columns=['variable_name', 'value_type']) |
|
|
59 |
df_types[var_col] = df_types[var_col].astype(str) |
|
|
60 |
df_types = df_types.set_index(var_col) |
|
|
61 |
fpath = data_path + 'value_types.csv' |
|
|
62 |
df_types.to_csv(fpath, quoting=1) |
|
|
63 |
print('Saved as:', fpath) |
|
|
64 |
return df, df_types['value_type'] |
|
|
65 |
|
|
|
66 |
|
|
|
67 |
def split_by_timestamp_type(df): |
|
|
68 |
print_header('*) Separate time-invariant and time-dependent', char='-') |
|
|
69 |
|
|
|
70 |
variables_inv = df[pd.isnull(df[t_col])][var_col].unique() # Invariant variables have t = NULL |
|
|
71 |
df_time_invariant = df[df[var_col].isin(variables_inv)] |
|
|
72 |
df_time_series = df[~df[var_col].isin(variables_inv)] |
|
|
73 |
|
|
|
74 |
print('Variables (time-invariant):', len(variables_inv)) |
|
|
75 |
print('Variables (time-dependent):', df[var_col].nunique() - len(variables_inv)) |
|
|
76 |
print('# rows (time-invariant):', len(df_time_invariant)) |
|
|
77 |
print('# rows (time-dependent):', len(df_time_series)) |
|
|
78 |
return df_time_invariant, df_time_series |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
def process_time_dependent(df_data_time_series, args): |
|
|
82 |
data_path = args.data_path |
|
|
83 |
theta_2 = args.theta_2 |
|
|
84 |
|
|
|
85 |
print_header('2-B) Transform time-dependent data', char='-') |
|
|
86 |
dir_path = data_path + '/' |
|
|
87 |
start_time = time.time() |
|
|
88 |
|
|
|
89 |
## Create NxLxD^ table |
|
|
90 |
df_time_series, dtypes_time_series = transform_time_series_table(df_data_time_series, args) |
|
|
91 |
print('Time elapsed: %f seconds' % (time.time() - start_time), flush=True) |
|
|
92 |
|
|
|
93 |
############## |
|
|
94 |
joblib.dump(df_time_series, args.data_path + 'df_time_series,{}.joblib'.format(datetime.now().isoformat())) |
|
|
95 |
joblib.dump(dtypes_time_series, args.data_path + 'dtypes_time_series,{}.joblib'.format(datetime.now().isoformat())) |
|
|
96 |
############## |
|
|
97 |
|
|
|
98 |
## Map variables to features |
|
|
99 |
X_all, X_all_feature_names = map_time_series_features(df_time_series, dtypes_time_series, args) |
|
|
100 |
sparse.save_npz(dir_path + 'X_all.npz', X_all) |
|
|
101 |
with open(dir_path + 'X_all.feature_names.json', 'w') as f: |
|
|
102 |
json.dump(list(X_all_feature_names), f, sort_keys=True) |
|
|
103 |
print('Time elapsed: %f seconds' % (time.time() - start_time), flush=True) |
|
|
104 |
|
|
|
105 |
## Filter features |
|
|
106 |
if not args.postfilter: |
|
|
107 |
return X_all, X_all_feature_names, {} |
|
|
108 |
print_header('3-B) Post-filter time-dependent data', char='-') |
|
|
109 |
print(X_all.shape, X_all.density) |
|
|
110 |
X, X_feature_names, X_feature_aliases = post_filter_time_series(X_all, X_all_feature_names, theta_2, args) |
|
|
111 |
print(X.shape, X.density) |
|
|
112 |
print('Time elapsed: %f seconds' % (time.time() - start_time)) |
|
|
113 |
|
|
|
114 |
## Save output |
|
|
115 |
print() |
|
|
116 |
print('Output') |
|
|
117 |
print('X: shape={}, density={:.3f}'.format(X.shape, X.density)) |
|
|
118 |
sparse.save_npz(dir_path + 'X.npz', X) |
|
|
119 |
with open(dir_path + 'X.feature_names.json', 'w') as f: |
|
|
120 |
json.dump(list(X_feature_names), f, sort_keys=True) |
|
|
121 |
with open(dir_path + 'X.feature_aliases.json', 'w') as f: |
|
|
122 |
json.dump(X_feature_aliases, f, sort_keys=True) |
|
|
123 |
|
|
|
124 |
print('Total time: %f seconds' % (time.time() - start_time)) |
|
|
125 |
print('', flush=True) |
|
|
126 |
return X, X_feature_names, X_feature_aliases |
|
|
127 |
|
|
|
128 |
###### |
|
|
129 |
# Time-series routines |
|
|
130 |
###### |
|
|
131 |
def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, stats_functions, impute=True): |
|
|
132 |
try: |
|
|
133 |
assert g.index.nunique() == 1 |
|
|
134 |
assert g.index.unique()[0] == i |
|
|
135 |
# non-frequent |
|
|
136 |
variables_non = sorted(set(variables) - set(variables_num_freq)) |
|
|
137 |
if len(variables_non) > 0: |
|
|
138 |
df_j = pivot_event_table(g).reindex(columns=variables_non).sort_index() |
|
|
139 |
df_values_j = most_recent_values(df_j, variables, T, dt) |
|
|
140 |
df_out = df_values_j |
|
|
141 |
|
|
|
142 |
if len(variables_num_freq) > 0: |
|
|
143 |
# frequent |
|
|
144 |
# we're only producing mask, ffill, and statistics if the data is measured frequently enough |
|
|
145 |
df_i = pivot_event_table(g).reindex(columns=variables_num_freq).sort_index() |
|
|
146 |
mask_i = presence_mask(df_i, variables_num_freq, T, dt) |
|
|
147 |
# delta_t_i = get_delta_time(mask_i) |
|
|
148 |
# df_i = impute_ffill(df_i, variables_num_freq, T, dt, mask_i) |
|
|
149 |
df_stats_i = summary_statistics(df_i, variables_num_freq, stats_functions, T, dt) |
|
|
150 |
df_values_i = most_recent_values(df_i, variables, T, dt) |
|
|
151 |
# if impute: |
|
|
152 |
# check_imputed_output(df_values_i) |
|
|
153 |
# check_imputed_output(df_stats_i) |
|
|
154 |
if len(variables_non) > 0: |
|
|
155 |
df_out = df_out.join([mask_i, df_values_i, df_stats_i]) |
|
|
156 |
else: |
|
|
157 |
df_out = mask_i.join([df_values_i, df_stats_i]) |
|
|
158 |
# df_out = df_out.join([mask_i, delta_t_i, df_values_i, df_stats_i]) |
|
|
159 |
except: |
|
|
160 |
print(i) |
|
|
161 |
print(g) |
|
|
162 |
raise |
|
|
163 |
return i, df_out |
|
|
164 |
|
|
|
165 |
def divide_chunks(l, n): |
|
|
166 |
# looping till length l |
|
|
167 |
for i in range(0, len(l), n): |
|
|
168 |
yield l[i:i + n] |
|
|
169 |
|
|
|
170 |
def form_batches_of_examples(df_in, args, batch_size=1000): |
|
|
171 |
grouped = df_in.set_index(ID_col) |
|
|
172 |
IDs = list(grouped.index.unique()) |
|
|
173 |
batches_IDs = list(divide_chunks(IDs, batch_size)) |
|
|
174 |
batches = [grouped.loc[chunk] for chunk in batches_IDs] |
|
|
175 |
return batches, batches_IDs |
|
|
176 |
|
|
|
177 |
def process_batch_time_series(first_arg): |
|
|
178 |
batch, batch_IDs, args = first_arg |
|
|
179 |
variables, variables_num_freq = args.variables, args.variables_num_freq |
|
|
180 |
out = dict( |
|
|
181 |
func_encode_single_time_series(i, batch.loc[i:i], variables, variables_num_freq, args.T, args.dt, args.stats_functions) |
|
|
182 |
for i in batch_IDs |
|
|
183 |
) |
|
|
184 |
return out |
|
|
185 |
|
|
|
186 |
def transform_time_series_table(df_in, args): |
|
|
187 |
data_path = args.data_path |
|
|
188 |
theta_freq = args.theta_freq |
|
|
189 |
stats_functions = args.stats_functions |
|
|
190 |
N, L = args.N, args.L |
|
|
191 |
df_population = args.df_population |
|
|
192 |
parallel = args.parallel |
|
|
193 |
|
|
|
194 |
## TODO: asserts shape of df_in |
|
|
195 |
|
|
|
196 |
# Determine all unique variable names |
|
|
197 |
variables = args.variables |
|
|
198 |
# variables = get_unique_variables(df_in) ############ |
|
|
199 |
# assert df_in[var_col].nunique() == len(variables) |
|
|
200 |
print('Total variables :', len(variables), flush=True) |
|
|
201 |
|
|
|
202 |
# Determine frequent variables -> we'll calculate statistics, mask, and delta_time only on these |
|
|
203 |
variables_num_freq = args.variables_num_freq |
|
|
204 |
# variables_num_freq = get_frequent_numeric_variables(df_in, variables, theta_freq, args) |
|
|
205 |
print('Frequent variables :', list(variables_num_freq)) |
|
|
206 |
print('{} = {}'.format('M\u2081', len(variables_num_freq))) |
|
|
207 |
print('{} = {}'.format('M\u2082', len(variables) - len(variables_num_freq))) |
|
|
208 |
print('{} = {} {}'.format('k ', len(stats_functions), stats_functions)) |
|
|
209 |
|
|
|
210 |
print() |
|
|
211 |
print('Transforming each example...', flush=True) |
|
|
212 |
args.variables = variables |
|
|
213 |
args.variables_num_freq = variables_num_freq |
|
|
214 |
|
|
|
215 |
# Encode time series table for each patient |
|
|
216 |
batches, batches_IDs = form_batches_of_examples(df_in, args, batch_size=1000) |
|
|
217 |
print('Batches of size 1000: ', len(batches), flush=True) |
|
|
218 |
|
|
|
219 |
### TEST |
|
|
220 |
# for batch, batch_IDs in zip(batches, batches_IDs): |
|
|
221 |
# process_batch_time_series((batch, batch_IDs, args)) |
|
|
222 |
# print('done one') |
|
|
223 |
# break |
|
|
224 |
# exit() |
|
|
225 |
### TEST |
|
|
226 |
|
|
|
227 |
pool = multiprocessing.Pool(multiprocessing.cpu_count()) |
|
|
228 |
out = list(tqdm(pool.imap_unordered( |
|
|
229 |
process_batch_time_series, |
|
|
230 |
zip(batches, batches_IDs, [args]*len(batches))), total=len(batches) |
|
|
231 |
)) |
|
|
232 |
pool.close() |
|
|
233 |
pool.join() |
|
|
234 |
|
|
|
235 |
out = dict((key, d[key]) for d in out for key in d) |
|
|
236 |
print() |
|
|
237 |
print('Parallel processing done', flush=True) |
|
|
238 |
|
|
|
239 |
# Handle IDs not in the table |
|
|
240 |
# df_original = list(out.values())[0] |
|
|
241 |
# df_copy = pd.DataFrame().reindex_like(df_original) |
|
|
242 |
# for i, j in df_original.dtypes.iteritems(): |
|
|
243 |
# if i.endswith('_mask'): |
|
|
244 |
# assert j == bool |
|
|
245 |
# df_copy[i] = False |
|
|
246 |
# df_copy[i] = df_copy[i].astype(bool) |
|
|
247 |
# if i.endswith('_delta_time'): |
|
|
248 |
# df_copy[i] = 0 |
|
|
249 |
# df_copy[i] = df_copy[i].astype(int) |
|
|
250 |
# if j == 'object': |
|
|
251 |
# df_copy[i] = df_copy[i].astype('object') |
|
|
252 |
|
|
|
253 |
# for ID in tqdm(df_population.index.values[:N]): |
|
|
254 |
# if ID not in out: |
|
|
255 |
# out[ID] = df_copy.copy() |
|
|
256 |
|
|
|
257 |
# out = {ID: out[ID] for ID in df_population.index.values} |
|
|
258 |
N = len(out) |
|
|
259 |
# assert len(out) == N |
|
|
260 |
D_timeseries = out |
|
|
261 |
|
|
|
262 |
print('Filled no-data examples done', flush=True) |
|
|
263 |
# # check each example have identical LxD table structure |
|
|
264 |
# ID0 = sorted(D_timeseries.keys())[0] |
|
|
265 |
# df0 = D_timeseries[ID0] |
|
|
266 |
# for ID, df_i in D_timeseries.items(): |
|
|
267 |
# pd.testing.assert_index_equal(df_i.index, df0.index) |
|
|
268 |
# pd.testing.assert_index_equal(df_i.columns, df0.columns) |
|
|
269 |
|
|
|
270 |
D_timeseries = out |
|
|
271 |
D_ = len(list(D_timeseries.values())[0].columns) |
|
|
272 |
|
|
|
273 |
######## |
|
|
274 |
# joblib.dump(D_timeseries, args.data_path + 'D_timeseries,{}.joblib'.format(datetime.now().isoformat())) |
|
|
275 |
######## |
|
|
276 |
|
|
|
277 |
# # (N*L)xD^ table |
|
|
278 |
# ## Create MultiIndex of (ID, time_bin) |
|
|
279 |
# index = sum([ |
|
|
280 |
# [(ID, t_) for t_ in list(df_.index)] |
|
|
281 |
# for ID, df_ in sorted(D_timeseries.items()) |
|
|
282 |
# ], []) |
|
|
283 |
# index = pd.Index(index) |
|
|
284 |
index = [ID for ID, df_ in sorted(D_timeseries.items())] |
|
|
285 |
index = pd.Index(index) |
|
|
286 |
# # assert len(index) == N * L |
|
|
287 |
|
|
|
288 |
## Assume all dataframes have the same columns, used after concatenation |
|
|
289 |
columns = list(sorted(D_timeseries.items())[0][1].columns) |
|
|
290 |
columns = np.array(columns) |
|
|
291 |
dtypes = sorted(D_timeseries.items())[0][1].dtypes |
|
|
292 |
|
|
|
293 |
## Convert each df to a numpy array |
|
|
294 |
## Concatenate **sorted** numpy arrays (faster than calling pd.concat) |
|
|
295 |
feature_values = [(ID, df_.to_numpy()) for ID, df_ in sorted(D_timeseries.items())] |
|
|
296 |
time_series = np.concatenate([feat_val[1] for feat_val in feature_values]) |
|
|
297 |
# assert time_series.shape == (len(index), len(columns)) |
|
|
298 |
|
|
|
299 |
df_time_series = pd.DataFrame(data=time_series, index=index, columns=columns) |
|
|
300 |
|
|
|
301 |
print() |
|
|
302 |
print('(N \u00D7 L \u00D7 ^D) table :\t', (N, L, len(columns))) |
|
|
303 |
return df_time_series, dtypes |
|
|
304 |
|
|
|
305 |
def map_time_invariant_features(df, bin_numeric=True): |
|
|
306 |
# Categorical -> binary features |
|
|
307 |
# Numeric -> binary/float-valued features |
|
|
308 |
if bin_numeric: |
|
|
309 |
# df_mixed = df.apply(smart_qcut, q=5) |
|
|
310 |
# features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':') |
|
|
311 |
# time_invariant_features = features_mixed |
|
|
312 |
# assert time_invariant_features.astype(int).dtypes.nunique() == 1 |
|
|
313 |
print('start discretization', flush=True) |
|
|
314 |
# out = [smart_qcut_dummify(df[col], q=5) for col in tqdm(df.columns)] |
|
|
315 |
|
|
|
316 |
# pool = multiprocessing.Pool(multiprocessing.cpu_count()) |
|
|
317 |
# out = list(tqdm(pool.imap_unordered( |
|
|
318 |
# smart_qcut_dummify_5, |
|
|
319 |
# [df[col] for col in tqdm(df.columns)]), total=len(df.columns) |
|
|
320 |
# )) |
|
|
321 |
# pool.close() |
|
|
322 |
# pool.join() |
|
|
323 |
|
|
|
324 |
pool = multiprocessing.Pool(multiprocessing.cpu_count()) |
|
|
325 |
out_0 = list(tqdm(pool.imap_unordered( |
|
|
326 |
smart_qcut, |
|
|
327 |
[df[col] for col in df.columns]), total=len(df.columns) |
|
|
328 |
)) |
|
|
329 |
cols_data, discretization_bins = zip(*out_0) |
|
|
330 |
out = list(tqdm(pool.imap_unordered(dummify, cols_data), total=len(df.columns) |
|
|
331 |
)) |
|
|
332 |
pool.close() |
|
|
333 |
pool.join() |
|
|
334 |
|
|
|
335 |
time_invariant_features = pd.concat(out, axis=1).sort_index(axis=1) |
|
|
336 |
feature_names_all = time_invariant_features.columns.values |
|
|
337 |
sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0)) |
|
|
338 |
s_ = sparse.COO(sdf.sparse.to_coo()) |
|
|
339 |
else: |
|
|
340 |
raise NotImplemented |
|
|
341 |
# Split a mixed column into numeric and string columns |
|
|
342 |
for col in df.columns: |
|
|
343 |
col_data = df[col] |
|
|
344 |
col_is_numeric = [is_numeric(v) for v in col_data if not pd.isnull(v)] |
|
|
345 |
if not all(col_is_numeric) and any(col_is_numeric): # have mixed type values |
|
|
346 |
numeric_mask = col_data.apply(is_numeric) |
|
|
347 |
df[col+'_str'] = df[col].copy() |
|
|
348 |
df.loc[~numeric_mask, col] = np.nan |
|
|
349 |
df.loc[numeric_mask, col+'_str'] = np.nan |
|
|
350 |
|
|
|
351 |
out = [smart_dummify_impute(df[col]) for col in tqdm(df.columns)] |
|
|
352 |
time_invariant_features = pd.concat(out, axis=1) |
|
|
353 |
feature_names_all = time_invariant_features.columns.values |
|
|
354 |
sdf = time_invariant_features.astype(pd.SparseDtype(float, fill_value=0)) |
|
|
355 |
s_ = sparse.COO(sdf.sparse.to_coo()) |
|
|
356 |
|
|
|
357 |
print() |
|
|
358 |
print('Output') |
|
|
359 |
print('s_all, binary features :\t', s_.shape) |
|
|
360 |
return s_, feature_names_all, dict(discretization_bins) |
|
|
361 |
|
|
|
362 |
########## |
|
|
363 |
|
|
|
364 |
def map_time_series_features(df_time_series, dtypes, args): |
|
|
365 |
N, L = args.N, args.L |
|
|
366 |
|
|
|
367 |
df_time_series = df_time_series.dropna(axis='columns', how='all').sort_index() |
|
|
368 |
|
|
|
369 |
print('Discretizing features...', flush=True) |
|
|
370 |
ts_mask = select_dtype(df_time_series, 'mask', dtypes) |
|
|
371 |
ts_mixed = select_dtype(df_time_series, '~mask', dtypes) |
|
|
372 |
assert len(ts_mixed.columns) + len(ts_mask.columns) == len(df_time_series.columns) |
|
|
373 |
ts_feature_mask = ts_mask.astype(int) |
|
|
374 |
ts_mixed_cols = [ts_mixed[col] for col in ts_mixed.columns] |
|
|
375 |
|
|
|
376 |
print() |
|
|
377 |
if args.binarize: |
|
|
378 |
dtype = int |
|
|
379 |
print('Processing', len(ts_mixed_cols), 'non-boolean variable columns...') |
|
|
380 |
|
|
|
381 |
print(' Binning numeric variables by quintile...') |
|
|
382 |
print(' Converting variables to binary features') |
|
|
383 |
if parallel: |
|
|
384 |
# out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables |
|
|
385 |
# delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols |
|
|
386 |
# ) |
|
|
387 |
# out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)] |
|
|
388 |
|
|
|
389 |
pool = multiprocessing.Pool(multiprocessing.cpu_count()) |
|
|
390 |
out_0 = pool.starmap(smart_qcut, ts_mixed_cols) |
|
|
391 |
cols_data, discretization_bins = zip(out_0) |
|
|
392 |
out = pool.starmap(dummify, cols_data) |
|
|
393 |
pool.close() |
|
|
394 |
pool.join() |
|
|
395 |
else: |
|
|
396 |
out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)] |
|
|
397 |
else: |
|
|
398 |
raise NotImplemented |
|
|
399 |
dtype = float |
|
|
400 |
df = ts_mixed.copy() |
|
|
401 |
|
|
|
402 |
# Split a mixed column into numeric and string columns |
|
|
403 |
for col in df.columns: |
|
|
404 |
col_data = df[col] |
|
|
405 |
col_is_numeric = [is_numeric(v) for v in col_data if not pd.isnull(v)] |
|
|
406 |
if not all(col_is_numeric) and any(col_is_numeric): # have mixed type values |
|
|
407 |
numeric_mask = col_data.apply(is_numeric) |
|
|
408 |
df[col+'_str'] = df[col].copy() |
|
|
409 |
df.loc[~numeric_mask, col] = np.nan |
|
|
410 |
df.loc[numeric_mask, col+'_str'] = np.nan |
|
|
411 |
|
|
|
412 |
ts_mixed_cols = [df[col] for col in df.columns] |
|
|
413 |
|
|
|
414 |
print('Discretizing categorical features...') |
|
|
415 |
if parallel: |
|
|
416 |
# out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables? |
|
|
417 |
# delayed(smart_dummify_impute)(col_data) for col_data in ts_mixed_cols |
|
|
418 |
# ) |
|
|
419 |
out = [smart_dummify_impute(col_data) for col_data in tqdm(ts_mixed_cols)] |
|
|
420 |
else: |
|
|
421 |
out = [smart_dummify_impute(col_data) for col_data in tqdm(ts_mixed_cols)] |
|
|
422 |
|
|
|
423 |
out = [ts_feature_mask, *out] |
|
|
424 |
D_all = sum(len(df_i.columns) for df_i in out) |
|
|
425 |
X_all_feature_names = np.asarray(sum([list(df_i.columns) for df_i in out], [])) |
|
|
426 |
X_dense = np.concatenate([df_i.values for df_i in out], axis=1).astype(dtype) |
|
|
427 |
X_all = sparse.COO(X_dense) |
|
|
428 |
|
|
|
429 |
print('Finished discretizing features', flush=True) |
|
|
430 |
assert X_all.shape[0] == N * L |
|
|
431 |
X_all = X_all.reshape((N, L, D_all)) |
|
|
432 |
|
|
|
433 |
print() |
|
|
434 |
print('Output') |
|
|
435 |
print('X_all: shape={}, density={:.3f}'.format(X_all.shape, X_all.density)) |
|
|
436 |
return X_all, X_all_feature_names, dict(discretization_bins) |
|
|
437 |
|
|
|
438 |
|
|
|
439 |
def print_metadata(): |
|
|
440 |
# Print metadata |
|
|
441 |
print('DONE: Transforming each example...') |
|
|
442 |
## Freq: Count missing entries using mask |
|
|
443 |
ts_mask = df_time_series[[col for col in df_time_series if col.endswith('_mask')]] |
|
|
444 |
ts_mask.columns = [col.replace('_mask', '') for col in ts_mask.columns] |
|
|
445 |
print('(freq) number of missing entries :\t', |
|
|
446 |
'{} out of {}={} total'.format( |
|
|
447 |
(1-ts_mask).astype(int).sum().sum(), |
|
|
448 |
'\u00D7'.join(str(i) for i in [N,L,ts_mask.shape[1]]), ts_mask.size)) |
|
|
449 |
|
|
|
450 |
## Freq: Count imputed entries using mask and dt |
|
|
451 |
ts_delta_time = df_time_series[[col for col in df_time_series if col.endswith('_delta_time')]] |
|
|
452 |
ts_delta_time.columns = [col.replace('_delta_time', '') for col in ts_delta_time.columns] |
|
|
453 |
|
|
|
454 |
imputed = (1-ts_mask).astype(bool) & (ts_delta_time > 0) |
|
|
455 |
print('(freq) number of imputed entries :\t', |
|
|
456 |
'{}'.format(imputed.sum().sum(), ts_delta_time.size)) |
|
|
457 |
imputed.sum().rename('count').to_csv(data_path + '/' + 'freq_imputed.csv') |
|
|
458 |
|
|
|
459 |
not_imputed = (1-ts_mask).astype(bool) & (ts_delta_time == 0) |
|
|
460 |
print('(freq) number of not imputed entries :\t', |
|
|
461 |
'{}'.format(not_imputed.sum().sum(), ts_delta_time.size)) |
|
|
462 |
not_imputed.sum().rename('count').to_csv(data_path + '/' + 'freq_not_imputed.csv') |
|
|
463 |
|
|
|
464 |
## Non-Freq: Count missing entries |
|
|
465 |
non_freq_cols = sorted([c + '_value' for c in set(variables) - set(variables_num_freq)]) |
|
|
466 |
non_freqs = df_time_series[non_freq_cols] |
|
|
467 |
print('(non-freq) number of missing entries :\t', |
|
|
468 |
'{} out of {}={} total'.format( |
|
|
469 |
non_freqs.isna().sum().sum(), |
|
|
470 |
'\u00D7'.join(str(i) for i in [N,L,non_freqs.shape[1]]), non_freqs.size)) |
|
|
471 |
|
|
|
472 |
|
|
|
473 |
def post_filter_time_series(X_all, feature_names_all, threshold, args): |
|
|
474 |
N, L = args.N, args.L |
|
|
475 |
assert X_all.shape[0] == N |
|
|
476 |
assert X_all.shape[1] == L |
|
|
477 |
# assert X_all.dtype == int |
|
|
478 |
start_time = time.time() |
|
|
479 |
|
|
|
480 |
X0 = X_all |
|
|
481 |
feature_names_0 = feature_names_all |
|
|
482 |
print('Original :', len(feature_names_0)) |
|
|
483 |
|
|
|
484 |
## Remove nearly-constant features (with low variance) |
|
|
485 |
sel_const = FrequencyThreshold_temporal(threshold=threshold, L=L) |
|
|
486 |
sel_const.fit(X0.reshape((N*L, -1))) |
|
|
487 |
m_ts_const = sel_const.get_support() |
|
|
488 |
assert len(m_ts_const) == X0.shape[-1] |
|
|
489 |
X1 = X0[:, :, m_ts_const] |
|
|
490 |
feature_names_1 = feature_names_0[m_ts_const] |
|
|
491 |
print('Nearly-constant:', len(feature_names_0) - len(feature_names_1)) |
|
|
492 |
print('*** time: ', time.time() - start_time) |
|
|
493 |
|
|
|
494 |
## Keep only first of pairwise perfectly correlated features |
|
|
495 |
sel_ts_corr = CorrelationSelector() |
|
|
496 |
sel_ts_corr.fit(X1.reshape((N*L, -1))) |
|
|
497 |
m_ts_corr = sel_ts_corr.get_support() |
|
|
498 |
assert len(m_ts_corr) == X1.shape[-1] |
|
|
499 |
X2 = X1[:, :, m_ts_corr] |
|
|
500 |
feature_names_2 = feature_names_1[m_ts_corr] |
|
|
501 |
feature_aliases = sel_ts_corr.get_feature_aliases(feature_names_1) |
|
|
502 |
print('Correlated :', len(feature_names_1) - len(feature_names_2)) |
|
|
503 |
print('*** time: ', time.time() - start_time) |
|
|
504 |
|
|
|
505 |
X = sparse.COO(X2) |
|
|
506 |
feature_names = feature_names_2 |
|
|
507 |
assert X.shape == (N, L, len(feature_names)) |
|
|
508 |
|
|
|
509 |
## Save output |
|
|
510 |
print() |
|
|
511 |
print('Output') |
|
|
512 |
print('X: shape={}, density={:.3f}'.format(X.shape, X.density)) |
|
|
513 |
|
|
|
514 |
return X, feature_names, feature_aliases |
|
|
515 |
|
|
|
516 |
|
|
|
517 |
def pre_filter(df, threshold, df_population, args): |
|
|
518 |
T = int(args.T) |
|
|
519 |
theta_1 = args.theta_1 |
|
|
520 |
df_population = args.df_population |
|
|
521 |
|
|
|
522 |
# Remove rows not in population |
|
|
523 |
print('Remove rows not in population') |
|
|
524 |
df = df[df['ID'].isin(df_population.index)] |
|
|
525 |
|
|
|
526 |
# Remove rows with t outside of [0, T) |
|
|
527 |
print('Remove rows with t outside of [0, {}]'.format(T)) |
|
|
528 |
df = df[pd.isnull(df[t_col]) | ((0 <= df[t_col]) & (df[t_col] < T))] |
|
|
529 |
|
|
|
530 |
# Data tables should not contain duplicate rows |
|
|
531 |
# Check for inconsistencies |
|
|
532 |
dups = df.duplicated(subset=[ID_col, t_col, var_col], keep=False) |
|
|
533 |
if any(dups): |
|
|
534 |
print(df[dups].head()) |
|
|
535 |
raise Exception('Inconsistent values recorded') |
|
|
536 |
|
|
|
537 |
# Remove variables that occur too rarely as defined by the threshold |
|
|
538 |
print('Remove rare variables (<= {})'.format(threshold)) |
|
|
539 |
|
|
|
540 |
## Calculate overall occurrence rate of each variable based on IDs |
|
|
541 |
df_count = calculate_variable_counts(df, df_population) # (N x |var|) table of counts |
|
|
542 |
df_bool = df_count.astype(bool) # convert counts to boolean |
|
|
543 |
|
|
|
544 |
## Keep variables that are recorded for more than threshold fraction of IDs |
|
|
545 |
variables_keep = df_bool.columns[df_bool.mean(axis=0) > threshold] |
|
|
546 |
df_out = df[df[var_col].isin(variables_keep)] |
|
|
547 |
assert set(variables_keep) == set(df_out[var_col].unique()) |
|
|
548 |
|
|
|
549 |
variables = sorted(df_bool.columns) |
|
|
550 |
variables_remove = sorted(set(variables) - set(variables_keep)) |
|
|
551 |
print('Total variables :', len(variables)) |
|
|
552 |
print('Rare variables :', len(variables_remove)) |
|
|
553 |
print('Remaining variables :', len(variables_keep)) |
|
|
554 |
print('# rows (original) :', len(df)) |
|
|
555 |
print('# rows (filtered) :', len(df_out)) |
|
|
556 |
return df_out |
|
|
557 |
|
|
|
558 |
|
|
|
559 |
|
|
|
560 |
###### |
|
|
561 |
# Time-invariant routines |
|
|
562 |
###### |
|
|
563 |
def transform_time_invariant_table(df_in, df_population): |
|
|
564 |
df_in = df_in.copy() |
|
|
565 |
|
|
|
566 |
# Recorded Value (np.nan if not recorded) |
|
|
567 |
df_value = pd.pivot_table(df_in, val_col, ID_col, var_col, 'last', np.nan) |
|
|
568 |
df_value = df_value.reindex(index=df_population.index, fill_value=np.nan) |
|
|
569 |
df_value.columns = [str(col) + '_value' for col in df_value.columns] |
|
|
570 |
|
|
|
571 |
print('(N \u00D7 ^d) table :\t', df_value.shape) |
|
|
572 |
print('number of missing entries :\t', '{} out of {} total'.format(df_value.isna().sum().sum(), df_value.size)) |
|
|
573 |
return df_value |
|
|
574 |
|
|
|
575 |
def smart_qcut_dummify_5(x): |
|
|
576 |
return smart_qcut_dummify(x, q=5) |
|
|
577 |
|
|
|
578 |
|
|
|
579 |
def post_filter(s_, s_feature_names_all, threshold): |
|
|
580 |
# Filter features (optional) |
|
|
581 |
assert s_.shape[1] == len(s_feature_names_all) |
|
|
582 |
feature_names_0 = s_feature_names_all |
|
|
583 |
s0 = s_.to_scipy_sparse() |
|
|
584 |
print('Original :', len(feature_names_0)) |
|
|
585 |
|
|
|
586 |
## Remove nearly-constant features (with low variance) |
|
|
587 |
## a binary feature is removed if =0 (or =1) for >th fraction of examples |
|
|
588 |
## i.e., variance <= (th * (1 - th)) |
|
|
589 |
sel_rare = VarianceThreshold(threshold=(threshold * (1 - threshold))) |
|
|
590 |
s1 = sel_rare.fit_transform(s0) |
|
|
591 |
feature_names_1 = feature_names_0[sel_rare.get_support()] |
|
|
592 |
print('Nearly-constant:', len(feature_names_0) - len(feature_names_1)) |
|
|
593 |
|
|
|
594 |
## Keep only first of pairwise perfectly correlated features |
|
|
595 |
sel_corr = CorrelationSelector() |
|
|
596 |
s2 = sel_corr.fit_transform(s1) |
|
|
597 |
feature_names_2 = feature_names_1[sel_corr.get_support()] |
|
|
598 |
feature_aliases = sel_corr.get_feature_aliases(feature_names_1) |
|
|
599 |
print('Correlated :', len(feature_names_1) - len(feature_names_2)) |
|
|
600 |
|
|
|
601 |
s = sparse.COO(s2) |
|
|
602 |
feature_names = feature_names_2 |
|
|
603 |
assert s.shape[1] == len(feature_names) |
|
|
604 |
|
|
|
605 |
return s, feature_names, feature_aliases |
|
|
606 |
|
|
|
607 |
|
|
|
608 |
def process_time_invariant(df_data_time_invariant, args): |
|
|
609 |
data_path = args.data_path |
|
|
610 |
df_population = args.df_population |
|
|
611 |
theta_2 = args.theta_2 |
|
|
612 |
|
|
|
613 |
print_header('2-A) Transform time-invariant data', char='-') |
|
|
614 |
dir_path = data_path + '/' |
|
|
615 |
start_time = time.time() |
|
|
616 |
|
|
|
617 |
## Create Nxd^ table |
|
|
618 |
df_time_invariant = transform_time_invariant_table(df_data_time_invariant, df_population) |
|
|
619 |
print('Time elapsed: %f seconds' % (time.time() - start_time)) |
|
|
620 |
|
|
|
621 |
## Discretize |
|
|
622 |
s_all, s_all_feature_names, s_discretization = map_time_invariant_features(df_time_invariant, args.binarize) |
|
|
623 |
sparse.save_npz(dir_path + 's_all.npz', s_all) |
|
|
624 |
with open(dir_path + 's_all.feature_names.json', 'w') as f: |
|
|
625 |
json.dump(list(s_all_feature_names), f, sort_keys=True) |
|
|
626 |
print('Time elapsed: %f seconds' % (time.time() - start_time)) |
|
|
627 |
json.dump(s_discretization, open(dir_path + 'discretization.json', 'w')) |
|
|
628 |
|
|
|
629 |
print_header('3-A) Post-filter time-invariant data', char='-') |
|
|
630 |
|
|
|
631 |
## Filter |
|
|
632 |
s, s_feature_names, s_feature_aliases = post_filter(s_all, s_all_feature_names, theta_2) |
|
|
633 |
print('Time elapsed: %f seconds' % (time.time() - start_time)) |
|
|
634 |
|
|
|
635 |
## Save output |
|
|
636 |
print() |
|
|
637 |
print('Output') |
|
|
638 |
print('s: shape={}, density={:.3f}'.format(s.shape, s.density)) |
|
|
639 |
sparse.save_npz(dir_path + 's.npz', s) |
|
|
640 |
|
|
|
641 |
with open(dir_path + 's.feature_names.json', 'w') as f: |
|
|
642 |
json.dump(list(s_feature_names), f, sort_keys=True) |
|
|
643 |
with open(dir_path + 's.feature_aliases.json', 'w') as f: |
|
|
644 |
json.dump(s_feature_aliases, f, sort_keys=True) |
|
|
645 |
|
|
|
646 |
print('Total time: %f seconds' % (time.time() - start_time)) |
|
|
647 |
print('', flush=True) |
|
|
648 |
return s, s_feature_names, s_feature_aliases |
|
|
649 |
|