Switch to unified view

a b/preprocessing/FIDDLE_steps_2.py
1
"""
2
FIDDLE Preprocessing steps
3
1. Pre-filter
4
2. Transform
5
3. Post-filter
6
"""
7
try:
8
    from FIDDLE_helpers import *
9
except:
10
    from .FIDDLE_helpers import *
11
import time
12
import json
13
import pickle5 as pickle
14
import joblib
15
from datetime import datetime
16
import multiprocessing
17
18
# import pickle4reducer
19
# import multiprocessing
20
# ctx = multiprocessing.get_context()
21
# ctx.reducer = pickle4reducer.Pickle4Reducer()
22
23
def detect_variable_data_type(df_data, value_type_override, args):
24
    data_path = args.data_path
25
    print_header('*) Detecting value types', char='-')
26
    
27
    data_types = []
28
    df = df_data
29
    assert val_col in df.columns
30
31
    # Collect the unique values of each variable
32
    # values_by_variable: dict(variable_name -> [value1, value2, ...])
33
    d = df[[var_col, val_col]].drop_duplicates().sort_values(by=[var_col, val_col])
34
    values_by_variable = defaultdict(list)
35
    for n,v in zip(d[var_col], d[val_col]):
36
        values_by_variable[n].append(v)
37
38
    # Determine type of each variable
39
    for variable, values in sorted(values_by_variable.items()):
40
        # Manual override type in config
41
        if variable in value_type_override:
42
            data_types.append((variable, value_type_override[variable]))
43
            # Force categorical values to be a string
44
            if value_type_override[variable] == 'Categorical' and \
45
                any(is_numeric(v) for v in values if not pd.isnull(v)):
46
                m_var = df[var_col] == variable
47
                df.loc[m_var, val_col] = df.loc[m_var, val_col].apply(lambda s: '_' + str(s))
48
        else:
49
            if len(values) == 1 and pd.isnull(values[0]):
50
                data_types.append((variable, 'None'))
51
            elif all(is_numeric(v) for v in values if not pd.isnull(v)):
52
                data_types.append((variable, 'Numeric'))
53
            elif any(is_numeric(v) for v in values if not pd.isnull(v)):
54
                data_types.append((variable, 'Numeric + Categorical'))
55
            else:
56
                data_types.append((variable, 'Categorical'))
57
    
58
    df_types = pd.DataFrame(data_types, columns=['variable_name', 'value_type'])
59
    df_types[var_col] = df_types[var_col].astype(str)
60
    df_types = df_types.set_index(var_col)
61
    fpath = data_path + 'value_types.csv'
62
    df_types.to_csv(fpath, quoting=1)
63
    print('Saved as:', fpath)
64
    return df, df_types['value_type']
65
66
67
def split_by_timestamp_type(df):
68
    print_header('*) Separate time-invariant and time-dependent', char='-')
69
    
70
    variables_inv = df[pd.isnull(df[t_col])][var_col].unique() # Invariant variables have t = NULL
71
    df_time_invariant = df[df[var_col].isin(variables_inv)]
72
    df_time_series = df[~df[var_col].isin(variables_inv)]
73
    
74
    print('Variables (time-invariant):', len(variables_inv))
75
    print('Variables (time-dependent):', df[var_col].nunique() - len(variables_inv))
76
    print('# rows    (time-invariant):', len(df_time_invariant))
77
    print('# rows    (time-dependent):', len(df_time_series))
78
    return df_time_invariant, df_time_series
79
80
81
def process_time_dependent(df_data_time_series, args):
82
    data_path = args.data_path
83
    theta_2 = args.theta_2
84
    
85
    print_header('2-B) Transform time-dependent data', char='-')
86
    dir_path = data_path + '/'
87
    start_time = time.time()
88
89
    ## Create NxLxD^ table
90
    df_time_series, dtypes_time_series = transform_time_series_table(df_data_time_series, args)
91
    print('Time elapsed: %f seconds' % (time.time() - start_time), flush=True)
92
93
    ##############
94
    joblib.dump(df_time_series, args.data_path + 'df_time_series,{}.joblib'.format(datetime.now().isoformat()))
95
    joblib.dump(dtypes_time_series, args.data_path + 'dtypes_time_series,{}.joblib'.format(datetime.now().isoformat()))
96
    ##############
97
98
    ## Map variables to features
99
    X_all, X_all_feature_names = map_time_series_features(df_time_series, dtypes_time_series, args)
100
    sparse.save_npz(dir_path + 'X_all.npz', X_all)
101
    with open(dir_path + 'X_all.feature_names.json', 'w') as f:
102
        json.dump(list(X_all_feature_names), f, sort_keys=True)
103
    print('Time elapsed: %f seconds' % (time.time() - start_time), flush=True)
104
    
105
    ## Filter features
106
    if not args.postfilter:
107
        return X_all, X_all_feature_names, {}
108
    print_header('3-B) Post-filter time-dependent data', char='-')
109
    print(X_all.shape, X_all.density)
110
    X, X_feature_names, X_feature_aliases = post_filter_time_series(X_all, X_all_feature_names, theta_2, args)
111
    print(X.shape, X.density)
112
    print('Time elapsed: %f seconds' % (time.time() - start_time))
113
114
    ## Save output
115
    print()
116
    print('Output')
117
    print('X: shape={}, density={:.3f}'.format(X.shape, X.density))
118
    sparse.save_npz(dir_path + 'X.npz', X)
119
    with open(dir_path + 'X.feature_names.json', 'w') as f:
120
        json.dump(list(X_feature_names), f, sort_keys=True)
121
    with open(dir_path + 'X.feature_aliases.json', 'w') as f:
122
        json.dump(X_feature_aliases, f, sort_keys=True)
123
    
124
    print('Total time: %f seconds' % (time.time() - start_time))
125
    print('', flush=True)
126
    return X, X_feature_names, X_feature_aliases
127
128
######
129
# Time-series routines
130
######
131
def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, stats_functions, impute=True):
132
    try:
133
        assert g.index.nunique() == 1
134
        assert g.index.unique()[0] == i
135
        # non-frequent
136
        variables_non = sorted(set(variables) - set(variables_num_freq))
137
        if len(variables_non) > 0:
138
            df_j = pivot_event_table(g).reindex(columns=variables_non).sort_index()
139
            df_values_j = most_recent_values(df_j, variables, T, dt)
140
            df_out = df_values_j
141
        
142
        if len(variables_num_freq) > 0:
143
            # frequent
144
            # we're only producing mask, ffill, and statistics if the data is measured frequently enough
145
            df_i = pivot_event_table(g).reindex(columns=variables_num_freq).sort_index()
146
            mask_i = presence_mask(df_i, variables_num_freq, T, dt)
147
#             delta_t_i = get_delta_time(mask_i)
148
#             df_i = impute_ffill(df_i, variables_num_freq, T, dt, mask_i)
149
            df_stats_i = summary_statistics(df_i, variables_num_freq, stats_functions, T, dt)
150
            df_values_i = most_recent_values(df_i, variables, T, dt)
151
#             if impute:
152
#                 check_imputed_output(df_values_i)
153
#                 check_imputed_output(df_stats_i)
154
            if len(variables_non) > 0:
155
                df_out = df_out.join([mask_i, df_values_i, df_stats_i])
156
            else:
157
                df_out = mask_i.join([df_values_i, df_stats_i])
158
#             df_out = df_out.join([mask_i, delta_t_i, df_values_i, df_stats_i])
159
    except:
160
        print(i)
161
        print(g)
162
        raise
163
    return i, df_out
164
165
def divide_chunks(l, n): 
166
    # looping till length l 
167
    for i in range(0, len(l), n):  
168
        yield l[i:i + n]
169
170
def form_batches_of_examples(df_in, args, batch_size=1000):
171
    grouped = df_in.set_index(ID_col)
172
    IDs = list(grouped.index.unique())
173
    batches_IDs = list(divide_chunks(IDs, batch_size))
174
    batches = [grouped.loc[chunk] for chunk in batches_IDs]
175
    return batches, batches_IDs
176
177
def process_batch_time_series(first_arg):
178
    batch, batch_IDs, args = first_arg
179
    variables, variables_num_freq = args.variables, args.variables_num_freq
180
    out = dict(
181
        func_encode_single_time_series(i, batch.loc[i:i], variables, variables_num_freq, args.T, args.dt, args.stats_functions) 
182
        for i in batch_IDs
183
    )
184
    return out
185
186
def transform_time_series_table(df_in, args):
187
    data_path = args.data_path
188
    theta_freq = args.theta_freq
189
    stats_functions = args.stats_functions
190
    N, L = args.N, args.L
191
    df_population = args.df_population
192
    parallel = args.parallel
193
    
194
    ## TODO: asserts shape of df_in
195
196
    # Determine all unique variable names
197
    variables = args.variables
198
#     variables = get_unique_variables(df_in) ############
199
#     assert df_in[var_col].nunique() == len(variables)
200
    print('Total variables    :', len(variables), flush=True)
201
    
202
    # Determine frequent variables -> we'll calculate statistics, mask, and delta_time only on these
203
    variables_num_freq = args.variables_num_freq
204
#     variables_num_freq = get_frequent_numeric_variables(df_in, variables, theta_freq, args)
205
    print('Frequent variables :', list(variables_num_freq))
206
    print('{} = {}'.format('M\u2081', len(variables_num_freq)))
207
    print('{} = {}'.format('M\u2082', len(variables) - len(variables_num_freq)))
208
    print('{} = {} {}'.format('k ', len(stats_functions), stats_functions))
209
    
210
    print()
211
    print('Transforming each example...', flush=True)
212
    args.variables = variables
213
    args.variables_num_freq = variables_num_freq
214
    
215
    # Encode time series table for each patient
216
    batches, batches_IDs = form_batches_of_examples(df_in, args, batch_size=1000)
217
    print('Batches of size 1000: ', len(batches), flush=True)
218
    
219
### TEST
220
#     for batch, batch_IDs in zip(batches, batches_IDs):
221
#         process_batch_time_series((batch, batch_IDs, args))
222
#         print('done one')
223
#         break
224
#     exit()
225
### TEST
226
227
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
228
    out = list(tqdm(pool.imap_unordered(
229
        process_batch_time_series,
230
        zip(batches, batches_IDs, [args]*len(batches))), total=len(batches)
231
    ))
232
    pool.close()
233
    pool.join()
234
    
235
    out = dict((key, d[key]) for d in out for key in d)
236
    print()
237
    print('Parallel processing done', flush=True)
238
    
239
    # Handle IDs not in the table
240
#     df_original = list(out.values())[0]
241
#     df_copy = pd.DataFrame().reindex_like(df_original)
242
#     for i, j in df_original.dtypes.iteritems():
243
#         if i.endswith('_mask'):
244
#             assert j == bool
245
#             df_copy[i] = False
246
#             df_copy[i] = df_copy[i].astype(bool)
247
#         if i.endswith('_delta_time'):
248
#             df_copy[i] = 0
249
#             df_copy[i] = df_copy[i].astype(int)
250
#         if j == 'object':
251
#             df_copy[i] = df_copy[i].astype('object')
252
253
#     for ID in tqdm(df_population.index.values[:N]):
254
#         if ID not in out:
255
#             out[ID] = df_copy.copy()
256
257
#     out = {ID: out[ID] for ID in df_population.index.values}
258
    N = len(out)
259
#     assert len(out) == N
260
    D_timeseries = out
261
262
    print('Filled no-data examples done', flush=True)
263
#     # check each example have identical LxD table structure
264
#     ID0 = sorted(D_timeseries.keys())[0]
265
#     df0 = D_timeseries[ID0]
266
#     for ID, df_i in D_timeseries.items():
267
#         pd.testing.assert_index_equal(df_i.index, df0.index)
268
#         pd.testing.assert_index_equal(df_i.columns, df0.columns)
269
270
    D_timeseries = out
271
    D_ = len(list(D_timeseries.values())[0].columns)
272
    
273
    ########
274
#     joblib.dump(D_timeseries, args.data_path + 'D_timeseries,{}.joblib'.format(datetime.now().isoformat()))
275
    ########
276
    
277
#     # (N*L)xD^ table
278
#     ## Create MultiIndex of (ID, time_bin)
279
#     index = sum([ 
280
#         [(ID, t_) for t_ in list(df_.index)]
281
#         for ID, df_ in sorted(D_timeseries.items()) 
282
#     ], [])
283
#     index = pd.Index(index)
284
    index = [ID for ID, df_ in sorted(D_timeseries.items())]
285
    index = pd.Index(index)
286
# #     assert len(index) == N * L
287
    
288
    ## Assume all dataframes have the same columns, used after concatenation
289
    columns = list(sorted(D_timeseries.items())[0][1].columns)
290
    columns = np.array(columns)
291
    dtypes = sorted(D_timeseries.items())[0][1].dtypes
292
    
293
    ## Convert each df to a numpy array
294
    ## Concatenate **sorted** numpy arrays (faster than calling pd.concat)
295
    feature_values = [(ID, df_.to_numpy()) for ID, df_ in sorted(D_timeseries.items())]
296
    time_series = np.concatenate([feat_val[1] for feat_val in feature_values])
297
#     assert time_series.shape == (len(index), len(columns))
298
    
299
    df_time_series = pd.DataFrame(data=time_series, index=index, columns=columns)
300
    
301
    print()
302
    print('(N \u00D7 L \u00D7 ^D) table :\t', (N, L, len(columns)))
303
    return df_time_series, dtypes
304
305
def map_time_invariant_features(df, bin_numeric=True):
306
    # Categorical -> binary features
307
    # Numeric -> binary/float-valued features
308
    if bin_numeric:
309
#         df_mixed = df.apply(smart_qcut, q=5)
310
#         features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':')
311
#         time_invariant_features = features_mixed
312
#         assert time_invariant_features.astype(int).dtypes.nunique() == 1
313
        print('start discretization', flush=True)
314
#         out = [smart_qcut_dummify(df[col], q=5) for col in tqdm(df.columns)]
315
        
316
#         pool = multiprocessing.Pool(multiprocessing.cpu_count())
317
#         out = list(tqdm(pool.imap_unordered(
318
#             smart_qcut_dummify_5,
319
#             [df[col] for col in tqdm(df.columns)]), total=len(df.columns)
320
#         ))
321
#         pool.close()
322
#         pool.join()
323
324
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
325
        out_0 = list(tqdm(pool.imap_unordered(
326
            smart_qcut,
327
            [df[col] for col in df.columns]), total=len(df.columns)
328
        ))
329
        cols_data, discretization_bins = zip(*out_0)
330
        out = list(tqdm(pool.imap_unordered(dummify, cols_data), total=len(df.columns)
331
        ))
332
        pool.close()
333
        pool.join()
334
        
335
        time_invariant_features = pd.concat(out, axis=1).sort_index(axis=1)
336
        feature_names_all = time_invariant_features.columns.values
337
        sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0))
338
        s_ = sparse.COO(sdf.sparse.to_coo())
339
    else:
340
        raise NotImplemented
341
        # Split a mixed column into numeric and string columns
342
        for col in df.columns:
343
            col_data = df[col]
344
            col_is_numeric = [is_numeric(v) for v in col_data if not pd.isnull(v)]
345
            if not all(col_is_numeric) and any(col_is_numeric): # have mixed type values
346
                numeric_mask = col_data.apply(is_numeric)
347
                df[col+'_str'] = df[col].copy()
348
                df.loc[~numeric_mask, col] = np.nan
349
                df.loc[numeric_mask, col+'_str'] = np.nan
350
        
351
        out = [smart_dummify_impute(df[col]) for col in tqdm(df.columns)]
352
        time_invariant_features = pd.concat(out, axis=1)
353
        feature_names_all = time_invariant_features.columns.values
354
        sdf = time_invariant_features.astype(pd.SparseDtype(float, fill_value=0))
355
        s_ = sparse.COO(sdf.sparse.to_coo())
356
    
357
    print()
358
    print('Output')
359
    print('s_all, binary features    :\t', s_.shape)
360
    return s_, feature_names_all, dict(discretization_bins)
361
362
##########
363
364
def map_time_series_features(df_time_series, dtypes, args):
365
    N, L = args.N, args.L
366
    
367
    df_time_series = df_time_series.dropna(axis='columns', how='all').sort_index()
368
369
    print('Discretizing features...', flush=True)
370
    ts_mask = select_dtype(df_time_series, 'mask', dtypes)
371
    ts_mixed = select_dtype(df_time_series, '~mask', dtypes)
372
    assert len(ts_mixed.columns) + len(ts_mask.columns) == len(df_time_series.columns)
373
    ts_feature_mask = ts_mask.astype(int)
374
    ts_mixed_cols = [ts_mixed[col] for col in ts_mixed.columns]
375
    
376
    print()
377
    if args.binarize:
378
        dtype = int
379
        print('Processing', len(ts_mixed_cols), 'non-boolean variable columns...')
380
381
        print('    Binning numeric variables by quintile...')
382
        print('    Converting variables to binary features')
383
        if parallel:
384
#             out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables
385
#                 delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols
386
#             )
387
#             out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)]
388
    
389
            pool = multiprocessing.Pool(multiprocessing.cpu_count())
390
            out_0 = pool.starmap(smart_qcut, ts_mixed_cols)
391
            cols_data, discretization_bins = zip(out_0)
392
            out = pool.starmap(dummify, cols_data)
393
            pool.close()
394
            pool.join()
395
        else:
396
            out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)]
397
    else:
398
        raise NotImplemented
399
        dtype = float
400
        df = ts_mixed.copy()
401
        
402
        # Split a mixed column into numeric and string columns
403
        for col in df.columns:
404
            col_data = df[col]
405
            col_is_numeric = [is_numeric(v) for v in col_data if not pd.isnull(v)]
406
            if not all(col_is_numeric) and any(col_is_numeric): # have mixed type values
407
                numeric_mask = col_data.apply(is_numeric)
408
                df[col+'_str'] = df[col].copy()
409
                df.loc[~numeric_mask, col] = np.nan
410
                df.loc[numeric_mask, col+'_str'] = np.nan
411
        
412
        ts_mixed_cols = [df[col] for col in df.columns]
413
        
414
        print('Discretizing categorical features...')
415
        if parallel:
416
#             out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables?
417
#                 delayed(smart_dummify_impute)(col_data) for col_data in ts_mixed_cols
418
#             )
419
            out = [smart_dummify_impute(col_data) for col_data in tqdm(ts_mixed_cols)]
420
        else:
421
            out = [smart_dummify_impute(col_data) for col_data in tqdm(ts_mixed_cols)]
422
    
423
    out = [ts_feature_mask, *out]
424
    D_all = sum(len(df_i.columns) for df_i in out)
425
    X_all_feature_names = np.asarray(sum([list(df_i.columns) for df_i in out], []))
426
    X_dense = np.concatenate([df_i.values for df_i in out], axis=1).astype(dtype)
427
    X_all = sparse.COO(X_dense)
428
    
429
    print('Finished discretizing features', flush=True)
430
    assert X_all.shape[0] == N * L
431
    X_all = X_all.reshape((N, L, D_all))
432
    
433
    print()
434
    print('Output')
435
    print('X_all: shape={}, density={:.3f}'.format(X_all.shape, X_all.density))
436
    return X_all, X_all_feature_names, dict(discretization_bins)
437
438
439
def print_metadata():
440
    # Print metadata
441
    print('DONE: Transforming each example...')
442
    ## Freq: Count missing entries using mask
443
    ts_mask = df_time_series[[col for col in df_time_series if col.endswith('_mask')]]
444
    ts_mask.columns = [col.replace('_mask', '') for col in ts_mask.columns]
445
    print('(freq) number of missing entries :\t', 
446
          '{} out of {}={} total'.format(
447
              (1-ts_mask).astype(int).sum().sum(), 
448
              '\u00D7'.join(str(i) for i in [N,L,ts_mask.shape[1]]), ts_mask.size))
449
    
450
    ## Freq: Count imputed entries using mask and dt
451
    ts_delta_time = df_time_series[[col for col in df_time_series if col.endswith('_delta_time')]]
452
    ts_delta_time.columns = [col.replace('_delta_time', '') for col in ts_delta_time.columns]
453
    
454
    imputed = (1-ts_mask).astype(bool) & (ts_delta_time > 0)
455
    print('(freq) number of imputed entries :\t', 
456
          '{}'.format(imputed.sum().sum(), ts_delta_time.size))
457
    imputed.sum().rename('count').to_csv(data_path + '/' + 'freq_imputed.csv')
458
    
459
    not_imputed = (1-ts_mask).astype(bool) & (ts_delta_time == 0)
460
    print('(freq) number of not imputed entries :\t', 
461
          '{}'.format(not_imputed.sum().sum(), ts_delta_time.size))
462
    not_imputed.sum().rename('count').to_csv(data_path + '/' + 'freq_not_imputed.csv')
463
    
464
    ## Non-Freq: Count missing entries
465
    non_freq_cols = sorted([c + '_value' for c in set(variables) - set(variables_num_freq)])
466
    non_freqs = df_time_series[non_freq_cols]
467
    print('(non-freq) number of missing entries :\t',
468
          '{} out of {}={} total'.format(
469
              non_freqs.isna().sum().sum(), 
470
              '\u00D7'.join(str(i) for i in [N,L,non_freqs.shape[1]]), non_freqs.size))
471
472
473
def post_filter_time_series(X_all, feature_names_all, threshold, args):
474
    N, L = args.N, args.L
475
    assert X_all.shape[0] == N
476
    assert X_all.shape[1] == L
477
#     assert X_all.dtype == int
478
    start_time = time.time()
479
    
480
    X0 = X_all
481
    feature_names_0 = feature_names_all
482
    print('Original :', len(feature_names_0))
483
    
484
    ## Remove nearly-constant features (with low variance)
485
    sel_const = FrequencyThreshold_temporal(threshold=threshold, L=L)
486
    sel_const.fit(X0.reshape((N*L, -1)))
487
    m_ts_const = sel_const.get_support()
488
    assert len(m_ts_const) == X0.shape[-1]
489
    X1 = X0[:, :, m_ts_const]
490
    feature_names_1 = feature_names_0[m_ts_const]
491
    print('Nearly-constant:', len(feature_names_0) - len(feature_names_1))
492
    print('*** time: ', time.time() - start_time)
493
    
494
    ## Keep only first of pairwise perfectly correlated features
495
    sel_ts_corr = CorrelationSelector()
496
    sel_ts_corr.fit(X1.reshape((N*L, -1)))
497
    m_ts_corr = sel_ts_corr.get_support()
498
    assert len(m_ts_corr) == X1.shape[-1]
499
    X2 = X1[:, :, m_ts_corr]
500
    feature_names_2 = feature_names_1[m_ts_corr]
501
    feature_aliases = sel_ts_corr.get_feature_aliases(feature_names_1)
502
    print('Correlated     :', len(feature_names_1) - len(feature_names_2))
503
    print('*** time: ', time.time() - start_time)
504
    
505
    X = sparse.COO(X2)
506
    feature_names = feature_names_2
507
    assert X.shape == (N, L, len(feature_names))
508
    
509
    ## Save output
510
    print()
511
    print('Output')
512
    print('X: shape={}, density={:.3f}'.format(X.shape, X.density))
513
    
514
    return X, feature_names, feature_aliases
515
516
517
def pre_filter(df, threshold, df_population, args):
518
    T = int(args.T)
519
    theta_1 = args.theta_1
520
    df_population = args.df_population
521
    
522
    # Remove rows not in population
523
    print('Remove rows not in population')
524
    df = df[df['ID'].isin(df_population.index)]
525
    
526
    # Remove rows with t outside of [0, T)
527
    print('Remove rows with t outside of [0, {}]'.format(T))
528
    df = df[pd.isnull(df[t_col]) | ((0 <= df[t_col]) & (df[t_col] < T))]
529
    
530
    # Data tables should not contain duplicate rows
531
    # Check for inconsistencies
532
    dups = df.duplicated(subset=[ID_col, t_col, var_col], keep=False)
533
    if any(dups):
534
        print(df[dups].head())
535
        raise Exception('Inconsistent values recorded')
536
    
537
    # Remove variables that occur too rarely as defined by the threshold
538
    print('Remove rare variables (<= {})'.format(threshold))
539
    
540
    ## Calculate overall occurrence rate of each variable based on IDs
541
    df_count = calculate_variable_counts(df, df_population) # (N x |var|) table of counts
542
    df_bool = df_count.astype(bool) # convert counts to boolean
543
    
544
    ## Keep variables that are recorded for more than threshold fraction of IDs
545
    variables_keep = df_bool.columns[df_bool.mean(axis=0) > threshold]
546
    df_out = df[df[var_col].isin(variables_keep)]
547
    assert set(variables_keep) == set(df_out[var_col].unique())
548
    
549
    variables = sorted(df_bool.columns)
550
    variables_remove = sorted(set(variables) - set(variables_keep))
551
    print('Total variables     :', len(variables))
552
    print('Rare variables      :', len(variables_remove))
553
    print('Remaining variables :', len(variables_keep))
554
    print('# rows (original)   :', len(df))
555
    print('# rows (filtered)   :', len(df_out))
556
    return df_out
557
558
559
560
######
561
# Time-invariant routines
562
######
563
def transform_time_invariant_table(df_in, df_population):
564
    df_in = df_in.copy()
565
    
566
    # Recorded Value (np.nan if not recorded)
567
    df_value = pd.pivot_table(df_in, val_col, ID_col, var_col, 'last', np.nan)
568
    df_value = df_value.reindex(index=df_population.index, fill_value=np.nan)
569
    df_value.columns = [str(col) + '_value' for col in df_value.columns]
570
    
571
    print('(N \u00D7 ^d) table            :\t', df_value.shape)
572
    print('number of missing entries :\t', '{} out of {} total'.format(df_value.isna().sum().sum(), df_value.size))
573
    return df_value
574
575
def smart_qcut_dummify_5(x):
576
    return smart_qcut_dummify(x, q=5)
577
578
579
def post_filter(s_, s_feature_names_all, threshold):
580
    # Filter features (optional)
581
    assert s_.shape[1] == len(s_feature_names_all)
582
    feature_names_0 = s_feature_names_all
583
    s0 = s_.to_scipy_sparse()
584
    print('Original       :', len(feature_names_0))
585
    
586
    ## Remove nearly-constant features (with low variance)
587
    ## a binary feature is removed if =0 (or =1) for >th fraction of examples
588
    ## i.e., variance <= (th * (1 - th))
589
    sel_rare = VarianceThreshold(threshold=(threshold * (1 - threshold)))
590
    s1 = sel_rare.fit_transform(s0)
591
    feature_names_1 = feature_names_0[sel_rare.get_support()]
592
    print('Nearly-constant:', len(feature_names_0) - len(feature_names_1))
593
    
594
    ## Keep only first of pairwise perfectly correlated features
595
    sel_corr = CorrelationSelector()
596
    s2 = sel_corr.fit_transform(s1)
597
    feature_names_2 = feature_names_1[sel_corr.get_support()]
598
    feature_aliases = sel_corr.get_feature_aliases(feature_names_1)
599
    print('Correlated     :', len(feature_names_1) - len(feature_names_2))
600
    
601
    s = sparse.COO(s2)
602
    feature_names = feature_names_2
603
    assert s.shape[1] == len(feature_names)
604
    
605
    return s, feature_names, feature_aliases
606
607
608
def process_time_invariant(df_data_time_invariant, args):
609
    data_path = args.data_path
610
    df_population = args.df_population
611
    theta_2 = args.theta_2
612
    
613
    print_header('2-A) Transform time-invariant data', char='-')
614
    dir_path = data_path + '/'
615
    start_time = time.time()
616
617
    ## Create Nxd^ table
618
    df_time_invariant = transform_time_invariant_table(df_data_time_invariant, df_population)
619
    print('Time elapsed: %f seconds' % (time.time() - start_time))
620
621
    ## Discretize
622
    s_all, s_all_feature_names, s_discretization = map_time_invariant_features(df_time_invariant, args.binarize)
623
    sparse.save_npz(dir_path + 's_all.npz', s_all)
624
    with open(dir_path + 's_all.feature_names.json', 'w') as f:
625
        json.dump(list(s_all_feature_names), f, sort_keys=True)
626
    print('Time elapsed: %f seconds' % (time.time() - start_time))
627
    json.dump(s_discretization, open(dir_path + 'discretization.json', 'w'))
628
    
629
    print_header('3-A) Post-filter time-invariant data', char='-')
630
    
631
    ## Filter
632
    s, s_feature_names, s_feature_aliases = post_filter(s_all, s_all_feature_names, theta_2)
633
    print('Time elapsed: %f seconds' % (time.time() - start_time))
634
    
635
    ## Save output
636
    print()
637
    print('Output')
638
    print('s: shape={}, density={:.3f}'.format(s.shape, s.density))
639
    sparse.save_npz(dir_path + 's.npz', s)
640
    
641
    with open(dir_path + 's.feature_names.json', 'w') as f:
642
        json.dump(list(s_feature_names), f, sort_keys=True)
643
    with open(dir_path + 's.feature_aliases.json', 'w') as f:
644
        json.dump(s_feature_aliases, f, sort_keys=True)
645
    
646
    print('Total time: %f seconds' % (time.time() - start_time))
647
    print('', flush=True)
648
    return s, s_feature_names, s_feature_aliases
649