Diff of /preprocessing/2-labs.py [000000] .. [0ae801]

Switch to unified view

a b/preprocessing/2-labs.py
1
from FIDDLE_config import *
2
import pickle
3
import pandas as pd
4
import numpy as np
5
import time
6
import os
7
8
import argparse
9
from FIDDLE_helpers import str2bool
10
11
if __name__ == '__main__':
12
13
    parser = argparse.ArgumentParser(description='')
14
    parser.add_argument('--T',               type=float,   required=True)
15
    parser.add_argument('--dt',              type=float,   required=True)
16
    parser.add_argument('--theta_1',         type=float,   default=0.001)
17
    parser.add_argument('--theta_2',         type=float,   default=0.001)
18
    parser.add_argument('--theta_freq',      type=float,   default=1.0)
19
    parser.add_argument('--stats_functions', nargs='+',    default=['min', 'max', 'mean'])
20
    parser.add_argument('--binarize',        type=str2bool, default=True, nargs='?', const=True)
21
22
    parser.add_argument('--data_path',       type=str,     required=True)
23
    parser.add_argument('--input_fname',     type=str,     required=False)
24
    parser.add_argument('--population',      type=str,     required=True)
25
    parser.add_argument('--N',               type=int,     required=False)
26
    parser.add_argument('--Ds',              nargs='+',    type=int)
27
28
    parser.add_argument('--no_prefilter',    dest='prefilter',  action='store_false')
29
    parser.add_argument('--no_postfilter',   dest='postfilter', action='store_false')
30
    parser.set_defaults(prefilter=True, postfilter=True)
31
32
    args = parser.parse_args([
33
        "--data_path=./sample_output/out_labs/", 
34
        "--input_fname=./sample_input/labs.csv", 
35
        "--population=./sample_input/windows.csv", 
36
        "--T=240", "--dt=240", 
37
        "--no_prefilter", "--no_postfilter", "--theta_freq=1",
38
        "--stats_functions", 'min', 'max', 'mean',
39
    ])
40
    args.variables = sorted(pd.read_csv('metadata/labs/value_types.csv')['variable_name'])
41
    args.variables_num_freq = []
42
43
    #########
44
    data_path = args.data_path
45
    if not data_path.endswith('/'):
46
        data_path += '/'
47
48
    population = args.population
49
    T = int(args.T)
50
    dt = args.dt
51
    theta_1 = args.theta_1
52
    theta_2 = args.theta_2
53
    theta_freq = args.theta_freq
54
    stats_functions = args.stats_functions
55
    binarize = args.binarize
56
57
    df_population = pd.read_csv(population).set_index('ID')
58
    N = args.N or len(df_population)
59
    df_population = df_population.iloc[:args.N]
60
    L = int(np.floor(T/dt))
61
62
    args.df_population = df_population
63
    args.N = N
64
    args.L = L
65
    args.parallel = parallel
66
67
    if args.input_fname and os.path.isfile(args.input_fname):
68
        input_fname = args.input_fname
69
        if input_fname.endswith('.p' or '.pickle'):
70
            df_data = pd.read_pickle(input_fname)
71
        elif input_fname.endswith('.csv'):
72
            df_data = pd.read_csv(input_fname)
73
        else:
74
            assert False
75
    else:
76
        raise NotImplementedError
77
78
    ##########
79
    from FIDDLE_steps_2 import *
80
81
    print('Input data file:', input_fname)
82
    print()
83
    print('Input arguments:')
84
    print('    {:<6} = {}'.format('T', T))
85
    print('    {:<6} = {}'.format('dt', dt))
86
    print('    {:<6} = {}'.format('\u03B8\u2081', theta_1))
87
    print('    {:<6} = {}'.format('\u03B8\u2082', theta_2))
88
    print('    {:<6} = {}'.format('\u03B8_freq', theta_freq))
89
    print('    {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
90
    print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
91
    print()
92
    print('N = {}'.format(N))
93
    print('L = {}'.format(L))
94
    print('', flush=True)
95
96
    df_data = df_data[df_data['ID'].isin(df_population.index)]
97
    df_data = df_data.sort_values(by=['ID', 't'])
98
    df_data_time_invariant = df_data.drop_duplicates(subset=['ID', 'variable_name'], keep='last') # Keep most recent recording
99
    df_data_time_invariant = df_data_time_invariant.drop(columns={'t'})
100
101
    dir_path = data_path + '/'
102
    start_time = time.time()
103
104
    ## Create NxLxD^ table
105
    df_time_invariant = df_data_time_invariant.set_index(['ID', 'variable_name']).unstack()
106
    print('Done unstacking', flush=True)
107
    df_time_invariant.columns = df_time_invariant.columns.droplevel(0)
108
    df_time_invariant = df_time_invariant.reindex(columns=args.variables, fill_value=np.nan)
109
    df_time_invariant = df_time_invariant.add_suffix('_value')
110
    print('Done reindexing', flush=True)
111
    print('Time elapsed: %f seconds' % (time.time() - start_time), flush=True)
112
113
    sdf = df_time_invariant.astype('Sparse[object]')
114
    joblib.dump(sdf, args.data_path + 'sdf.joblib')
115
116
    #####
117
    import joblib
118
    import pandas as pd
119
    import numpy as np
120
    import sparse
121
    import json
122
123
    df_time_series = joblib.load('sample_output/out_labs/sdf.joblib')
124
    discretization_bins = json.load(open('metadata/labs/discretization.json', 'r'))
125
    feature_names = json.load(open('metadata/labs/X_all.feature_names.json', 'r'))
126
127
    cols_useful = list(discretization_bins.keys())
128
    df_time_series = df_time_series[cols_useful]
129
    ddf = df_time_series.sparse.to_dense()
130
131
    from FIDDLE_steps_2 import *
132
    from FIDDLE_helpers import *
133
134
    df = ddf
135
136
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
137
    out_0 = list(tqdm(pool.imap_unordered(
138
        smart_qcut_bins,
139
        [(df[col], discretization_bins[col]) for col in df.columns]), total=len(df.columns)
140
    ))
141
    cols_data, dis_bins = zip(*out_0)
142
    out = list(tqdm(pool.imap_unordered(dummify, cols_data), total=len(df.columns)
143
    ))
144
    pool.close()
145
    pool.join()
146
147
    df_features = pd.concat(out, axis=1).sort_index(axis=1)
148
149
    # Drop any values='missing' for now
150
    df_features = df_features.loc[:, [col for col in df_features.columns if 'missing' not in col]]
151
152
    df_features = df_features.reindex(columns=feature_names, fill_value=0)
153
    sdf = df_features.astype(pd.SparseDtype(int, fill_value=0))
154
    X_all = sparse.COO(sdf.sparse.to_coo())
155
156
    sparse.save_npz('sample_output/out_labs/X_all.npz', X_all)
157
    df_time_series[[]].to_csv('sample_output/out_labs/X_all.IDs.csv')