|
a |
|
b/preprocessing/2-labs.py |
|
|
1 |
from FIDDLE_config import * |
|
|
2 |
import pickle |
|
|
3 |
import pandas as pd |
|
|
4 |
import numpy as np |
|
|
5 |
import time |
|
|
6 |
import os |
|
|
7 |
|
|
|
8 |
import argparse |
|
|
9 |
from FIDDLE_helpers import str2bool |
|
|
10 |
|
|
|
11 |
if __name__ == '__main__': |
|
|
12 |
|
|
|
13 |
parser = argparse.ArgumentParser(description='') |
|
|
14 |
parser.add_argument('--T', type=float, required=True) |
|
|
15 |
parser.add_argument('--dt', type=float, required=True) |
|
|
16 |
parser.add_argument('--theta_1', type=float, default=0.001) |
|
|
17 |
parser.add_argument('--theta_2', type=float, default=0.001) |
|
|
18 |
parser.add_argument('--theta_freq', type=float, default=1.0) |
|
|
19 |
parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) |
|
|
20 |
parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True) |
|
|
21 |
|
|
|
22 |
parser.add_argument('--data_path', type=str, required=True) |
|
|
23 |
parser.add_argument('--input_fname', type=str, required=False) |
|
|
24 |
parser.add_argument('--population', type=str, required=True) |
|
|
25 |
parser.add_argument('--N', type=int, required=False) |
|
|
26 |
parser.add_argument('--Ds', nargs='+', type=int) |
|
|
27 |
|
|
|
28 |
parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') |
|
|
29 |
parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') |
|
|
30 |
parser.set_defaults(prefilter=True, postfilter=True) |
|
|
31 |
|
|
|
32 |
args = parser.parse_args([ |
|
|
33 |
"--data_path=./sample_output/out_labs/", |
|
|
34 |
"--input_fname=./sample_input/labs.csv", |
|
|
35 |
"--population=./sample_input/windows.csv", |
|
|
36 |
"--T=240", "--dt=240", |
|
|
37 |
"--no_prefilter", "--no_postfilter", "--theta_freq=1", |
|
|
38 |
"--stats_functions", 'min', 'max', 'mean', |
|
|
39 |
]) |
|
|
40 |
args.variables = sorted(pd.read_csv('metadata/labs/value_types.csv')['variable_name']) |
|
|
41 |
args.variables_num_freq = [] |
|
|
42 |
|
|
|
43 |
######### |
|
|
44 |
data_path = args.data_path |
|
|
45 |
if not data_path.endswith('/'): |
|
|
46 |
data_path += '/' |
|
|
47 |
|
|
|
48 |
population = args.population |
|
|
49 |
T = int(args.T) |
|
|
50 |
dt = args.dt |
|
|
51 |
theta_1 = args.theta_1 |
|
|
52 |
theta_2 = args.theta_2 |
|
|
53 |
theta_freq = args.theta_freq |
|
|
54 |
stats_functions = args.stats_functions |
|
|
55 |
binarize = args.binarize |
|
|
56 |
|
|
|
57 |
df_population = pd.read_csv(population).set_index('ID') |
|
|
58 |
N = args.N or len(df_population) |
|
|
59 |
df_population = df_population.iloc[:args.N] |
|
|
60 |
L = int(np.floor(T/dt)) |
|
|
61 |
|
|
|
62 |
args.df_population = df_population |
|
|
63 |
args.N = N |
|
|
64 |
args.L = L |
|
|
65 |
args.parallel = parallel |
|
|
66 |
|
|
|
67 |
if args.input_fname and os.path.isfile(args.input_fname): |
|
|
68 |
input_fname = args.input_fname |
|
|
69 |
if input_fname.endswith('.p' or '.pickle'): |
|
|
70 |
df_data = pd.read_pickle(input_fname) |
|
|
71 |
elif input_fname.endswith('.csv'): |
|
|
72 |
df_data = pd.read_csv(input_fname) |
|
|
73 |
else: |
|
|
74 |
assert False |
|
|
75 |
else: |
|
|
76 |
raise NotImplementedError |
|
|
77 |
|
|
|
78 |
########## |
|
|
79 |
from FIDDLE_steps_2 import * |
|
|
80 |
|
|
|
81 |
print('Input data file:', input_fname) |
|
|
82 |
print() |
|
|
83 |
print('Input arguments:') |
|
|
84 |
print(' {:<6} = {}'.format('T', T)) |
|
|
85 |
print(' {:<6} = {}'.format('dt', dt)) |
|
|
86 |
print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) |
|
|
87 |
print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) |
|
|
88 |
print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) |
|
|
89 |
print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) |
|
|
90 |
print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize])) |
|
|
91 |
print() |
|
|
92 |
print('N = {}'.format(N)) |
|
|
93 |
print('L = {}'.format(L)) |
|
|
94 |
print('', flush=True) |
|
|
95 |
|
|
|
96 |
df_data = df_data[df_data['ID'].isin(df_population.index)] |
|
|
97 |
df_data = df_data.sort_values(by=['ID', 't']) |
|
|
98 |
df_data_time_invariant = df_data.drop_duplicates(subset=['ID', 'variable_name'], keep='last') # Keep most recent recording |
|
|
99 |
df_data_time_invariant = df_data_time_invariant.drop(columns={'t'}) |
|
|
100 |
|
|
|
101 |
dir_path = data_path + '/' |
|
|
102 |
start_time = time.time() |
|
|
103 |
|
|
|
104 |
## Create NxLxD^ table |
|
|
105 |
df_time_invariant = df_data_time_invariant.set_index(['ID', 'variable_name']).unstack() |
|
|
106 |
print('Done unstacking', flush=True) |
|
|
107 |
df_time_invariant.columns = df_time_invariant.columns.droplevel(0) |
|
|
108 |
df_time_invariant = df_time_invariant.reindex(columns=args.variables, fill_value=np.nan) |
|
|
109 |
df_time_invariant = df_time_invariant.add_suffix('_value') |
|
|
110 |
print('Done reindexing', flush=True) |
|
|
111 |
print('Time elapsed: %f seconds' % (time.time() - start_time), flush=True) |
|
|
112 |
|
|
|
113 |
sdf = df_time_invariant.astype('Sparse[object]') |
|
|
114 |
joblib.dump(sdf, args.data_path + 'sdf.joblib') |
|
|
115 |
|
|
|
116 |
##### |
|
|
117 |
import joblib |
|
|
118 |
import pandas as pd |
|
|
119 |
import numpy as np |
|
|
120 |
import sparse |
|
|
121 |
import json |
|
|
122 |
|
|
|
123 |
df_time_series = joblib.load('sample_output/out_labs/sdf.joblib') |
|
|
124 |
discretization_bins = json.load(open('metadata/labs/discretization.json', 'r')) |
|
|
125 |
feature_names = json.load(open('metadata/labs/X_all.feature_names.json', 'r')) |
|
|
126 |
|
|
|
127 |
cols_useful = list(discretization_bins.keys()) |
|
|
128 |
df_time_series = df_time_series[cols_useful] |
|
|
129 |
ddf = df_time_series.sparse.to_dense() |
|
|
130 |
|
|
|
131 |
from FIDDLE_steps_2 import * |
|
|
132 |
from FIDDLE_helpers import * |
|
|
133 |
|
|
|
134 |
df = ddf |
|
|
135 |
|
|
|
136 |
pool = multiprocessing.Pool(multiprocessing.cpu_count()) |
|
|
137 |
out_0 = list(tqdm(pool.imap_unordered( |
|
|
138 |
smart_qcut_bins, |
|
|
139 |
[(df[col], discretization_bins[col]) for col in df.columns]), total=len(df.columns) |
|
|
140 |
)) |
|
|
141 |
cols_data, dis_bins = zip(*out_0) |
|
|
142 |
out = list(tqdm(pool.imap_unordered(dummify, cols_data), total=len(df.columns) |
|
|
143 |
)) |
|
|
144 |
pool.close() |
|
|
145 |
pool.join() |
|
|
146 |
|
|
|
147 |
df_features = pd.concat(out, axis=1).sort_index(axis=1) |
|
|
148 |
|
|
|
149 |
# Drop any values='missing' for now |
|
|
150 |
df_features = df_features.loc[:, [col for col in df_features.columns if 'missing' not in col]] |
|
|
151 |
|
|
|
152 |
df_features = df_features.reindex(columns=feature_names, fill_value=0) |
|
|
153 |
sdf = df_features.astype(pd.SparseDtype(int, fill_value=0)) |
|
|
154 |
X_all = sparse.COO(sdf.sparse.to_coo()) |
|
|
155 |
|
|
|
156 |
sparse.save_npz('sample_output/out_labs/X_all.npz', X_all) |
|
|
157 |
df_time_series[[]].to_csv('sample_output/out_labs/X_all.IDs.csv') |