[0ae801]: / preprocessing / 2-labs.py

Download this file

158 lines (130 with data), 6.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from FIDDLE_config import *
import pickle
import pandas as pd
import numpy as np
import time
import os
import argparse
from FIDDLE_helpers import str2bool
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
parser.add_argument('--T', type=float, required=True)
parser.add_argument('--dt', type=float, required=True)
parser.add_argument('--theta_1', type=float, default=0.001)
parser.add_argument('--theta_2', type=float, default=0.001)
parser.add_argument('--theta_freq', type=float, default=1.0)
parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True)
parser.add_argument('--data_path', type=str, required=True)
parser.add_argument('--input_fname', type=str, required=False)
parser.add_argument('--population', type=str, required=True)
parser.add_argument('--N', type=int, required=False)
parser.add_argument('--Ds', nargs='+', type=int)
parser.add_argument('--no_prefilter', dest='prefilter', action='store_false')
parser.add_argument('--no_postfilter', dest='postfilter', action='store_false')
parser.set_defaults(prefilter=True, postfilter=True)
args = parser.parse_args([
"--data_path=./sample_output/out_labs/",
"--input_fname=./sample_input/labs.csv",
"--population=./sample_input/windows.csv",
"--T=240", "--dt=240",
"--no_prefilter", "--no_postfilter", "--theta_freq=1",
"--stats_functions", 'min', 'max', 'mean',
])
args.variables = sorted(pd.read_csv('metadata/labs/value_types.csv')['variable_name'])
args.variables_num_freq = []
#########
data_path = args.data_path
if not data_path.endswith('/'):
data_path += '/'
population = args.population
T = int(args.T)
dt = args.dt
theta_1 = args.theta_1
theta_2 = args.theta_2
theta_freq = args.theta_freq
stats_functions = args.stats_functions
binarize = args.binarize
df_population = pd.read_csv(population).set_index('ID')
N = args.N or len(df_population)
df_population = df_population.iloc[:args.N]
L = int(np.floor(T/dt))
args.df_population = df_population
args.N = N
args.L = L
args.parallel = parallel
if args.input_fname and os.path.isfile(args.input_fname):
input_fname = args.input_fname
if input_fname.endswith('.p' or '.pickle'):
df_data = pd.read_pickle(input_fname)
elif input_fname.endswith('.csv'):
df_data = pd.read_csv(input_fname)
else:
assert False
else:
raise NotImplementedError
##########
from FIDDLE_steps_2 import *
print('Input data file:', input_fname)
print()
print('Input arguments:')
print(' {:<6} = {}'.format('T', T))
print(' {:<6} = {}'.format('dt', dt))
print(' {:<6} = {}'.format('\u03B8\u2081', theta_1))
print(' {:<6} = {}'.format('\u03B8\u2082', theta_2))
print(' {:<6} = {}'.format('\u03B8_freq', theta_freq))
print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
print()
print('N = {}'.format(N))
print('L = {}'.format(L))
print('', flush=True)
df_data = df_data[df_data['ID'].isin(df_population.index)]
df_data = df_data.sort_values(by=['ID', 't'])
df_data_time_invariant = df_data.drop_duplicates(subset=['ID', 'variable_name'], keep='last') # Keep most recent recording
df_data_time_invariant = df_data_time_invariant.drop(columns={'t'})
dir_path = data_path + '/'
start_time = time.time()
## Create NxLxD^ table
df_time_invariant = df_data_time_invariant.set_index(['ID', 'variable_name']).unstack()
print('Done unstacking', flush=True)
df_time_invariant.columns = df_time_invariant.columns.droplevel(0)
df_time_invariant = df_time_invariant.reindex(columns=args.variables, fill_value=np.nan)
df_time_invariant = df_time_invariant.add_suffix('_value')
print('Done reindexing', flush=True)
print('Time elapsed: %f seconds' % (time.time() - start_time), flush=True)
sdf = df_time_invariant.astype('Sparse[object]')
joblib.dump(sdf, args.data_path + 'sdf.joblib')
#####
import joblib
import pandas as pd
import numpy as np
import sparse
import json
df_time_series = joblib.load('sample_output/out_labs/sdf.joblib')
discretization_bins = json.load(open('metadata/labs/discretization.json', 'r'))
feature_names = json.load(open('metadata/labs/X_all.feature_names.json', 'r'))
cols_useful = list(discretization_bins.keys())
df_time_series = df_time_series[cols_useful]
ddf = df_time_series.sparse.to_dense()
from FIDDLE_steps_2 import *
from FIDDLE_helpers import *
df = ddf
pool = multiprocessing.Pool(multiprocessing.cpu_count())
out_0 = list(tqdm(pool.imap_unordered(
smart_qcut_bins,
[(df[col], discretization_bins[col]) for col in df.columns]), total=len(df.columns)
))
cols_data, dis_bins = zip(*out_0)
out = list(tqdm(pool.imap_unordered(dummify, cols_data), total=len(df.columns)
))
pool.close()
pool.join()
df_features = pd.concat(out, axis=1).sort_index(axis=1)
# Drop any values='missing' for now
df_features = df_features.loc[:, [col for col in df_features.columns if 'missing' not in col]]
df_features = df_features.reindex(columns=feature_names, fill_value=0)
sdf = df_features.astype(pd.SparseDtype(int, fill_value=0))
X_all = sparse.COO(sdf.sparse.to_coo())
sparse.save_npz('sample_output/out_labs/X_all.npz', X_all)
df_time_series[[]].to_csv('sample_output/out_labs/X_all.IDs.csv')