a b/preprocessing/3-vitals.py
1
from FIDDLE_config import *
2
import pickle
3
import pandas as pd
4
import numpy as np
5
import time
6
import os
7
8
import argparse
9
from FIDDLE_helpers import str2bool
10
11
if __name__ == '__main__':
12
13
    parser = argparse.ArgumentParser(description='')
14
    parser.add_argument('--T',               type=float,   required=True)
15
    parser.add_argument('--dt',              type=float,   required=True)
16
    parser.add_argument('--theta_1',         type=float,   default=0.001)
17
    parser.add_argument('--theta_2',         type=float,   default=0.001)
18
    parser.add_argument('--theta_freq',      type=float,   default=1.0)
19
    parser.add_argument('--stats_functions', nargs='+',    default=['min', 'max', 'mean'])
20
    parser.add_argument('--binarize',        type=str2bool, default=True, nargs='?', const=True)
21
22
    parser.add_argument('--data_path',       type=str,     required=True)
23
    parser.add_argument('--input_fname',     type=str,     required=False)
24
    parser.add_argument('--population',      type=str,     required=True)
25
    parser.add_argument('--N',               type=int,     required=False)
26
    parser.add_argument('--Ds',              nargs='+',    type=int)
27
28
    parser.add_argument('--no_prefilter',    dest='prefilter',  action='store_false')
29
    parser.add_argument('--no_postfilter',   dest='postfilter', action='store_false')
30
    parser.set_defaults(prefilter=True, postfilter=True)
31
32
    args = parser.parse_args([
33
        "--data_path=./sample_output/out_vitals/", 
34
        "--input_fname=./sample_input/vitals.csv", 
35
        "--population=./sample_input/windows.csv", 
36
        "--T=240", "--dt=240", 
37
        "--no_prefilter", "--no_postfilter", "--theta_freq=1",
38
        "--stats_functions", 'min', 'max', 'mean',
39
    ])
40
    args.variables = sorted(pd.read_csv('metadata/vitals/value_types.csv')['variable_name'])
41
    args.variables_num_freq = ['respiratoryrate', 'heartrate', 'temperature', 'sbp', 'dbp', 'spo2']
42
43
    #########
44
    data_path = args.data_path
45
    if not data_path.endswith('/'):
46
        data_path += '/'
47
48
    population = args.population
49
    T = int(args.T)
50
    dt = args.dt
51
    theta_1 = args.theta_1
52
    theta_2 = args.theta_2
53
    theta_freq = args.theta_freq
54
    stats_functions = args.stats_functions
55
    binarize = args.binarize
56
57
    df_population = pd.read_csv(population).set_index('ID')
58
    N = args.N or len(df_population)
59
    df_population = df_population.iloc[:args.N]
60
    L = int(np.floor(T/dt))
61
62
    args.df_population = df_population
63
    args.N = N
64
    args.L = L
65
    args.parallel = parallel
66
67
    if args.input_fname and os.path.isfile(args.input_fname):
68
        input_fname = args.input_fname
69
        if input_fname.endswith('.p' or '.pickle'):
70
            df_data = pd.read_pickle(input_fname)
71
        elif input_fname.endswith('.csv'):
72
            df_data = pd.read_csv(input_fname)
73
        else:
74
            assert False
75
    else:
76
        raise NotImplementedError
77
78
    if df_data['ID'].isnull().sum():
79
        print('Some IDs are NULL')
80
        df_data = df_data.dropna(subset=['ID'], axis=0)
81
82
    ##########
83
    from FIDDLE_steps_2 import *
84
85
    print('Input data file:', input_fname)
86
    print()
87
    print('Input arguments:')
88
    print('    {:<6} = {}'.format('T', T))
89
    print('    {:<6} = {}'.format('dt', dt))
90
    print('    {:<6} = {}'.format('\u03B8\u2081', theta_1))
91
    print('    {:<6} = {}'.format('\u03B8\u2082', theta_2))
92
    print('    {:<6} = {}'.format('\u03B8_freq', theta_freq))
93
    print('    {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
94
    print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
95
    print()
96
    print('N = {}'.format(N))
97
    print('L = {}'.format(L))
98
    print('', flush=True)
99
100
    #######
101
    df_data = df_data[df_data['variable_name'].isin(args.variables)]
102
    df_time_series = df_data
103
104
    print_header('2-B) Transform time-dependent data', char='-')
105
    dir_path = data_path + '/'
106
    start_time = time.time()
107
108
    # Create NxLxD^ table
109
    df_time_series, dtypes_time_series = transform_time_series_table(df_time_series, args)
110
    print('Time elapsed: %f seconds' % (time.time() - start_time), flush=True)
111
112
    ##############
113
    joblib.dump(dtypes_time_series, args.data_path + 'dtypes_time_series.joblib')
114
    joblib.dump(df_time_series, args.data_path + 'df_time_series.joblib')
115
    ##############
116
117
118
    ######
119
    import joblib
120
    import pandas as pd
121
    import numpy as np
122
    import sparse
123
    import json
124
125
    ### Vitals
126
127
    df_time_series = joblib.load('sample_output/out_vitals/df_time_series.joblib')
128
    discretization_bins = json.load(open('metadata/vitals/discretization.json', 'r'))
129
    feature_names = json.load(open('metadata/vitals/X_all.feature_names.json', 'r'))
130
131
    cols_useful = list(discretization_bins.keys())
132
    df_time_series = df_time_series[cols_useful]
133
134
    from FIDDLE_steps_2 import *
135
    from FIDDLE_helpers import *
136
137
    df = df_time_series
138
139
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
140
    out_0 = list(tqdm(pool.imap_unordered(
141
        smart_qcut_bins,
142
        [(df[col], discretization_bins[col]) for col in df.columns]), total=len(df.columns)
143
    ))
144
    cols_data, dis_bins = zip(*out_0)
145
    out = list(tqdm(pool.imap_unordered(dummify, cols_data), total=len(df.columns)
146
    ))
147
    pool.close()
148
    pool.join()
149
150
    df_features = pd.concat(out, axis=1).sort_index(axis=1)
151
152
    # Drop any values='missing' for now
153
    df_features = df_features.loc[:, [col for col in df_features.columns if 'missing' not in col]]
154
155
    df_features = df_features.reindex(columns=feature_names, fill_value=0)
156
    sdf = df_features.astype(pd.SparseDtype(int, fill_value=0))
157
    X_all = sparse.COO(sdf.sparse.to_coo())
158
    sparse.save_npz('sample_output/out_vitals/X_all.npz', X_all)
159
    df_time_series[[]].to_csv('sample_output/out_vitals/X_all.IDs.csv')