Diff of /preprocessing/1-demog.py [000000] .. [0ae801]

Switch to unified view

a b/preprocessing/1-demog.py
1
import json
2
import sparse
3
import pandas as pd
4
import numpy as np
5
import joblib
6
7
if __name__ == '__main__':
8
9
    df_static = pd.read_csv('sample_input/demog.csv').set_index('hosp_id')
10
    discretization_bins = json.load(open('metadata/demog/discretization.json', 'r'))
11
    feature_names = json.load(open('metadata/demog/s_all.feature_names.json', 'r'))
12
13
    cols_useful = list(discretization_bins.keys())
14
    df_static = df_static[cols_useful]
15
16
    ##### FIDDLE
17
    from FIDDLE_steps_2 import *
18
    from FIDDLE_helpers import *
19
20
    df = df_static
21
    df_by_cols = [df[col] for col in df.columns]
22
    out_0 = [smart_qcut_bins((x, discretization_bins[x.name])) for x in tqdm(df_by_cols, desc='pd.qcut')]
23
    cols_data, dis_bins = zip(*out_0)
24
    out = [dummify(z) for z in tqdm(cols_data, desc='dummify')]
25
    df_features = pd.concat(out, axis=1).sort_index(axis=1)
26
27
    # Drop any values='missing'
28
    df_features = df_features.loc[:, [col for col in df_features.columns if 'missing' not in col]]
29
30
    # Make sure same features
31
    assert set(df_features.columns) <= set(feature_names)
32
33
    df_features = df_features.reindex(columns=feature_names, fill_value=0)
34
    sdf = df_features.astype(pd.SparseDtype(int, fill_value=0))
35
    X_all = sparse.COO(sdf.sparse.to_coo())
36
37
    sparse.save_npz('sample_output/out_demog/X_all.npz', X_all)
38
    df_features[[]].to_csv('sample_output/out_demog/X_all.IDs.csv')
39
40
    import sparse, json
41
    s = sparse.load_npz('sample_output/out_demog/X_all.npz').todense()
42
    cols = feature_names
43
    df_s = pd.DataFrame(s, columns=cols)
44
    df_s.index = df_static.index
45
    df_s.to_csv('sample_output/out_demog/static-features.csv')