|
a |
|
b/preprocessing/1-demog.py |
|
|
1 |
import json |
|
|
2 |
import sparse |
|
|
3 |
import pandas as pd |
|
|
4 |
import numpy as np |
|
|
5 |
import joblib |
|
|
6 |
|
|
|
7 |
if __name__ == '__main__': |
|
|
8 |
|
|
|
9 |
df_static = pd.read_csv('sample_input/demog.csv').set_index('hosp_id') |
|
|
10 |
discretization_bins = json.load(open('metadata/demog/discretization.json', 'r')) |
|
|
11 |
feature_names = json.load(open('metadata/demog/s_all.feature_names.json', 'r')) |
|
|
12 |
|
|
|
13 |
cols_useful = list(discretization_bins.keys()) |
|
|
14 |
df_static = df_static[cols_useful] |
|
|
15 |
|
|
|
16 |
##### FIDDLE |
|
|
17 |
from FIDDLE_steps_2 import * |
|
|
18 |
from FIDDLE_helpers import * |
|
|
19 |
|
|
|
20 |
df = df_static |
|
|
21 |
df_by_cols = [df[col] for col in df.columns] |
|
|
22 |
out_0 = [smart_qcut_bins((x, discretization_bins[x.name])) for x in tqdm(df_by_cols, desc='pd.qcut')] |
|
|
23 |
cols_data, dis_bins = zip(*out_0) |
|
|
24 |
out = [dummify(z) for z in tqdm(cols_data, desc='dummify')] |
|
|
25 |
df_features = pd.concat(out, axis=1).sort_index(axis=1) |
|
|
26 |
|
|
|
27 |
# Drop any values='missing' |
|
|
28 |
df_features = df_features.loc[:, [col for col in df_features.columns if 'missing' not in col]] |
|
|
29 |
|
|
|
30 |
# Make sure same features |
|
|
31 |
assert set(df_features.columns) <= set(feature_names) |
|
|
32 |
|
|
|
33 |
df_features = df_features.reindex(columns=feature_names, fill_value=0) |
|
|
34 |
sdf = df_features.astype(pd.SparseDtype(int, fill_value=0)) |
|
|
35 |
X_all = sparse.COO(sdf.sparse.to_coo()) |
|
|
36 |
|
|
|
37 |
sparse.save_npz('sample_output/out_demog/X_all.npz', X_all) |
|
|
38 |
df_features[[]].to_csv('sample_output/out_demog/X_all.IDs.csv') |
|
|
39 |
|
|
|
40 |
import sparse, json |
|
|
41 |
s = sparse.load_npz('sample_output/out_demog/X_all.npz').todense() |
|
|
42 |
cols = feature_names |
|
|
43 |
df_s = pd.DataFrame(s, columns=cols) |
|
|
44 |
df_s.index = df_static.index |
|
|
45 |
df_s.to_csv('sample_output/out_demog/static-features.csv') |