Diff of /preprocessing/1-demog.py [000000] .. [0ae801]

Switch to side-by-side view

--- a
+++ b/preprocessing/1-demog.py
@@ -0,0 +1,45 @@
+import json
+import sparse
+import pandas as pd
+import numpy as np
+import joblib
+
+if __name__ == '__main__':
+
+    df_static = pd.read_csv('sample_input/demog.csv').set_index('hosp_id')
+    discretization_bins = json.load(open('metadata/demog/discretization.json', 'r'))
+    feature_names = json.load(open('metadata/demog/s_all.feature_names.json', 'r'))
+
+    cols_useful = list(discretization_bins.keys())
+    df_static = df_static[cols_useful]
+
+    ##### FIDDLE
+    from FIDDLE_steps_2 import *
+    from FIDDLE_helpers import *
+
+    df = df_static
+    df_by_cols = [df[col] for col in df.columns]
+    out_0 = [smart_qcut_bins((x, discretization_bins[x.name])) for x in tqdm(df_by_cols, desc='pd.qcut')]
+    cols_data, dis_bins = zip(*out_0)
+    out = [dummify(z) for z in tqdm(cols_data, desc='dummify')]
+    df_features = pd.concat(out, axis=1).sort_index(axis=1)
+
+    # Drop any values='missing'
+    df_features = df_features.loc[:, [col for col in df_features.columns if 'missing' not in col]]
+
+    # Make sure same features
+    assert set(df_features.columns) <= set(feature_names)
+
+    df_features = df_features.reindex(columns=feature_names, fill_value=0)
+    sdf = df_features.astype(pd.SparseDtype(int, fill_value=0))
+    X_all = sparse.COO(sdf.sparse.to_coo())
+
+    sparse.save_npz('sample_output/out_demog/X_all.npz', X_all)
+    df_features[[]].to_csv('sample_output/out_demog/X_all.IDs.csv')
+
+    import sparse, json
+    s = sparse.load_npz('sample_output/out_demog/X_all.npz').todense()
+    cols = feature_names
+    df_s = pd.DataFrame(s, columns=cols)
+    df_s.index = df_static.index
+    df_s.to_csv('sample_output/out_demog/static-features.csv')