Diff of /lib/features_selection.py [000000] .. [793d90]

Switch to side-by-side view

--- a
+++ b/lib/features_selection.py
@@ -0,0 +1,52 @@
+# -*- coding:utf-8 -*-
+import pandas as pd
+from utils import avg_importance
+
+from sklearn.model_selection import StratifiedKFold
+
+import gcforest.data_load as load
+from gcforest.gcforest import GCForest
+
+import utils
+
+cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
+
+config = utils.load_json("demo_ca.json")
+gc = GCForest(config)
+
+datasets = ['cirrhosis', 'obesity', 't2d']
+
+for dataset_idx, name in enumerate(datasets):
+    thre_features = {}
+    X = None
+    Y = None
+    if name == 'cirrhosis':
+        X, Y = load.cirrhosis_data()
+    elif name == 't2d':
+        X, Y = load.t2d_data()
+    elif name == 'obesity':
+        X, Y = load.obesity_data()
+    else:
+        raise Exception('the dataset is not defined!!!')
+
+    output_features = pd.Series()
+    for train, test in cv.split(X, Y):
+        x_train = X.iloc[train]
+        y_train = Y[train]
+
+        x_test = X.iloc[test]
+        y_test = Y[test]
+
+        X_train = x_train.values.reshape(-1, 1, len(x_train.columns))
+        X_test = x_test.values.reshape(-1, 1, len(x_test.columns))
+
+        X_train_enc, _features = gc.fit_transform(X_train, y_train)
+
+        probas_ = gc.predict_proba(X_test)
+        output_features = avg_importance(output_features, _features)
+
+    output_features = output_features.sort_values(ascending=False)
+    columns = list(map(int, output_features.index.tolist()))
+    output_features.index = X.columns[columns]
+
+    output_features.to_csv("output/" + name)