b/lib/utils.py
+# -*- coding:utf-8 -*-
+import math
+import numpy as np
+import tensorflow as tf
+from sklearn import metrics
+from sklearn.utils import shuffle
+import pymrmr
+from sklearn.svm import SVR
+from sklearn.feature_selection import RFE
+from skrebate import ReliefF
+from gcforest.gcforest import GCForest
+def load_json(path):
+    import json
+    """
+    """
+    lines = []
+    with open(path) as f:
+        for row in f.readlines():
+            if row.strip().startswith("//"):
+                continue
+            lines.append(row)
+    return json.loads("\n".join(lines))
+def consistency_index(sel1, sel2, num_features):
+    """ Compute the consistency index between two sets of features.
+    Parameters
+    ----------
+    sel1: set
+        First set of indices of selected features
+    sel2: set
+        Second set of indices of selected features
+    num_features: int
+        Total number of features
+    Returns
+    -------
+    cidx: float
+        Consistency index between the two sets.
+    Reference
+    ---------
+    Kuncheva, L.I. (2007). A Stability Index for Feature Selection.
+    AIAC, pp. 390--395.
+    """
+    observed = float(len(sel1.intersection(sel2)))
+    expected = len(sel1) * len(sel2) / float(num_features)
+    maxposbl = float(min(len(sel1), len(sel2)))
+    cidx = -1.
+    # It's 0 and not 1 as expected if num_features == len(sel1) == len(sel2) => observed = n
+    # Because "take everything" and "take nothing" are trivial solutions we don't want to select
+    if expected != maxposbl:
+        cidx = (observed - expected) / (maxposbl - expected)
+    return cidx
+def consistency_index_k(sel_list, num_features):
+    """ Compute the consistency index between more than 2 sets of features.
+    This is done by averaging over all pairwise consistency indices.
+    Parameters
+    ----------
+    sel_list: list of lists
+        List of k lists of indices of selected features
+    num_features: int
+        Total number of features
+    Returns
+    -------
+    cidx: float
+        Consistency index between the k sets.
+    Reference
+    ---------
+    Kuncheva, L.I. (2007). A Stability Index for Feature Selection.
+    AIAC, pp. 390--395.
+    """
+    cidx = 0.
+    for k1, sel1 in enumerate(sel_list[:-1]):
+        # sel_list[:-1] to not take into account the last list.
+        # avoid a problem with sel_list[k1+1:] when k1 is the last element,
+        # that give an empty list overwise
+        # the work is done at the second to last element anyway
+        for sel2 in sel_list[k1+1:]:
+            cidx += consistency_index(set(sel1), set(sel2), num_features)
+    cidx = 2.  * cidx / (len(sel_list) * (len(sel_list) - 1))
+    return "{0:.4f}".format(cidx)
+def avg_importance(sa, sb):
+    sc = sa.add(sb, fill_value=None).dropna() / 2
+    sd = sa.add(sb, fill_value=0).drop(sc.index)
+    return sc.append(sd)
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+def bias_variable(shape):
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+def conv2d(x, w):
+    return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+def max_pool_2x2(x):
+    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+def cnn(x_train, x_test, y_train, y_test):
+    L1 = 32  # number of convolutions for first layer
+    L2 = 64  # number of convolutions for second layer
+    L3 = 512  # number of neurons for dense layer
+    learning_date = 1e-4  # learning rate
+    epochs = 50  # number of times we loop through training data
+    batch_size = 16  # number of data per batch
+    display_step = 1
+    loss_rec = np.zeros([epochs, 1])
+    training_eval = np.zeros([epochs, 2])
+    features = x_train.shape[1]
+    classes = y_train.shape[1]
+    xs = tf.placeholder(tf.float32, [None, features])
+    ys = tf.placeholder(tf.float32, [None, classes])
+    keep_prob = tf.placeholder(tf.float32)
+    x_shape = tf.reshape(xs, [-1, 1, features, 1])
+    # first conv
+    w_conv1 = weight_variable([5, 5, 1, L1])
+    b_conv1 = bias_variable([L1])
+    h_conv1 = tf.nn.relu(conv2d(x_shape, w_conv1) + b_conv1)
+    h_pool1 = max_pool_2x2(h_conv1)
+    # second conv
+    w_conv2 = weight_variable([5, 5, L1, L2])
+    b_conv2 = bias_variable([L2])
+    h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
+    h_pool2 = max_pool_2x2(h_conv2)
+    tmp_shape = (int)(math.ceil(features / 4.0))
+    h_pool2_flat = tf.reshape(h_pool2, [-1, 1 * tmp_shape * L2])
+    # third dense layer,full connected
+    w_fc1 = weight_variable([1 * tmp_shape * L2, L3])
+    b_fc1 = bias_variable([L3])
+    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
+    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
+    # fourth layer, output
+    w_fc2 = weight_variable([L3, classes])
+    b_fc2 = bias_variable([classes])
+    y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, w_fc2) + b_fc2)
+    cost = tf.reduce_mean(-tf.reduce_sum(ys * tf.log(y_conv), reduction_indices=[1]))
+    optimizer = tf.train.AdamOptimizer(learning_date).minimize(cost)
+    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(ys, 1))
+    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        total_batch = int(np.shape(x_train)[0] / batch_size)
+        for epoch in range(epochs):
+            avg_cost = 0.
+            x_tmp, y_tmp = shuffle(x_train, y_train)
+            for i in range(total_batch - 1):
+                batch_x, batch_y = x_tmp[i * batch_size:i * batch_size + batch_size], \
+                                   y_tmp[i * batch_size:i * batch_size + batch_size]
+                _, c, acc = sess.run([optimizer, cost, accuracy],
+                                     feed_dict={xs: batch_x, ys: batch_y, keep_prob: 0.8})
+                avg_cost += c / total_batch
+            del x_tmp
+            del y_tmp
+            ## Display logs per epoch step
+            if epoch % display_step == 0:
+                loss_rec[epoch] = avg_cost
+                acc, y_s = sess.run([accuracy, y_conv],
+                                    feed_dict={xs: x_train, ys: y_train, keep_prob: 1})
+                auc = metrics.roc_auc_score(y_train, y_s)
+                training_eval[epoch] = [acc, auc]
+                print("Epoch:", '%d' % (epoch + 1), "cost =", "{:.9f}".format(avg_cost),
+                      "Training accuracy:", round(acc, 3), " Training auc:", round(auc, 3))
+        y_pred = y_conv.eval(feed_dict={xs: x_test, ys: y_test, keep_prob: 1.0})[:, 1]
+        return y_pred
+def mRMR(x_train, y_train, n_features):
+    x_train.insert(loc=0, column='class', value=y_train)
+    features = pymrmr.mRMR(x_train, 'MIQ', n_features)
+    column_name = x_train.columns.tolist()
+    results = []
+    for feature_index in features:
+        idx = column_name.index(feature_index)
+        results.append(idx)
+    return results
+def svm_rfe(x_train, y_train, n_features):
+    estimator = SVR(kernel="linear")
+    selector = RFE(estimator, n_features_to_select=n_features)
+    selector = selector.fit(x_train.values, y_train)
+    column_name = x_train.columns.tolist()
+    features = []
+    for feature_name, feature_ind in zip(column_name, selector.ranking_):
+        if feature_ind == 1:
+            features.append(column_name.index(feature_name))
+    return features
+def reliefF(x_train, y_train, n_features):
+    fs = ReliefF(n_features_to_select=n_features)
+    fs.fit(x_train.values, y_train)
+    return list(fs.top_features_)[:n_features]
+def df(x_train, y_train, n_features):
+    config = load_json("demo_ca.json")
+    gc = GCForest(config)
+    X_train = x_train.values.reshape(-1, 1, len(x_train.columns))
+    _, _features = gc.fit_transform(X_train, y_train)
+    _features = _features.sort_values(ascending=False)
+    return _features.index.values.tolist()[:n_features]