a b/lib/utils.py
1
# -*- coding:utf-8 -*-
2
import math
3
import numpy as np
4
import tensorflow as tf
5
from sklearn import metrics
6
from sklearn.utils import shuffle
7
8
import pymrmr
9
10
from sklearn.svm import SVR
11
from sklearn.feature_selection import RFE
12
13
from skrebate import ReliefF
14
15
from gcforest.gcforest import GCForest
16
17
18
def load_json(path):
19
    import json
20
    """
21
    """
22
    lines = []
23
    with open(path) as f:
24
        for row in f.readlines():
25
            if row.strip().startswith("//"):
26
                continue
27
            lines.append(row)
28
    return json.loads("\n".join(lines))
29
30
def consistency_index(sel1, sel2, num_features):
31
    """ Compute the consistency index between two sets of features.
32
    Parameters
33
    ----------
34
    sel1: set
35
        First set of indices of selected features
36
    sel2: set
37
        Second set of indices of selected features
38
    num_features: int
39
        Total number of features
40
    Returns
41
    -------
42
    cidx: float
43
        Consistency index between the two sets.
44
    Reference
45
    ---------
46
    Kuncheva, L.I. (2007). A Stability Index for Feature Selection.
47
    AIAC, pp. 390--395.
48
    """
49
    observed = float(len(sel1.intersection(sel2)))
50
    expected = len(sel1) * len(sel2) / float(num_features)
51
    maxposbl = float(min(len(sel1), len(sel2)))
52
    cidx = -1.
53
    # It's 0 and not 1 as expected if num_features == len(sel1) == len(sel2) => observed = n
54
    # Because "take everything" and "take nothing" are trivial solutions we don't want to select
55
    if expected != maxposbl:
56
        cidx = (observed - expected) / (maxposbl - expected)
57
    return cidx
58
59
60
def consistency_index_k(sel_list, num_features):
61
    """ Compute the consistency index between more than 2 sets of features.
62
    This is done by averaging over all pairwise consistency indices.
63
    Parameters
64
    ----------
65
    sel_list: list of lists
66
        List of k lists of indices of selected features
67
    num_features: int
68
        Total number of features
69
    Returns
70
    -------
71
    cidx: float
72
        Consistency index between the k sets.
73
    Reference
74
    ---------
75
    Kuncheva, L.I. (2007). A Stability Index for Feature Selection.
76
    AIAC, pp. 390--395.
77
    """
78
    cidx = 0.
79
    for k1, sel1 in enumerate(sel_list[:-1]):
80
        # sel_list[:-1] to not take into account the last list.
81
        # avoid a problem with sel_list[k1+1:] when k1 is the last element,
82
        # that give an empty list overwise
83
        # the work is done at the second to last element anyway
84
        for sel2 in sel_list[k1+1:]:
85
            cidx += consistency_index(set(sel1), set(sel2), num_features)
86
    cidx = 2.  * cidx / (len(sel_list) * (len(sel_list) - 1))
87
    return "{0:.4f}".format(cidx)
88
89
90
def avg_importance(sa, sb):
91
    sc = sa.add(sb, fill_value=None).dropna() / 2
92
    sd = sa.add(sb, fill_value=0).drop(sc.index)
93
    return sc.append(sd)
94
95
96
def weight_variable(shape):
97
    initial = tf.truncated_normal(shape, stddev=0.1)
98
    return tf.Variable(initial)
99
100
101
def bias_variable(shape):
102
    initial = tf.constant(0.1, shape=shape)
103
    return tf.Variable(initial)
104
105
106
def conv2d(x, w):
107
    return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
108
109
110
def max_pool_2x2(x):
111
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
112
113
114
def cnn(x_train, x_test, y_train, y_test):
115
    L1 = 32  # number of convolutions for first layer
116
    L2 = 64  # number of convolutions for second layer
117
    L3 = 512  # number of neurons for dense layer
118
    learning_date = 1e-4  # learning rate
119
    epochs = 50  # number of times we loop through training data
120
    batch_size = 16  # number of data per batch
121
    display_step = 1
122
123
    loss_rec = np.zeros([epochs, 1])
124
    training_eval = np.zeros([epochs, 2])
125
126
    features = x_train.shape[1]
127
    classes = y_train.shape[1]
128
129
    xs = tf.placeholder(tf.float32, [None, features])
130
    ys = tf.placeholder(tf.float32, [None, classes])
131
    keep_prob = tf.placeholder(tf.float32)
132
    x_shape = tf.reshape(xs, [-1, 1, features, 1])
133
134
    # first conv
135
    w_conv1 = weight_variable([5, 5, 1, L1])
136
    b_conv1 = bias_variable([L1])
137
    h_conv1 = tf.nn.relu(conv2d(x_shape, w_conv1) + b_conv1)
138
    h_pool1 = max_pool_2x2(h_conv1)
139
140
    # second conv
141
    w_conv2 = weight_variable([5, 5, L1, L2])
142
    b_conv2 = bias_variable([L2])
143
    h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
144
    h_pool2 = max_pool_2x2(h_conv2)
145
146
    tmp_shape = (int)(math.ceil(features / 4.0))
147
    h_pool2_flat = tf.reshape(h_pool2, [-1, 1 * tmp_shape * L2])
148
149
    # third dense layer,full connected
150
    w_fc1 = weight_variable([1 * tmp_shape * L2, L3])
151
    b_fc1 = bias_variable([L3])
152
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
153
154
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
155
156
    # fourth layer, output
157
    w_fc2 = weight_variable([L3, classes])
158
    b_fc2 = bias_variable([classes])
159
    y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, w_fc2) + b_fc2)
160
161
    cost = tf.reduce_mean(-tf.reduce_sum(ys * tf.log(y_conv), reduction_indices=[1]))
162
    optimizer = tf.train.AdamOptimizer(learning_date).minimize(cost)
163
164
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(ys, 1))
165
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
166
167
    with tf.Session() as sess:
168
        sess.run(tf.global_variables_initializer())
169
        total_batch = int(np.shape(x_train)[0] / batch_size)
170
        for epoch in range(epochs):
171
            avg_cost = 0.
172
            x_tmp, y_tmp = shuffle(x_train, y_train)
173
            for i in range(total_batch - 1):
174
                batch_x, batch_y = x_tmp[i * batch_size:i * batch_size + batch_size], \
175
                                   y_tmp[i * batch_size:i * batch_size + batch_size]
176
                _, c, acc = sess.run([optimizer, cost, accuracy],
177
                                     feed_dict={xs: batch_x, ys: batch_y, keep_prob: 0.8})
178
                avg_cost += c / total_batch
179
180
            del x_tmp
181
            del y_tmp
182
183
            ## Display logs per epoch step
184
            if epoch % display_step == 0:
185
                loss_rec[epoch] = avg_cost
186
                acc, y_s = sess.run([accuracy, y_conv],
187
                                    feed_dict={xs: x_train, ys: y_train, keep_prob: 1})
188
                auc = metrics.roc_auc_score(y_train, y_s)
189
                training_eval[epoch] = [acc, auc]
190
                print("Epoch:", '%d' % (epoch + 1), "cost =", "{:.9f}".format(avg_cost),
191
                      "Training accuracy:", round(acc, 3), " Training auc:", round(auc, 3))
192
193
        y_pred = y_conv.eval(feed_dict={xs: x_test, ys: y_test, keep_prob: 1.0})[:, 1]
194
195
        return y_pred
196
197
198
def mRMR(x_train, y_train, n_features):
199
    x_train.insert(loc=0, column='class', value=y_train)
200
    features = pymrmr.mRMR(x_train, 'MIQ', n_features)
201
202
    column_name = x_train.columns.tolist()
203
    results = []
204
    for feature_index in features:
205
        idx = column_name.index(feature_index)
206
        results.append(idx)
207
208
    return results
209
210
211
def svm_rfe(x_train, y_train, n_features):
212
    estimator = SVR(kernel="linear")
213
    selector = RFE(estimator, n_features_to_select=n_features)
214
    selector = selector.fit(x_train.values, y_train)
215
    column_name = x_train.columns.tolist()
216
217
    features = []
218
    for feature_name, feature_ind in zip(column_name, selector.ranking_):
219
        if feature_ind == 1:
220
            features.append(column_name.index(feature_name))
221
222
    return features
223
224
225
def reliefF(x_train, y_train, n_features):
226
    fs = ReliefF(n_features_to_select=n_features)
227
    fs.fit(x_train.values, y_train)
228
229
    return list(fs.top_features_)[:n_features]
230
231
232
def df(x_train, y_train, n_features):
233
    config = load_json("demo_ca.json")
234
    gc = GCForest(config)
235
    X_train = x_train.values.reshape(-1, 1, len(x_train.columns))
236
237
    _, _features = gc.fit_transform(X_train, y_train)
238
    _features = _features.sort_values(ascending=False)
239
    return _features.index.values.tolist()[:n_features]