a b/diff_sex/DeepFM.py
1
import numpy as np
2
import tensorflow as tf
3
import pandas as pd
4
from time import time
5
from sklearn.base import BaseEstimator, TransformerMixin
6
from sklearn.metrics import roc_auc_score
7
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
8
from imblearn.under_sampling import RandomUnderSampler
9
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
10
from yellowfin import YFOptimizer
11
from numpy.random import seed
12
seed(2020)
13
from tensorflow import set_random_seed
14
set_random_seed(2020) 
15
16
class DeepFM(BaseEstimator, TransformerMixin):
17
18
    def __init__(self, 
19
                 feature_size, 
20
                 field_size,
21
                 embedding_size=32, 
22
                 dropout_fm=[1.0, 1.0],
23
                 deep_layers=[32, 32], 
24
                 dropout_deep=[1.0,1.0,1.0,],
25
                 deep_layer_activation=tf.nn.relu,
26
                 epoch=10, 
27
                 batch_size=50,
28
                 learning_rate=0.0001, 
29
                 optimizer="adam",
30
                 batch_norm=0.6, 
31
                 batch_norm_decay=0.90,
32
                 verbose=False, 
33
                 random_seed=2016,
34
                 use_fm=True, 
35
                 use_deep=True,
36
                 loss_type="logloss", 
37
                 eval_metric=roc_auc_score,
38
                 l2_reg=0.001, 
39
                 greater_is_better=True):
40
        assert (use_fm or use_deep)
41
        assert loss_type in ["logloss", "mse"], "loss_type can be either 'logloss' for classification task or 'mse' for regression task"
42
43
        self.feature_size = feature_size
44
        self.field_size = field_size
45
        self.embedding_size = embedding_size
46
47
        self.dropout_fm = dropout_fm
48
        self.deep_layers = deep_layers
49
        self.dropout_dep = dropout_deep
50
        self.deep_layers_activation = deep_layer_activation
51
        self.use_fm = use_fm
52
        self.use_deep = use_deep
53
        self.l2_reg = l2_reg
54
55
        self.epoch = epoch
56
        self.batch_size = batch_size
57
        self.learning_rate = learning_rate
58
        self.optimizer_type = optimizer
59
60
        self.batch_norm = batch_norm
61
        self.batch_norm_decay = batch_norm_decay
62
63
        self.verbose = verbose
64
        self.random_seed = random_seed
65
        self.loss_type = loss_type
66
        self.eval_metric = eval_metric
67
        self.greater_is_better = greater_is_better
68
        self.train_result,self.valid_result,self.test_result  = [],[],[]
69
70
        self._init_graph()
71
72
    def _init_graph(self):
73
        self.graph = tf.Graph()
74
        with self.graph.as_default():
75
            tf.set_random_seed(self.random_seed)
76
            #input data
77
            with tf.name_scope('inputs'):
78
79
                self.feat_index = tf.placeholder(tf.int32,shape=[None,None],name='feat_index')
80
                self.feat_value = tf.placeholder(tf.float32,shape=[None,None], name='feat_value')
81
                self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
82
            self.dropout_keep_fm = tf.placeholder(tf.float32,shape=[None],name='dropout_keep_fm')
83
            self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_deep_deep')
84
            self.train_phase = tf.placeholder(tf.bool,name='train_phase')
85
            
86
            #weight
87
            self.weights = self._initialize_weights()
88
89
            # model
90
            self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
91
            feat_value = tf.reshape(self.feat_value,shape=[-1,self.field_size,1])
92
            self.embeddings = tf.multiply(self.embeddings,feat_value)
93
94
95
            # first order term
96
            self.y_first_order = tf.nn.embedding_lookup(self.weights['feature_bias'],self.feat_index)
97
            self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order,feat_value),2)
98
            self.y_first_order = tf.nn.dropout(self.y_first_order,self.dropout_keep_fm[0])
99
100
            # second order term
101
            # sum-square-part
102
            self.summed_features_emb = tf.reduce_sum(self.embeddings,1) # None * k
103
            self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K
104
105
            # squre-sum-part
106
            self.squared_features_emb = tf.square(self.embeddings)
107
            self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1)  # None * K
108
109
            #second order
110
            self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square,self.squared_sum_features_emb)
111
            self.y_second_order = tf.nn.dropout(self.y_second_order,self.dropout_keep_fm[1])
112
113
114
            # Deep component
115
            self.y_deep = tf.reshape(self.embeddings,shape=[-1,self.field_size * self.embedding_size])
116
            self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[0])
117
            
118
            
119
            #deep_layers_activation
120
            for i in range(0,len(self.deep_layers)):
121
                self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights["layer_%d" %i]), self.weights["bias_%d"%i])
122
                if self.batch_norm:
123
                    self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1
124
                    #print("norm yes")
125
                self.y_deep = self.deep_layers_activation(self.y_deep)
126
                self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
127
128
129
            #----DeepFM---------
130
            if self.use_fm and self.use_deep:
131
                concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1)
132
            elif self.use_fm:
133
                concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1)
134
            elif self.use_deep:
135
                concat_input = self.y_deep
136
137
            self.out = tf.add(tf.matmul(concat_input,self.weights['concat_projection']),self.weights['concat_bias'])
138
139
            # loss
140
            with tf.name_scope('loss'):
141
                if self.loss_type == "logloss":
142
                    self.out = tf.nn.sigmoid(self.out)
143
                    self.loss = tf.losses.log_loss(self.label, self.out)
144
                elif self.loss_type == "mse":
145
                    self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
146
                # l2 regularization on weights
147
                if self.l2_reg > 0:
148
                    self.loss += tf.contrib.layers.l2_regularizer(
149
                        self.l2_reg)(self.weights["concat_projection"])
150
                    if self.use_deep:
151
                        for i in range(len(self.deep_layers)):
152
                            self.loss += tf.contrib.layers.l2_regularizer(
153
                                self.l2_reg)(self.weights["layer_%d" % i])
154
                tf.summary.scalar('loss', self.loss)
155
                            
156
            with tf.name_scope('train_optimizer'):
157
                if self.optimizer_type == "adam":
158
                    self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,epsilon=1e-8).minimize(self.loss)
159
                elif self.optimizer_type == "adagrad":
160
                    self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss)
161
                elif self.optimizer_type == "gd":
162
                    self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
163
                elif self.optimizer_type == "momentum":
164
                    self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss)
165
                elif self.optimizer_type == "yellowfin":
166
                    self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss)
167
168
169
            #init
170
            self.saver = tf.train.Saver()
171
            init = tf.global_variables_initializer()
172
            self.sess = tf.Session()
173
            self.merged = tf.summary.merge_all()
174
            self.train_writer = tf.summary.FileWriter("logs/20210506_lasso_xgboost",self.sess.graph)
175
            self.test_writer = tf.summary.FileWriter("logs/20210506_lasso_xgboost", self.sess.graph)
176
            self.sess.run(init)
177
178
            # number of params
179
            total_parameters = 0
180
            for variable in self.weights.values():
181
                shape = variable.get_shape()
182
                variable_parameters = 1
183
                for dim in shape:
184
                    variable_parameters *= dim.value
185
                total_parameters += variable_parameters
186
            if self.verbose > 0:
187
                print("#params: %d" % total_parameters)
188
189
190
191
192
193
    def _initialize_weights(self):
194
        weights = dict()
195
196
        
197
        with tf.name_scope('layer'):
198
            
199
            #embeddings
200
            with tf.name_scope('feature_embeddings'):
201
                weights['feature_embeddings'] = tf.Variable(tf.random_normal([self.feature_size,self.embedding_size],0.0,0.01),name='feature_embeddings')
202
                
203
                tf.summary.histogram('feature_embeddings', weights['feature_embeddings']) 
204
            
205
            with tf.name_scope('_feature_bias'):       
206
                weights['feature_bias'] = tf.Variable(tf.random_normal([self.feature_size,1],0.0,1.0),name='feature_bias')
207
                
208
                tf.summary.histogram('feature_bias', weights['feature_bias']) 
209
210
211
            #deep layers
212
            num_layer = len(self.deep_layers)
213
            input_size = self.field_size * self.embedding_size
214
            glorot = np.sqrt(2.0/(input_size + self.deep_layers[0]))
215
            
216
            with tf.name_scope('layer_0'):    
217
                weights['layer_0'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(input_size,self.deep_layers[0])),dtype=np.float32,name="weights_layer_0")
218
                tf.summary.histogram('layer_0', weights['layer_0']) 
219
            with tf.name_scope('bias_0'):
220
                weights['bias_0'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32,name="weights_bias_0")
221
                tf.summary.histogram('bias_0', weights['bias_0']) 
222
    
223
    
224
            for i in range(1,num_layer):
225
                layer_names="layer_%d" % i
226
                glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
227
                with tf.name_scope("layer_%d" % i):
228
                    weights["layer_%d" % i] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),name="layer_",dtype=np.float32)  # layers[i-1] * layers[i]
229
                    tf.summary.histogram(layer_names+'layer' , weights["layer_%d" % i]) 
230
                with tf.name_scope("bias_%d" % i):
231
                    weights["bias_%d" % i] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),name="bias_",dtype=np.float32)  # 1 * layer[i]
232
                    tf.summary.histogram(layer_names+'bias' , weights["bias_%d" % i]) 
233
    
234
    
235
            # final concat projection layer
236
    
237
            if self.use_fm and self.use_deep:
238
                input_size = self.field_size + self.embedding_size + self.deep_layers[-1]
239
            elif self.use_fm:
240
                input_size = self.field_size + self.embedding_size
241
            elif self.use_deep:
242
                input_size = self.deep_layers[-1]
243
    
244
            glorot = np.sqrt(2.0/(input_size + 1))
245
            with tf.name_scope("weights_concat_projection"):
246
                weights['concat_projection'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(input_size,1)),dtype=np.float32,name="concat_projection")
247
                tf.summary.histogram('concat_projection' , weights['concat_projection']) 
248
            with tf.name_scope("weights_concat_bias"):
249
                weights['concat_bias'] = tf.Variable(tf.constant(0.01),dtype=np.float32,name="concat_bias")
250
                tf.summary.histogram('concat_bias' , weights['concat_bias']) 
251
252
253
        return weights
254
        
255
    def batch_norm_layer(self, x, train_phase, scope_bn):
256
        bn_train = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,
257
                              is_training=True, reuse=None, trainable=True, scope=scope_bn)
258
        bn_inference = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,
259
                                  is_training=False, reuse=True, trainable=True, scope=scope_bn)
260
        z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
261
        return z
262
263
264
    def get_batch(self,Xi,Xv,y,batch_size,index):
265
        start = index * batch_size
266
        end = (index + 1) * batch_size
267
        end = end if end < len(y) else len(y)
268
        return Xi[start:end],Xv[start:end],[[y_] for y_ in y[start:end]]
269
270
    # shuffle three lists simutaneously
271
    def shuffle_in_unison_scary(self, a, b, c):
272
        rng_state = np.random.get_state()
273
        np.random.shuffle(a)
274
        np.random.set_state(rng_state)
275
        np.random.shuffle(b)
276
        np.random.set_state(rng_state)
277
        np.random.shuffle(c)
278
279
280
    def evaluate(self, Xi, Xv, y):
281
        """
282
        :param Xi: list of list of feature indices of each sample in the dataset
283
        :param Xv: list of list of feature values of each sample in the dataset
284
        :param y: label of each sample in the dataset
285
        :return: metric of the evaluation
286
        """
287
        y_pred = self.predict(Xi, Xv)
288
        return self.eval_metric(y, y_pred)#roc
289
290
    def predict(self, Xi, Xv):
291
        """
292
        :param Xi: list of list of feature indices of each sample in the dataset
293
        :param Xv: list of list of feature values of each sample in the dataset
294
        :return: predicted probability of each sample
295
        """
296
        # dummy y
297
        dummy_y = [1] * len(Xi)
298
        batch_index = 0
299
        Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)
300
        y_pred = None
301
        while len(Xi_batch) > 0:
302
            num_batch = len(y_batch)
303
            feed_dict = {self.feat_index: Xi_batch,
304
                         self.feat_value: Xv_batch,
305
                         #self.label: y_batch,
306
                         self.dropout_keep_fm: [1.0] * len(self.dropout_fm),
307
                         self.dropout_keep_deep: [1.0] * len(self.dropout_dep),#dropout
308
                         self.train_phase: False}
309
            
310
            batch_out = self.sess.run(self.out, feed_dict=feed_dict)
311
312
            if batch_index == 0:
313
                y_pred = np.reshape(batch_out, (num_batch,))
314
            else:
315
                y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,))))
316
317
            batch_index += 1
318
            Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)
319
320
        return y_pred
321
322
323
    def fit_on_batch(self,Xi,Xv,y):
324
        feed_dict = {self.feat_index:Xi,
325
                     self.feat_value:Xv,
326
                     self.label:y,
327
                     self.dropout_keep_fm:  self.dropout_fm,
328
                     self.dropout_keep_deep:  self.dropout_dep,
329
                     self.train_phase:True}
330
331
        loss,opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
332
        rs = self.sess.run(self.merged,feed_dict = feed_dict)
333
334
        return loss,rs
335
336
    def fit(self, Xi_train, Xv_train, y_train,
337
            Xi_valid=None, Xv_valid=None, y_valid=None,
338
            Xi_test=None, Xv_test=None,y_test=None,
339
            early_stopping=True, refit=False):
340
        """
341
        :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
342
                         indi_j is the feature index of feature field j of sample i in the training set
343
        :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
344
                         vali_j is the feature value of feature field j of sample i in the training set
345
                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
346
        :param y_train: label of each sample in the training set
347
        :param Xi_valid: list of list of feature indices of each sample in the validation set
348
        :param Xv_valid: list of list of feature values of each sample in the validation set
349
        :param y_valid: label of each sample in the validation set
350
        :param early_stopping: perform early stopping or not
351
        :param refit: refit the model on the train+valid dataset or not
352
        :return: None
353
        """
354
        loss_batch = []
355
        has_valid = Xv_valid is not None
356
        
357
# =============================================================================
358
#         Xv_resampled, y_resampled1 = RandomOverSampler(random_state=42).fit_sample(pd.DataFrame(Xv_train), pd.DataFrame(y_train))
359
#         Xi_resampled, y_resampled2 = RandomOverSampler(random_state=42).fit_sample(pd.DataFrame(Xi_train), pd.DataFrame(y_train))
360
#         if all(y_resampled1 == y_resampled2) :
361
#             print(Xv_resampled.shape)
362
#             print(pd.DataFrame(Xv_train).shape)
363
#             Xv_train = Xv_resampled.tolist()
364
#             Xi_train = Xi_resampled.tolist()
365
#             y_train = y_resampled1.tolist()
366
# =============================================================================
367
            
368
        for epoch in range(self.epoch):
369
            t1 = time()
370
            self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
371
            total_batch = int(len(y_train) / self.batch_size)
372
            
373
            for i in range(total_batch):
374
                Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)
375
                loss,rs = self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
376
                loss_batch.append(loss)
377
#                print('loss = %.4f' % loss)
378
                self.train_writer.add_summary(rs,epoch) 
379
380
            # evaluate training and validation datasets
381
            train_result = self.evaluate(Xi_train, Xv_train, y_train)
382
            self.train_result.append(train_result)
383
            if has_valid:
384
                valid_result = self.evaluate(Xi_valid, Xv_valid, y_valid)
385
                self.valid_result.append(valid_result)
386
            
387
#                test_result = self.evaluate(Xi_test, Xv_test, y_test)
388
#                self.test_result.append(test_result)
389
                
390
            if self.verbose > 0 and epoch % self.verbose == 0:
391
                if has_valid:
392
#                    print("[%d] train-result=%.4f, valid-result=%.4f ,test-result=%.4f [%.1f s]"
393
#                        % (epoch + 1, train_result, valid_result, test_result,time() - t1))
394
                    continue
395
                else:
396
#                    print("[%d] train-result=%.4f [%.1f s]"
397
#                        % (epoch + 1, train_result, time() - t1))
398
                    continue
399
            #early_stopping
400
            if has_valid and early_stopping and self.training_termination(self.valid_result):
401
                break
402
403
        # fit a few more epoch on train+valid until result reaches the best_train_score
404
        
405
        if has_valid and refit:
406
            if self.greater_is_better:
407
                best_valid_score = max(self.valid_result)
408
            else:
409
                best_valid_score = min(self.valid_result)
410
            best_epoch = self.valid_result.index(best_valid_score)
411
            best_train_score = self.train_result[best_epoch]
412
            Xi_train = Xi_train + Xi_valid
413
            Xv_train = Xv_train + Xv_valid
414
            y_train = y_train + y_valid
415
            for epoch in range(100):
416
                self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
417
                total_batch = int(len(y_train) / self.batch_size)
418
                for i in range(total_batch):
419
                    Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train,self.batch_size, i)
420
                    loss,rs = self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
421
                    
422
#                    print('loss2 = %.4f' % loss)
423
                # check
424
                train_result = self.evaluate(Xi_train, Xv_train, y_train)
425
                if abs(train_result - best_train_score) < 0.001 or \
426
                    (self.greater_is_better and train_result > best_train_score) or \
427
                    ((not self.greater_is_better) and train_result < best_train_score):
428
                    break
429
        return loss_batch
430
431
432
    def training_termination(self, valid_result):
433
        if len(valid_result) > 5:
434
            if self.greater_is_better:
435
                if valid_result[-1] < valid_result[-2] and \
436
                    valid_result[-2] < valid_result[-3] and \
437
                    valid_result[-3] < valid_result[-4] and \
438
                    valid_result[-4] < valid_result[-5]:
439
                    return True
440
            else:
441
                if valid_result[-1] > valid_result[-2] and \
442
                    valid_result[-2] > valid_result[-3] and \
443
                    valid_result[-3] > valid_result[-4] and \
444
                    valid_result[-4] > valid_result[-5]:
445
                    return True
446
        return False
447
448
449
450
451
452
453
454
455
456
457
458
459