import numpy as np
import tensorflow as tf
import pandas as pd
from time import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from yellowfin import YFOptimizer
from numpy.random import seed
seed(2020)
from tensorflow import set_random_seed
set_random_seed(2020)
class DeepFM(BaseEstimator, TransformerMixin):
def __init__(self,
feature_size,
field_size,
embedding_size=32,
dropout_fm=[1.0, 1.0],
deep_layers=[32, 32],
dropout_deep=[1.0,1.0,1.0,],
deep_layer_activation=tf.nn.relu,
epoch=10,
batch_size=50,
learning_rate=0.0001,
optimizer="adam",
batch_norm=0.6,
batch_norm_decay=0.90,
verbose=False,
random_seed=2016,
use_fm=True,
use_deep=True,
loss_type="logloss",
eval_metric=roc_auc_score,
l2_reg=0.001,
greater_is_better=True):
assert (use_fm or use_deep)
assert loss_type in ["logloss", "mse"], "loss_type can be either 'logloss' for classification task or 'mse' for regression task"
self.feature_size = feature_size
self.field_size = field_size
self.embedding_size = embedding_size
self.dropout_fm = dropout_fm
self.deep_layers = deep_layers
self.dropout_dep = dropout_deep
self.deep_layers_activation = deep_layer_activation
self.use_fm = use_fm
self.use_deep = use_deep
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.greater_is_better = greater_is_better
self.train_result,self.valid_result,self.test_result = [],[],[]
self._init_graph()
def _init_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
tf.set_random_seed(self.random_seed)
#input data
with tf.name_scope('inputs'):
self.feat_index = tf.placeholder(tf.int32,shape=[None,None],name='feat_index')
self.feat_value = tf.placeholder(tf.float32,shape=[None,None], name='feat_value')
self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
self.dropout_keep_fm = tf.placeholder(tf.float32,shape=[None],name='dropout_keep_fm')
self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_deep_deep')
self.train_phase = tf.placeholder(tf.bool,name='train_phase')
#weight
self.weights = self._initialize_weights()
# model
self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
feat_value = tf.reshape(self.feat_value,shape=[-1,self.field_size,1])
self.embeddings = tf.multiply(self.embeddings,feat_value)
# first order term
self.y_first_order = tf.nn.embedding_lookup(self.weights['feature_bias'],self.feat_index)
self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order,feat_value),2)
self.y_first_order = tf.nn.dropout(self.y_first_order,self.dropout_keep_fm[0])
# second order term
# sum-square-part
self.summed_features_emb = tf.reduce_sum(self.embeddings,1) # None * k
self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K
# squre-sum-part
self.squared_features_emb = tf.square(self.embeddings)
self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1) # None * K
#second order
self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square,self.squared_sum_features_emb)
self.y_second_order = tf.nn.dropout(self.y_second_order,self.dropout_keep_fm[1])
# Deep component
self.y_deep = tf.reshape(self.embeddings,shape=[-1,self.field_size * self.embedding_size])
self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[0])
#deep_layers_activation
for i in range(0,len(self.deep_layers)):
self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights["layer_%d" %i]), self.weights["bias_%d"%i])
if self.batch_norm:
self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1
#print("norm yes")
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
#----DeepFM---------
if self.use_fm and self.use_deep:
concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1)
elif self.use_fm:
concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1)
elif self.use_deep:
concat_input = self.y_deep
self.out = tf.add(tf.matmul(concat_input,self.weights['concat_projection']),self.weights['concat_bias'])
# loss
with tf.name_scope('loss'):
if self.loss_type == "logloss":
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label, self.out)
elif self.loss_type == "mse":
self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
# l2 regularization on weights
if self.l2_reg > 0:
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["concat_projection"])
if self.use_deep:
for i in range(len(self.deep_layers)):
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["layer_%d" % i])
tf.summary.scalar('loss', self.loss)
with tf.name_scope('train_optimizer'):
if self.optimizer_type == "adam":
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == "gd":
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == "momentum":
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss)
elif self.optimizer_type == "yellowfin":
self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss)
#init
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.merged = tf.summary.merge_all()
self.train_writer = tf.summary.FileWriter("logs/20210506_lasso_xgboost",self.sess.graph)
self.test_writer = tf.summary.FileWriter("logs/20210506_lasso_xgboost", self.sess.graph)
self.sess.run(init)
# number of params
total_parameters = 0
for variable in self.weights.values():
shape = variable.get_shape()
variable_parameters = 1
for dim in shape:
variable_parameters *= dim.value
total_parameters += variable_parameters
if self.verbose > 0:
print("#params: %d" % total_parameters)
def _initialize_weights(self):
weights = dict()
with tf.name_scope('layer'):
#embeddings
with tf.name_scope('feature_embeddings'):
weights['feature_embeddings'] = tf.Variable(tf.random_normal([self.feature_size,self.embedding_size],0.0,0.01),name='feature_embeddings')
tf.summary.histogram('feature_embeddings', weights['feature_embeddings'])
with tf.name_scope('_feature_bias'):
weights['feature_bias'] = tf.Variable(tf.random_normal([self.feature_size,1],0.0,1.0),name='feature_bias')
tf.summary.histogram('feature_bias', weights['feature_bias'])
#deep layers
num_layer = len(self.deep_layers)
input_size = self.field_size * self.embedding_size
glorot = np.sqrt(2.0/(input_size + self.deep_layers[0]))
with tf.name_scope('layer_0'):
weights['layer_0'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(input_size,self.deep_layers[0])),dtype=np.float32,name="weights_layer_0")
tf.summary.histogram('layer_0', weights['layer_0'])
with tf.name_scope('bias_0'):
weights['bias_0'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32,name="weights_bias_0")
tf.summary.histogram('bias_0', weights['bias_0'])
for i in range(1,num_layer):
layer_names="layer_%d" % i
glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
with tf.name_scope("layer_%d" % i):
weights["layer_%d" % i] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),name="layer_",dtype=np.float32) # layers[i-1] * layers[i]
tf.summary.histogram(layer_names+'layer' , weights["layer_%d" % i])
with tf.name_scope("bias_%d" % i):
weights["bias_%d" % i] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),name="bias_",dtype=np.float32) # 1 * layer[i]
tf.summary.histogram(layer_names+'bias' , weights["bias_%d" % i])
# final concat projection layer
if self.use_fm and self.use_deep:
input_size = self.field_size + self.embedding_size + self.deep_layers[-1]
elif self.use_fm:
input_size = self.field_size + self.embedding_size
elif self.use_deep:
input_size = self.deep_layers[-1]
glorot = np.sqrt(2.0/(input_size + 1))
with tf.name_scope("weights_concat_projection"):
weights['concat_projection'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(input_size,1)),dtype=np.float32,name="concat_projection")
tf.summary.histogram('concat_projection' , weights['concat_projection'])
with tf.name_scope("weights_concat_bias"):
weights['concat_bias'] = tf.Variable(tf.constant(0.01),dtype=np.float32,name="concat_bias")
tf.summary.histogram('concat_bias' , weights['concat_bias'])
return weights
def batch_norm_layer(self, x, train_phase, scope_bn):
bn_train = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,
is_training=True, reuse=None, trainable=True, scope=scope_bn)
bn_inference = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,
is_training=False, reuse=True, trainable=True, scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
def get_batch(self,Xi,Xv,y,batch_size,index):
start = index * batch_size
end = (index + 1) * batch_size
end = end if end < len(y) else len(y)
return Xi[start:end],Xv[start:end],[[y_] for y_ in y[start:end]]
# shuffle three lists simutaneously
def shuffle_in_unison_scary(self, a, b, c):
rng_state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(rng_state)
np.random.shuffle(b)
np.random.set_state(rng_state)
np.random.shuffle(c)
def evaluate(self, Xi, Xv, y):
"""
:param Xi: list of list of feature indices of each sample in the dataset
:param Xv: list of list of feature values of each sample in the dataset
:param y: label of each sample in the dataset
:return: metric of the evaluation
"""
y_pred = self.predict(Xi, Xv)
return self.eval_metric(y, y_pred)#roc
def predict(self, Xi, Xv):
"""
:param Xi: list of list of feature indices of each sample in the dataset
:param Xv: list of list of feature values of each sample in the dataset
:return: predicted probability of each sample
"""
# dummy y
dummy_y = [1] * len(Xi)
batch_index = 0
Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)
y_pred = None
while len(Xi_batch) > 0:
num_batch = len(y_batch)
feed_dict = {self.feat_index: Xi_batch,
self.feat_value: Xv_batch,
#self.label: y_batch,
self.dropout_keep_fm: [1.0] * len(self.dropout_fm),
self.dropout_keep_deep: [1.0] * len(self.dropout_dep),#dropout
self.train_phase: False}
batch_out = self.sess.run(self.out, feed_dict=feed_dict)
if batch_index == 0:
y_pred = np.reshape(batch_out, (num_batch,))
else:
y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,))))
batch_index += 1
Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)
return y_pred
def fit_on_batch(self,Xi,Xv,y):
feed_dict = {self.feat_index:Xi,
self.feat_value:Xv,
self.label:y,
self.dropout_keep_fm: self.dropout_fm,
self.dropout_keep_deep: self.dropout_dep,
self.train_phase:True}
loss,opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
rs = self.sess.run(self.merged,feed_dict = feed_dict)
return loss,rs
def fit(self, Xi_train, Xv_train, y_train,
Xi_valid=None, Xv_valid=None, y_valid=None,
Xi_test=None, Xv_test=None,y_test=None,
early_stopping=True, refit=False):
"""
:param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
indi_j is the feature index of feature field j of sample i in the training set
:param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
vali_j is the feature value of feature field j of sample i in the training set
vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
:param y_train: label of each sample in the training set
:param Xi_valid: list of list of feature indices of each sample in the validation set
:param Xv_valid: list of list of feature values of each sample in the validation set
:param y_valid: label of each sample in the validation set
:param early_stopping: perform early stopping or not
:param refit: refit the model on the train+valid dataset or not
:return: None
"""
loss_batch = []
has_valid = Xv_valid is not None
# =============================================================================
# Xv_resampled, y_resampled1 = RandomOverSampler(random_state=42).fit_sample(pd.DataFrame(Xv_train), pd.DataFrame(y_train))
# Xi_resampled, y_resampled2 = RandomOverSampler(random_state=42).fit_sample(pd.DataFrame(Xi_train), pd.DataFrame(y_train))
# if all(y_resampled1 == y_resampled2) :
# print(Xv_resampled.shape)
# print(pd.DataFrame(Xv_train).shape)
# Xv_train = Xv_resampled.tolist()
# Xi_train = Xi_resampled.tolist()
# y_train = y_resampled1.tolist()
# =============================================================================
for epoch in range(self.epoch):
t1 = time()
self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
total_batch = int(len(y_train) / self.batch_size)
for i in range(total_batch):
Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)
loss,rs = self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
loss_batch.append(loss)
# print('loss = %.4f' % loss)
self.train_writer.add_summary(rs,epoch)
# evaluate training and validation datasets
train_result = self.evaluate(Xi_train, Xv_train, y_train)
self.train_result.append(train_result)
if has_valid:
valid_result = self.evaluate(Xi_valid, Xv_valid, y_valid)
self.valid_result.append(valid_result)
# test_result = self.evaluate(Xi_test, Xv_test, y_test)
# self.test_result.append(test_result)
if self.verbose > 0 and epoch % self.verbose == 0:
if has_valid:
# print("[%d] train-result=%.4f, valid-result=%.4f ,test-result=%.4f [%.1f s]"
# % (epoch + 1, train_result, valid_result, test_result,time() - t1))
continue
else:
# print("[%d] train-result=%.4f [%.1f s]"
# % (epoch + 1, train_result, time() - t1))
continue
#early_stopping
if has_valid and early_stopping and self.training_termination(self.valid_result):
break
# fit a few more epoch on train+valid until result reaches the best_train_score
if has_valid and refit:
if self.greater_is_better:
best_valid_score = max(self.valid_result)
else:
best_valid_score = min(self.valid_result)
best_epoch = self.valid_result.index(best_valid_score)
best_train_score = self.train_result[best_epoch]
Xi_train = Xi_train + Xi_valid
Xv_train = Xv_train + Xv_valid
y_train = y_train + y_valid
for epoch in range(100):
self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
total_batch = int(len(y_train) / self.batch_size)
for i in range(total_batch):
Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train,self.batch_size, i)
loss,rs = self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
# print('loss2 = %.4f' % loss)
# check
train_result = self.evaluate(Xi_train, Xv_train, y_train)
if abs(train_result - best_train_score) < 0.001 or \
(self.greater_is_better and train_result > best_train_score) or \
((not self.greater_is_better) and train_result < best_train_score):
break
return loss_batch
def training_termination(self, valid_result):
if len(valid_result) > 5:
if self.greater_is_better:
if valid_result[-1] < valid_result[-2] and \
valid_result[-2] < valid_result[-3] and \
valid_result[-3] < valid_result[-4] and \
valid_result[-4] < valid_result[-5]:
return True
else:
if valid_result[-1] > valid_result[-2] and \
valid_result[-2] > valid_result[-3] and \
valid_result[-3] > valid_result[-4] and \
valid_result[-4] > valid_result[-5]:
return True
return False