|
a |
|
b/diff_sex/main.py |
|
|
1 |
import os |
|
|
2 |
import numpy as np |
|
|
3 |
import pandas as pd |
|
|
4 |
import random |
|
|
5 |
import sklearn |
|
|
6 |
import tensorflow as tf |
|
|
7 |
from sklearn.metrics import make_scorer,roc_auc_score,confusion_matrix |
|
|
8 |
from sklearn.model_selection import StratifiedKFold |
|
|
9 |
from DataReader import FeatureDictionary, DataParser |
|
|
10 |
from matplotlib import pyplot as plt |
|
|
11 |
from imblearn.over_sampling import RandomOverSampler, SMOTE |
|
|
12 |
import config_20210816 as config |
|
|
13 |
from metrics import gini_norm |
|
|
14 |
from DeepFM import DeepFM |
|
|
15 |
from numpy.random import seed |
|
|
16 |
from scipy import interp |
|
|
17 |
from sklearn.metrics import roc_curve, auc, accuracy_score |
|
|
18 |
from sklearn.manifold import TSNE |
|
|
19 |
#seed(2020) |
|
|
20 |
from tensorflow import set_random_seed |
|
|
21 |
#set_random_seed(2020) |
|
|
22 |
from matplotlib import pyplot as plt |
|
|
23 |
import matplotlib.pylab as pl |
|
|
24 |
import shap |
|
|
25 |
plt.rc('font',family='Times New Roman') |
|
|
26 |
|
|
|
27 |
|
|
|
28 |
## In[*] |
|
|
29 |
def load_data(): |
|
|
30 |
dfTrain = pd.read_csv(config.TRAIN_FILE) |
|
|
31 |
dfTest = pd.read_csv(config.TEST_FILE) |
|
|
32 |
#sex |
|
|
33 |
dfTrain = dfTrain[dfTrain['SEX'] == config.sex] |
|
|
34 |
dfTrain = dfTrain.drop("SEX",axis=1) |
|
|
35 |
dfTest = dfTest[dfTest['SEX'] == config.sex] |
|
|
36 |
dfTest = dfTest.drop("SEX",axis=1) |
|
|
37 |
row_sequence= np.random.choice(dfTest.shape[0],50,replace=False, p=None) |
|
|
38 |
dfTest = dfTest.iloc[np.array(sorted(row_sequence)),:] |
|
|
39 |
ex_list = list(dfTest.ID) |
|
|
40 |
dfTest = dfTest[dfTest.ID.isin(ex_list)] |
|
|
41 |
|
|
|
42 |
cols = [c for c in dfTrain.columns if c not in ['ID','target']] |
|
|
43 |
#cols = [c for c in cols if (not c in config.IGNORE_COLS)] |
|
|
44 |
|
|
|
45 |
X_train = dfTrain[cols].values |
|
|
46 |
y_train = dfTrain['target'].values |
|
|
47 |
|
|
|
48 |
X_test = dfTest[cols].values |
|
|
49 |
ids_test = dfTest['ID'].values |
|
|
50 |
|
|
|
51 |
cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS] |
|
|
52 |
|
|
|
53 |
return dfTrain,dfTest,X_train,y_train,X_test,ids_test,cat_features_indices,ex_list,cols |
|
|
54 |
|
|
|
55 |
def run_base_model_dfm(dfTrain,dfTest,folds,dfm_params,ex_list): |
|
|
56 |
fd = FeatureDictionary(dfTrain=dfTrain,dfTest=dfTest, numeric_cols=config.NUMERIC_COLS,ignore_cols = config.IGNORE_COLS) |
|
|
57 |
data_parser = DataParser(feat_dict= fd) |
|
|
58 |
Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True) |
|
|
59 |
Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest) |
|
|
60 |
|
|
|
61 |
print(dfTrain.dtypes) |
|
|
62 |
|
|
|
63 |
dfm_params['feature_size'] = fd.feat_dim |
|
|
64 |
dfm_params['field_size'] = len(Xi_train[0]) |
|
|
65 |
|
|
|
66 |
y_train_meta = np.zeros((dfTrain.shape[0],1),dtype=float) |
|
|
67 |
y_test_meta = np.zeros((dfTest.shape[0],1),dtype=float) |
|
|
68 |
y_test_meta_all = np.zeros((dfTest.shape[0],1),dtype=float).T |
|
|
69 |
Threshold1 = np.zeros((1,1),dtype=float) |
|
|
70 |
Threshold2 = np.zeros((1,1),dtype=float) |
|
|
71 |
|
|
|
72 |
_get = lambda x,l:[x[i] for i in l] |
|
|
73 |
|
|
|
74 |
gini_results_cv = np.zeros(len(folds),dtype=float) |
|
|
75 |
gini_results_cv_test = np.zeros(len(folds),dtype=float) |
|
|
76 |
gini_results_epoch_train = np.zeros((len(folds),dfm_params['epoch']),dtype=float) |
|
|
77 |
gini_results_epoch_valid = np.zeros((len(folds),dfm_params['epoch']),dtype=float) |
|
|
78 |
gini_results_epoch_test = np.zeros((len(folds),dfm_params['epoch']),dtype=float) |
|
|
79 |
#y_test = pd.read_table("D:\\anaconda-python\\UMN_JHU_alldata\\trainUMN_testJHU\\pdata_test.txt",sep='\t') |
|
|
80 |
# y_test = pd.read_table("D:\\anaconda-python\\UMN_JHU_alldata\\trainUMN_testJHU\\new_result\\deepfm\\FHS\\pdata_woman_FHS_model.txt",sep='\t') |
|
|
81 |
# y_test = pd.read_table("D:\\anaconda-python\\learn_DL\\Basic-DeepFM-model\\data\\new_1126\\20210707deepfm_pdata_test.txt",sep=' ') |
|
|
82 |
# y_test = pd.read_table("D:\\anaconda-python\\learn_DL\\Basic-DeepFM-model\\data\\new_1126\\20210816deepfm_pdata_no_HFpEFtest.txt",sep=' ') |
|
|
83 |
# y_test = pd.read_table("D:\\anaconda-python\\learn_DL\\Basic-DeepFM-model\\data\\20201126deepfm_pdata_JHU.txt",sep=' ') |
|
|
84 |
y_test = pd.read_table("D:\\anaconda-python\\learn_DL\\Basic-DeepFM-model\\data\\new_1126\\20210817deepfm_pdata_sex_test.txt",sep=' ') |
|
|
85 |
y_test = y_test[y_test.ID.isin(ex_list)] |
|
|
86 |
y_test = y_test['JHU_DMP_new.chf'] |
|
|
87 |
# y_test = np.array(y_test)[:,0] |
|
|
88 |
#y_test = np.array(y_test['test_data.Heart.failure'].values.tolist()) |
|
|
89 |
|
|
|
90 |
random.seed( 1000 ) |
|
|
91 |
tprs = [] |
|
|
92 |
aucs = [] |
|
|
93 |
tprs_val = [] |
|
|
94 |
aucs_val = [] |
|
|
95 |
mean_fpr = np.linspace(0, 1, 100) |
|
|
96 |
mean_fpr_val = np.linspace(0, 1, 100) |
|
|
97 |
loss_batch = [] |
|
|
98 |
sensitivitys = [] |
|
|
99 |
specificitys = [] |
|
|
100 |
accs = [] |
|
|
101 |
sensitivitys_val = [] |
|
|
102 |
specificitys_val = [] |
|
|
103 |
accs_val = [] |
|
|
104 |
|
|
|
105 |
for i, (train_idx, valid_idx) in enumerate(folds): |
|
|
106 |
|
|
|
107 |
Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) |
|
|
108 |
Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) |
|
|
109 |
|
|
|
110 |
dfm = DeepFM(**dfm_params) |
|
|
111 |
random.seed( 1000 ) |
|
|
112 |
loss = dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_,Xi_test, Xv_test,y_test,early_stopping=False, refit=False) |
|
|
113 |
#print(len(loss)) |
|
|
114 |
loss_batch.append(loss[-1])#600 is epoch-1 |
|
|
115 |
random.seed( 1000 ) |
|
|
116 |
y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) |
|
|
117 |
random.seed( 1000 ) |
|
|
118 |
y_test_meta_ = dfm.predict(Xi_test, Xv_test) |
|
|
119 |
y_test_meta[:,0] += y_test_meta_ |
|
|
120 |
y_test_meta_all = np.vstack((y_test_meta_all[0:],y_test_meta_)) |
|
|
121 |
random.seed( 1000 ) |
|
|
122 |
#val |
|
|
123 |
gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx,0]) |
|
|
124 |
gini_results_epoch_train[i] = dfm.train_result |
|
|
125 |
gini_results_epoch_valid[i] = dfm.valid_result |
|
|
126 |
|
|
|
127 |
fpr, tpr, thresholds = roc_curve(y_valid_, y_train_meta[valid_idx,0]) |
|
|
128 |
tprs_val.append(interp(mean_fpr_val, fpr, tpr)) |
|
|
129 |
tprs_val[-1][0] = 0.0 |
|
|
130 |
roc_auc = auc(fpr, tpr) |
|
|
131 |
aucs_val.append(roc_auc) |
|
|
132 |
|
|
|
133 |
random.seed( 1000 ) |
|
|
134 |
threshold1 = thresholds[np.argmax(tpr - fpr)] |
|
|
135 |
pre = (np.array(y_train_meta[valid_idx,0]) >= threshold1) * 1 |
|
|
136 |
Threshold1 = np.vstack((Threshold1[0:],threshold1)) |
|
|
137 |
cm = confusion_matrix(y_valid_, pre, labels=[1, 0]) |
|
|
138 |
sensitivity = cm[0, 0] * 1.0 / (cm[0, 0] + cm[0, 1]) |
|
|
139 |
specificity = cm[1, 1] * 1.0 / (cm[1, 1] + cm[1, 0]) |
|
|
140 |
acc = sklearn.metrics.accuracy_score(y_valid_, pre) |
|
|
141 |
specificitys_val.append(specificity) |
|
|
142 |
sensitivitys_val.append(sensitivity) |
|
|
143 |
accs_val.append(acc) |
|
|
144 |
|
|
|
145 |
#test |
|
|
146 |
gini_results_cv_test[i] = gini_norm(y_test, y_test_meta_) |
|
|
147 |
gini_results_epoch_valid[i] = dfm.valid_result |
|
|
148 |
gini_results_epoch_test[i] = dfm.test_result |
|
|
149 |
|
|
|
150 |
fpr, tpr, thresholds = roc_curve(y_test, y_test_meta_) |
|
|
151 |
tprs.append(interp(mean_fpr, fpr, tpr)) |
|
|
152 |
tprs[-1][0] = 0.0 |
|
|
153 |
roc_auc = auc(fpr, tpr) |
|
|
154 |
aucs.append(roc_auc) |
|
|
155 |
|
|
|
156 |
random.seed( 1000 ) |
|
|
157 |
threshold2 = thresholds[np.argmax(tpr - fpr)] |
|
|
158 |
pre = (np.array(y_test_meta_) >= threshold2) * 1 |
|
|
159 |
Threshold2 = np.vstack((Threshold2[0:],threshold2)) |
|
|
160 |
cm = confusion_matrix(y_test, pre, labels=[1, 0]) |
|
|
161 |
sensitivity = cm[0, 0] * 1.0 / (cm[0, 0] + cm[0, 1]) |
|
|
162 |
specificity = cm[1, 1] * 1.0 / (cm[1, 1] + cm[1, 0]) |
|
|
163 |
acc = sklearn.metrics.accuracy_score(y_test, pre) |
|
|
164 |
specificitys.append(specificity) |
|
|
165 |
sensitivitys.append(sensitivity) |
|
|
166 |
accs.append(acc) |
|
|
167 |
|
|
|
168 |
#val |
|
|
169 |
mean_tpr_val = np.mean(tprs_val, axis=0) |
|
|
170 |
mean_tpr_val[-1] = 1.0 |
|
|
171 |
#mean_auc_val = auc(mean_fpr_val, mean_tpr_val) |
|
|
172 |
mean_auc_val = np.mean(aucs_val) |
|
|
173 |
std_auc_val = np.std(aucs_val) |
|
|
174 |
mean_sensitivity_val = np.mean(sensitivitys_val) |
|
|
175 |
std_sensitivity_val = np.std(sensitivitys_val) |
|
|
176 |
mean_specificity_val = np.mean(specificitys_val) |
|
|
177 |
std_specificity_val = np.std(specificitys_val) |
|
|
178 |
mean_acc_val = np.mean(accs_val) |
|
|
179 |
std_acc_val = np.std(accs_val) |
|
|
180 |
|
|
|
181 |
ROC_val = sklearn.metrics.roc_auc_score(y_train, y_train_meta) |
|
|
182 |
|
|
|
183 |
#test |
|
|
184 |
mean_tpr = np.mean(tprs, axis=0) |
|
|
185 |
mean_tpr[-1] = 1.0 |
|
|
186 |
#mean_auc = auc(mean_fpr, mean_tpr) |
|
|
187 |
mean_auc = np.mean(aucs) |
|
|
188 |
std_auc = np.std(aucs) |
|
|
189 |
mean_sensitivity = np.mean(sensitivitys) |
|
|
190 |
std_sensitivity = np.std(sensitivitys) |
|
|
191 |
mean_specificity = np.mean(specificitys) |
|
|
192 |
std_specificity = np.std(specificitys) |
|
|
193 |
mean_acc = np.mean(accs) |
|
|
194 |
std_acc = np.std(accs) |
|
|
195 |
|
|
|
196 |
y_test_meta /= float(len(folds)) |
|
|
197 |
y_test_meta = np.array(y_test_meta)[:,0] |
|
|
198 |
#cm |
|
|
199 |
fpr, tpr, thresholds = roc_curve(y_test, y_test_meta) |
|
|
200 |
threshold = thresholds[np.argmax(tpr - fpr)] |
|
|
201 |
pre = (np.array(y_test_meta) >= threshold) * 1 |
|
|
202 |
cm = confusion_matrix(y_test, pre, labels=[1, 0]) |
|
|
203 |
sensitivity = cm[0, 0] * 1.0 / (cm[0, 0] + cm[0, 1]) |
|
|
204 |
specificity = cm[1, 1] * 1.0 / (cm[1, 1] + cm[1, 0]) |
|
|
205 |
acc = sklearn.metrics.accuracy_score(y_test, pre) |
|
|
206 |
ROC1 = sklearn.metrics.roc_auc_score(y_test, y_test_meta) |
|
|
207 |
ROC2 = sklearn.metrics.roc_auc_score(y_test, pre) |
|
|
208 |
#pd.DataFrame({"ID": ids_test, "target": pre.flatten()}).to_csv(os.path.join(config.SUB_DIR, "pre_threshold.csv"), index=False, float_format="%.5f") |
|
|
209 |
# save result |
|
|
210 |
if dfm_params["use_fm"] and dfm_params["use_deep"]: |
|
|
211 |
clf_str = "deepFM %s"%(config.names) |
|
|
212 |
elif dfm_params["use_fm"]: |
|
|
213 |
clf_str = "1126_nolasso_xgboost_FM" |
|
|
214 |
elif dfm_params["use_deep"]: |
|
|
215 |
clf_str = "1126_nolasso_xgboost_DNN" |
|
|
216 |
print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) |
|
|
217 |
filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv_test.mean(), gini_results_cv_test.std()) |
|
|
218 |
_make_submission(ids_test, y_test_meta, pre,y_test_meta_all,threshold,Threshold1,Threshold2,filename) |
|
|
219 |
# _make_submission(ids_test, y_test_meta, pre,filename) |
|
|
220 |
|
|
|
221 |
_plot_fig(gini_results_epoch_train,gini_results_epoch_valid, gini_results_epoch_test, filename) |
|
|
222 |
_plot_tsne(X_test,y_test,filename) |
|
|
223 |
|
|
|
224 |
return threshold,cm, sensitivity,specificity,acc,ROC_val,ROC1,ROC2,loss_batch, y_train_meta, y_test_meta, mean_auc_val,std_auc_val,mean_sensitivity_val,std_sensitivity_val,mean_specificity_val,std_specificity_val,mean_acc_val,std_acc_val, mean_auc,std_auc,mean_sensitivity,std_sensitivity,mean_specificity,std_specificity,mean_acc,std_acc |
|
|
225 |
|
|
|
226 |
def _make_submission(ids, y_pred, pre, y_test_meta_all,threshold,Threshold1,Threshold2,filename="submission.csv"): |
|
|
227 |
pd.DataFrame({"ID": ids, "target": y_pred.flatten(),"y_target": pre.flatten()}).to_csv( |
|
|
228 |
os.path.join(config.SUB_DIR, filename), index=False, float_format="%.5f") |
|
|
229 |
pd.DataFrame(y_test_meta_all.T).to_csv( |
|
|
230 |
os.path.join(config.SUB_DIR, "traget_10_%s"%filename), index=False, float_format="%.5f") |
|
|
231 |
pd.DataFrame({"Threshold": threshold.flatten()}).to_csv( |
|
|
232 |
os.path.join(config.SUB_DIR, "Threshold_%s"%filename), index=False, float_format="%.5f") |
|
|
233 |
pd.DataFrame({"Threshold_val": Threshold1.flatten(),"Threshold_test": Threshold2.flatten()}).to_csv( |
|
|
234 |
os.path.join(config.SUB_DIR, "Threshold_valandtest_%s"%filename), index=False, float_format="%.5f") |
|
|
235 |
|
|
|
236 |
|
|
|
237 |
def _plot_fig(train_results, valid_results, test_results,model_name): |
|
|
238 |
colors = ["g", "b", "darkorange","c", "m", "grey","k", "pink", "y","r"] |
|
|
239 |
step = 1 |
|
|
240 |
xs = np.arange(1, train_results.shape[1]+1,step) |
|
|
241 |
xs = np.arange(1, train_results.shape[1]+1) |
|
|
242 |
plt.figure() |
|
|
243 |
legends = [] |
|
|
244 |
for i in range(train_results.shape[0]): |
|
|
245 |
# plt.plot(xs, train_results[i], color=colors[i], linestyle="solid", marker="o") |
|
|
246 |
# plt.plot(xs, valid_results[i], color=colors[i], linestyle="dashed", marker="+") |
|
|
247 |
# plt.plot(xs, test_results[i], color=colors[i], linestyle="dotted", marker=".") |
|
|
248 |
plt.plot(xs, np.linspace(train_results[i][0],train_results[i][dfm_params['epoch']-1],dfm_params['epoch']/step), color=colors[i], linestyle="solid", marker="o") |
|
|
249 |
plt.plot(xs, np.linspace(valid_results[i][0],valid_results[i][dfm_params['epoch']-1],dfm_params['epoch']/step), color=colors[i], linestyle="dashed", marker="+") |
|
|
250 |
plt.plot(xs, np.linspace(test_results[i][0],test_results[i][dfm_params['epoch']-1],dfm_params['epoch']/step), color=colors[i], linestyle="dotted", marker=".") |
|
|
251 |
legends.append("training set %d"%(i+1)) |
|
|
252 |
legends.append("valid set %d"%(i+1)) |
|
|
253 |
legends.append("test set %d"%(i+1)) |
|
|
254 |
plt.xlabel("Epoch") |
|
|
255 |
plt.ylabel("Normalized Gini") |
|
|
256 |
plt.title("%s"%model_name) |
|
|
257 |
plt.legend(legends,loc="lower right") |
|
|
258 |
plt.savefig("%s/Plot_%s.pdf"%(config.SUB_DIR,model_name)) |
|
|
259 |
plt.close() |
|
|
260 |
|
|
|
261 |
#Visualizing the Training Set |
|
|
262 |
def _plot_tsne(X_train,y_train,model_name): |
|
|
263 |
tsne = TSNE(n_components=2, random_state=0) #number components |
|
|
264 |
X_tsne = tsne.fit_transform(X_train) |
|
|
265 |
|
|
|
266 |
plt.figure(figsize=(10, 8)) |
|
|
267 |
mask_0 = (y_train == 0) |
|
|
268 |
mask_1 = (y_train == 1) |
|
|
269 |
|
|
|
270 |
plt.scatter(X_tsne[mask_0, 1], X_tsne[mask_0, 0], marker='s', c='g', label='No-HF', edgecolor='k', alpha=0.7) |
|
|
271 |
plt.scatter(X_tsne[mask_1, 1], X_tsne[mask_1, 0], marker='o', c='r', label='HF', edgecolor='k', alpha=0.7) |
|
|
272 |
|
|
|
273 |
|
|
|
274 |
plt.title('t-SNE plot of the testing data') |
|
|
275 |
plt.xlabel('1st embedding axis') |
|
|
276 |
plt.ylabel('2nd embedding axis') |
|
|
277 |
plt.legend(loc='best', frameon=True, shadow=True) |
|
|
278 |
|
|
|
279 |
plt.tight_layout() |
|
|
280 |
#plt.show() |
|
|
281 |
# model_name = 20 |
|
|
282 |
plt.savefig("%s/Tsne_%s.pdf"%(config.SUB_DIR,model_name)) |
|
|
283 |
plt.close() |
|
|
284 |
|
|
|
285 |
## In[*] |
|
|
286 |
dfm_params = { |
|
|
287 |
"use_fm":True, |
|
|
288 |
"use_deep":True, |
|
|
289 |
"embedding_size":8,# |
|
|
290 |
"dropout_fm":[1.0,1.0], |
|
|
291 |
"deep_layers":[256,256], |
|
|
292 |
"dropout_deep":[0.6,0.6,0.6], |
|
|
293 |
"deep_layer_activation":tf.nn.relu, |
|
|
294 |
"epoch":400,# |
|
|
295 |
"batch_size":300, |
|
|
296 |
"learning_rate":0.0001, |
|
|
297 |
"optimizer":"adam", |
|
|
298 |
"batch_norm":0.5, |
|
|
299 |
"batch_norm_decay":0.9, |
|
|
300 |
"l2_reg":0.0001, |
|
|
301 |
"verbose":True, |
|
|
302 |
"eval_metric":gini_norm, |
|
|
303 |
"random_seed":config.RANDOM_SEED |
|
|
304 |
} |
|
|
305 |
|
|
|
306 |
# load data |
|
|
307 |
dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices,ex_list,cols = load_data() |
|
|
308 |
|
|
|
309 |
# folds |
|
|
310 |
|
|
|
311 |
from sklearn.utils import resample |
|
|
312 |
y = np.arange(len(y_train)) |
|
|
313 |
#config.NUM_SPLITS = 10 |
|
|
314 |
#train_index = [] |
|
|
315 |
#test_index = [] |
|
|
316 |
folds = [] |
|
|
317 |
for i in range(config.NUM_SPLITS): |
|
|
318 |
train = resample(y,n_samples=len(y),replace=1,random_state=1000) |
|
|
319 |
#train_index.append(train) |
|
|
320 |
#print(train) |
|
|
321 |
test = np.array(list(set(y).difference(set(train)))) |
|
|
322 |
#print(test) |
|
|
323 |
#test_index.append(test) |
|
|
324 |
folds.append(np.array([train,test]).flatten()) |
|
|
325 |
#folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True, random_state=config.RANDOM_SEED).split(X_train, y_train)) |
|
|
326 |
## In[*] |
|
|
327 |
#y_train_dfm,y_test_dfm = run_base_model_dfm(dfTrain,dfTest,folds,dfm_params) |
|
|
328 |
#random.seed( 1000 ) |
|
|
329 |
threshold,cm, sensitivity,specificity,acc,ROC_val,ROC1,ROC2, loss_batch, y_train_meta, y_test_meta, mean_auc_val,std_auc_val,mean_sensitivity_val,std_sensitivity_val,mean_specificity_val,std_specificity_val,mean_acc_val,std_acc_val, mean_auc,std_auc,mean_sensitivity,std_sensitivity,mean_specificity,std_specificity,mean_acc,std_acc = run_base_model_dfm(dfTrain, dfTest, folds, dfm_params,ex_list) |
|
|
330 |
# In[*] |
|
|
331 |
#save |
|
|
332 |
f = open("D:\\anaconda-python\\UMN_JHU_alldata\\trainUMN_testJHU\\new_result\\deepfm\\output\\new_1126\\1126_val.txt",'a') |
|
|
333 |
f.write("mean_auc,std_auc,mean_sensitivity,std_sensitivity,mean_specificity,std_specificity,mean_acc,std_acc") |
|
|
334 |
f.write("\n") |
|
|
335 |
f.write(str(mean_auc_val)) |
|
|
336 |
f.write(" ") |
|
|
337 |
f.write(str(std_auc_val)) |
|
|
338 |
f.write(" ") |
|
|
339 |
f.write(str(mean_sensitivity_val)) |
|
|
340 |
f.write(" ") |
|
|
341 |
f.write(str(std_sensitivity_val)) |
|
|
342 |
f.write(" ") |
|
|
343 |
f.write(str(mean_specificity_val)) |
|
|
344 |
f.write(" ") |
|
|
345 |
f.write(str(std_specificity_val)) |
|
|
346 |
f.write(" ") |
|
|
347 |
f.write(str(mean_acc_val)) |
|
|
348 |
f.write(" ") |
|
|
349 |
f.write(str(std_acc_val)) |
|
|
350 |
f.write("\n") |
|
|
351 |
f.close() |
|
|
352 |
|
|
|
353 |
f = open("D:\\anaconda-python\\UMN_JHU_alldata\\trainUMN_testJHU\\new_result\\deepfm\\output\\new_1126\\1126_test.txt",'a') |
|
|
354 |
f.write("mean_auc,std_auc,sensitivity,mean_sensitivity,std_sensitivity,specificity,mean_specificity,std_specificity,acc,mean_acc,std_acc") |
|
|
355 |
f.write("\n") |
|
|
356 |
f.write(str(mean_auc)) |
|
|
357 |
f.write(" ") |
|
|
358 |
f.write(str(std_auc)) |
|
|
359 |
f.write(" ") |
|
|
360 |
f.write(str(sensitivity)) |
|
|
361 |
f.write(" ") |
|
|
362 |
f.write(str(mean_sensitivity)) |
|
|
363 |
f.write(" ") |
|
|
364 |
f.write(str(std_sensitivity)) |
|
|
365 |
f.write(" ") |
|
|
366 |
f.write(str(specificity)) |
|
|
367 |
f.write(" ") |
|
|
368 |
f.write(str(mean_specificity)) |
|
|
369 |
f.write(" ") |
|
|
370 |
f.write(str(std_specificity)) |
|
|
371 |
f.write(" ") |
|
|
372 |
f.write(str(acc)) |
|
|
373 |
f.write(" ") |
|
|
374 |
f.write(str(mean_acc)) |
|
|
375 |
f.write(" ") |
|
|
376 |
f.write(str(std_acc)) |
|
|
377 |
f.write(" ") |
|
|
378 |
f.write(str(ROC1)) |
|
|
379 |
f.write(" ") |
|
|
380 |
f.write(str(ROC2)) |
|
|
381 |
f.write("\n") |
|
|
382 |
f.write(str(cm)) |
|
|
383 |
f.write("\n") |
|
|
384 |
f.close() |
|
|
385 |
|
|
|
386 |
# In[*] |
|
|
387 |
|
|
|
388 |
roc_train = [] |
|
|
389 |
sensitivity_train = [] |
|
|
390 |
specificity_train = [] |
|
|
391 |
acc_train = [] |
|
|
392 |
|
|
|
393 |
roc_test = [] |
|
|
394 |
sensitivity_test = [] |
|
|
395 |
specificity_test = [] |
|
|
396 |
acc_test = [] |
|
|
397 |
|
|
|
398 |
loss_score = [] |
|
|
399 |
|
|
|
400 |
# In[*] |
|
|
401 |
loss_score.append(loss_batch)#3-cv |
|
|
402 |
#train |
|
|
403 |
roc_train.append(mean_auc_val) |
|
|
404 |
sensitivity_train.append(mean_sensitivity_val) |
|
|
405 |
specificity_train.append(mean_specificity_val) |
|
|
406 |
acc_train.append(mean_acc_val) |
|
|
407 |
|
|
|
408 |
#test |
|
|
409 |
roc_test.append(mean_auc) |
|
|
410 |
sensitivity_test.append(mean_sensitivity) |
|
|
411 |
specificity_test.append(mean_specificity) |
|
|
412 |
acc_test.append(mean_acc) |
|
|
413 |
# In[*] |
|
|
414 |
param_range1 = np.array([2,4,8,16]) |
|
|
415 |
na = "Embedding size" |
|
|
416 |
li = "D:\\anaconda-python\\UMN_JHU_alldata\\trainUMN_testJHU\\new_result\\deepfm\\output\\new_1126\\" |
|
|
417 |
plt.plot(param_range1, roc_train, 'o-', color="g", label="Train:ROC", lw=1, alpha=.8,markersize=2) |
|
|
418 |
plt.plot(param_range1, roc_test, 'o-', color="g", linestyle='dashed', label="Test:ROC", lw=1, alpha=.8,markersize=2) |
|
|
419 |
plt.plot(param_range1, sensitivity_train, 'o-', color="darkorange", label="Train:Accuracy", lw=1, alpha=.8,markersize=2) |
|
|
420 |
plt.plot(param_range1, sensitivity_test, 'o-', color="darkorange", linestyle='dashed', label="Test:Accuracy", lw=1, alpha=.8,markersize=2) |
|
|
421 |
plt.plot(param_range1, specificity_train, 'o-', color="b", label="Train:F1 score", lw=1, alpha=.8,markersize=2) |
|
|
422 |
plt.plot(param_range1, specificity_test, 'o-', color="b", linestyle='dashed', label="Test:F1 score", lw=1, alpha=.8,markersize=2) |
|
|
423 |
plt.plot(param_range1, acc_train, 'o-', color="pink", label="Train:Average Precision", lw=1, alpha=.8,markersize=2) |
|
|
424 |
plt.plot(param_range1, acc_test, 'o-', color="pink", linestyle='dashed', label="Test:Average Precision", lw=1, alpha=.8,markersize=2) |
|
|
425 |
plt.xlabel(na) |
|
|
426 |
plt.ylabel("Value") |
|
|
427 |
plt.legend(fontsize = "x-small",loc="lower center",ncol=2) |
|
|
428 |
plt.savefig('%s%s.pdf'%(li,na)) |
|
|
429 |
plt.close() |
|
|
430 |
|
|
|
431 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,:1], 'o-', color="g", lw=1, alpha=.8,markersize=2,label="Fold1") |
|
|
432 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,1:2], 'o-', color="b", lw=1, alpha=.8,markersize=2,label="Fold2") |
|
|
433 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,2:3], 'o-', color="darkorange", lw=1, alpha=.8,markersize=2,label="Fold3") |
|
|
434 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,3:4], 'o-', color="c", lw=1, alpha=.8,markersize=2,label="Fold4") |
|
|
435 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,4:5], 'o-', color="m", lw=1, alpha=.8,markersize=2,label="Fold5") |
|
|
436 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,5:6], 'o-', color="grey", lw=1, alpha=.8,markersize=2,label="Fold6") |
|
|
437 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,6:7], 'o-', color="k", lw=1, alpha=.8,markersize=2,label="Fold7") |
|
|
438 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,7:8], 'o-', color="pink", lw=1, alpha=.8,markersize=2,label="Fold8") |
|
|
439 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,8:9], 'o-', color="y", lw=1, alpha=.8,markersize=2,label="Fold9") |
|
|
440 |
plt.plot(param_range1, np.array(loss_score)[:len(param_range1)+1,9:10], 'o-', color="r", lw=1, alpha=.8,markersize=2,label="Fold10") |
|
|
441 |
plt.xlabel(na) |
|
|
442 |
plt.ylabel("Loss") |
|
|
443 |
plt.legend(fontsize = "x-small",loc=1) |
|
|
444 |
plt.savefig('%s%s_Loss.pdf'%(li,na)) |
|
|
445 |
plt.close() |
|
|
446 |
# In[*] |
|
|
447 |
# ------------------ FM Model ------------------ |
|
|
448 |
fm_params = dfm_params.copy() |
|
|
449 |
fm_params["use_deep"] = False |
|
|
450 |
y_train_fm, y_test_fm = run_base_model_dfm(dfTrain, dfTest, folds, fm_params) |
|
|
451 |
|
|
|
452 |
# In[*] |
|
|
453 |
# ------------------ DNN Model ------------------ |
|
|
454 |
dnn_params = dfm_params.copy() |
|
|
455 |
dnn_params["use_fm"] = False |
|
|
456 |
y_train_dnn, y_test_dnn = run_base_model_dfm(dfTrain, dfTest, folds, dnn_params) |