[59083a]: / utils / model.py

Download this file

116 lines (90 with data), 4.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import mlflow
import numpy as np
from mlflow.metrics import f1_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pickle
class MLModelSelector:
def __init__(self, class_weight='balanced'):
self.class_weight = class_weight
def SVM(self, C=1.0, kernel='rbf', **kwargs):
model = SVC(C=C, kernel=kernel, class_weight=self.class_weight, **kwargs)
return model
def logistic_regression(self, C=1.0, solver='lbfgs', max_iter=1000, **kwargs):
model = LogisticRegression(C=C, solver=solver, max_iter=max_iter, class_weight=self.class_weight, **kwargs)
return model
def random_forest(self, n_estimators=100, max_depth=None, **kwargs):
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, class_weight=self.class_weight, **kwargs)
return model
def XGBoost(self, n_estimators=100, learning_rate=0.3, **kwargs):
model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, **kwargs)
return model
def train_model(self, X_train, y_train,X_test, y_test, model_name, param_grid=None, **kwargs):
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.astype(int)
preprocess_parameters =kwargs['preprocess_parameters']
del kwargs['preprocess_parameters']
if model_name == 'SVM':
model = self.SVM(**kwargs)
elif model_name == 'logistic_regression':
model = self.logistic_regression(**kwargs)
elif model_name == 'random_forest':
model = self.random_forest(**kwargs)
elif model_name == 'XGBoost':
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos if 'scale_pos_weight' not in kwargs else kwargs['scale_pos_weight']
kwargs['scale_pos_weight'] = scale_pos_weight
model = self.XGBoost(**kwargs)
else:
raise ValueError(f"Model '{model_name}' not supported")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train, y_train)
# results = grid_search.cv_results_
# for mean_score, params in zip(results['mean_test_score'], results['params']):
# with mlflow.start_run(run_name=f"{model_name}_{params}_{preprocess_parameters}"):
# mlflow.log_params(params)
# mlflow.log_params(preprocess_parameters)
# mlflow.log_metric('f1_weighted_score', mean_score)
best_params = grid_search.best_params_
# best_score = grid_search.best_score_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
with mlflow.start_run(run_name=f"Best_{model_name}"):
mlflow.log_params(best_params)
mlflow.log_params(preprocess_parameters)
mlflow.log_metric('best_f1_weighted_score', f1)
return best_params, f1
def final_model(self, X, y, model_name, **kwargs):
# with open('model/columns.pkl', 'wb') as f:
# pickle.dump(X.columns.tolist(), f)
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = y.astype(int)
if model_name == 'SVM':
model = self.SVM(**kwargs)
elif model_name == 'logistic_regression':
model = self.logistic_regression(**kwargs)
elif model_name == 'random_forest':
model = self.random_forest(**kwargs)
elif model_name == 'XGBoost':
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos if 'scale_pos_weight' not in kwargs else kwargs['scale_pos_weight']
kwargs['scale_pos_weight'] = scale_pos_weight
model = self.XGBoost(**kwargs)
else:
raise ValueError(f"Model '{model_name}' not supported")
model.fit(X, y)
# os.makedirs('models', exist_ok=True)
# os.makedirs('scalers', exist_ok=True)
with open(f'model/model.pkl', 'wb') as model_file:
pickle.dump(model, model_file)
with open(f'model/scaler.pkl', 'wb') as scaler_file:
pickle.dump(scaler, scaler_file)