|
a |
|
b/scripts/model/training_validation.py |
|
|
1 |
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier |
|
|
2 |
from sklearn.tree import DecisionTreeClassifier |
|
|
3 |
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, IsolationForest, AdaBoostClassifier, StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, BaggingClassifier |
|
|
4 |
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier |
|
|
5 |
from sklearn.svm import SVC, LinearSVC |
|
|
6 |
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB |
|
|
7 |
from sklearn.neural_network import MLPClassifier |
|
|
8 |
from sklearn.model_selection import cross_validate |
|
|
9 |
|
|
|
10 |
from time import time |
|
|
11 |
import warnings |
|
|
12 |
import pandas as pd |
|
|
13 |
import joblib |
|
|
14 |
import os |
|
|
15 |
|
|
|
16 |
warnings.filterwarnings('ignore') |
|
|
17 |
|
|
|
18 |
# Load holdout training set |
|
|
19 |
try: |
|
|
20 |
data = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/input/train.csv'), 'r')) |
|
|
21 |
except FileNotFoundError as err: |
|
|
22 |
print(f'Ann error occoured: {err}') |
|
|
23 |
|
|
|
24 |
# Select every classifiers |
|
|
25 |
def classifiers(): |
|
|
26 |
estimators = { |
|
|
27 |
'logreg': LogisticRegression(solver='lbfgs', max_iter=200), |
|
|
28 |
'ridge': RidgeClassifier(), |
|
|
29 |
'passive_agressive': PassiveAggressiveClassifier(), |
|
|
30 |
'sgd': SGDClassifier(), |
|
|
31 |
'tree': DecisionTreeClassifier(), |
|
|
32 |
'extra_tree': ExtraTreesClassifier(), |
|
|
33 |
'random_forest': RandomForestClassifier(), |
|
|
34 |
'isolation_forest': IsolationForest(), |
|
|
35 |
'adaboost': AdaBoostClassifier(), |
|
|
36 |
'stacking': StackingClassifier(estimators=[('tree', DecisionTreeClassifier())]), |
|
|
37 |
'gradient_boosting': GradientBoostingClassifier(), |
|
|
38 |
'hist_gradient_boosting': HistGradientBoostingClassifier(), |
|
|
39 |
'voting': VotingClassifier(estimators=[('tree', DecisionTreeClassifier())]), |
|
|
40 |
'bagging': BaggingClassifier(), |
|
|
41 |
'radius_neighbors': RadiusNeighborsClassifier(), |
|
|
42 |
'kneighbors': KNeighborsClassifier(), |
|
|
43 |
'svm': SVC(), |
|
|
44 |
'linear_svm': LinearSVC(), |
|
|
45 |
'categorical': CategoricalNB(), |
|
|
46 |
'bernoulli': BernoulliNB(), |
|
|
47 |
'gaussian': GaussianNB(), |
|
|
48 |
'neural_net': MLPClassifier() |
|
|
49 |
} |
|
|
50 |
|
|
|
51 |
return estimators |
|
|
52 |
|
|
|
53 |
# Train and perform validation to select the performing model |
|
|
54 |
def train_validate(data): |
|
|
55 |
feature = data.drop('LUNG_CANCER', axis='columns') |
|
|
56 |
label = data['LUNG_CANCER'] |
|
|
57 |
|
|
|
58 |
best_accuracy = 0 |
|
|
59 |
best_precision = 0 |
|
|
60 |
best_recall = 0 |
|
|
61 |
best_f1 = 0 |
|
|
62 |
best_model = None |
|
|
63 |
model_name = None |
|
|
64 |
|
|
|
65 |
for estimator_name, estimator in classifiers().items(): |
|
|
66 |
cv = cross_validate(estimator, feature, label, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=10) |
|
|
67 |
|
|
|
68 |
accuracy = cv['test_accuracy'].mean() |
|
|
69 |
precision = cv['test_precision'].mean() |
|
|
70 |
recall = cv['test_recall'].mean() |
|
|
71 |
f1 = cv['test_f1'].mean() |
|
|
72 |
|
|
|
73 |
if accuracy > best_accuracy: |
|
|
74 |
best_accuracy = accuracy |
|
|
75 |
best_precision = precision |
|
|
76 |
best_recall = recall |
|
|
77 |
best_f1 = f1 |
|
|
78 |
model_name = estimator_name |
|
|
79 |
best_model = estimator |
|
|
80 |
|
|
|
81 |
return [model_name, best_accuracy, best_precision, best_recall, best_f1, best_model] |
|
|
82 |
|
|
|
83 |
# Save the model |
|
|
84 |
def save_model(data): |
|
|
85 |
model_result = train_validate(data) |
|
|
86 |
|
|
|
87 |
print(f'\nBest Model: {model_result[0]}') |
|
|
88 |
print(f'Accuracy: {model_result[1]}') |
|
|
89 |
print(f'Precision: {model_result[2]}') |
|
|
90 |
print(f'Recall: {model_result[3]}') |
|
|
91 |
print(f'Measure: {model_result[4]}') |
|
|
92 |
|
|
|
93 |
model = model_result[5].fit(data.drop('LUNG_CANCER', axis='columns'), data['LUNG_CANCER']) |
|
|
94 |
|
|
|
95 |
model = joblib.dump(model, f'../../models/{model_result[0]}.joblib') |
|
|
96 |
return f'Model Saved: {model}' |
|
|
97 |
|
|
|
98 |
if __name__ == '__main__': |
|
|
99 |
print(save_model(data)) |