a b/scripts/model/training_validation.py
1
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
2
from sklearn.tree import DecisionTreeClassifier
3
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, IsolationForest, AdaBoostClassifier, StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, BaggingClassifier
4
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
5
from sklearn.svm import SVC, LinearSVC
6
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB
7
from sklearn.neural_network import MLPClassifier
8
from sklearn.model_selection import cross_validate
9
10
from time import time 
11
import warnings
12
import pandas as pd
13
import joblib
14
import os
15
16
warnings.filterwarnings('ignore')
17
18
# Load holdout training set
19
try:
20
    data = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/input/train.csv'), 'r'))
21
except FileNotFoundError as err:
22
    print(f'Ann error occoured: {err}')
23
24
# Select every classifiers
25
def classifiers():
26
    estimators = {
27
        'logreg': LogisticRegression(solver='lbfgs', max_iter=200),
28
        'ridge': RidgeClassifier(),
29
        'passive_agressive': PassiveAggressiveClassifier(),
30
        'sgd': SGDClassifier(),
31
        'tree': DecisionTreeClassifier(),
32
        'extra_tree': ExtraTreesClassifier(),
33
        'random_forest': RandomForestClassifier(),
34
        'isolation_forest': IsolationForest(),
35
        'adaboost': AdaBoostClassifier(),
36
        'stacking': StackingClassifier(estimators=[('tree', DecisionTreeClassifier())]), 
37
        'gradient_boosting': GradientBoostingClassifier(), 
38
        'hist_gradient_boosting': HistGradientBoostingClassifier(), 
39
        'voting': VotingClassifier(estimators=[('tree', DecisionTreeClassifier())]), 
40
        'bagging': BaggingClassifier(),
41
        'radius_neighbors': RadiusNeighborsClassifier(), 
42
        'kneighbors': KNeighborsClassifier(),
43
        'svm': SVC(),
44
        'linear_svm': LinearSVC(),
45
        'categorical': CategoricalNB(),
46
        'bernoulli': BernoulliNB(), 
47
        'gaussian': GaussianNB(),
48
        'neural_net': MLPClassifier()
49
    }
50
51
    return estimators
52
53
# Train and perform validation to select the performing model
54
def train_validate(data):
55
    feature = data.drop('LUNG_CANCER', axis='columns')
56
    label = data['LUNG_CANCER']
57
58
    best_accuracy = 0
59
    best_precision = 0
60
    best_recall = 0
61
    best_f1 = 0
62
    best_model = None
63
    model_name = None
64
65
    for estimator_name, estimator in classifiers().items():
66
        cv = cross_validate(estimator, feature, label, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=10)
67
68
        accuracy = cv['test_accuracy'].mean()
69
        precision = cv['test_precision'].mean()
70
        recall = cv['test_recall'].mean()
71
        f1 = cv['test_f1'].mean()
72
 
73
        if accuracy > best_accuracy:
74
           best_accuracy = accuracy
75
           best_precision = precision
76
           best_recall = recall
77
           best_f1 = f1
78
           model_name = estimator_name
79
           best_model = estimator
80
81
    return [model_name, best_accuracy, best_precision, best_recall, best_f1, best_model]
82
83
# Save the model 
84
def save_model(data):
85
    model_result = train_validate(data)
86
87
    print(f'\nBest Model: {model_result[0]}')
88
    print(f'Accuracy: {model_result[1]}')
89
    print(f'Precision: {model_result[2]}')
90
    print(f'Recall: {model_result[3]}')
91
    print(f'Measure: {model_result[4]}')
92
    
93
    model = model_result[5].fit(data.drop('LUNG_CANCER', axis='columns'), data['LUNG_CANCER'])
94
95
    model = joblib.dump(model, f'../../models/{model_result[0]}.joblib')
96
    return f'Model Saved: {model}'
97
98
if __name__ == '__main__':
99
   print(save_model(data))