Switch to side-by-side view

--- a
+++ b/scripts/model/training_validation.py
@@ -0,0 +1,99 @@
+from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, IsolationForest, AdaBoostClassifier, StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, BaggingClassifier
+from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
+from sklearn.svm import SVC, LinearSVC
+from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB
+from sklearn.neural_network import MLPClassifier
+from sklearn.model_selection import cross_validate
+
+from time import time 
+import warnings
+import pandas as pd
+import joblib
+import os
+
+warnings.filterwarnings('ignore')
+
+# Load holdout training set
+try:
+    data = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/input/train.csv'), 'r'))
+except FileNotFoundError as err:
+    print(f'Ann error occoured: {err}')
+
+# Select every classifiers
+def classifiers():
+    estimators = {
+        'logreg': LogisticRegression(solver='lbfgs', max_iter=200),
+        'ridge': RidgeClassifier(),
+        'passive_agressive': PassiveAggressiveClassifier(),
+        'sgd': SGDClassifier(),
+        'tree': DecisionTreeClassifier(),
+        'extra_tree': ExtraTreesClassifier(),
+        'random_forest': RandomForestClassifier(),
+        'isolation_forest': IsolationForest(),
+        'adaboost': AdaBoostClassifier(),
+        'stacking': StackingClassifier(estimators=[('tree', DecisionTreeClassifier())]), 
+        'gradient_boosting': GradientBoostingClassifier(), 
+        'hist_gradient_boosting': HistGradientBoostingClassifier(), 
+        'voting': VotingClassifier(estimators=[('tree', DecisionTreeClassifier())]), 
+        'bagging': BaggingClassifier(),
+        'radius_neighbors': RadiusNeighborsClassifier(), 
+        'kneighbors': KNeighborsClassifier(),
+        'svm': SVC(),
+        'linear_svm': LinearSVC(),
+        'categorical': CategoricalNB(),
+        'bernoulli': BernoulliNB(), 
+        'gaussian': GaussianNB(),
+        'neural_net': MLPClassifier()
+    }
+
+    return estimators
+
+# Train and perform validation to select the performing model
+def train_validate(data):
+    feature = data.drop('LUNG_CANCER', axis='columns')
+    label = data['LUNG_CANCER']
+
+    best_accuracy = 0
+    best_precision = 0
+    best_recall = 0
+    best_f1 = 0
+    best_model = None
+    model_name = None
+
+    for estimator_name, estimator in classifiers().items():
+        cv = cross_validate(estimator, feature, label, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=10)
+
+        accuracy = cv['test_accuracy'].mean()
+        precision = cv['test_precision'].mean()
+        recall = cv['test_recall'].mean()
+        f1 = cv['test_f1'].mean()
+ 
+        if accuracy > best_accuracy:
+           best_accuracy = accuracy
+           best_precision = precision
+           best_recall = recall
+           best_f1 = f1
+           model_name = estimator_name
+           best_model = estimator
+
+    return [model_name, best_accuracy, best_precision, best_recall, best_f1, best_model]
+
+# Save the model 
+def save_model(data):
+    model_result = train_validate(data)
+
+    print(f'\nBest Model: {model_result[0]}')
+    print(f'Accuracy: {model_result[1]}')
+    print(f'Precision: {model_result[2]}')
+    print(f'Recall: {model_result[3]}')
+    print(f'Measure: {model_result[4]}')
+    
+    model = model_result[5].fit(data.drop('LUNG_CANCER', axis='columns'), data['LUNG_CANCER'])
+
+    model = joblib.dump(model, f'../../models/{model_result[0]}.joblib')
+    return f'Model Saved: {model}'
+
+if __name__ == '__main__':
+   print(save_model(data))
\ No newline at end of file