Switch to side-by-side view

--- a
+++ b/notebooks/model/evaluation/training_validation.ipynb
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, IsolationForest, AdaBoostClassifier, StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, BaggingClassifier\n",
+    "from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier\n",
+    "from sklearn.svm import SVC, LinearSVC\n",
+    "from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "from sklearn.model_selection import cross_validate\n",
+    "\n",
+    "from time import time \n",
+    "import warnings\n",
+    "import pandas as pd\n",
+    "import joblib\n",
+    "\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv('../../../data/input/train.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature = data.drop('LUNG_CANCER', axis='columns')\n",
+    "label = data['LUNG_CANCER']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "logreg -> ( Accuracy: 92%) - (Precision: 91%) - (Recall: 94%) - (Measure: 93%)\n",
+      "ridge -> ( Accuracy: 90%) - (Precision: 93%) - (Recall: 89%) - (Measure: 91%)\n",
+      "passive_agressive -> ( Accuracy: 52%) - (Precision: 38%) - (Recall: 70%) - (Measure: 49%)\n",
+      "sgd -> ( Accuracy: 69%) - (Precision: 68%) - (Recall: 93%) - (Measure: 77%)\n",
+      "tree -> ( Accuracy: 89%) - (Precision: 91%) - (Recall: 88%) - (Measure: 90%)\n",
+      "extra_tree -> ( Accuracy: 92%) - (Precision: 93%) - (Recall: 92%) - (Measure: 92%)\n",
+      "random_forest -> ( Accuracy: 92%) - (Precision: 93%) - (Recall: 93%) - (Measure: 93%)\n",
+      "isolation_forest -> ( Accuracy: 4%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n",
+      "adaboost -> ( Accuracy: 91%) - (Precision: 91%) - (Recall: 93%) - (Measure: 92%)\n",
+      "stacking -> ( Accuracy: 89%) - (Precision: 90%) - (Recall: 89%) - (Measure: 89%)\n",
+      "gradient_boosting -> ( Accuracy: 93%) - (Precision: 94%) - (Recall: 93%) - (Measure: 93%)\n",
+      "hist_gradient_boosting -> ( Accuracy: 91%) - (Precision: 93%) - (Recall: 90%) - (Measure: 92%)\n",
+      "voting -> ( Accuracy: 88%) - (Precision: 90%) - (Recall: 88%) - (Measure: 89%)\n",
+      "bagging -> ( Accuracy: 91%) - (Precision: 93%) - (Recall: 89%) - (Measure: 91%)\n",
+      "radius_neighbors -> ( Accuracy: nan%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n",
+      "kneighbors -> ( Accuracy: 85%) - (Precision: 91%) - (Recall: 79%) - (Measure: 85%)\n",
+      "svm -> ( Accuracy: 57%) - (Precision: 56%) - (Recall: 95%) - (Measure: 70%)\n",
+      "linear_svm -> ( Accuracy: 81%) - (Precision: 89%) - (Recall: 81%) - (Measure: 82%)\n",
+      "categorical -> ( Accuracy: nan%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n",
+      "bernoulli -> ( Accuracy: 53%) - (Precision: 53%) - (Recall: 100%) - (Measure: 69%)\n",
+      "gaussian -> ( Accuracy: 91%) - (Precision: 90%) - (Recall: 93%) - (Measure: 91%)\n",
+      "neural_net -> ( Accuracy: 83%) - (Precision: 83%) - (Recall: 86%) - (Measure: 84%)\n",
+      "\n",
+      "Best Model: gradient_boosting\n",
+      "Accuracy: 0.9266666666666665\n",
+      "Precision: 0.937217062263502\n",
+      "Recall: 0.9254385964912281\n",
+      "Measure: 0.930505907874329\n",
+      "Model Saved: ['../../../models/gradient_boosting.joblib']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Train every classifiers\n",
+    "estimators = {\n",
+    "    'logreg': LogisticRegression(solver='lbfgs', max_iter=200),\n",
+    "    'ridge': RidgeClassifier(),\n",
+    "    'passive_agressive': PassiveAggressiveClassifier(),\n",
+    "    'sgd': SGDClassifier(),\n",
+    "    'tree': DecisionTreeClassifier(),\n",
+    "    'extra_tree': ExtraTreesClassifier(),\n",
+    "    'random_forest': RandomForestClassifier(),\n",
+    "    'isolation_forest': IsolationForest(),\n",
+    "    'adaboost': AdaBoostClassifier(),\n",
+    "    'stacking': StackingClassifier(estimators=[('tree', DecisionTreeClassifier())]), \n",
+    "    'gradient_boosting': GradientBoostingClassifier(), \n",
+    "    'hist_gradient_boosting': HistGradientBoostingClassifier(), \n",
+    "    'voting': VotingClassifier(estimators=[('tree', DecisionTreeClassifier())]), \n",
+    "    'bagging': BaggingClassifier(),\n",
+    "    'radius_neighbors': RadiusNeighborsClassifier(), \n",
+    "    'kneighbors': KNeighborsClassifier(),\n",
+    "    'svm': SVC(),\n",
+    "    'linear_svm': LinearSVC(),\n",
+    "    'categorical': CategoricalNB(),\n",
+    "    'bernoulli': BernoulliNB(), \n",
+    "    'gaussian': GaussianNB(),\n",
+    "    'neural_net': MLPClassifier()\n",
+    "}\n",
+    "\n",
+    "best_accuracy = 0\n",
+    "best_precision = 0\n",
+    "best_recall = 0\n",
+    "best_f1 = 0\n",
+    "best_model = None\n",
+    "model_name = None\n",
+    "\n",
+    "for estimator_name, estimator in estimators.items():\n",
+    "    # Iterate & validate the model\n",
+    "    cv = cross_validate(estimator, feature, label, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=10)\n",
+    "    \n",
+    "    accuracy = cv['test_accuracy'].mean()\n",
+    "    precision = cv['test_precision'].mean()\n",
+    "    recall = cv['test_recall'].mean()\n",
+    "    f1 = cv['test_f1'].mean()\n",
+    " \n",
+    "    # Select the performing model\n",
+    "    if accuracy > best_accuracy:\n",
+    "       best_accuracy = accuracy\n",
+    "       best_precision = precision\n",
+    "       best_recall = recall\n",
+    "       best_f1 = f1\n",
+    "       model_name = estimator_name\n",
+    "       best_model = estimator\n",
+    "\n",
+    "    print(f'{estimator_name} -> ( Accuracy: {(accuracy * 100):.0f}%) - (Precision: {(precision * 100):.0f}%) - (Recall: {(recall * 100):.0f}%) - (Measure: {(f1 * 100):.0f}%)')\n",
+    "\n",
+    "print(f'\\nBest Model: {model_name}')\n",
+    "print(f'Accuracy: {best_accuracy}')\n",
+    "print(f'Precision: {best_precision}')\n",
+    "print(f'Recall: {best_recall}')\n",
+    "print(f'Measure: {best_f1}')\n",
+    "\n",
+    "# Re-train the performing model\n",
+    "best_model.fit(feature, label)\n",
+    "\n",
+    "# Save the trained model\n",
+    "model = joblib.dump(best_model, f'../../../models/{model_name}.joblib')\n",
+    "print(f'Model Saved: {model}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}