[0241e6]: / notebooks / model / evaluation / training_validation.ipynb

Download this file

177 lines (176 with data), 7.3 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, IsolationForest, AdaBoostClassifier, StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, BaggingClassifier\n",
    "from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier\n",
    "from sklearn.svm import SVC, LinearSVC\n",
    "from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.model_selection import cross_validate\n",
    "\n",
    "from time import time \n",
    "import warnings\n",
    "import pandas as pd\n",
    "import joblib\n",
    "\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('../../../data/input/train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature = data.drop('LUNG_CANCER', axis='columns')\n",
    "label = data['LUNG_CANCER']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "logreg -> ( Accuracy: 92%) - (Precision: 91%) - (Recall: 94%) - (Measure: 93%)\n",
      "ridge -> ( Accuracy: 90%) - (Precision: 93%) - (Recall: 89%) - (Measure: 91%)\n",
      "passive_agressive -> ( Accuracy: 52%) - (Precision: 38%) - (Recall: 70%) - (Measure: 49%)\n",
      "sgd -> ( Accuracy: 69%) - (Precision: 68%) - (Recall: 93%) - (Measure: 77%)\n",
      "tree -> ( Accuracy: 89%) - (Precision: 91%) - (Recall: 88%) - (Measure: 90%)\n",
      "extra_tree -> ( Accuracy: 92%) - (Precision: 93%) - (Recall: 92%) - (Measure: 92%)\n",
      "random_forest -> ( Accuracy: 92%) - (Precision: 93%) - (Recall: 93%) - (Measure: 93%)\n",
      "isolation_forest -> ( Accuracy: 4%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n",
      "adaboost -> ( Accuracy: 91%) - (Precision: 91%) - (Recall: 93%) - (Measure: 92%)\n",
      "stacking -> ( Accuracy: 89%) - (Precision: 90%) - (Recall: 89%) - (Measure: 89%)\n",
      "gradient_boosting -> ( Accuracy: 93%) - (Precision: 94%) - (Recall: 93%) - (Measure: 93%)\n",
      "hist_gradient_boosting -> ( Accuracy: 91%) - (Precision: 93%) - (Recall: 90%) - (Measure: 92%)\n",
      "voting -> ( Accuracy: 88%) - (Precision: 90%) - (Recall: 88%) - (Measure: 89%)\n",
      "bagging -> ( Accuracy: 91%) - (Precision: 93%) - (Recall: 89%) - (Measure: 91%)\n",
      "radius_neighbors -> ( Accuracy: nan%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n",
      "kneighbors -> ( Accuracy: 85%) - (Precision: 91%) - (Recall: 79%) - (Measure: 85%)\n",
      "svm -> ( Accuracy: 57%) - (Precision: 56%) - (Recall: 95%) - (Measure: 70%)\n",
      "linear_svm -> ( Accuracy: 81%) - (Precision: 89%) - (Recall: 81%) - (Measure: 82%)\n",
      "categorical -> ( Accuracy: nan%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n",
      "bernoulli -> ( Accuracy: 53%) - (Precision: 53%) - (Recall: 100%) - (Measure: 69%)\n",
      "gaussian -> ( Accuracy: 91%) - (Precision: 90%) - (Recall: 93%) - (Measure: 91%)\n",
      "neural_net -> ( Accuracy: 83%) - (Precision: 83%) - (Recall: 86%) - (Measure: 84%)\n",
      "\n",
      "Best Model: gradient_boosting\n",
      "Accuracy: 0.9266666666666665\n",
      "Precision: 0.937217062263502\n",
      "Recall: 0.9254385964912281\n",
      "Measure: 0.930505907874329\n",
      "Model Saved: ['../../../models/gradient_boosting.joblib']\n"
     ]
    }
   ],
   "source": [
    "# Train every classifiers\n",
    "estimators = {\n",
    "    'logreg': LogisticRegression(solver='lbfgs', max_iter=200),\n",
    "    'ridge': RidgeClassifier(),\n",
    "    'passive_agressive': PassiveAggressiveClassifier(),\n",
    "    'sgd': SGDClassifier(),\n",
    "    'tree': DecisionTreeClassifier(),\n",
    "    'extra_tree': ExtraTreesClassifier(),\n",
    "    'random_forest': RandomForestClassifier(),\n",
    "    'isolation_forest': IsolationForest(),\n",
    "    'adaboost': AdaBoostClassifier(),\n",
    "    'stacking': StackingClassifier(estimators=[('tree', DecisionTreeClassifier())]), \n",
    "    'gradient_boosting': GradientBoostingClassifier(), \n",
    "    'hist_gradient_boosting': HistGradientBoostingClassifier(), \n",
    "    'voting': VotingClassifier(estimators=[('tree', DecisionTreeClassifier())]), \n",
    "    'bagging': BaggingClassifier(),\n",
    "    'radius_neighbors': RadiusNeighborsClassifier(), \n",
    "    'kneighbors': KNeighborsClassifier(),\n",
    "    'svm': SVC(),\n",
    "    'linear_svm': LinearSVC(),\n",
    "    'categorical': CategoricalNB(),\n",
    "    'bernoulli': BernoulliNB(), \n",
    "    'gaussian': GaussianNB(),\n",
    "    'neural_net': MLPClassifier()\n",
    "}\n",
    "\n",
    "best_accuracy = 0\n",
    "best_precision = 0\n",
    "best_recall = 0\n",
    "best_f1 = 0\n",
    "best_model = None\n",
    "model_name = None\n",
    "\n",
    "for estimator_name, estimator in estimators.items():\n",
    "    # Iterate & validate the model\n",
    "    cv = cross_validate(estimator, feature, label, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=10)\n",
    "    \n",
    "    accuracy = cv['test_accuracy'].mean()\n",
    "    precision = cv['test_precision'].mean()\n",
    "    recall = cv['test_recall'].mean()\n",
    "    f1 = cv['test_f1'].mean()\n",
    " \n",
    "    # Select the performing model\n",
    "    if accuracy > best_accuracy:\n",
    "       best_accuracy = accuracy\n",
    "       best_precision = precision\n",
    "       best_recall = recall\n",
    "       best_f1 = f1\n",
    "       model_name = estimator_name\n",
    "       best_model = estimator\n",
    "\n",
    "    print(f'{estimator_name} -> ( Accuracy: {(accuracy * 100):.0f}%) - (Precision: {(precision * 100):.0f}%) - (Recall: {(recall * 100):.0f}%) - (Measure: {(f1 * 100):.0f}%)')\n",
    "\n",
    "print(f'\\nBest Model: {model_name}')\n",
    "print(f'Accuracy: {best_accuracy}')\n",
    "print(f'Precision: {best_precision}')\n",
    "print(f'Recall: {best_recall}')\n",
    "print(f'Measure: {best_f1}')\n",
    "\n",
    "# Re-train the performing model\n",
    "best_model.fit(feature, label)\n",
    "\n",
    "# Save the trained model\n",
    "model = joblib.dump(best_model, f'../../../models/{model_name}.joblib')\n",
    "print(f'Model Saved: {model}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}