--- a +++ b/notebooks/model/evaluation/training_validation.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, IsolationForest, AdaBoostClassifier, StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, BaggingClassifier\n", + "from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier\n", + "from sklearn.svm import SVC, LinearSVC\n", + "from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB\n", + "from sklearn.neural_network import MLPClassifier\n", + "from sklearn.model_selection import cross_validate\n", + "\n", + "from time import time \n", + "import warnings\n", + "import pandas as pd\n", + "import joblib\n", + "\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('../../../data/input/train.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "feature = data.drop('LUNG_CANCER', axis='columns')\n", + "label = data['LUNG_CANCER']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "logreg -> ( Accuracy: 92%) - (Precision: 91%) - (Recall: 94%) - (Measure: 93%)\n", + "ridge -> ( Accuracy: 90%) - (Precision: 93%) - (Recall: 89%) - (Measure: 91%)\n", + "passive_agressive -> ( Accuracy: 52%) - (Precision: 38%) - (Recall: 70%) - (Measure: 49%)\n", + "sgd -> ( Accuracy: 69%) - (Precision: 68%) - (Recall: 93%) - (Measure: 77%)\n", + "tree -> ( Accuracy: 89%) - (Precision: 91%) - (Recall: 88%) - (Measure: 90%)\n", + "extra_tree -> ( Accuracy: 92%) - (Precision: 93%) - (Recall: 92%) - (Measure: 92%)\n", + "random_forest -> ( Accuracy: 92%) - (Precision: 93%) - (Recall: 93%) - (Measure: 93%)\n", + "isolation_forest -> ( Accuracy: 4%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n", + "adaboost -> ( Accuracy: 91%) - (Precision: 91%) - (Recall: 93%) - (Measure: 92%)\n", + "stacking -> ( Accuracy: 89%) - (Precision: 90%) - (Recall: 89%) - (Measure: 89%)\n", + "gradient_boosting -> ( Accuracy: 93%) - (Precision: 94%) - (Recall: 93%) - (Measure: 93%)\n", + "hist_gradient_boosting -> ( Accuracy: 91%) - (Precision: 93%) - (Recall: 90%) - (Measure: 92%)\n", + "voting -> ( Accuracy: 88%) - (Precision: 90%) - (Recall: 88%) - (Measure: 89%)\n", + "bagging -> ( Accuracy: 91%) - (Precision: 93%) - (Recall: 89%) - (Measure: 91%)\n", + "radius_neighbors -> ( Accuracy: nan%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n", + "kneighbors -> ( Accuracy: 85%) - (Precision: 91%) - (Recall: 79%) - (Measure: 85%)\n", + "svm -> ( Accuracy: 57%) - (Precision: 56%) - (Recall: 95%) - (Measure: 70%)\n", + "linear_svm -> ( Accuracy: 81%) - (Precision: 89%) - (Recall: 81%) - (Measure: 82%)\n", + "categorical -> ( Accuracy: nan%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)\n", + "bernoulli -> ( Accuracy: 53%) - (Precision: 53%) - (Recall: 100%) - (Measure: 69%)\n", + "gaussian -> ( Accuracy: 91%) - (Precision: 90%) - (Recall: 93%) - (Measure: 91%)\n", + "neural_net -> ( Accuracy: 83%) - (Precision: 83%) - (Recall: 86%) - (Measure: 84%)\n", + "\n", + "Best Model: gradient_boosting\n", + "Accuracy: 0.9266666666666665\n", + "Precision: 0.937217062263502\n", + "Recall: 0.9254385964912281\n", + "Measure: 0.930505907874329\n", + "Model Saved: ['../../../models/gradient_boosting.joblib']\n" + ] + } + ], + "source": [ + "# Train every classifiers\n", + "estimators = {\n", + " 'logreg': LogisticRegression(solver='lbfgs', max_iter=200),\n", + " 'ridge': RidgeClassifier(),\n", + " 'passive_agressive': PassiveAggressiveClassifier(),\n", + " 'sgd': SGDClassifier(),\n", + " 'tree': DecisionTreeClassifier(),\n", + " 'extra_tree': ExtraTreesClassifier(),\n", + " 'random_forest': RandomForestClassifier(),\n", + " 'isolation_forest': IsolationForest(),\n", + " 'adaboost': AdaBoostClassifier(),\n", + " 'stacking': StackingClassifier(estimators=[('tree', DecisionTreeClassifier())]), \n", + " 'gradient_boosting': GradientBoostingClassifier(), \n", + " 'hist_gradient_boosting': HistGradientBoostingClassifier(), \n", + " 'voting': VotingClassifier(estimators=[('tree', DecisionTreeClassifier())]), \n", + " 'bagging': BaggingClassifier(),\n", + " 'radius_neighbors': RadiusNeighborsClassifier(), \n", + " 'kneighbors': KNeighborsClassifier(),\n", + " 'svm': SVC(),\n", + " 'linear_svm': LinearSVC(),\n", + " 'categorical': CategoricalNB(),\n", + " 'bernoulli': BernoulliNB(), \n", + " 'gaussian': GaussianNB(),\n", + " 'neural_net': MLPClassifier()\n", + "}\n", + "\n", + "best_accuracy = 0\n", + "best_precision = 0\n", + "best_recall = 0\n", + "best_f1 = 0\n", + "best_model = None\n", + "model_name = None\n", + "\n", + "for estimator_name, estimator in estimators.items():\n", + " # Iterate & validate the model\n", + " cv = cross_validate(estimator, feature, label, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=10)\n", + " \n", + " accuracy = cv['test_accuracy'].mean()\n", + " precision = cv['test_precision'].mean()\n", + " recall = cv['test_recall'].mean()\n", + " f1 = cv['test_f1'].mean()\n", + " \n", + " # Select the performing model\n", + " if accuracy > best_accuracy:\n", + " best_accuracy = accuracy\n", + " best_precision = precision\n", + " best_recall = recall\n", + " best_f1 = f1\n", + " model_name = estimator_name\n", + " best_model = estimator\n", + "\n", + " print(f'{estimator_name} -> ( Accuracy: {(accuracy * 100):.0f}%) - (Precision: {(precision * 100):.0f}%) - (Recall: {(recall * 100):.0f}%) - (Measure: {(f1 * 100):.0f}%)')\n", + "\n", + "print(f'\\nBest Model: {model_name}')\n", + "print(f'Accuracy: {best_accuracy}')\n", + "print(f'Precision: {best_precision}')\n", + "print(f'Recall: {best_recall}')\n", + "print(f'Measure: {best_f1}')\n", + "\n", + "# Re-train the performing model\n", + "best_model.fit(feature, label)\n", + "\n", + "# Save the trained model\n", + "model = joblib.dump(best_model, f'../../../models/{model_name}.joblib')\n", + "print(f'Model Saved: {model}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}