{ "cells": [ { "cell_type": "markdown", "id": "9af65cb9-8a84-47e4-8bea-1547afe46a15", "metadata": {}, "source": [ "DIABETIES" ] }, { "cell_type": "code", "execution_count": 2, "id": "3fba3c9b-5e48-4771-a28a-4ec54fc4bc1b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pregnancies 0\n", "Glucose 0\n", "BloodPressure 0\n", "SkinThickness 0\n", "Insulin 0\n", "BMI 0\n", "DiabetesPedigreeFunction 0\n", "Age 0\n", "Outcome 0\n", "dtype: int64" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import svm\n", "from sklearn.metrics import accuracy_score\n", "dataset= pd.read_csv(r'C:\\Users\\Pranshu Saini\\Desktop\\disease-prediction-main\\docpat\\datasets\\diabetes.csv')\n", "dataset.head(5)\n", "dataset.isna().sum()" ] }, { "cell_type": "code", "execution_count": 4, "id": "e99dd297-c606-4501-80c8-fa87d89fc237", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The reduced dataframe has 9 columns.\n" ] } ], "source": [ "# removing highly correlated features\n", "\n", "corr_matrix = dataset.corr().abs() \n", "\n", "mask = np.triu(np.ones_like(corr_matrix, dtype = bool))\n", "tri_df = corr_matrix.mask(mask)\n", "\n", "to_drop = [x for x in tri_df.columns if any(tri_df[x] > 0.92)]\n", "\n", "df = dataset.drop(to_drop, axis = 1)\n", "\n", "print(f\"The reduced dataframe has {df.shape[1]} columns.\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "64693a8e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
..............................
76310101764818032.90.171630
76421227027036.80.340270
7655121722311226.20.245300
7661126600030.10.349471
7671937031030.40.315230
\n", "

768 rows × 9 columns

\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "0 6 148 72 35 0 33.6 \n", "1 1 85 66 29 0 26.6 \n", "2 8 183 64 0 0 23.3 \n", "3 1 89 66 23 94 28.1 \n", "4 0 137 40 35 168 43.1 \n", ".. ... ... ... ... ... ... \n", "763 10 101 76 48 180 32.9 \n", "764 2 122 70 27 0 36.8 \n", "765 5 121 72 23 112 26.2 \n", "766 1 126 60 0 0 30.1 \n", "767 1 93 70 31 0 30.4 \n", "\n", " DiabetesPedigreeFunction Age Outcome \n", "0 0.627 50 1 \n", "1 0.351 31 0 \n", "2 0.672 32 1 \n", "3 0.167 21 0 \n", "4 2.288 33 1 \n", ".. ... ... ... \n", "763 0.171 63 0 \n", "764 0.340 27 0 \n", "765 0.245 30 0 \n", "766 0.349 47 1 \n", "767 0.315 23 0 \n", "\n", "[768 rows x 9 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "id": "ecc668ff-a516-4ee6-8b2f-76b07d9df34f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(768, 8) (614, 8) (154, 8)\n" ] } ], "source": [ "A= dataset.drop(columns = 'Outcome', axis=1)\n", "B= dataset['Outcome']\n", "A_training, A_testing, B_training, B_testing = train_test_split(A,B, test_size = 0.2, stratify=B, random_state=5)\n", "print(A.shape, A_training.shape, A_testing.shape)\n" ] }, { "cell_type": "markdown", "id": "62af783b-d901-479d-bf3f-512a897fceaa", "metadata": {}, "source": [ "LogisticRegression" ] }, { "cell_type": "code", "execution_count": null, "id": "69152bd3-c30b-46b4-bf8e-32dbcd33e163", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7817589576547231\n", "0.7532467532467533\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Dell\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] } ], "source": [ "# fitting data to model\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "log_reg = LogisticRegression()\n", "log_reg.fit(A_training, B_training)\n", "B_pred = log_reg.predict(A_testing)\n", "# accuracy score\n", "\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "\n", "print(accuracy_score(B_training, log_reg.predict(A_training)))\n", "\n", "log_reg_acc = accuracy_score(B_testing, log_reg.predict(A_testing))\n", "print(log_reg_acc)" ] }, { "cell_type": "markdown", "id": "90967102-86d1-4939-9113-4d06ce5bb054", "metadata": {}, "source": [ "K Neighbors Classifier (KNN)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2092a5bc-4602-4aa6-adb3-34ee677f8134", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7980456026058632\n", "0.7142857142857143\n" ] } ], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "knn = KNeighborsClassifier()\n", "knn.fit(A_training, B_training)\n", "# model predictions \n", "\n", "B_pred = knn.predict(A_testing)\n", "# accuracy score\n", "\n", "print(accuracy_score(B_training, knn.predict(A_training)))\n", "\n", "knn_acc = accuracy_score(B_testing, knn.predict(A_testing))\n", "print(knn_acc)" ] }, { "cell_type": "markdown", "id": "da68f26d-4291-4076-bf2c-f20dd524c6e7", "metadata": {}, "source": [ "Support Vector Machine (SVM)" ] }, { "cell_type": "code", "execution_count": null, "id": "a27c4a3c-15e7-492d-88a6-526ddfc968cd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'C': 1, 'gamma': 0.0001}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.svm import SVC\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "svc = SVC(probability=True)\n", "parameters = {\n", " 'gamma' : [0.0001, 0.001, 0.01, 0.1],\n", " 'C' : [0.01, 0.05, 0.5, 0.1, 1, 10, 15, 20]\n", "}\n", "\n", "grid_search = GridSearchCV(svc, parameters)\n", "grid_search.fit(A_training, B_training)\n", "# best parameters\n", "\n", "grid_search.best_params_\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d8da6a6f-eba3-4a66-b61f-04a58313de41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7557643609222977" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# best score \n", "\n", "grid_search.best_score_" ] }, { "cell_type": "code", "execution_count": null, "id": "0627d35c-f004-499b-be77-0fe0380aa002", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0\n", "0.6428571428571429\n", " precision recall f1-score support\n", "\n", " 0 0.66 0.93 0.77 100\n", " 1 0.46 0.11 0.18 54\n", "\n", " accuracy 0.64 154\n", " macro avg 0.56 0.52 0.48 154\n", "weighted avg 0.59 0.64 0.56 154\n", "\n" ] } ], "source": [ "svc = SVC(C = 10, gamma = 0.01, probability=True)\n", "svc.fit(A_training, B_training)\n", "# model predictions \n", "\n", "B_pred = svc.predict(A_testing)\n", "# accuracy score\n", "\n", "print(accuracy_score(B_training, svc.predict(A_training)))\n", "\n", "svc_acc = accuracy_score(B_testing, svc.predict(A_testing))\n", "print(svc_acc)\n", "# classification report\n", "\n", "print(classification_report(B_testing, B_pred))" ] }, { "cell_type": "markdown", "id": "e4b77944-3ceb-4b80-a738-e8e5a99e009d", "metadata": {}, "source": [ "DECISION TREE" ] }, { "cell_type": "code", "execution_count": null, "id": "fe80f32d-ba91-4297-a9d7-6232a48b434c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 8640 candidates, totalling 43200 fits\n" ] }, { "data": { "text/html": [ "
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,\n",
       "             param_grid={'criterion': ['gini', 'entropy'],\n",
       "                         'max_depth': range(2, 32),\n",
       "                         'min_samples_leaf': range(1, 10),\n",
       "                         'min_samples_split': range(2, 10),\n",
       "                         'splitter': ['best', 'random']},\n",
       "             verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,\n", " param_grid={'criterion': ['gini', 'entropy'],\n", " 'max_depth': range(2, 32),\n", " 'min_samples_leaf': range(1, 10),\n", " 'min_samples_split': range(2, 10),\n", " 'splitter': ['best', 'random']},\n", " verbose=1)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "\n", "dtc = DecisionTreeClassifier()\n", "\n", "parameters = {\n", " 'criterion' : ['gini', 'entropy'],\n", " 'max_depth' : range(2, 32, 1),\n", " 'min_samples_leaf' : range(1, 10, 1),\n", " 'min_samples_split' : range(2, 10, 1),\n", " 'splitter' : ['best', 'random']\n", "}\n", "\n", "grid_search_dt = GridSearchCV(dtc, parameters, cv = 5, n_jobs = -1, verbose = 1)\n", "grid_search_dt.fit(A_training, B_training)" ] }, { "cell_type": "code", "execution_count": null, "id": "3500946f-0e9a-4d31-b076-35c851e1ca69", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
DecisionTreeClassifier(criterion='entropy', max_depth=19, min_samples_leaf=4,\n",
       "                       min_samples_split=6, splitter='random')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "DecisionTreeClassifier(criterion='entropy', max_depth=19, min_samples_leaf=4,\n", " min_samples_split=6, splitter='random')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# best score\n", "\n", "grid_search_dt.best_score_\n", "dtc = DecisionTreeClassifier(criterion= 'entropy', max_depth= 19, min_samples_leaf= 4, min_samples_split= 6, splitter= 'random')\n", "dtc.fit(A_training, B_training)" ] }, { "cell_type": "code", "execution_count": null, "id": "73fa053a-be12-46d6-9315-fb8d5d557b7c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8224755700325733\n", "0.6883116883116883\n" ] } ], "source": [ "B_pred = dtc.predict(A_testing)\n", "# accuracy score\n", "\n", "print(accuracy_score(B_training, dtc.predict(A_training)))\n", "\n", "dtc_acc = accuracy_score(B_testing, dtc.predict(A_testing))\n", "print(dtc_acc)" ] }, { "cell_type": "code", "execution_count": null, "id": "f7f0b47c-f612-4fb1-b06f-9c074da06bb7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.74 0.80 0.77 100\n", " 1 0.57 0.48 0.52 54\n", "\n", " accuracy 0.69 154\n", " macro avg 0.65 0.64 0.64 154\n", "weighted avg 0.68 0.69 0.68 154\n", "\n" ] } ], "source": [ "# classification report\n", "\n", "print(classification_report(B_testing, B_pred))" ] }, { "cell_type": "code", "execution_count": null, "id": "55fde2ec-b707-47d5-8556-6b461a71f5dd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelScore
0Logistic Regression75.32
1KNN71.43
3Decision Tree Classifier68.83
2SVM64.29
\n", "
" ], "text/plain": [ " Model Score\n", "0 Logistic Regression 75.32\n", "1 KNN 71.43\n", "3 Decision Tree Classifier 68.83\n", "2 SVM 64.29" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "models = pd.DataFrame({\n", " 'Model': ['Logistic Regression', 'KNN', 'SVM', 'Decision Tree Classifier'],\n", " 'Score': [100*round(log_reg_acc,4), 100*round(knn_acc,4), 100*round(svc_acc,4), 100*round(dtc_acc,4)]\n", "})\n", "models.sort_values(by = 'Score', ascending = False)" ] }, { "cell_type": "code", "execution_count": null, "id": "24684911-cbad-474c-8743-fcf517c7e01c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "41401323-1a4c-4091-8bc2-7797137c0f65", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "filename = 'C:/Users/Dell/OneDrive/Desktop/DM PROJECT/diabetes_model.pkl'\n", "pickle.dump(log_reg, open(filename, 'wb'))" ] }, { "cell_type": "code", "execution_count": null, "id": "b1b7f82b-c26e-437a-8a01-c2bfd34e7268", "metadata": {}, "outputs": [], "source": [ "'''import pickle\n", "def load_model(path):\n", " with open(path, 'rb') as file:\n", " model = pickle.load(file)\n", "diabetes_model = load_model(r'C:\\Users\\DELL\\Desktop\\app\\diabetes_model.pkl')\n", "def predict(inputs):\n", " return diabetes_model.predict(inputs)'''" ] }, { "cell_type": "code", "execution_count": null, "id": "2ef4b8e2-d3d0-4a14-b512-c618d848c8d8", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }