AI-Powered-Preventive-HC / Git / Diff of /Model Buliding/final

Models:
RaymondKing/
AI-Powered-Preventive-HC
Downloads: 1
Diff of /Model Buliding/final_notebook.ipynb [000000] .. [2629da]
Switch to side-by-side view

--- a
+++ b/Model Buliding/final_notebook.ipynb
@@ -0,0 +1,1055 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from pandas.plotting import scatter_matrix\n",
+    "import seaborn as sns\n",
+    "import pickle\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "# from sklearn.tree import DecisionTreeClassifier\n",
+    "# from sklearn.svm import SVC\n",
+    "# from sklearn.neighbors import KNeighborsClassifier\n",
+    "# from sklearn.ensemble import RandomForestClassifier\n",
+    "# from imblearn.over_sampling import SMOTE\n",
+    "# from sklearn.model_selection import GridSearchCV\n",
+    "import warnings\n",
+    "\n",
+    "# Ignore all warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>General_Health</th>\n",
+       "      <th>Checkup</th>\n",
+       "      <th>Exercise</th>\n",
+       "      <th>Heart_Disease</th>\n",
+       "      <th>Skin_Cancer</th>\n",
+       "      <th>Other_Cancer</th>\n",
+       "      <th>Depression</th>\n",
+       "      <th>Diabetes</th>\n",
+       "      <th>Arthritis</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age_Category</th>\n",
+       "      <th>Height_(cm)</th>\n",
+       "      <th>Weight_(kg)</th>\n",
+       "      <th>BMI</th>\n",
+       "      <th>Smoking_History</th>\n",
+       "      <th>Alcohol_Consumption</th>\n",
+       "      <th>Fruit_Consumption</th>\n",
+       "      <th>Green_Vegetables_Consumption</th>\n",
+       "      <th>FriedPotato_Consumption</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Poor</td>\n",
+       "      <td>Within the past 2 years</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>70-74</td>\n",
+       "      <td>150.0</td>\n",
+       "      <td>32.66</td>\n",
+       "      <td>14.54</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>12.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Very Good</td>\n",
+       "      <td>Within the past year</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>70-74</td>\n",
+       "      <td>165.0</td>\n",
+       "      <td>77.11</td>\n",
+       "      <td>28.29</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Very Good</td>\n",
+       "      <td>Within the past year</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>60-64</td>\n",
+       "      <td>163.0</td>\n",
+       "      <td>88.45</td>\n",
+       "      <td>33.47</td>\n",
+       "      <td>No</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>16.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Poor</td>\n",
+       "      <td>Within the past year</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>75-79</td>\n",
+       "      <td>180.0</td>\n",
+       "      <td>93.44</td>\n",
+       "      <td>28.73</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>8.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Good</td>\n",
+       "      <td>Within the past year</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>80+</td>\n",
+       "      <td>191.0</td>\n",
+       "      <td>88.45</td>\n",
+       "      <td>24.37</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \\\n",
+       "0           Poor  Within the past 2 years       No            No          No   \n",
+       "1      Very Good     Within the past year       No           Yes          No   \n",
+       "2      Very Good     Within the past year      Yes            No          No   \n",
+       "3           Poor     Within the past year      Yes           Yes          No   \n",
+       "4           Good     Within the past year       No            No          No   \n",
+       "\n",
+       "  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \\\n",
+       "0           No         No       No       Yes  Female        70-74   \n",
+       "1           No         No      Yes        No  Female        70-74   \n",
+       "2           No         No      Yes        No  Female        60-64   \n",
+       "3           No         No      Yes        No    Male        75-79   \n",
+       "4           No         No       No        No    Male          80+   \n",
+       "\n",
+       "   Height_(cm)  Weight_(kg)    BMI Smoking_History  Alcohol_Consumption  \\\n",
+       "0        150.0        32.66  14.54             Yes                  0.0   \n",
+       "1        165.0        77.11  28.29              No                  0.0   \n",
+       "2        163.0        88.45  33.47              No                  4.0   \n",
+       "3        180.0        93.44  28.73              No                  0.0   \n",
+       "4        191.0        88.45  24.37             Yes                  0.0   \n",
+       "\n",
+       "   Fruit_Consumption  Green_Vegetables_Consumption  FriedPotato_Consumption  \n",
+       "0               30.0                          16.0                     12.0  \n",
+       "1               30.0                           0.0                      4.0  \n",
+       "2               12.0                           3.0                     16.0  \n",
+       "3               30.0                          30.0                      8.0  \n",
+       "4                8.0                           4.0                      0.0  "
+      ]
+     },
+     "execution_count": 139,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"CVD_cleaned.csv\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 308854 entries, 0 to 308853\n",
+      "Data columns (total 19 columns):\n",
+      " #   Column                        Non-Null Count   Dtype  \n",
+      "---  ------                        --------------   -----  \n",
+      " 0   General_Health                308854 non-null  object \n",
+      " 1   Checkup                       308854 non-null  object \n",
+      " 2   Exercise                      308854 non-null  object \n",
+      " 3   Heart_Disease                 308854 non-null  object \n",
+      " 4   Skin_Cancer                   308854 non-null  object \n",
+      " 5   Other_Cancer                  308854 non-null  object \n",
+      " 6   Depression                    308854 non-null  object \n",
+      " 7   Diabetes                      308854 non-null  object \n",
+      " 8   Arthritis                     308854 non-null  object \n",
+      " 9   Sex                           308854 non-null  object \n",
+      " 10  Age_Category                  308854 non-null  object \n",
+      " 11  Height_(cm)                   308854 non-null  float64\n",
+      " 12  Weight_(kg)                   308854 non-null  float64\n",
+      " 13  BMI                           308854 non-null  float64\n",
+      " 14  Smoking_History               308854 non-null  object \n",
+      " 15  Alcohol_Consumption           308854 non-null  float64\n",
+      " 16  Fruit_Consumption             308854 non-null  float64\n",
+      " 17  Green_Vegetables_Consumption  308854 non-null  float64\n",
+      " 18  FriedPotato_Consumption       308854 non-null  float64\n",
+      "dtypes: float64(7), object(12)\n",
+      "memory usage: 44.8+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer',\n",
+       "       'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',\n",
+       "       'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History',\n",
+       "       'Alcohol_Consumption', 'Fruit_Consumption',\n",
+       "       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 141,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop(columns=['Skin_Cancer','Other_Cancer'],inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Diabetes\n",
+       "No     266037\n",
+       "Yes     42817\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 143,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Diabetes'] = df['Diabetes'].replace({\n",
+    "    'Yes': 'Yes',\n",
+    "    'Yes, but female told only during pregnancy': 'Yes',\n",
+    "    'No': 'No',\n",
+    "    'No, pre-diabetes or borderline diabetes': 'No'\n",
+    "})\n",
+    "df['Diabetes'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 144,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categorical_columns = df.select_dtypes(include=['object', 'category']).columns\n",
+    "numerical_columns = df.select_dtypes(include=['number'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 145,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "obesity\n",
+       "Overweight       109866\n",
+       "Obesity          106738\n",
+       "Normal weight     87706\n",
+       "Underweight        4544\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 145,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "bins = [-float('inf'), 18.5, 24.9, 29.9, float('inf')]\n",
+    "labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity']\n",
+    "\n",
+    "# Create a new column 'obesity' based on BMI classification\n",
+    "df['obesity'] = pd.cut(df['BMI'], bins=bins, labels=labels)\n",
+    "df['obesity'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 146,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_encoders = {}\n",
+    "\n",
+    "# Apply Label Encoding to categorical columns\n",
+    "for column in categorical_columns:\n",
+    "    df[column] = df[column].astype(str)\n",
+    "    label_encoder = LabelEncoder() \n",
+    "    df[column] = label_encoder.fit_transform(df[column])\n",
+    "    label_encoders[column] = label_encoder \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('label_encoders.pkl', 'wb') as f:\n",
+    "    pickle.dump(label_encoders, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 148,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Checkup Label Encoder Mappings:\n",
+      "0: No\n",
+      "1: Yes\n"
+     ]
+    }
+   ],
+   "source": [
+    "if 'Heart_Disease' in label_encoders:\n",
+    "    encoder = label_encoders['Heart_Disease']\n",
+    "    mappings = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))\n",
+    "    print(\"Checkup Label Encoder Mappings:\")\n",
+    "    for key, value in mappings.items():\n",
+    "        print(f\"{key}: {value}\")\n",
+    "else:\n",
+    "    print(\"Checkup column not found in label encoders.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 149,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "18"
+      ]
+     },
+     "execution_count": 149,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df.columns.tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 150,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Diabetes\n",
+       "0    266037\n",
+       "1     42817\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 150,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Diabetes'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 151,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_heart_disease = df['Heart_Disease']\n",
+    "y_diabetes = df['Diabetes']\n",
+    "y_obesity = df['obesity']\n",
+    "X_train = df.drop(columns=['Heart_Disease','Diabetes','obesity'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 152,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Feature scaling (Standardizing the data)\n",
+    "# scaler = StandardScaler()\n",
+    "# X_scaled = scaler.fit_transform(X)\n",
+    "\n",
+    "# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)\n",
+    "X_train_ht, X_test_ht, y_train_ht, y_test_ht = train_test_split(X_train,y_heart_disease,test_size= 0.2)\n",
+    "X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X_train,y_diabetes,test_size= 0.2)\n",
+    "X_train_ob, X_test_ob, y_train_ob, y_test_ob = train_test_split(X_train,y_obesity,test_size= 0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',\n",
+       "       'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',\n",
+       "       'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',\n",
+       "       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 154,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train_dt.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.9201\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.92      1.00      0.96     56850\n",
+      "           1       0.33      0.00      0.01      4921\n",
+      "\n",
+      "    accuracy                           0.92     61771\n",
+      "   macro avg       0.63      0.50      0.48     61771\n",
+      "weighted avg       0.87      0.92      0.88     61771\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Logistic Regression\n",
+    "lr_ht = LogisticRegression()\n",
+    "lr_ht.fit(X_train_ht,y_train_ht)\n",
+    "y_pred_ht = lr_ht.predict(X_test_ht)\n",
+    "print(f\"Accuracy: {accuracy_score(y_test_ht, y_pred_ht):.4f}\")\n",
+    "print(classification_report(y_test_ht, y_pred_ht))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 156,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.8609\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.86      0.99      0.92     53194\n",
+      "           1       0.49      0.04      0.07      8577\n",
+      "\n",
+      "    accuracy                           0.86     61771\n",
+      "   macro avg       0.68      0.52      0.50     61771\n",
+      "weighted avg       0.81      0.86      0.81     61771\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Logistic Regression\n",
+    "lr_dt = LogisticRegression()\n",
+    "lr_dt.fit(X_train_dt,y_train_dt)\n",
+    "y_pred_dt = lr_dt.predict(X_test_dt)\n",
+    "print(f\"Accuracy: {accuracy_score(y_test_dt, y_pred_dt):.4f}\")\n",
+    "print(classification_report(y_test_dt, y_pred_dt))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.9732\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "Normal weight       0.97      0.97      0.97     17380\n",
+      "      Obesity       0.99      0.99      0.99     21321\n",
+      "   Overweight       0.98      0.98      0.98     22138\n",
+      "  Underweight       0.63      0.65      0.64       932\n",
+      "\n",
+      "     accuracy                           0.97     61771\n",
+      "    macro avg       0.89      0.89      0.89     61771\n",
+      " weighted avg       0.97      0.97      0.97     61771\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Logistic Regression\n",
+    "lr_ob = LogisticRegression()\n",
+    "lr_ob.fit(X_train_ob,y_train_ob)\n",
+    "y_pred_ob = lr_ob.predict(X_test_ob)\n",
+    "print(f\"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}\")\n",
+    "print(classification_report(y_test_ob, y_pred_ob))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.9732\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "Normal weight       0.97      0.97      0.97     17380\n",
+      "      Obesity       0.99      0.99      0.99     21321\n",
+      "   Overweight       0.98      0.98      0.98     22138\n",
+      "  Underweight       0.63      0.65      0.64       932\n",
+      "\n",
+      "     accuracy                           0.97     61771\n",
+      "    macro avg       0.89      0.89      0.89     61771\n",
+      " weighted avg       0.97      0.97      0.97     61771\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Logistic Regression\n",
+    "lr_ob = LogisticRegression()\n",
+    "lr_ob.fit(X_train_ob,y_train_ob)\n",
+    "y_pred_ob = lr_ob.predict(X_test_ob)\n",
+    "print(f\"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}\")\n",
+    "print(classification_report(y_test_ob, y_pred_ob))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model saved to lr_ht.pkl\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_filename = 'lr_ht.pkl'\n",
+    "with open(model_filename, 'wb') as file:\n",
+    "    pickle.dump(lr_ht, file)\n",
+    "\n",
+    "print(f\"Model saved to {model_filename}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 160,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model saved to lr_dt.pkl\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_filename = 'lr_dt.pkl'\n",
+    "with open(model_filename, 'wb') as file:\n",
+    "    pickle.dump(lr_dt, file)\n",
+    "\n",
+    "print(f\"Model saved to {model_filename}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 161,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model saved to lr_ob.pkl\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_filename = 'lr_ob.pkl'\n",
+    "with open(model_filename, 'wb') as file:\n",
+    "    pickle.dump(lr_ob, file)\n",
+    "\n",
+    "print(f\"Model saved to {model_filename}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded Model Accuracy: 0.9727\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(model_filename, 'rb') as file:\n",
+    "    loaded_model = pickle.load(file)\n",
+    "\n",
+    "# Now you can use loaded_model to make predictions\n",
+    "y_pred_loaded = loaded_model.predict(X_test_ob)\n",
+    "print(f\"Loaded Model Accuracy: {accuracy_score(y_test_ob, y_pred_loaded):.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 'Other_Cancer',\n",
+       "       'Depression', 'Arthritis', 'Sex', 'Age_Category', 'Height_(cm)',\n",
+       "       'Weight_(kg)', 'BMI', 'Smoking_History', 'Alcohol_Consumption',\n",
+       "       'Fruit_Consumption', 'Green_Vegetables_Consumption',\n",
+       "       'FriedPotato_Consumption'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 113,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_test_ob.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['General_Health',\n",
+       " 'Checkup',\n",
+       " 'Exercise',\n",
+       " 'Heart_Disease',\n",
+       " 'Skin_Cancer',\n",
+       " 'Other_Cancer',\n",
+       " 'Depression',\n",
+       " 'Diabetes',\n",
+       " 'Arthritis',\n",
+       " 'Sex',\n",
+       " 'Age_Category',\n",
+       " 'Smoking_History']"
+      ]
+     },
+     "execution_count": 114,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "categorical_columns.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Encoded Input DataFrame:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>General_Health</th>\n",
+       "      <th>Checkup</th>\n",
+       "      <th>Exercise</th>\n",
+       "      <th>Depression</th>\n",
+       "      <th>Arthritis</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age_Category</th>\n",
+       "      <th>Height_(cm)</th>\n",
+       "      <th>Weight_(kg)</th>\n",
+       "      <th>BMI</th>\n",
+       "      <th>Smoking_History</th>\n",
+       "      <th>Alcohol_Consumption</th>\n",
+       "      <th>Fruit_Consumption</th>\n",
+       "      <th>Green_Vegetables_Consumption</th>\n",
+       "      <th>FriedPotato_Consumption</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>10</td>\n",
+       "      <td>150</td>\n",
+       "      <td>32.66</td>\n",
+       "      <td>14.34</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>16</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   General_Health  Checkup  Exercise  Depression  Arthritis  Sex  \\\n",
+       "0               3        2         0           0          1    0   \n",
+       "\n",
+       "   Age_Category  Height_(cm)  Weight_(kg)    BMI  Smoking_History  \\\n",
+       "0            10          150        32.66  14.34                1   \n",
+       "\n",
+       "   Alcohol_Consumption  Fruit_Consumption  Green_Vegetables_Consumption  \\\n",
+       "0                  0.0               30.0                            16   \n",
+       "\n",
+       "   FriedPotato_Consumption  \n",
+       "0                       12  "
+      ]
+     },
+     "execution_count": 115,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Define the columns\n",
+    "columns = ['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',\n",
+    "           'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',\n",
+    "           'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',\n",
+    "           'Green_Vegetables_Consumption', 'FriedPotato_Consumption']\n",
+    "\n",
+    "# Define the input values for each column (replace these with actual values)\n",
+    "input_values = {\n",
+    "    'General_Health': 'Poor',  # Example values\n",
+    "    'Checkup': 'Within the past 2 years',\n",
+    "    'Exercise': 'No',\n",
+    "    'Depression': 'No',\n",
+    "    'Arthritis': 'Yes',\n",
+    "    'Sex': 'Female',\n",
+    "    'Age_Category': '70-74',\n",
+    "    'Height_(cm)': 150,  # Example numerical values\n",
+    "    'Weight_(kg)': 32.66,\n",
+    "    'BMI': 14.34,\n",
+    "    'Smoking_History': 'Yes',\n",
+    "    'Alcohol_Consumption': 0.0,\n",
+    "    'Fruit_Consumption': 30.0,\n",
+    "    'Green_Vegetables_Consumption': 16,\n",
+    "    'FriedPotato_Consumption': 12\n",
+    "}\n",
+    "\n",
+    "# Create a DataFrame from input values\n",
+    "input_df = pd.DataFrame([input_values])\n",
+    "\n",
+    "# Encode categorical columns using the same LabelEncoders you used during training\n",
+    "for column in categorical_columns:\n",
+    "    if column in input_df.columns:\n",
+    "        # Transform the input values using the stored encoder\n",
+    "        input_df[column] = label_encoders[column].transform(input_df[column].astype(str))\n",
+    "\n",
+    "# Display the input DataFrame after encoding\n",
+    "print(\"Encoded Input DataFrame:\")\n",
+    "input_df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "The feature names should match those that were passed during fit.\nFeature names seen at fit time, yet now missing:\n- Other_Cancer\n- Skin_Cancer\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[116], line 5\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Prepare input for prediction (make sure to drop any non-feature columns if necessary)\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m# X_input = input_df[columns]\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \n\u001b[0;32m      4\u001b[0m \u001b[38;5;66;03m# Make the prediction\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m prediction \u001b[38;5;241m=\u001b[39m loaded_model\u001b[38;5;241m.\u001b[39mpredict(input_df)\n\u001b[0;32m      7\u001b[0m \u001b[38;5;66;03m# Output the prediction\u001b[39;00m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPredicted Class: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mprediction[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_base.py:351\u001b[0m, in \u001b[0;36mLinearClassifierMixin.predict\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    337\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    338\u001b[0m \u001b[39mPredict class labels for samples in X.\u001b[39;00m\n\u001b[0;32m    339\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    348\u001b[0m \u001b[39m    Vector containing the class labels for each sample.\u001b[39;00m\n\u001b[0;32m    349\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    350\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(X)\n\u001b[1;32m--> 351\u001b[0m scores \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdecision_function(X)\n\u001b[0;32m    352\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(scores\u001b[39m.\u001b[39mshape) \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m    353\u001b[0m     indices \u001b[39m=\u001b[39m xp\u001b[39m.\u001b[39mastype(scores \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m, \u001b[39mint\u001b[39m)\n",
+      "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_base.py:332\u001b[0m, in \u001b[0;36mLinearClassifierMixin.decision_function\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    329\u001b[0m check_is_fitted(\u001b[39mself\u001b[39m)\n\u001b[0;32m    330\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(X)\n\u001b[1;32m--> 332\u001b[0m X \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_data(X, accept_sparse\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcsr\u001b[39m\u001b[39m\"\u001b[39m, reset\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[0;32m    333\u001b[0m scores \u001b[39m=\u001b[39m safe_sparse_dot(X, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcoef_\u001b[39m.\u001b[39mT, dense_output\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m) \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mintercept_\n\u001b[0;32m    334\u001b[0m \u001b[39mreturn\u001b[39;00m xp\u001b[39m.\u001b[39mreshape(scores, (\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m,)) \u001b[39mif\u001b[39;00m scores\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m] \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m \u001b[39melse\u001b[39;00m scores\n",
+      "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:608\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m    537\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_validate_data\u001b[39m(\n\u001b[0;32m    538\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[0;32m    539\u001b[0m     X\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mno_validation\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    544\u001b[0m     \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_params,\n\u001b[0;32m    545\u001b[0m ):\n\u001b[0;32m    546\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Validate input data and set or check the `n_features_in_` attribute.\u001b[39;00m\n\u001b[0;32m    547\u001b[0m \n\u001b[0;32m    548\u001b[0m \u001b[39m    Parameters\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    606\u001b[0m \u001b[39m        validated.\u001b[39;00m\n\u001b[0;32m    607\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m--> 608\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_feature_names(X, reset\u001b[39m=\u001b[39mreset)\n\u001b[0;32m    610\u001b[0m     \u001b[39mif\u001b[39;00m y \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_tags()[\u001b[39m\"\u001b[39m\u001b[39mrequires_y\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m    611\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m    612\u001b[0m             \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThis \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m estimator \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    613\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mrequires y to be passed, but the target y is None.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    614\u001b[0m         )\n",
+      "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:535\u001b[0m, in \u001b[0;36mBaseEstimator._check_feature_names\u001b[1;34m(self, X, reset)\u001b[0m\n\u001b[0;32m    530\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m missing_names \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m unexpected_names:\n\u001b[0;32m    531\u001b[0m     message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\n\u001b[0;32m    532\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mFeature names must be in the same order as they were in fit.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m    533\u001b[0m     )\n\u001b[1;32m--> 535\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(message)\n",
+      "\u001b[1;31mValueError\u001b[0m: The feature names should match those that were passed during fit.\nFeature names seen at fit time, yet now missing:\n- Other_Cancer\n- Skin_Cancer\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Prepare input for prediction (make sure to drop any non-feature columns if necessary)\n",
+    "# X_input = input_df[columns]\n",
+    "\n",
+    "# Make the prediction\n",
+    "prediction = loaded_model.predict(input_df)\n",
+    "\n",
+    "# Output the prediction\n",
+    "print(f\"Predicted Class: {prediction[0]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Underweight'], dtype=object)"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0.90208773, 0.01696106, 0.06855007, 0.01240114]])"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_filename = 'lr_dt.pkl'\n",
+    "\n",
+    "with open(model_filename, 'rb') as file:\n",
+    "    loaded_model_lr_dt = pickle.load(file)\n",
+    "y_pred_loaded = loaded_model_lr_dt.predict_proba(input_df)\n",
+    "y_pred_loaded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.12.4 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "3c06e3e46abf38078fe4dac36a0085ec2b134ebbd73dd076183d243eeca6918f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}