[2629da]: / Model Buliding / final_notebook.ipynb

Download this file

1056 lines (1055 with data), 37.8 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from pandas.plotting import scatter_matrix\n",
    "import seaborn as sns\n",
    "import pickle\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "# from sklearn.tree import DecisionTreeClassifier\n",
    "# from sklearn.svm import SVC\n",
    "# from sklearn.neighbors import KNeighborsClassifier\n",
    "# from sklearn.ensemble import RandomForestClassifier\n",
    "# from imblearn.over_sampling import SMOTE\n",
    "# from sklearn.model_selection import GridSearchCV\n",
    "import warnings\n",
    "\n",
    "# Ignore all warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Heart_Disease</th>\n",
       "      <th>Skin_Cancer</th>\n",
       "      <th>Other_Cancer</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Diabetes</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past 2 years</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>150.0</td>\n",
       "      <td>32.66</td>\n",
       "      <td>14.54</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>70-74</td>\n",
       "      <td>165.0</td>\n",
       "      <td>77.11</td>\n",
       "      <td>28.29</td>\n",
       "      <td>No</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Very Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>60-64</td>\n",
       "      <td>163.0</td>\n",
       "      <td>88.45</td>\n",
       "      <td>33.47</td>\n",
       "      <td>No</td>\n",
       "      <td>4.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Poor</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>75-79</td>\n",
       "      <td>180.0</td>\n",
       "      <td>93.44</td>\n",
       "      <td>28.73</td>\n",
       "      <td>No</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Good</td>\n",
       "      <td>Within the past year</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Male</td>\n",
       "      <td>80+</td>\n",
       "      <td>191.0</td>\n",
       "      <td>88.45</td>\n",
       "      <td>24.37</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \\\n",
       "0           Poor  Within the past 2 years       No            No          No   \n",
       "1      Very Good     Within the past year       No           Yes          No   \n",
       "2      Very Good     Within the past year      Yes            No          No   \n",
       "3           Poor     Within the past year      Yes           Yes          No   \n",
       "4           Good     Within the past year       No            No          No   \n",
       "\n",
       "  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \\\n",
       "0           No         No       No       Yes  Female        70-74   \n",
       "1           No         No      Yes        No  Female        70-74   \n",
       "2           No         No      Yes        No  Female        60-64   \n",
       "3           No         No      Yes        No    Male        75-79   \n",
       "4           No         No       No        No    Male          80+   \n",
       "\n",
       "   Height_(cm)  Weight_(kg)    BMI Smoking_History  Alcohol_Consumption  \\\n",
       "0        150.0        32.66  14.54             Yes                  0.0   \n",
       "1        165.0        77.11  28.29              No                  0.0   \n",
       "2        163.0        88.45  33.47              No                  4.0   \n",
       "3        180.0        93.44  28.73              No                  0.0   \n",
       "4        191.0        88.45  24.37             Yes                  0.0   \n",
       "\n",
       "   Fruit_Consumption  Green_Vegetables_Consumption  FriedPotato_Consumption  \n",
       "0               30.0                          16.0                     12.0  \n",
       "1               30.0                           0.0                      4.0  \n",
       "2               12.0                           3.0                     16.0  \n",
       "3               30.0                          30.0                      8.0  \n",
       "4                8.0                           4.0                      0.0  "
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"CVD_cleaned.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 308854 entries, 0 to 308853\n",
      "Data columns (total 19 columns):\n",
      " #   Column                        Non-Null Count   Dtype  \n",
      "---  ------                        --------------   -----  \n",
      " 0   General_Health                308854 non-null  object \n",
      " 1   Checkup                       308854 non-null  object \n",
      " 2   Exercise                      308854 non-null  object \n",
      " 3   Heart_Disease                 308854 non-null  object \n",
      " 4   Skin_Cancer                   308854 non-null  object \n",
      " 5   Other_Cancer                  308854 non-null  object \n",
      " 6   Depression                    308854 non-null  object \n",
      " 7   Diabetes                      308854 non-null  object \n",
      " 8   Arthritis                     308854 non-null  object \n",
      " 9   Sex                           308854 non-null  object \n",
      " 10  Age_Category                  308854 non-null  object \n",
      " 11  Height_(cm)                   308854 non-null  float64\n",
      " 12  Weight_(kg)                   308854 non-null  float64\n",
      " 13  BMI                           308854 non-null  float64\n",
      " 14  Smoking_History               308854 non-null  object \n",
      " 15  Alcohol_Consumption           308854 non-null  float64\n",
      " 16  Fruit_Consumption             308854 non-null  float64\n",
      " 17  Green_Vegetables_Consumption  308854 non-null  float64\n",
      " 18  FriedPotato_Consumption       308854 non-null  float64\n",
      "dtypes: float64(7), object(12)\n",
      "memory usage: 44.8+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer',\n",
       "       'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',\n",
       "       'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History',\n",
       "       'Alcohol_Consumption', 'Fruit_Consumption',\n",
       "       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop(columns=['Skin_Cancer','Other_Cancer'],inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Diabetes\n",
       "No     266037\n",
       "Yes     42817\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Diabetes'] = df['Diabetes'].replace({\n",
    "    'Yes': 'Yes',\n",
    "    'Yes, but female told only during pregnancy': 'Yes',\n",
    "    'No': 'No',\n",
    "    'No, pre-diabetes or borderline diabetes': 'No'\n",
    "})\n",
    "df['Diabetes'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_columns = df.select_dtypes(include=['object', 'category']).columns\n",
    "numerical_columns = df.select_dtypes(include=['number'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "obesity\n",
       "Overweight       109866\n",
       "Obesity          106738\n",
       "Normal weight     87706\n",
       "Underweight        4544\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bins = [-float('inf'), 18.5, 24.9, 29.9, float('inf')]\n",
    "labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity']\n",
    "\n",
    "# Create a new column 'obesity' based on BMI classification\n",
    "df['obesity'] = pd.cut(df['BMI'], bins=bins, labels=labels)\n",
    "df['obesity'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_encoders = {}\n",
    "\n",
    "# Apply Label Encoding to categorical columns\n",
    "for column in categorical_columns:\n",
    "    df[column] = df[column].astype(str)\n",
    "    label_encoder = LabelEncoder() \n",
    "    df[column] = label_encoder.fit_transform(df[column])\n",
    "    label_encoders[column] = label_encoder \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('label_encoders.pkl', 'wb') as f:\n",
    "    pickle.dump(label_encoders, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checkup Label Encoder Mappings:\n",
      "0: No\n",
      "1: Yes\n"
     ]
    }
   ],
   "source": [
    "if 'Heart_Disease' in label_encoders:\n",
    "    encoder = label_encoders['Heart_Disease']\n",
    "    mappings = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))\n",
    "    print(\"Checkup Label Encoder Mappings:\")\n",
    "    for key, value in mappings.items():\n",
    "        print(f\"{key}: {value}\")\n",
    "else:\n",
    "    print(\"Checkup column not found in label encoders.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Diabetes\n",
       "0    266037\n",
       "1     42817\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Diabetes'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_heart_disease = df['Heart_Disease']\n",
    "y_diabetes = df['Diabetes']\n",
    "y_obesity = df['obesity']\n",
    "X_train = df.drop(columns=['Heart_Disease','Diabetes','obesity'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature scaling (Standardizing the data)\n",
    "# scaler = StandardScaler()\n",
    "# X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)\n",
    "X_train_ht, X_test_ht, y_train_ht, y_test_ht = train_test_split(X_train,y_heart_disease,test_size= 0.2)\n",
    "X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X_train,y_diabetes,test_size= 0.2)\n",
    "X_train_ob, X_test_ob, y_train_ob, y_test_ob = train_test_split(X_train,y_obesity,test_size= 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',\n",
       "       'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',\n",
       "       'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',\n",
       "       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_dt.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9201\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.92      1.00      0.96     56850\n",
      "           1       0.33      0.00      0.01      4921\n",
      "\n",
      "    accuracy                           0.92     61771\n",
      "   macro avg       0.63      0.50      0.48     61771\n",
      "weighted avg       0.87      0.92      0.88     61771\n",
      "\n"
     ]
    }
   ],
   "source": [
    "### Logistic Regression\n",
    "lr_ht = LogisticRegression()\n",
    "lr_ht.fit(X_train_ht,y_train_ht)\n",
    "y_pred_ht = lr_ht.predict(X_test_ht)\n",
    "print(f\"Accuracy: {accuracy_score(y_test_ht, y_pred_ht):.4f}\")\n",
    "print(classification_report(y_test_ht, y_pred_ht))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.8609\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.86      0.99      0.92     53194\n",
      "           1       0.49      0.04      0.07      8577\n",
      "\n",
      "    accuracy                           0.86     61771\n",
      "   macro avg       0.68      0.52      0.50     61771\n",
      "weighted avg       0.81      0.86      0.81     61771\n",
      "\n"
     ]
    }
   ],
   "source": [
    "### Logistic Regression\n",
    "lr_dt = LogisticRegression()\n",
    "lr_dt.fit(X_train_dt,y_train_dt)\n",
    "y_pred_dt = lr_dt.predict(X_test_dt)\n",
    "print(f\"Accuracy: {accuracy_score(y_test_dt, y_pred_dt):.4f}\")\n",
    "print(classification_report(y_test_dt, y_pred_dt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9732\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "Normal weight       0.97      0.97      0.97     17380\n",
      "      Obesity       0.99      0.99      0.99     21321\n",
      "   Overweight       0.98      0.98      0.98     22138\n",
      "  Underweight       0.63      0.65      0.64       932\n",
      "\n",
      "     accuracy                           0.97     61771\n",
      "    macro avg       0.89      0.89      0.89     61771\n",
      " weighted avg       0.97      0.97      0.97     61771\n",
      "\n"
     ]
    }
   ],
   "source": [
    "### Logistic Regression\n",
    "lr_ob = LogisticRegression()\n",
    "lr_ob.fit(X_train_ob,y_train_ob)\n",
    "y_pred_ob = lr_ob.predict(X_test_ob)\n",
    "print(f\"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}\")\n",
    "print(classification_report(y_test_ob, y_pred_ob))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9732\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "Normal weight       0.97      0.97      0.97     17380\n",
      "      Obesity       0.99      0.99      0.99     21321\n",
      "   Overweight       0.98      0.98      0.98     22138\n",
      "  Underweight       0.63      0.65      0.64       932\n",
      "\n",
      "     accuracy                           0.97     61771\n",
      "    macro avg       0.89      0.89      0.89     61771\n",
      " weighted avg       0.97      0.97      0.97     61771\n",
      "\n"
     ]
    }
   ],
   "source": [
    "### Logistic Regression\n",
    "lr_ob = LogisticRegression()\n",
    "lr_ob.fit(X_train_ob,y_train_ob)\n",
    "y_pred_ob = lr_ob.predict(X_test_ob)\n",
    "print(f\"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}\")\n",
    "print(classification_report(y_test_ob, y_pred_ob))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model saved to lr_ht.pkl\n"
     ]
    }
   ],
   "source": [
    "model_filename = 'lr_ht.pkl'\n",
    "with open(model_filename, 'wb') as file:\n",
    "    pickle.dump(lr_ht, file)\n",
    "\n",
    "print(f\"Model saved to {model_filename}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model saved to lr_dt.pkl\n"
     ]
    }
   ],
   "source": [
    "model_filename = 'lr_dt.pkl'\n",
    "with open(model_filename, 'wb') as file:\n",
    "    pickle.dump(lr_dt, file)\n",
    "\n",
    "print(f\"Model saved to {model_filename}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model saved to lr_ob.pkl\n"
     ]
    }
   ],
   "source": [
    "model_filename = 'lr_ob.pkl'\n",
    "with open(model_filename, 'wb') as file:\n",
    "    pickle.dump(lr_ob, file)\n",
    "\n",
    "print(f\"Model saved to {model_filename}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded Model Accuracy: 0.9727\n"
     ]
    }
   ],
   "source": [
    "with open(model_filename, 'rb') as file:\n",
    "    loaded_model = pickle.load(file)\n",
    "\n",
    "# Now you can use loaded_model to make predictions\n",
    "y_pred_loaded = loaded_model.predict(X_test_ob)\n",
    "print(f\"Loaded Model Accuracy: {accuracy_score(y_test_ob, y_pred_loaded):.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 'Other_Cancer',\n",
       "       'Depression', 'Arthritis', 'Sex', 'Age_Category', 'Height_(cm)',\n",
       "       'Weight_(kg)', 'BMI', 'Smoking_History', 'Alcohol_Consumption',\n",
       "       'Fruit_Consumption', 'Green_Vegetables_Consumption',\n",
       "       'FriedPotato_Consumption'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test_ob.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['General_Health',\n",
       " 'Checkup',\n",
       " 'Exercise',\n",
       " 'Heart_Disease',\n",
       " 'Skin_Cancer',\n",
       " 'Other_Cancer',\n",
       " 'Depression',\n",
       " 'Diabetes',\n",
       " 'Arthritis',\n",
       " 'Sex',\n",
       " 'Age_Category',\n",
       " 'Smoking_History']"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "categorical_columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Encoded Input DataFrame:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>General_Health</th>\n",
       "      <th>Checkup</th>\n",
       "      <th>Exercise</th>\n",
       "      <th>Depression</th>\n",
       "      <th>Arthritis</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age_Category</th>\n",
       "      <th>Height_(cm)</th>\n",
       "      <th>Weight_(kg)</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Smoking_History</th>\n",
       "      <th>Alcohol_Consumption</th>\n",
       "      <th>Fruit_Consumption</th>\n",
       "      <th>Green_Vegetables_Consumption</th>\n",
       "      <th>FriedPotato_Consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>150</td>\n",
       "      <td>32.66</td>\n",
       "      <td>14.34</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   General_Health  Checkup  Exercise  Depression  Arthritis  Sex  \\\n",
       "0               3        2         0           0          1    0   \n",
       "\n",
       "   Age_Category  Height_(cm)  Weight_(kg)    BMI  Smoking_History  \\\n",
       "0            10          150        32.66  14.34                1   \n",
       "\n",
       "   Alcohol_Consumption  Fruit_Consumption  Green_Vegetables_Consumption  \\\n",
       "0                  0.0               30.0                            16   \n",
       "\n",
       "   FriedPotato_Consumption  \n",
       "0                       12  "
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Define the columns\n",
    "columns = ['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',\n",
    "           'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',\n",
    "           'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',\n",
    "           'Green_Vegetables_Consumption', 'FriedPotato_Consumption']\n",
    "\n",
    "# Define the input values for each column (replace these with actual values)\n",
    "input_values = {\n",
    "    'General_Health': 'Poor',  # Example values\n",
    "    'Checkup': 'Within the past 2 years',\n",
    "    'Exercise': 'No',\n",
    "    'Depression': 'No',\n",
    "    'Arthritis': 'Yes',\n",
    "    'Sex': 'Female',\n",
    "    'Age_Category': '70-74',\n",
    "    'Height_(cm)': 150,  # Example numerical values\n",
    "    'Weight_(kg)': 32.66,\n",
    "    'BMI': 14.34,\n",
    "    'Smoking_History': 'Yes',\n",
    "    'Alcohol_Consumption': 0.0,\n",
    "    'Fruit_Consumption': 30.0,\n",
    "    'Green_Vegetables_Consumption': 16,\n",
    "    'FriedPotato_Consumption': 12\n",
    "}\n",
    "\n",
    "# Create a DataFrame from input values\n",
    "input_df = pd.DataFrame([input_values])\n",
    "\n",
    "# Encode categorical columns using the same LabelEncoders you used during training\n",
    "for column in categorical_columns:\n",
    "    if column in input_df.columns:\n",
    "        # Transform the input values using the stored encoder\n",
    "        input_df[column] = label_encoders[column].transform(input_df[column].astype(str))\n",
    "\n",
    "# Display the input DataFrame after encoding\n",
    "print(\"Encoded Input DataFrame:\")\n",
    "input_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "The feature names should match those that were passed during fit.\nFeature names seen at fit time, yet now missing:\n- Other_Cancer\n- Skin_Cancer\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[116], line 5\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Prepare input for prediction (make sure to drop any non-feature columns if necessary)\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m# X_input = input_df[columns]\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \n\u001b[0;32m      4\u001b[0m \u001b[38;5;66;03m# Make the prediction\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m prediction \u001b[38;5;241m=\u001b[39m loaded_model\u001b[38;5;241m.\u001b[39mpredict(input_df)\n\u001b[0;32m      7\u001b[0m \u001b[38;5;66;03m# Output the prediction\u001b[39;00m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPredicted Class: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mprediction[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_base.py:351\u001b[0m, in \u001b[0;36mLinearClassifierMixin.predict\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    337\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    338\u001b[0m \u001b[39mPredict class labels for samples in X.\u001b[39;00m\n\u001b[0;32m    339\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    348\u001b[0m \u001b[39m    Vector containing the class labels for each sample.\u001b[39;00m\n\u001b[0;32m    349\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    350\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(X)\n\u001b[1;32m--> 351\u001b[0m scores \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdecision_function(X)\n\u001b[0;32m    352\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(scores\u001b[39m.\u001b[39mshape) \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m    353\u001b[0m     indices \u001b[39m=\u001b[39m xp\u001b[39m.\u001b[39mastype(scores \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m, \u001b[39mint\u001b[39m)\n",
      "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_base.py:332\u001b[0m, in \u001b[0;36mLinearClassifierMixin.decision_function\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    329\u001b[0m check_is_fitted(\u001b[39mself\u001b[39m)\n\u001b[0;32m    330\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(X)\n\u001b[1;32m--> 332\u001b[0m X \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_data(X, accept_sparse\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcsr\u001b[39m\u001b[39m\"\u001b[39m, reset\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[0;32m    333\u001b[0m scores \u001b[39m=\u001b[39m safe_sparse_dot(X, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcoef_\u001b[39m.\u001b[39mT, dense_output\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m) \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mintercept_\n\u001b[0;32m    334\u001b[0m \u001b[39mreturn\u001b[39;00m xp\u001b[39m.\u001b[39mreshape(scores, (\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m,)) \u001b[39mif\u001b[39;00m scores\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m] \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m \u001b[39melse\u001b[39;00m scores\n",
      "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:608\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m    537\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_validate_data\u001b[39m(\n\u001b[0;32m    538\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[0;32m    539\u001b[0m     X\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mno_validation\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    544\u001b[0m     \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_params,\n\u001b[0;32m    545\u001b[0m ):\n\u001b[0;32m    546\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Validate input data and set or check the `n_features_in_` attribute.\u001b[39;00m\n\u001b[0;32m    547\u001b[0m \n\u001b[0;32m    548\u001b[0m \u001b[39m    Parameters\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    606\u001b[0m \u001b[39m        validated.\u001b[39;00m\n\u001b[0;32m    607\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m--> 608\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_feature_names(X, reset\u001b[39m=\u001b[39mreset)\n\u001b[0;32m    610\u001b[0m     \u001b[39mif\u001b[39;00m y \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_tags()[\u001b[39m\"\u001b[39m\u001b[39mrequires_y\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m    611\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m    612\u001b[0m             \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThis \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m estimator \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    613\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mrequires y to be passed, but the target y is None.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    614\u001b[0m         )\n",
      "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:535\u001b[0m, in \u001b[0;36mBaseEstimator._check_feature_names\u001b[1;34m(self, X, reset)\u001b[0m\n\u001b[0;32m    530\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m missing_names \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m unexpected_names:\n\u001b[0;32m    531\u001b[0m     message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\n\u001b[0;32m    532\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mFeature names must be in the same order as they were in fit.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m    533\u001b[0m     )\n\u001b[1;32m--> 535\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(message)\n",
      "\u001b[1;31mValueError\u001b[0m: The feature names should match those that were passed during fit.\nFeature names seen at fit time, yet now missing:\n- Other_Cancer\n- Skin_Cancer\n"
     ]
    }
   ],
   "source": [
    "# Prepare input for prediction (make sure to drop any non-feature columns if necessary)\n",
    "# X_input = input_df[columns]\n",
    "\n",
    "# Make the prediction\n",
    "prediction = loaded_model.predict(input_df)\n",
    "\n",
    "# Output the prediction\n",
    "print(f\"Predicted Class: {prediction[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Underweight'], dtype=object)"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.90208773, 0.01696106, 0.06855007, 0.01240114]])"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_filename = 'lr_dt.pkl'\n",
    "\n",
    "with open(model_filename, 'rb') as file:\n",
    "    loaded_model_lr_dt = pickle.load(file)\n",
    "y_pred_loaded = loaded_model_lr_dt.predict_proba(input_df)\n",
    "y_pred_loaded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.12.4 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "3c06e3e46abf38078fe4dac36a0085ec2b134ebbd73dd076183d243eeca6918f"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}