--- a +++ b/Model Buliding/final_notebook.ipynb @@ -0,0 +1,1055 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from pandas.plotting import scatter_matrix\n", + "import seaborn as sns\n", + "import pickle\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from sklearn.linear_model import LogisticRegression\n", + "# from sklearn.tree import DecisionTreeClassifier\n", + "# from sklearn.svm import SVC\n", + "# from sklearn.neighbors import KNeighborsClassifier\n", + "# from sklearn.ensemble import RandomForestClassifier\n", + "# from imblearn.over_sampling import SMOTE\n", + "# from sklearn.model_selection import GridSearchCV\n", + "import warnings\n", + "\n", + "# Ignore all warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>General_Health</th>\n", + " <th>Checkup</th>\n", + " <th>Exercise</th>\n", + " <th>Heart_Disease</th>\n", + " <th>Skin_Cancer</th>\n", + " <th>Other_Cancer</th>\n", + " <th>Depression</th>\n", + " <th>Diabetes</th>\n", + " <th>Arthritis</th>\n", + " <th>Sex</th>\n", + " <th>Age_Category</th>\n", + " <th>Height_(cm)</th>\n", + " <th>Weight_(kg)</th>\n", + " <th>BMI</th>\n", + " <th>Smoking_History</th>\n", + " <th>Alcohol_Consumption</th>\n", + " <th>Fruit_Consumption</th>\n", + " <th>Green_Vegetables_Consumption</th>\n", + " <th>FriedPotato_Consumption</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Poor</td>\n", + " <td>Within the past 2 years</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>Yes</td>\n", + " <td>Female</td>\n", + " <td>70-74</td>\n", + " <td>150.0</td>\n", + " <td>32.66</td>\n", + " <td>14.54</td>\n", + " <td>Yes</td>\n", + " <td>0.0</td>\n", + " <td>30.0</td>\n", + " <td>16.0</td>\n", + " <td>12.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Very Good</td>\n", + " <td>Within the past year</td>\n", + " <td>No</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>Female</td>\n", + " <td>70-74</td>\n", + " <td>165.0</td>\n", + " <td>77.11</td>\n", + " <td>28.29</td>\n", + " <td>No</td>\n", + " <td>0.0</td>\n", + " <td>30.0</td>\n", + " <td>0.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Very Good</td>\n", + " <td>Within the past year</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>Female</td>\n", + " <td>60-64</td>\n", + " <td>163.0</td>\n", + " <td>88.45</td>\n", + " <td>33.47</td>\n", + " <td>No</td>\n", + " <td>4.0</td>\n", + " <td>12.0</td>\n", + " <td>3.0</td>\n", + " <td>16.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Poor</td>\n", + " <td>Within the past year</td>\n", + " <td>Yes</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>Male</td>\n", + " <td>75-79</td>\n", + " <td>180.0</td>\n", + " <td>93.44</td>\n", + " <td>28.73</td>\n", + " <td>No</td>\n", + " <td>0.0</td>\n", + " <td>30.0</td>\n", + " <td>30.0</td>\n", + " <td>8.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Good</td>\n", + " <td>Within the past year</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>Male</td>\n", + " <td>80+</td>\n", + " <td>191.0</td>\n", + " <td>88.45</td>\n", + " <td>24.37</td>\n", + " <td>Yes</td>\n", + " <td>0.0</td>\n", + " <td>8.0</td>\n", + " <td>4.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " General_Health Checkup Exercise Heart_Disease Skin_Cancer \\\n", + "0 Poor Within the past 2 years No No No \n", + "1 Very Good Within the past year No Yes No \n", + "2 Very Good Within the past year Yes No No \n", + "3 Poor Within the past year Yes Yes No \n", + "4 Good Within the past year No No No \n", + "\n", + " Other_Cancer Depression Diabetes Arthritis Sex Age_Category \\\n", + "0 No No No Yes Female 70-74 \n", + "1 No No Yes No Female 70-74 \n", + "2 No No Yes No Female 60-64 \n", + "3 No No Yes No Male 75-79 \n", + "4 No No No No Male 80+ \n", + "\n", + " Height_(cm) Weight_(kg) BMI Smoking_History Alcohol_Consumption \\\n", + "0 150.0 32.66 14.54 Yes 0.0 \n", + "1 165.0 77.11 28.29 No 0.0 \n", + "2 163.0 88.45 33.47 No 4.0 \n", + "3 180.0 93.44 28.73 No 0.0 \n", + "4 191.0 88.45 24.37 Yes 0.0 \n", + "\n", + " Fruit_Consumption Green_Vegetables_Consumption FriedPotato_Consumption \n", + "0 30.0 16.0 12.0 \n", + "1 30.0 0.0 4.0 \n", + "2 12.0 3.0 16.0 \n", + "3 30.0 30.0 8.0 \n", + "4 8.0 4.0 0.0 " + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"CVD_cleaned.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 308854 entries, 0 to 308853\n", + "Data columns (total 19 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 General_Health 308854 non-null object \n", + " 1 Checkup 308854 non-null object \n", + " 2 Exercise 308854 non-null object \n", + " 3 Heart_Disease 308854 non-null object \n", + " 4 Skin_Cancer 308854 non-null object \n", + " 5 Other_Cancer 308854 non-null object \n", + " 6 Depression 308854 non-null object \n", + " 7 Diabetes 308854 non-null object \n", + " 8 Arthritis 308854 non-null object \n", + " 9 Sex 308854 non-null object \n", + " 10 Age_Category 308854 non-null object \n", + " 11 Height_(cm) 308854 non-null float64\n", + " 12 Weight_(kg) 308854 non-null float64\n", + " 13 BMI 308854 non-null float64\n", + " 14 Smoking_History 308854 non-null object \n", + " 15 Alcohol_Consumption 308854 non-null float64\n", + " 16 Fruit_Consumption 308854 non-null float64\n", + " 17 Green_Vegetables_Consumption 308854 non-null float64\n", + " 18 FriedPotato_Consumption 308854 non-null float64\n", + "dtypes: float64(7), object(12)\n", + "memory usage: 44.8+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer',\n", + " 'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',\n", + " 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History',\n", + " 'Alcohol_Consumption', 'Fruit_Consumption',\n", + " 'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],\n", + " dtype='object')" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(columns=['Skin_Cancer','Other_Cancer'],inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Diabetes\n", + "No 266037\n", + "Yes 42817\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Diabetes'] = df['Diabetes'].replace({\n", + " 'Yes': 'Yes',\n", + " 'Yes, but female told only during pregnancy': 'Yes',\n", + " 'No': 'No',\n", + " 'No, pre-diabetes or borderline diabetes': 'No'\n", + "})\n", + "df['Diabetes'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "categorical_columns = df.select_dtypes(include=['object', 'category']).columns\n", + "numerical_columns = df.select_dtypes(include=['number'])" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "obesity\n", + "Overweight 109866\n", + "Obesity 106738\n", + "Normal weight 87706\n", + "Underweight 4544\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bins = [-float('inf'), 18.5, 24.9, 29.9, float('inf')]\n", + "labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity']\n", + "\n", + "# Create a new column 'obesity' based on BMI classification\n", + "df['obesity'] = pd.cut(df['BMI'], bins=bins, labels=labels)\n", + "df['obesity'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [], + "source": [ + "label_encoders = {}\n", + "\n", + "# Apply Label Encoding to categorical columns\n", + "for column in categorical_columns:\n", + " df[column] = df[column].astype(str)\n", + " label_encoder = LabelEncoder() \n", + " df[column] = label_encoder.fit_transform(df[column])\n", + " label_encoders[column] = label_encoder \n" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "with open('label_encoders.pkl', 'wb') as f:\n", + " pickle.dump(label_encoders, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checkup Label Encoder Mappings:\n", + "0: No\n", + "1: Yes\n" + ] + } + ], + "source": [ + "if 'Heart_Disease' in label_encoders:\n", + " encoder = label_encoders['Heart_Disease']\n", + " mappings = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))\n", + " print(\"Checkup Label Encoder Mappings:\")\n", + " for key, value in mappings.items():\n", + " print(f\"{key}: {value}\")\n", + "else:\n", + " print(\"Checkup column not found in label encoders.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "18" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df.columns.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Diabetes\n", + "0 266037\n", + "1 42817\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Diabetes'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [], + "source": [ + "y_heart_disease = df['Heart_Disease']\n", + "y_diabetes = df['Diabetes']\n", + "y_obesity = df['obesity']\n", + "X_train = df.drop(columns=['Heart_Disease','Diabetes','obesity'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature scaling (Standardizing the data)\n", + "# scaler = StandardScaler()\n", + "# X_scaled = scaler.fit_transform(X)\n", + "\n", + "# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)\n", + "X_train_ht, X_test_ht, y_train_ht, y_test_ht = train_test_split(X_train,y_heart_disease,test_size= 0.2)\n", + "X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X_train,y_diabetes,test_size= 0.2)\n", + "X_train_ob, X_test_ob, y_train_ob, y_test_ob = train_test_split(X_train,y_obesity,test_size= 0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',\n", + " 'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',\n", + " 'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',\n", + " 'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],\n", + " dtype='object')" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train_dt.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9201\n", + " precision recall f1-score support\n", + "\n", + " 0 0.92 1.00 0.96 56850\n", + " 1 0.33 0.00 0.01 4921\n", + "\n", + " accuracy 0.92 61771\n", + " macro avg 0.63 0.50 0.48 61771\n", + "weighted avg 0.87 0.92 0.88 61771\n", + "\n" + ] + } + ], + "source": [ + "### Logistic Regression\n", + "lr_ht = LogisticRegression()\n", + "lr_ht.fit(X_train_ht,y_train_ht)\n", + "y_pred_ht = lr_ht.predict(X_test_ht)\n", + "print(f\"Accuracy: {accuracy_score(y_test_ht, y_pred_ht):.4f}\")\n", + "print(classification_report(y_test_ht, y_pred_ht))" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.8609\n", + " precision recall f1-score support\n", + "\n", + " 0 0.86 0.99 0.92 53194\n", + " 1 0.49 0.04 0.07 8577\n", + "\n", + " accuracy 0.86 61771\n", + " macro avg 0.68 0.52 0.50 61771\n", + "weighted avg 0.81 0.86 0.81 61771\n", + "\n" + ] + } + ], + "source": [ + "### Logistic Regression\n", + "lr_dt = LogisticRegression()\n", + "lr_dt.fit(X_train_dt,y_train_dt)\n", + "y_pred_dt = lr_dt.predict(X_test_dt)\n", + "print(f\"Accuracy: {accuracy_score(y_test_dt, y_pred_dt):.4f}\")\n", + "print(classification_report(y_test_dt, y_pred_dt))" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9732\n", + " precision recall f1-score support\n", + "\n", + "Normal weight 0.97 0.97 0.97 17380\n", + " Obesity 0.99 0.99 0.99 21321\n", + " Overweight 0.98 0.98 0.98 22138\n", + " Underweight 0.63 0.65 0.64 932\n", + "\n", + " accuracy 0.97 61771\n", + " macro avg 0.89 0.89 0.89 61771\n", + " weighted avg 0.97 0.97 0.97 61771\n", + "\n" + ] + } + ], + "source": [ + "### Logistic Regression\n", + "lr_ob = LogisticRegression()\n", + "lr_ob.fit(X_train_ob,y_train_ob)\n", + "y_pred_ob = lr_ob.predict(X_test_ob)\n", + "print(f\"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}\")\n", + "print(classification_report(y_test_ob, y_pred_ob))" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9732\n", + " precision recall f1-score support\n", + "\n", + "Normal weight 0.97 0.97 0.97 17380\n", + " Obesity 0.99 0.99 0.99 21321\n", + " Overweight 0.98 0.98 0.98 22138\n", + " Underweight 0.63 0.65 0.64 932\n", + "\n", + " accuracy 0.97 61771\n", + " macro avg 0.89 0.89 0.89 61771\n", + " weighted avg 0.97 0.97 0.97 61771\n", + "\n" + ] + } + ], + "source": [ + "### Logistic Regression\n", + "lr_ob = LogisticRegression()\n", + "lr_ob.fit(X_train_ob,y_train_ob)\n", + "y_pred_ob = lr_ob.predict(X_test_ob)\n", + "print(f\"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}\")\n", + "print(classification_report(y_test_ob, y_pred_ob))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model saved to lr_ht.pkl\n" + ] + } + ], + "source": [ + "model_filename = 'lr_ht.pkl'\n", + "with open(model_filename, 'wb') as file:\n", + " pickle.dump(lr_ht, file)\n", + "\n", + "print(f\"Model saved to {model_filename}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model saved to lr_dt.pkl\n" + ] + } + ], + "source": [ + "model_filename = 'lr_dt.pkl'\n", + "with open(model_filename, 'wb') as file:\n", + " pickle.dump(lr_dt, file)\n", + "\n", + "print(f\"Model saved to {model_filename}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model saved to lr_ob.pkl\n" + ] + } + ], + "source": [ + "model_filename = 'lr_ob.pkl'\n", + "with open(model_filename, 'wb') as file:\n", + " pickle.dump(lr_ob, file)\n", + "\n", + "print(f\"Model saved to {model_filename}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded Model Accuracy: 0.9727\n" + ] + } + ], + "source": [ + "with open(model_filename, 'rb') as file:\n", + " loaded_model = pickle.load(file)\n", + "\n", + "# Now you can use loaded_model to make predictions\n", + "y_pred_loaded = loaded_model.predict(X_test_ob)\n", + "print(f\"Loaded Model Accuracy: {accuracy_score(y_test_ob, y_pred_loaded):.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 'Other_Cancer',\n", + " 'Depression', 'Arthritis', 'Sex', 'Age_Category', 'Height_(cm)',\n", + " 'Weight_(kg)', 'BMI', 'Smoking_History', 'Alcohol_Consumption',\n", + " 'Fruit_Consumption', 'Green_Vegetables_Consumption',\n", + " 'FriedPotato_Consumption'],\n", + " dtype='object')" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test_ob.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['General_Health',\n", + " 'Checkup',\n", + " 'Exercise',\n", + " 'Heart_Disease',\n", + " 'Skin_Cancer',\n", + " 'Other_Cancer',\n", + " 'Depression',\n", + " 'Diabetes',\n", + " 'Arthritis',\n", + " 'Sex',\n", + " 'Age_Category',\n", + " 'Smoking_History']" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "categorical_columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded Input DataFrame:\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>General_Health</th>\n", + " <th>Checkup</th>\n", + " <th>Exercise</th>\n", + " <th>Depression</th>\n", + " <th>Arthritis</th>\n", + " <th>Sex</th>\n", + " <th>Age_Category</th>\n", + " <th>Height_(cm)</th>\n", + " <th>Weight_(kg)</th>\n", + " <th>BMI</th>\n", + " <th>Smoking_History</th>\n", + " <th>Alcohol_Consumption</th>\n", + " <th>Fruit_Consumption</th>\n", + " <th>Green_Vegetables_Consumption</th>\n", + " <th>FriedPotato_Consumption</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>10</td>\n", + " <td>150</td>\n", + " <td>32.66</td>\n", + " <td>14.34</td>\n", + " <td>1</td>\n", + " <td>0.0</td>\n", + " <td>30.0</td>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " General_Health Checkup Exercise Depression Arthritis Sex \\\n", + "0 3 2 0 0 1 0 \n", + "\n", + " Age_Category Height_(cm) Weight_(kg) BMI Smoking_History \\\n", + "0 10 150 32.66 14.34 1 \n", + "\n", + " Alcohol_Consumption Fruit_Consumption Green_Vegetables_Consumption \\\n", + "0 0.0 30.0 16 \n", + "\n", + " FriedPotato_Consumption \n", + "0 12 " + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the columns\n", + "columns = ['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',\n", + " 'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',\n", + " 'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',\n", + " 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']\n", + "\n", + "# Define the input values for each column (replace these with actual values)\n", + "input_values = {\n", + " 'General_Health': 'Poor', # Example values\n", + " 'Checkup': 'Within the past 2 years',\n", + " 'Exercise': 'No',\n", + " 'Depression': 'No',\n", + " 'Arthritis': 'Yes',\n", + " 'Sex': 'Female',\n", + " 'Age_Category': '70-74',\n", + " 'Height_(cm)': 150, # Example numerical values\n", + " 'Weight_(kg)': 32.66,\n", + " 'BMI': 14.34,\n", + " 'Smoking_History': 'Yes',\n", + " 'Alcohol_Consumption': 0.0,\n", + " 'Fruit_Consumption': 30.0,\n", + " 'Green_Vegetables_Consumption': 16,\n", + " 'FriedPotato_Consumption': 12\n", + "}\n", + "\n", + "# Create a DataFrame from input values\n", + "input_df = pd.DataFrame([input_values])\n", + "\n", + "# Encode categorical columns using the same LabelEncoders you used during training\n", + "for column in categorical_columns:\n", + " if column in input_df.columns:\n", + " # Transform the input values using the stored encoder\n", + " input_df[column] = label_encoders[column].transform(input_df[column].astype(str))\n", + "\n", + "# Display the input DataFrame after encoding\n", + "print(\"Encoded Input DataFrame:\")\n", + "input_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The feature names should match those that were passed during fit.\nFeature names seen at fit time, yet now missing:\n- Other_Cancer\n- Skin_Cancer\n", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[116], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Prepare input for prediction (make sure to drop any non-feature columns if necessary)\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# X_input = input_df[columns]\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Make the prediction\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m prediction \u001b[38;5;241m=\u001b[39m loaded_model\u001b[38;5;241m.\u001b[39mpredict(input_df)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# Output the prediction\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPredicted Class: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mprediction[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_base.py:351\u001b[0m, in \u001b[0;36mLinearClassifierMixin.predict\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 338\u001b[0m \u001b[39mPredict class labels for samples in X.\u001b[39;00m\n\u001b[0;32m 339\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 348\u001b[0m \u001b[39m Vector containing the class labels for each sample.\u001b[39;00m\n\u001b[0;32m 349\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 350\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(X)\n\u001b[1;32m--> 351\u001b[0m scores \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdecision_function(X)\n\u001b[0;32m 352\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(scores\u001b[39m.\u001b[39mshape) \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m 353\u001b[0m indices \u001b[39m=\u001b[39m xp\u001b[39m.\u001b[39mastype(scores \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m, \u001b[39mint\u001b[39m)\n", + "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_base.py:332\u001b[0m, in \u001b[0;36mLinearClassifierMixin.decision_function\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 329\u001b[0m check_is_fitted(\u001b[39mself\u001b[39m)\n\u001b[0;32m 330\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(X)\n\u001b[1;32m--> 332\u001b[0m X \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_data(X, accept_sparse\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcsr\u001b[39m\u001b[39m\"\u001b[39m, reset\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[0;32m 333\u001b[0m scores \u001b[39m=\u001b[39m safe_sparse_dot(X, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcoef_\u001b[39m.\u001b[39mT, dense_output\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m) \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mintercept_\n\u001b[0;32m 334\u001b[0m \u001b[39mreturn\u001b[39;00m xp\u001b[39m.\u001b[39mreshape(scores, (\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m,)) \u001b[39mif\u001b[39;00m scores\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m] \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m \u001b[39melse\u001b[39;00m scores\n", + "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:608\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m 537\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_validate_data\u001b[39m(\n\u001b[0;32m 538\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[0;32m 539\u001b[0m X\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mno_validation\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 544\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_params,\n\u001b[0;32m 545\u001b[0m ):\n\u001b[0;32m 546\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Validate input data and set or check the `n_features_in_` attribute.\u001b[39;00m\n\u001b[0;32m 547\u001b[0m \n\u001b[0;32m 548\u001b[0m \u001b[39m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 606\u001b[0m \u001b[39m validated.\u001b[39;00m\n\u001b[0;32m 607\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 608\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_feature_names(X, reset\u001b[39m=\u001b[39mreset)\n\u001b[0;32m 610\u001b[0m \u001b[39mif\u001b[39;00m y \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_tags()[\u001b[39m\"\u001b[39m\u001b[39mrequires_y\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m 611\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 612\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThis \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m estimator \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 613\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mrequires y to be passed, but the target y is None.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 614\u001b[0m )\n", + "File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:535\u001b[0m, in \u001b[0;36mBaseEstimator._check_feature_names\u001b[1;34m(self, X, reset)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m missing_names \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m unexpected_names:\n\u001b[0;32m 531\u001b[0m message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\n\u001b[0;32m 532\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFeature names must be in the same order as they were in fit.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 533\u001b[0m )\n\u001b[1;32m--> 535\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(message)\n", + "\u001b[1;31mValueError\u001b[0m: The feature names should match those that were passed during fit.\nFeature names seen at fit time, yet now missing:\n- Other_Cancer\n- Skin_Cancer\n" + ] + } + ], + "source": [ + "# Prepare input for prediction (make sure to drop any non-feature columns if necessary)\n", + "# X_input = input_df[columns]\n", + "\n", + "# Make the prediction\n", + "prediction = loaded_model.predict(input_df)\n", + "\n", + "# Output the prediction\n", + "print(f\"Predicted Class: {prediction[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Underweight'], dtype=object)" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.90208773, 0.01696106, 0.06855007, 0.01240114]])" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_filename = 'lr_dt.pkl'\n", + "\n", + "with open(model_filename, 'rb') as file:\n", + " loaded_model_lr_dt = pickle.load(file)\n", + "y_pred_loaded = loaded_model_lr_dt.predict_proba(input_df)\n", + "y_pred_loaded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.12.4 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3c06e3e46abf38078fe4dac36a0085ec2b134ebbd73dd076183d243eeca6918f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}