1056 lines (1055 with data), 37.8 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from pandas.plotting import scatter_matrix\n",
"import seaborn as sns\n",
"import pickle\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from sklearn.linear_model import LogisticRegression\n",
"# from sklearn.tree import DecisionTreeClassifier\n",
"# from sklearn.svm import SVC\n",
"# from sklearn.neighbors import KNeighborsClassifier\n",
"# from sklearn.ensemble import RandomForestClassifier\n",
"# from imblearn.over_sampling import SMOTE\n",
"# from sklearn.model_selection import GridSearchCV\n",
"import warnings\n",
"\n",
"# Ignore all warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>General_Health</th>\n",
" <th>Checkup</th>\n",
" <th>Exercise</th>\n",
" <th>Heart_Disease</th>\n",
" <th>Skin_Cancer</th>\n",
" <th>Other_Cancer</th>\n",
" <th>Depression</th>\n",
" <th>Diabetes</th>\n",
" <th>Arthritis</th>\n",
" <th>Sex</th>\n",
" <th>Age_Category</th>\n",
" <th>Height_(cm)</th>\n",
" <th>Weight_(kg)</th>\n",
" <th>BMI</th>\n",
" <th>Smoking_History</th>\n",
" <th>Alcohol_Consumption</th>\n",
" <th>Fruit_Consumption</th>\n",
" <th>Green_Vegetables_Consumption</th>\n",
" <th>FriedPotato_Consumption</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Poor</td>\n",
" <td>Within the past 2 years</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>70-74</td>\n",
" <td>150.0</td>\n",
" <td>32.66</td>\n",
" <td>14.54</td>\n",
" <td>Yes</td>\n",
" <td>0.0</td>\n",
" <td>30.0</td>\n",
" <td>16.0</td>\n",
" <td>12.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Very Good</td>\n",
" <td>Within the past year</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>70-74</td>\n",
" <td>165.0</td>\n",
" <td>77.11</td>\n",
" <td>28.29</td>\n",
" <td>No</td>\n",
" <td>0.0</td>\n",
" <td>30.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Very Good</td>\n",
" <td>Within the past year</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>60-64</td>\n",
" <td>163.0</td>\n",
" <td>88.45</td>\n",
" <td>33.47</td>\n",
" <td>No</td>\n",
" <td>4.0</td>\n",
" <td>12.0</td>\n",
" <td>3.0</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Poor</td>\n",
" <td>Within the past year</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>75-79</td>\n",
" <td>180.0</td>\n",
" <td>93.44</td>\n",
" <td>28.73</td>\n",
" <td>No</td>\n",
" <td>0.0</td>\n",
" <td>30.0</td>\n",
" <td>30.0</td>\n",
" <td>8.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Good</td>\n",
" <td>Within the past year</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>80+</td>\n",
" <td>191.0</td>\n",
" <td>88.45</td>\n",
" <td>24.37</td>\n",
" <td>Yes</td>\n",
" <td>0.0</td>\n",
" <td>8.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" General_Health Checkup Exercise Heart_Disease Skin_Cancer \\\n",
"0 Poor Within the past 2 years No No No \n",
"1 Very Good Within the past year No Yes No \n",
"2 Very Good Within the past year Yes No No \n",
"3 Poor Within the past year Yes Yes No \n",
"4 Good Within the past year No No No \n",
"\n",
" Other_Cancer Depression Diabetes Arthritis Sex Age_Category \\\n",
"0 No No No Yes Female 70-74 \n",
"1 No No Yes No Female 70-74 \n",
"2 No No Yes No Female 60-64 \n",
"3 No No Yes No Male 75-79 \n",
"4 No No No No Male 80+ \n",
"\n",
" Height_(cm) Weight_(kg) BMI Smoking_History Alcohol_Consumption \\\n",
"0 150.0 32.66 14.54 Yes 0.0 \n",
"1 165.0 77.11 28.29 No 0.0 \n",
"2 163.0 88.45 33.47 No 4.0 \n",
"3 180.0 93.44 28.73 No 0.0 \n",
"4 191.0 88.45 24.37 Yes 0.0 \n",
"\n",
" Fruit_Consumption Green_Vegetables_Consumption FriedPotato_Consumption \n",
"0 30.0 16.0 12.0 \n",
"1 30.0 0.0 4.0 \n",
"2 12.0 3.0 16.0 \n",
"3 30.0 30.0 8.0 \n",
"4 8.0 4.0 0.0 "
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"CVD_cleaned.csv\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 308854 entries, 0 to 308853\n",
"Data columns (total 19 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 General_Health 308854 non-null object \n",
" 1 Checkup 308854 non-null object \n",
" 2 Exercise 308854 non-null object \n",
" 3 Heart_Disease 308854 non-null object \n",
" 4 Skin_Cancer 308854 non-null object \n",
" 5 Other_Cancer 308854 non-null object \n",
" 6 Depression 308854 non-null object \n",
" 7 Diabetes 308854 non-null object \n",
" 8 Arthritis 308854 non-null object \n",
" 9 Sex 308854 non-null object \n",
" 10 Age_Category 308854 non-null object \n",
" 11 Height_(cm) 308854 non-null float64\n",
" 12 Weight_(kg) 308854 non-null float64\n",
" 13 BMI 308854 non-null float64\n",
" 14 Smoking_History 308854 non-null object \n",
" 15 Alcohol_Consumption 308854 non-null float64\n",
" 16 Fruit_Consumption 308854 non-null float64\n",
" 17 Green_Vegetables_Consumption 308854 non-null float64\n",
" 18 FriedPotato_Consumption 308854 non-null float64\n",
"dtypes: float64(7), object(12)\n",
"memory usage: 44.8+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer',\n",
" 'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',\n",
" 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History',\n",
" 'Alcohol_Consumption', 'Fruit_Consumption',\n",
" 'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],\n",
" dtype='object')"
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
"df.drop(columns=['Skin_Cancer','Other_Cancer'],inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Diabetes\n",
"No 266037\n",
"Yes 42817\n",
"Name: count, dtype: int64"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Diabetes'] = df['Diabetes'].replace({\n",
" 'Yes': 'Yes',\n",
" 'Yes, but female told only during pregnancy': 'Yes',\n",
" 'No': 'No',\n",
" 'No, pre-diabetes or borderline diabetes': 'No'\n",
"})\n",
"df['Diabetes'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"categorical_columns = df.select_dtypes(include=['object', 'category']).columns\n",
"numerical_columns = df.select_dtypes(include=['number'])"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"obesity\n",
"Overweight 109866\n",
"Obesity 106738\n",
"Normal weight 87706\n",
"Underweight 4544\n",
"Name: count, dtype: int64"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = [-float('inf'), 18.5, 24.9, 29.9, float('inf')]\n",
"labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity']\n",
"\n",
"# Create a new column 'obesity' based on BMI classification\n",
"df['obesity'] = pd.cut(df['BMI'], bins=bins, labels=labels)\n",
"df['obesity'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"label_encoders = {}\n",
"\n",
"# Apply Label Encoding to categorical columns\n",
"for column in categorical_columns:\n",
" df[column] = df[column].astype(str)\n",
" label_encoder = LabelEncoder() \n",
" df[column] = label_encoder.fit_transform(df[column])\n",
" label_encoders[column] = label_encoder \n"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"with open('label_encoders.pkl', 'wb') as f:\n",
" pickle.dump(label_encoders, f)"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checkup Label Encoder Mappings:\n",
"0: No\n",
"1: Yes\n"
]
}
],
"source": [
"if 'Heart_Disease' in label_encoders:\n",
" encoder = label_encoders['Heart_Disease']\n",
" mappings = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))\n",
" print(\"Checkup Label Encoder Mappings:\")\n",
" for key, value in mappings.items():\n",
" print(f\"{key}: {value}\")\n",
"else:\n",
" print(\"Checkup column not found in label encoders.\")"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"18"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df.columns.tolist())"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Diabetes\n",
"0 266037\n",
"1 42817\n",
"Name: count, dtype: int64"
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Diabetes'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"y_heart_disease = df['Heart_Disease']\n",
"y_diabetes = df['Diabetes']\n",
"y_obesity = df['obesity']\n",
"X_train = df.drop(columns=['Heart_Disease','Diabetes','obesity'])\n"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [],
"source": [
"# Feature scaling (Standardizing the data)\n",
"# scaler = StandardScaler()\n",
"# X_scaled = scaler.fit_transform(X)\n",
"\n",
"# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)\n",
"X_train_ht, X_test_ht, y_train_ht, y_test_ht = train_test_split(X_train,y_heart_disease,test_size= 0.2)\n",
"X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X_train,y_diabetes,test_size= 0.2)\n",
"X_train_ob, X_test_ob, y_train_ob, y_test_ob = train_test_split(X_train,y_obesity,test_size= 0.2)"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',\n",
" 'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',\n",
" 'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',\n",
" 'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],\n",
" dtype='object')"
]
},
"execution_count": 154,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train_dt.columns"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9201\n",
" precision recall f1-score support\n",
"\n",
" 0 0.92 1.00 0.96 56850\n",
" 1 0.33 0.00 0.01 4921\n",
"\n",
" accuracy 0.92 61771\n",
" macro avg 0.63 0.50 0.48 61771\n",
"weighted avg 0.87 0.92 0.88 61771\n",
"\n"
]
}
],
"source": [
"### Logistic Regression\n",
"lr_ht = LogisticRegression()\n",
"lr_ht.fit(X_train_ht,y_train_ht)\n",
"y_pred_ht = lr_ht.predict(X_test_ht)\n",
"print(f\"Accuracy: {accuracy_score(y_test_ht, y_pred_ht):.4f}\")\n",
"print(classification_report(y_test_ht, y_pred_ht))"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.8609\n",
" precision recall f1-score support\n",
"\n",
" 0 0.86 0.99 0.92 53194\n",
" 1 0.49 0.04 0.07 8577\n",
"\n",
" accuracy 0.86 61771\n",
" macro avg 0.68 0.52 0.50 61771\n",
"weighted avg 0.81 0.86 0.81 61771\n",
"\n"
]
}
],
"source": [
"### Logistic Regression\n",
"lr_dt = LogisticRegression()\n",
"lr_dt.fit(X_train_dt,y_train_dt)\n",
"y_pred_dt = lr_dt.predict(X_test_dt)\n",
"print(f\"Accuracy: {accuracy_score(y_test_dt, y_pred_dt):.4f}\")\n",
"print(classification_report(y_test_dt, y_pred_dt))"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9732\n",
" precision recall f1-score support\n",
"\n",
"Normal weight 0.97 0.97 0.97 17380\n",
" Obesity 0.99 0.99 0.99 21321\n",
" Overweight 0.98 0.98 0.98 22138\n",
" Underweight 0.63 0.65 0.64 932\n",
"\n",
" accuracy 0.97 61771\n",
" macro avg 0.89 0.89 0.89 61771\n",
" weighted avg 0.97 0.97 0.97 61771\n",
"\n"
]
}
],
"source": [
"### Logistic Regression\n",
"lr_ob = LogisticRegression()\n",
"lr_ob.fit(X_train_ob,y_train_ob)\n",
"y_pred_ob = lr_ob.predict(X_test_ob)\n",
"print(f\"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}\")\n",
"print(classification_report(y_test_ob, y_pred_ob))"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9732\n",
" precision recall f1-score support\n",
"\n",
"Normal weight 0.97 0.97 0.97 17380\n",
" Obesity 0.99 0.99 0.99 21321\n",
" Overweight 0.98 0.98 0.98 22138\n",
" Underweight 0.63 0.65 0.64 932\n",
"\n",
" accuracy 0.97 61771\n",
" macro avg 0.89 0.89 0.89 61771\n",
" weighted avg 0.97 0.97 0.97 61771\n",
"\n"
]
}
],
"source": [
"### Logistic Regression\n",
"lr_ob = LogisticRegression()\n",
"lr_ob.fit(X_train_ob,y_train_ob)\n",
"y_pred_ob = lr_ob.predict(X_test_ob)\n",
"print(f\"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}\")\n",
"print(classification_report(y_test_ob, y_pred_ob))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model saved to lr_ht.pkl\n"
]
}
],
"source": [
"model_filename = 'lr_ht.pkl'\n",
"with open(model_filename, 'wb') as file:\n",
" pickle.dump(lr_ht, file)\n",
"\n",
"print(f\"Model saved to {model_filename}\")"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model saved to lr_dt.pkl\n"
]
}
],
"source": [
"model_filename = 'lr_dt.pkl'\n",
"with open(model_filename, 'wb') as file:\n",
" pickle.dump(lr_dt, file)\n",
"\n",
"print(f\"Model saved to {model_filename}\")"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model saved to lr_ob.pkl\n"
]
}
],
"source": [
"model_filename = 'lr_ob.pkl'\n",
"with open(model_filename, 'wb') as file:\n",
" pickle.dump(lr_ob, file)\n",
"\n",
"print(f\"Model saved to {model_filename}\")"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded Model Accuracy: 0.9727\n"
]
}
],
"source": [
"with open(model_filename, 'rb') as file:\n",
" loaded_model = pickle.load(file)\n",
"\n",
"# Now you can use loaded_model to make predictions\n",
"y_pred_loaded = loaded_model.predict(X_test_ob)\n",
"print(f\"Loaded Model Accuracy: {accuracy_score(y_test_ob, y_pred_loaded):.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 'Other_Cancer',\n",
" 'Depression', 'Arthritis', 'Sex', 'Age_Category', 'Height_(cm)',\n",
" 'Weight_(kg)', 'BMI', 'Smoking_History', 'Alcohol_Consumption',\n",
" 'Fruit_Consumption', 'Green_Vegetables_Consumption',\n",
" 'FriedPotato_Consumption'],\n",
" dtype='object')"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_ob.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['General_Health',\n",
" 'Checkup',\n",
" 'Exercise',\n",
" 'Heart_Disease',\n",
" 'Skin_Cancer',\n",
" 'Other_Cancer',\n",
" 'Depression',\n",
" 'Diabetes',\n",
" 'Arthritis',\n",
" 'Sex',\n",
" 'Age_Category',\n",
" 'Smoking_History']"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"categorical_columns.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Encoded Input DataFrame:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>General_Health</th>\n",
" <th>Checkup</th>\n",
" <th>Exercise</th>\n",
" <th>Depression</th>\n",
" <th>Arthritis</th>\n",
" <th>Sex</th>\n",
" <th>Age_Category</th>\n",
" <th>Height_(cm)</th>\n",
" <th>Weight_(kg)</th>\n",
" <th>BMI</th>\n",
" <th>Smoking_History</th>\n",
" <th>Alcohol_Consumption</th>\n",
" <th>Fruit_Consumption</th>\n",
" <th>Green_Vegetables_Consumption</th>\n",
" <th>FriedPotato_Consumption</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>10</td>\n",
" <td>150</td>\n",
" <td>32.66</td>\n",
" <td>14.34</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>30.0</td>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" General_Health Checkup Exercise Depression Arthritis Sex \\\n",
"0 3 2 0 0 1 0 \n",
"\n",
" Age_Category Height_(cm) Weight_(kg) BMI Smoking_History \\\n",
"0 10 150 32.66 14.34 1 \n",
"\n",
" Alcohol_Consumption Fruit_Consumption Green_Vegetables_Consumption \\\n",
"0 0.0 30.0 16 \n",
"\n",
" FriedPotato_Consumption \n",
"0 12 "
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Define the columns\n",
"columns = ['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',\n",
" 'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',\n",
" 'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',\n",
" 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']\n",
"\n",
"# Define the input values for each column (replace these with actual values)\n",
"input_values = {\n",
" 'General_Health': 'Poor', # Example values\n",
" 'Checkup': 'Within the past 2 years',\n",
" 'Exercise': 'No',\n",
" 'Depression': 'No',\n",
" 'Arthritis': 'Yes',\n",
" 'Sex': 'Female',\n",
" 'Age_Category': '70-74',\n",
" 'Height_(cm)': 150, # Example numerical values\n",
" 'Weight_(kg)': 32.66,\n",
" 'BMI': 14.34,\n",
" 'Smoking_History': 'Yes',\n",
" 'Alcohol_Consumption': 0.0,\n",
" 'Fruit_Consumption': 30.0,\n",
" 'Green_Vegetables_Consumption': 16,\n",
" 'FriedPotato_Consumption': 12\n",
"}\n",
"\n",
"# Create a DataFrame from input values\n",
"input_df = pd.DataFrame([input_values])\n",
"\n",
"# Encode categorical columns using the same LabelEncoders you used during training\n",
"for column in categorical_columns:\n",
" if column in input_df.columns:\n",
" # Transform the input values using the stored encoder\n",
" input_df[column] = label_encoders[column].transform(input_df[column].astype(str))\n",
"\n",
"# Display the input DataFrame after encoding\n",
"print(\"Encoded Input DataFrame:\")\n",
"input_df\n"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "The feature names should match those that were passed during fit.\nFeature names seen at fit time, yet now missing:\n- Other_Cancer\n- Skin_Cancer\n",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[116], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Prepare input for prediction (make sure to drop any non-feature columns if necessary)\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# X_input = input_df[columns]\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Make the prediction\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m prediction \u001b[38;5;241m=\u001b[39m loaded_model\u001b[38;5;241m.\u001b[39mpredict(input_df)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# Output the prediction\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPredicted Class: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mprediction[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
"File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_base.py:351\u001b[0m, in \u001b[0;36mLinearClassifierMixin.predict\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 338\u001b[0m \u001b[39mPredict class labels for samples in X.\u001b[39;00m\n\u001b[0;32m 339\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 348\u001b[0m \u001b[39m Vector containing the class labels for each sample.\u001b[39;00m\n\u001b[0;32m 349\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 350\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(X)\n\u001b[1;32m--> 351\u001b[0m scores \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdecision_function(X)\n\u001b[0;32m 352\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(scores\u001b[39m.\u001b[39mshape) \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m 353\u001b[0m indices \u001b[39m=\u001b[39m xp\u001b[39m.\u001b[39mastype(scores \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m, \u001b[39mint\u001b[39m)\n",
"File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_base.py:332\u001b[0m, in \u001b[0;36mLinearClassifierMixin.decision_function\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 329\u001b[0m check_is_fitted(\u001b[39mself\u001b[39m)\n\u001b[0;32m 330\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(X)\n\u001b[1;32m--> 332\u001b[0m X \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_data(X, accept_sparse\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcsr\u001b[39m\u001b[39m\"\u001b[39m, reset\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[0;32m 333\u001b[0m scores \u001b[39m=\u001b[39m safe_sparse_dot(X, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcoef_\u001b[39m.\u001b[39mT, dense_output\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m) \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mintercept_\n\u001b[0;32m 334\u001b[0m \u001b[39mreturn\u001b[39;00m xp\u001b[39m.\u001b[39mreshape(scores, (\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m,)) \u001b[39mif\u001b[39;00m scores\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m] \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m \u001b[39melse\u001b[39;00m scores\n",
"File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:608\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m 537\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_validate_data\u001b[39m(\n\u001b[0;32m 538\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[0;32m 539\u001b[0m X\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mno_validation\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 544\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_params,\n\u001b[0;32m 545\u001b[0m ):\n\u001b[0;32m 546\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Validate input data and set or check the `n_features_in_` attribute.\u001b[39;00m\n\u001b[0;32m 547\u001b[0m \n\u001b[0;32m 548\u001b[0m \u001b[39m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 606\u001b[0m \u001b[39m validated.\u001b[39;00m\n\u001b[0;32m 607\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 608\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_feature_names(X, reset\u001b[39m=\u001b[39mreset)\n\u001b[0;32m 610\u001b[0m \u001b[39mif\u001b[39;00m y \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_tags()[\u001b[39m\"\u001b[39m\u001b[39mrequires_y\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m 611\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 612\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThis \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m estimator \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 613\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mrequires y to be passed, but the target y is None.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 614\u001b[0m )\n",
"File \u001b[1;32mc:\\ProgramData\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:535\u001b[0m, in \u001b[0;36mBaseEstimator._check_feature_names\u001b[1;34m(self, X, reset)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m missing_names \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m unexpected_names:\n\u001b[0;32m 531\u001b[0m message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\n\u001b[0;32m 532\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFeature names must be in the same order as they were in fit.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 533\u001b[0m )\n\u001b[1;32m--> 535\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(message)\n",
"\u001b[1;31mValueError\u001b[0m: The feature names should match those that were passed during fit.\nFeature names seen at fit time, yet now missing:\n- Other_Cancer\n- Skin_Cancer\n"
]
}
],
"source": [
"# Prepare input for prediction (make sure to drop any non-feature columns if necessary)\n",
"# X_input = input_df[columns]\n",
"\n",
"# Make the prediction\n",
"prediction = loaded_model.predict(input_df)\n",
"\n",
"# Output the prediction\n",
"print(f\"Predicted Class: {prediction[0]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Underweight'], dtype=object)"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prediction"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.90208773, 0.01696106, 0.06855007, 0.01240114]])"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_filename = 'lr_dt.pkl'\n",
"\n",
"with open(model_filename, 'rb') as file:\n",
" loaded_model_lr_dt = pickle.load(file)\n",
"y_pred_loaded = loaded_model_lr_dt.predict_proba(input_df)\n",
"y_pred_loaded"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.12.4 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "3c06e3e46abf38078fe4dac36a0085ec2b134ebbd73dd076183d243eeca6918f"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}