475 lines (474 with data), 15.7 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"functions"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"def load_data(path):\n",
" df = pd.read_csv(path)\n",
" # arham check this later\n",
" # original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')\n",
" # split to train test\n",
" train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
" train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
" test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
" return train_df, test_df\n",
"\n",
"def encode_target(train):\n",
" target_key = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4,'Obesity_Type_II' : 5, 'Obesity_Type_III': 6}\n",
" train['NObeyesdad'] = train['NObeyesdad'].map(target_key)\n",
" return train\n",
"\n",
"def decode_target(train):\n",
" target_key = {0: 'Insufficient_Weight', 1: 'Normal_Weight', 2: 'Overweight_Level_I', 3: 'Overweight_Level_II', 4: 'Obesity_Type_I', 5: 'Obesity_Type_II', 6: 'Obesity_Type_III'}\n",
" train['NObeyesdad'] = train['NObeyesdad'].map(target_key)\n",
" return train\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"\n",
"path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
"train, test = load_data(path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"EDA"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Female 6789\n",
"Male 6703\n",
"Name: Gender, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train\n",
"\n",
"## gender\n",
"train['Gender'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Feature Engineering"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Obesity_Type_II', 'Overweight_Level_II', 'Normal_Weight',\n",
" 'Obesity_Type_III', 'Obesity_Type_I', 'Overweight_Level_I',\n",
" 'Insufficient_Weight'], dtype=object)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['NObeyesdad'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Height</th>\n",
" <th>Weight</th>\n",
" <th>family_history_with_overweight</th>\n",
" <th>FAVC</th>\n",
" <th>FCVC</th>\n",
" <th>NCP</th>\n",
" <th>CAEC</th>\n",
" <th>SMOKE</th>\n",
" <th>CH2O</th>\n",
" <th>SCC</th>\n",
" <th>FAF</th>\n",
" <th>TUE</th>\n",
" <th>CALC</th>\n",
" <th>MTRANS</th>\n",
" <th>NObeyesdad</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Male</td>\n",
" <td>23.586058</td>\n",
" <td>1.750000</td>\n",
" <td>119.434645</td>\n",
" <td>yes</td>\n",
" <td>yes</td>\n",
" <td>1.655684</td>\n",
" <td>3.000000</td>\n",
" <td>Sometimes</td>\n",
" <td>no</td>\n",
" <td>2.000000</td>\n",
" <td>no</td>\n",
" <td>1.097983</td>\n",
" <td>0.738935</td>\n",
" <td>Sometimes</td>\n",
" <td>Public_Transportation</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Male</td>\n",
" <td>24.565628</td>\n",
" <td>1.769328</td>\n",
" <td>85.079589</td>\n",
" <td>yes</td>\n",
" <td>yes</td>\n",
" <td>1.979944</td>\n",
" <td>3.566082</td>\n",
" <td>Sometimes</td>\n",
" <td>no</td>\n",
" <td>2.000000</td>\n",
" <td>no</td>\n",
" <td>0.000000</td>\n",
" <td>1.944675</td>\n",
" <td>Sometimes</td>\n",
" <td>Public_Transportation</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Female</td>\n",
" <td>22.000000</td>\n",
" <td>1.650000</td>\n",
" <td>60.000000</td>\n",
" <td>yes</td>\n",
" <td>yes</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>Frequently</td>\n",
" <td>no</td>\n",
" <td>2.000000</td>\n",
" <td>no</td>\n",
" <td>3.000000</td>\n",
" <td>0.000000</td>\n",
" <td>no</td>\n",
" <td>Walking</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Female</td>\n",
" <td>25.930376</td>\n",
" <td>1.610086</td>\n",
" <td>104.954291</td>\n",
" <td>yes</td>\n",
" <td>yes</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>Sometimes</td>\n",
" <td>no</td>\n",
" <td>2.411582</td>\n",
" <td>no</td>\n",
" <td>0.001297</td>\n",
" <td>0.656491</td>\n",
" <td>Sometimes</td>\n",
" <td>Public_Transportation</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Male</td>\n",
" <td>33.000000</td>\n",
" <td>1.700000</td>\n",
" <td>97.000000</td>\n",
" <td>yes</td>\n",
" <td>yes</td>\n",
" <td>2.000000</td>\n",
" <td>3.000000</td>\n",
" <td>Always</td>\n",
" <td>no</td>\n",
" <td>2.000000</td>\n",
" <td>no</td>\n",
" <td>3.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Frequently</td>\n",
" <td>Automobile</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13487</th>\n",
" <td>Female</td>\n",
" <td>18.000000</td>\n",
" <td>1.722461</td>\n",
" <td>80.442775</td>\n",
" <td>yes</td>\n",
" <td>yes</td>\n",
" <td>2.628791</td>\n",
" <td>2.562895</td>\n",
" <td>Sometimes</td>\n",
" <td>no</td>\n",
" <td>1.844645</td>\n",
" <td>no</td>\n",
" <td>0.288032</td>\n",
" <td>0.722276</td>\n",
" <td>no</td>\n",
" <td>Public_Transportation</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13488</th>\n",
" <td>Male</td>\n",
" <td>22.000000</td>\n",
" <td>1.750000</td>\n",
" <td>95.000000</td>\n",
" <td>yes</td>\n",
" <td>no</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>Sometimes</td>\n",
" <td>no</td>\n",
" <td>2.000000</td>\n",
" <td>no</td>\n",
" <td>3.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Sometimes</td>\n",
" <td>Public_Transportation</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13489</th>\n",
" <td>Male</td>\n",
" <td>21.000000</td>\n",
" <td>1.620000</td>\n",
" <td>68.000000</td>\n",
" <td>no</td>\n",
" <td>yes</td>\n",
" <td>2.000000</td>\n",
" <td>3.000000</td>\n",
" <td>Always</td>\n",
" <td>no</td>\n",
" <td>3.000000</td>\n",
" <td>no</td>\n",
" <td>2.000000</td>\n",
" <td>0.000000</td>\n",
" <td>Sometimes</td>\n",
" <td>Public_Transportation</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13490</th>\n",
" <td>Female</td>\n",
" <td>26.000000</td>\n",
" <td>1.650125</td>\n",
" <td>111.939671</td>\n",
" <td>yes</td>\n",
" <td>yes</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>Sometimes</td>\n",
" <td>no</td>\n",
" <td>2.770732</td>\n",
" <td>no</td>\n",
" <td>0.000000</td>\n",
" <td>0.237307</td>\n",
" <td>Sometimes</td>\n",
" <td>Public_Transportation</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13491</th>\n",
" <td>Male</td>\n",
" <td>37.997912</td>\n",
" <td>1.774330</td>\n",
" <td>107.998815</td>\n",
" <td>yes</td>\n",
" <td>yes</td>\n",
" <td>2.964419</td>\n",
" <td>2.902766</td>\n",
" <td>Sometimes</td>\n",
" <td>no</td>\n",
" <td>2.745242</td>\n",
" <td>no</td>\n",
" <td>2.545707</td>\n",
" <td>0.000000</td>\n",
" <td>Sometimes</td>\n",
" <td>Automobile</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>13492 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" Gender Age Height Weight family_history_with_overweight \\\n",
"0 Male 23.586058 1.750000 119.434645 yes \n",
"1 Male 24.565628 1.769328 85.079589 yes \n",
"2 Female 22.000000 1.650000 60.000000 yes \n",
"3 Female 25.930376 1.610086 104.954291 yes \n",
"4 Male 33.000000 1.700000 97.000000 yes \n",
"... ... ... ... ... ... \n",
"13487 Female 18.000000 1.722461 80.442775 yes \n",
"13488 Male 22.000000 1.750000 95.000000 yes \n",
"13489 Male 21.000000 1.620000 68.000000 no \n",
"13490 Female 26.000000 1.650125 111.939671 yes \n",
"13491 Male 37.997912 1.774330 107.998815 yes \n",
"\n",
" FAVC FCVC NCP CAEC SMOKE CH2O SCC FAF \\\n",
"0 yes 1.655684 3.000000 Sometimes no 2.000000 no 1.097983 \n",
"1 yes 1.979944 3.566082 Sometimes no 2.000000 no 0.000000 \n",
"2 yes 3.000000 3.000000 Frequently no 2.000000 no 3.000000 \n",
"3 yes 3.000000 3.000000 Sometimes no 2.411582 no 0.001297 \n",
"4 yes 2.000000 3.000000 Always no 2.000000 no 3.000000 \n",
"... ... ... ... ... ... ... .. ... \n",
"13487 yes 2.628791 2.562895 Sometimes no 1.844645 no 0.288032 \n",
"13488 no 3.000000 3.000000 Sometimes no 2.000000 no 3.000000 \n",
"13489 yes 2.000000 3.000000 Always no 3.000000 no 2.000000 \n",
"13490 yes 3.000000 3.000000 Sometimes no 2.770732 no 0.000000 \n",
"13491 yes 2.964419 2.902766 Sometimes no 2.745242 no 2.545707 \n",
"\n",
" TUE CALC MTRANS NObeyesdad \n",
"0 0.738935 Sometimes Public_Transportation 5 \n",
"1 1.944675 Sometimes Public_Transportation 3 \n",
"2 0.000000 no Walking 1 \n",
"3 0.656491 Sometimes Public_Transportation 6 \n",
"4 0.000000 Frequently Automobile 4 \n",
"... ... ... ... ... \n",
"13487 0.722276 no Public_Transportation 2 \n",
"13488 0.000000 Sometimes Public_Transportation 3 \n",
"13489 0.000000 Sometimes Public_Transportation 2 \n",
"13490 0.237307 Sometimes Public_Transportation 6 \n",
"13491 0.000000 Sometimes Automobile 4 \n",
"\n",
"[13492 rows x 17 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = encode_target(train)\n",
"train"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "DataScience",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}