[7bf731]: / Dump / dump2.ipynb

Download this file

475 lines (474 with data), 15.7 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def load_data(path):\n",
    "    df = pd.read_csv(path)\n",
    "    # arham check this later\n",
    "    # original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')\n",
    "    # split to train test\n",
    "    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
    "    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
    "    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
    "    return train_df, test_df\n",
    "\n",
    "def encode_target(train):\n",
    "    target_key = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4,'Obesity_Type_II' : 5, 'Obesity_Type_III': 6}\n",
    "    train['NObeyesdad'] = train['NObeyesdad'].map(target_key)\n",
    "    return train\n",
    "\n",
    "def decode_target(train):\n",
    "    target_key = {0: 'Insufficient_Weight', 1: 'Normal_Weight', 2: 'Overweight_Level_I', 3: 'Overweight_Level_II', 4: 'Obesity_Type_I', 5: 'Obesity_Type_II', 6: 'Obesity_Type_III'}\n",
    "    train['NObeyesdad'] = train['NObeyesdad'].map(target_key)\n",
    "    return train\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
    "train, test = load_data(path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Female    6789\n",
       "Male      6703\n",
       "Name: Gender, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train\n",
    "\n",
    "## gender\n",
    "train['Gender'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Obesity_Type_II', 'Overweight_Level_II', 'Normal_Weight',\n",
       "       'Obesity_Type_III', 'Obesity_Type_I', 'Overweight_Level_I',\n",
       "       'Insufficient_Weight'], dtype=object)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['NObeyesdad'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gender</th>\n",
       "      <th>Age</th>\n",
       "      <th>Height</th>\n",
       "      <th>Weight</th>\n",
       "      <th>family_history_with_overweight</th>\n",
       "      <th>FAVC</th>\n",
       "      <th>FCVC</th>\n",
       "      <th>NCP</th>\n",
       "      <th>CAEC</th>\n",
       "      <th>SMOKE</th>\n",
       "      <th>CH2O</th>\n",
       "      <th>SCC</th>\n",
       "      <th>FAF</th>\n",
       "      <th>TUE</th>\n",
       "      <th>CALC</th>\n",
       "      <th>MTRANS</th>\n",
       "      <th>NObeyesdad</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Male</td>\n",
       "      <td>23.586058</td>\n",
       "      <td>1.750000</td>\n",
       "      <td>119.434645</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>1.655684</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>no</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>no</td>\n",
       "      <td>1.097983</td>\n",
       "      <td>0.738935</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>Public_Transportation</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Male</td>\n",
       "      <td>24.565628</td>\n",
       "      <td>1.769328</td>\n",
       "      <td>85.079589</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>1.979944</td>\n",
       "      <td>3.566082</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>no</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>no</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.944675</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>Public_Transportation</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Female</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>1.650000</td>\n",
       "      <td>60.000000</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>Frequently</td>\n",
       "      <td>no</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>no</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>no</td>\n",
       "      <td>Walking</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Female</td>\n",
       "      <td>25.930376</td>\n",
       "      <td>1.610086</td>\n",
       "      <td>104.954291</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>no</td>\n",
       "      <td>2.411582</td>\n",
       "      <td>no</td>\n",
       "      <td>0.001297</td>\n",
       "      <td>0.656491</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>Public_Transportation</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Male</td>\n",
       "      <td>33.000000</td>\n",
       "      <td>1.700000</td>\n",
       "      <td>97.000000</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>Always</td>\n",
       "      <td>no</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>no</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>Frequently</td>\n",
       "      <td>Automobile</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13487</th>\n",
       "      <td>Female</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>1.722461</td>\n",
       "      <td>80.442775</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>2.628791</td>\n",
       "      <td>2.562895</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>no</td>\n",
       "      <td>1.844645</td>\n",
       "      <td>no</td>\n",
       "      <td>0.288032</td>\n",
       "      <td>0.722276</td>\n",
       "      <td>no</td>\n",
       "      <td>Public_Transportation</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13488</th>\n",
       "      <td>Male</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>1.750000</td>\n",
       "      <td>95.000000</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>no</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>no</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>Public_Transportation</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13489</th>\n",
       "      <td>Male</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>1.620000</td>\n",
       "      <td>68.000000</td>\n",
       "      <td>no</td>\n",
       "      <td>yes</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>Always</td>\n",
       "      <td>no</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>no</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>Public_Transportation</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13490</th>\n",
       "      <td>Female</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>1.650125</td>\n",
       "      <td>111.939671</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>no</td>\n",
       "      <td>2.770732</td>\n",
       "      <td>no</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.237307</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>Public_Transportation</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13491</th>\n",
       "      <td>Male</td>\n",
       "      <td>37.997912</td>\n",
       "      <td>1.774330</td>\n",
       "      <td>107.998815</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>2.964419</td>\n",
       "      <td>2.902766</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>no</td>\n",
       "      <td>2.745242</td>\n",
       "      <td>no</td>\n",
       "      <td>2.545707</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>Sometimes</td>\n",
       "      <td>Automobile</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>13492 rows × 17 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Gender        Age    Height      Weight family_history_with_overweight  \\\n",
       "0        Male  23.586058  1.750000  119.434645                            yes   \n",
       "1        Male  24.565628  1.769328   85.079589                            yes   \n",
       "2      Female  22.000000  1.650000   60.000000                            yes   \n",
       "3      Female  25.930376  1.610086  104.954291                            yes   \n",
       "4        Male  33.000000  1.700000   97.000000                            yes   \n",
       "...       ...        ...       ...         ...                            ...   \n",
       "13487  Female  18.000000  1.722461   80.442775                            yes   \n",
       "13488    Male  22.000000  1.750000   95.000000                            yes   \n",
       "13489    Male  21.000000  1.620000   68.000000                             no   \n",
       "13490  Female  26.000000  1.650125  111.939671                            yes   \n",
       "13491    Male  37.997912  1.774330  107.998815                            yes   \n",
       "\n",
       "      FAVC      FCVC       NCP        CAEC SMOKE      CH2O SCC       FAF  \\\n",
       "0      yes  1.655684  3.000000   Sometimes    no  2.000000  no  1.097983   \n",
       "1      yes  1.979944  3.566082   Sometimes    no  2.000000  no  0.000000   \n",
       "2      yes  3.000000  3.000000  Frequently    no  2.000000  no  3.000000   \n",
       "3      yes  3.000000  3.000000   Sometimes    no  2.411582  no  0.001297   \n",
       "4      yes  2.000000  3.000000      Always    no  2.000000  no  3.000000   \n",
       "...    ...       ...       ...         ...   ...       ...  ..       ...   \n",
       "13487  yes  2.628791  2.562895   Sometimes    no  1.844645  no  0.288032   \n",
       "13488   no  3.000000  3.000000   Sometimes    no  2.000000  no  3.000000   \n",
       "13489  yes  2.000000  3.000000      Always    no  3.000000  no  2.000000   \n",
       "13490  yes  3.000000  3.000000   Sometimes    no  2.770732  no  0.000000   \n",
       "13491  yes  2.964419  2.902766   Sometimes    no  2.745242  no  2.545707   \n",
       "\n",
       "            TUE        CALC                 MTRANS  NObeyesdad  \n",
       "0      0.738935   Sometimes  Public_Transportation           5  \n",
       "1      1.944675   Sometimes  Public_Transportation           3  \n",
       "2      0.000000          no                Walking           1  \n",
       "3      0.656491   Sometimes  Public_Transportation           6  \n",
       "4      0.000000  Frequently             Automobile           4  \n",
       "...         ...         ...                    ...         ...  \n",
       "13487  0.722276          no  Public_Transportation           2  \n",
       "13488  0.000000   Sometimes  Public_Transportation           3  \n",
       "13489  0.000000   Sometimes  Public_Transportation           2  \n",
       "13490  0.237307   Sometimes  Public_Transportation           6  \n",
       "13491  0.000000   Sometimes             Automobile           4  \n",
       "\n",
       "[13492 rows x 17 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = encode_target(train)\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "DataScience",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}