[b4c0b6]: / 03-Experiments / 05-LightBGM_With_FE.ipynb

Download this file

867 lines (866 with data), 36.6 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Gloabl Experiment Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024/04/25 15:58:03 INFO mlflow.tracking.fluent: Experiment with name 'LightGBM' does not exist. Creating a new experiment.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<Experiment: artifact_location='/Users/arham/Downloads/Projects/03-Experiments/mlruns/4', creation_time=1714075083201, experiment_id='4', last_update_time=1714075083201, lifecycle_stage='active', name='LightGBM', tags={}>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import mlflow\n",
    "# Set the MLflow tracking URI to a new SQLite URI\n",
    "mlflow.set_tracking_uri(\"sqlite:///new_mlflow.db\")\n",
    "mlflow.set_experiment(\"LightGBM\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "import lightgbm as lgb\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "def load_data(path):\n",
    "    df = pd.read_csv(path)\n",
    "    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
    "    train_df, val_df,  = train_test_split(train_df, test_size=0.20, random_state=42)\n",
    "    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
    "    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
    "    val_df = val_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
    "    return train_df, val_df, test_df\n",
    "\n",
    "def encode_target(train):\n",
    "    target_key = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4,'Obesity_Type_II' : 5, 'Obesity_Type_III': 6}\n",
    "    train['NObeyesdad'] = train['NObeyesdad'].map(target_key)\n",
    "    return train\n",
    "\n",
    "def make_gender_binary(train):\n",
    "    train['Gender'] = train['Gender'].map({'Male':0, 'Female':1})\n",
    "\n",
    "def datatypes(train):\n",
    "    train['Weight'] = train['Weight'].astype(float)\n",
    "    train['Age'] = train['Age'].astype(float)\n",
    "    train['Height'] = train['Height'].astype(float)\n",
    "    return train\n",
    "\n",
    "# def age_binning(train_df):\n",
    "#     train_df['Age_Group'] = pd.cut(train_df['Age'], bins=[0, 20, 30, 40, 50, train_df['Age'].max()], labels=['0-20', '21-30', '31-40', '41-50', '50+'])\n",
    "#     return train_df\n",
    "\n",
    "def age_binning(df):\n",
    "    age_groups = []\n",
    "    for age in df['Age']:\n",
    "        if age <= 20:\n",
    "            age_group = 1\n",
    "        elif age <= 30:\n",
    "            age_group = 2\n",
    "        elif age <= 40:\n",
    "            age_group = 3\n",
    "        elif age <= 50:\n",
    "            age_group = 4\n",
    "        else:\n",
    "            age_group = 5\n",
    "        age_groups.append(age_group)\n",
    "    df['Age_Group'] = age_groups\n",
    "    return df\n",
    "\n",
    "def age_scaling_log(train_df):\n",
    "    train_df['Age'] = train_df['Age'].astype(float)\n",
    "    train_df['Log_Age'] = np.log1p(train_df['Age'])\n",
    "    return train_df\n",
    "\n",
    "def age_scaling_minmax(train_df):\n",
    "    train_df['Age'] = train_df['Age'].astype(float)\n",
    "    scaler_age = MinMaxScaler()\n",
    "    train_df['Scaled_Age'] = scaler_age.fit_transform(train_df['Age'].values.reshape(-1, 1))\n",
    "    return train_df, scaler_age\n",
    "\n",
    "def weight_scaling_log(train_df):\n",
    "    train_df['Weight'] = train_df['Weight'].astype(float)\n",
    "    train_df['Log_Weight'] = np.log1p(train_df['Weight'])\n",
    "    return train_df\n",
    "\n",
    "def weight_scaling_minmax(train_df):\n",
    "    train_df['Weight'] = train_df['Weight'].astype(float)\n",
    "    scaler_weight = MinMaxScaler()\n",
    "    train_df['Scaled_Weight'] = scaler_weight.fit_transform(train_df['Weight'].values.reshape(-1, 1))\n",
    "    return train_df, scaler_weight\n",
    "\n",
    "def height_scaling_log(train_df):\n",
    "    train_df['Log_Height'] = np.log1p(train_df['Height'])\n",
    "    return train_df\n",
    "\n",
    "def height_scaling_minmax(train_df):\n",
    "    scaler_height = MinMaxScaler()\n",
    "    train_df['Scaled_Height'] = scaler_height.fit_transform(train_df['Height'].values.reshape(-1, 1))\n",
    "    return train_df, scaler_height\n",
    "\n",
    "def make_gender_binary(train):\n",
    "    train['Gender'] = train['Gender'].map({'Female':1, 'Male':0})\n",
    "    return train\n",
    "\n",
    "def fix_binary_columns(train):\n",
    "    Binary_Cols = ['family_history_with_overweight','FAVC', 'SCC','SMOKE']\n",
    "    # if yes then 1 else 0\n",
    "    for col in Binary_Cols:\n",
    "        train[col] = train[col].map({'yes': 1, 'no': 0})\n",
    "    return train\n",
    "\n",
    "def freq_cat_cols(train):\n",
    "    # One hot encoding\n",
    "    cat_cols = ['CAEC', 'CALC']\n",
    "    for col in cat_cols:\n",
    "        train[col] = train[col].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})\n",
    "    return train\n",
    "\n",
    "def Mtrans(train):\n",
    "    \"\"\"\n",
    "    Public_Transportation    8692\n",
    "    Automobile               1835\n",
    "    Walking                   231\n",
    "    Motorbike                  19\n",
    "    Bike                       16\n",
    "    \"\"\"\n",
    "    # train['MTRANS'] = train['MTRANS'].map({'Public_Transportation': 3, 'Automobile': 5, 'Walking': 1, 'Motorbike': 4, 'Bike': 2})\n",
    "    # dummify column\n",
    "    train = pd.get_dummies(train, columns=['MTRANS'])\n",
    "    return train\n",
    "\n",
    "\n",
    "def other_features(train):\n",
    "    train['BMI'] = train['Weight'] / (train['Height'] ** 2)\n",
    "    # train['Age'*'Gender'] = train['Age'] * train['Gender']\n",
    "    polynomial_features = PolynomialFeatures(degree=2)\n",
    "    X_poly = polynomial_features.fit_transform(train[['Age', 'BMI']])\n",
    "    poly_features_df = pd.DataFrame(X_poly, columns=['Age^2', 'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2'])\n",
    "    train = pd.concat([train, poly_features_df], axis=1)\n",
    "    return train\n",
    "\n",
    "\n",
    "def test_pipeline(test, scaler_age, scaler_weight, scaler_height):\n",
    "    test = datatypes(test)\n",
    "    test = encode_target(test)\n",
    "    test = age_binning(test)\n",
    "    test = age_scaling_log(test)\n",
    "    test['Scaled_Age'] = scaler_age.transform(test['Age'].values.reshape(-1, 1))\n",
    "    test = weight_scaling_log(test)\n",
    "    test['Scaled_Weight'] = scaler_weight.transform(test['Weight'].values.reshape(-1, 1))\n",
    "    test = height_scaling_log(test)\n",
    "    test['Scaled_Height'] = scaler_height.transform(test['Height'].values.reshape(-1, 1))\n",
    "    test = make_gender_binary(test)\n",
    "    test = fix_binary_columns(test)\n",
    "    test = freq_cat_cols(test)\n",
    "    test = Mtrans(test)\n",
    "    test = other_features(test)\n",
    "\n",
    "    return test\n",
    "\n",
    "def train_model(params, X_train, y_train):\n",
    "    lgb_train = lgb.Dataset(X_train, y_train)\n",
    "    model = lgb.train(params, lgb_train, num_boost_round=1000)\n",
    "    return model\n",
    "\n",
    "def evaluate_model(model, X_val, y_val):\n",
    "    y_pred = model.predict(X_val)\n",
    "    y_pred = [np.argmax(y) for y in y_pred]\n",
    "    accuracy = accuracy_score(y_val, y_pred)\n",
    "    return accuracy\n",
    "\n",
    "def objective(trial, X_train, y_train):\n",
    "    params = {\n",
    "        'objective': 'multiclass',\n",
    "        'num_class': 7,\n",
    "        'metric': 'multi_logloss',\n",
    "        'boosting_type': 'gbdt',\n",
    "        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),\n",
    "        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),\n",
    "        'max_depth': trial.suggest_int('max_depth', -1, 20),\n",
    "        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 0.95),\n",
    "        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 0.95),\n",
    "        'verbosity': -1\n",
    "    }\n",
    "\n",
    "    n_splits = 5\n",
    "    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)\n",
    "    scores = []\n",
    "\n",
    "    for train_index, val_index in kf.split(X_train, y_train):\n",
    "        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]\n",
    "        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]\n",
    "\n",
    "        model = train_model(params, X_tr, y_tr)\n",
    "        accuracy = evaluate_model(model, X_val, y_val)\n",
    "        scores.append(accuracy)\n",
    "\n",
    "    return np.mean(scores)\n",
    "\n",
    "def optimize_hyperparameters(X_train, y_train, n_trials=2):\n",
    "    study = optuna.create_study(direction='maximize')\n",
    "    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=n_trials)\n",
    "    return study.best_params\n",
    "\n",
    "def New_Test_Instances_Pipeline(test, scaler_age, scaler_weight, scaler_height):\n",
    "    test = datatypes(test)\n",
    "    test = age_binning(test)\n",
    "    test = age_scaling_log(test)\n",
    "    test['Scaled_Age'] = scaler_age.transform(test['Age'].values.reshape(-1, 1))\n",
    "    test = weight_scaling_log(test)\n",
    "    test['Scaled_Weight'] = scaler_weight.transform(test['Weight'].values.reshape(-1, 1))\n",
    "    test = height_scaling_log(test)\n",
    "    test['Scaled_Height'] = scaler_height.transform(test['Height'].values.reshape(-1, 1))\n",
    "    test = make_gender_binary(test)\n",
    "    test = fix_binary_columns(test)\n",
    "    test = freq_cat_cols(test)\n",
    "    test = Mtrans(test)\n",
    "    test = other_features(test)\n",
    "\n",
    "    return test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]\n",
      "[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001163 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 3576\n",
      "[LightGBM] [Info] Number of data points in the train set: 10793, number of used features: 25\n",
      "[LightGBM] [Info] Start training from score -2.103541\n",
      "[LightGBM] [Info] Start training from score -1.893390\n",
      "[LightGBM] [Info] Start training from score -2.159762\n",
      "[LightGBM] [Info] Start training from score -2.113461\n",
      "[LightGBM] [Info] Start training from score -1.974767\n",
      "[LightGBM] [Info] Start training from score -1.867272\n",
      "[LightGBM] [Info] Start training from score -1.619963\n",
      "[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000883 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 3576\n",
      "[LightGBM] [Info] Number of data points in the train set: 8634, number of used features: 25\n",
      "[LightGBM] [Info] Start training from score -2.104065\n",
      "[LightGBM] [Info] Start training from score -1.893344\n",
      "[LightGBM] [Info] Start training from score -2.159716\n",
      "[LightGBM] [Info] Start training from score -2.113607\n",
      "[LightGBM] [Info] Start training from score -1.974220\n",
      "[LightGBM] [Info] Start training from score -1.867526\n",
      "[LightGBM] [Info] Start training from score -1.619799\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001080 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 3573\n",
      "[LightGBM] [Info] Number of data points in the train set: 8634, number of used features: 25\n",
      "[LightGBM] [Info] Start training from score -2.104065\n",
      "[LightGBM] [Info] Start training from score -1.893344\n",
      "[LightGBM] [Info] Start training from score -2.159716\n",
      "[LightGBM] [Info] Start training from score -2.112648\n",
      "[LightGBM] [Info] Start training from score -1.974220\n",
      "[LightGBM] [Info] Start training from score -1.867526\n",
      "[LightGBM] [Info] Start training from score -1.620385\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000459 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 3572\n",
      "[LightGBM] [Info] Number of data points in the train set: 8634, number of used features: 25\n",
      "[LightGBM] [Info] Start training from score -2.103115\n",
      "[LightGBM] [Info] Start training from score -1.893344\n",
      "[LightGBM] [Info] Start training from score -2.159716\n",
      "[LightGBM] [Info] Start training from score -2.113607\n",
      "[LightGBM] [Info] Start training from score -1.975054\n",
      "[LightGBM] [Info] Start training from score -1.867526\n",
      "[LightGBM] [Info] Start training from score -1.619799\n",
      "[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001021 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 3571\n",
      "[LightGBM] [Info] Number of data points in the train set: 8635, number of used features: 25\n",
      "[LightGBM] [Info] Start training from score -2.103231\n",
      "[LightGBM] [Info] Start training from score -1.893459\n",
      "[LightGBM] [Info] Start training from score -2.159832\n",
      "[LightGBM] [Info] Start training from score -2.113723\n",
      "[LightGBM] [Info] Start training from score -1.975170\n",
      "[LightGBM] [Info] Start training from score -1.866892\n",
      "[LightGBM] [Info] Start training from score -1.619915\n",
      "[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000919 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 3575\n",
      "[LightGBM] [Info] Number of data points in the train set: 8635, number of used features: 25\n",
      "[LightGBM] [Info] Start training from score -2.103231\n",
      "[LightGBM] [Info] Start training from score -1.893459\n",
      "[LightGBM] [Info] Start training from score -2.159832\n",
      "[LightGBM] [Info] Start training from score -2.113723\n",
      "[LightGBM] [Info] Start training from score -1.975170\n",
      "[LightGBM] [Info] Start training from score -1.866892\n",
      "[LightGBM] [Info] Start training from score -1.619915\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "\n",
      "Accuracy: 0.904845733345687\n",
      "Precision: 0.9046557231546489\n",
      "Recall: 0.904845733345687\n",
      "F1: 0.9046297258523301\n",
      "[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001173 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 3576\n",
      "[LightGBM] [Info] Number of data points in the train set: 10793, number of used features: 25\n",
      "[LightGBM] [Info] Start training from score -2.103541\n",
      "[LightGBM] [Info] Start training from score -1.893390\n",
      "[LightGBM] [Info] Start training from score -2.159762\n",
      "[LightGBM] [Info] Start training from score -2.113461\n",
      "[LightGBM] [Info] Start training from score -1.974767\n",
      "[LightGBM] [Info] Start training from score -1.867272\n",
      "[LightGBM] [Info] Start training from score -1.619963\n",
      "Recall for class 0: 0.9367088607594937\n",
      "Recall for class 1: 0.9117647058823529\n",
      "Recall for class 2: 0.755223880597015\n",
      "Recall for class 3: 0.8267477203647416\n",
      "Recall for class 4: 0.8669833729216152\n",
      "Recall for class 5: 0.9617224880382775\n",
      "Recall for class 6: 0.9960474308300395\n"
     ]
    }
   ],
   "source": [
    "path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
    "train_df, val_df, test_df = load_data(path)\n",
    "\n",
    "# train test val pipeline\n",
    "train_df = datatypes(train_df)\n",
    "train_df = encode_target(train_df)\n",
    "train_df = age_binning(train_df)\n",
    "train_df, scaler_age = age_scaling_minmax(train_df)\n",
    "train_df = age_scaling_log(train_df)\n",
    "train_df, scaler_weight = weight_scaling_minmax(train_df)\n",
    "train_df = weight_scaling_log(train_df)\n",
    "train_df, scaler_height = height_scaling_minmax(train_df)\n",
    "train_df = height_scaling_log(train_df)\n",
    "train_df = make_gender_binary(train_df)\n",
    "train_df = fix_binary_columns(train_df)\n",
    "train_df = freq_cat_cols(train_df)\n",
    "train_df = Mtrans(train_df)\n",
    "train_df = other_features(train_df)\n",
    "val_df = test_pipeline(val_df, scaler_age, scaler_weight, scaler_height)\n",
    "test_df = test_pipeline(test_df, scaler_age, scaler_weight, scaler_height)\n",
    "\n",
    "\n",
    "# target & predictors\n",
    "Target = 'NObeyesdad'\n",
    "features = ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',\n",
    "       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',\n",
    "       'CALC', 'Age_Group', \n",
    "       'MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike',\n",
    "       'MTRANS_Public_Transportation', 'MTRANS_Walking', 'BMI', 'Age^2',\n",
    "       'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2'] \n",
    "        #'Scaled_Age', 'Log_Age', 'Scaled_Weight', 'Log_Weight', 'Scaled_Height', 'Log_Height',\n",
    "\n",
    "X_train = train_df[features]\n",
    "y_train = train_df[Target]\n",
    "X_val = val_df[features]\n",
    "y_val = val_df[Target]\n",
    "X_test = test_df[features]\n",
    "y_test = test_df[Target]\n",
    "\n",
    "\n",
    "lgb_train = lgb.Dataset(X_train, y_train)\n",
    "params = {\n",
    "    'objective': 'multiclass',\n",
    "    'num_class': 7,\n",
    "    'metric': 'multi_logloss',\n",
    "}\n",
    "\n",
    "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
    "import mlflow\n",
    "import lightgbm as lgb\n",
    "from lightgbm import LGBMClassifier\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "\n",
    "mlflow.sklearn.autolog(disable=True)\n",
    "\n",
    "with mlflow.start_run(run_name=\"LGBM_without_FE_v2\"):\n",
    "    class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]\n",
    "    class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]\n",
    "    target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]\n",
    "    print(f\"Target Drift For Each Class {target_drift}\")\n",
    "    mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})\n",
    "\n",
    "    model = LGBMClassifier(**params)  # Assuming you have your parameters defined somewhere\n",
    "    model.fit(X_train, y_train)  # Fit the model on training data\n",
    "\n",
    "    # CV predictions of LightGBM\n",
    "    cv_predictions = cross_val_predict(model, X_train, y_train, cv=5)\n",
    "    accuracy_lgbm = accuracy_score(y_train, cv_predictions)\n",
    "    \n",
    "    # Compute precision, recall, and F1-score\n",
    "    precision_lgbm, recall_lgbm, f1_lgbm, _ = precision_recall_fscore_support(y_train, cv_predictions, average='weighted')\n",
    "    \n",
    "    print(\"\\nAccuracy:\", accuracy_lgbm)\n",
    "    print(\"Precision:\", precision_lgbm)\n",
    "    print(\"Recall:\", recall_lgbm)\n",
    "    print(\"F1:\", f1_lgbm)\n",
    "    \n",
    "    mlflow.log_metric('accuracy', accuracy_lgbm)\n",
    "    mlflow.log_metric('precision', precision_lgbm)\n",
    "    mlflow.log_metric('recall', recall_lgbm)\n",
    "    mlflow.log_metric('f1', f1_lgbm)\n",
    "\n",
    "    model.fit(X_train, y_train)\n",
    "    y_val_pred_lgbm = model.predict(X_val)\n",
    "    \n",
    "    # Compute precision, recall, and F1-score per class\n",
    "    precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_val_pred_lgbm, average=None)\n",
    "    for i in range(len(recall_per_class)):\n",
    "        print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
    "        mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
    "\n",
    "    mlflow.lightgbm.log_model(model, 'model')\n",
    "    mlflow.set_tag('experiments', 'Arham A.')\n",
    "    mlflow.set_tag('model_name', 'LGBM')\n",
    "    mlflow.set_tag('preprocessing', 'Yes')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df, val_df, test_df = load_data(path)\n",
    "\n",
    "\n",
    "# X_val = val_df[features]\n",
    "# y_val = val_df[Target]\n",
    "# y_pred = model.predict(X_val, num_iteration=model.best_iteration)\n",
    "# # y_pred to a dataframe\n",
    "# y_pred = pd.DataFrame(y_pred, columns=['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'])\n",
    "# # add prefix to columns \"prob_lgbm_\"\n",
    "# y_pred = y_pred.add_prefix('prob_lgbm_')\n",
    "# # add to X_val\n",
    "# X_val = pd.concat([X_val, y_pred], axis=1)\n",
    "# # export as stack_aid_lgbm.csv\n",
    "# X_val.to_csv('stack_aid_lgbm.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gender</th>\n",
       "      <th>Age</th>\n",
       "      <th>Height</th>\n",
       "      <th>Weight</th>\n",
       "      <th>family_history_with_overweight</th>\n",
       "      <th>FAVC</th>\n",
       "      <th>FCVC</th>\n",
       "      <th>NCP</th>\n",
       "      <th>CAEC</th>\n",
       "      <th>SMOKE</th>\n",
       "      <th>CH2O</th>\n",
       "      <th>SCC</th>\n",
       "      <th>FAF</th>\n",
       "      <th>TUE</th>\n",
       "      <th>CALC</th>\n",
       "      <th>Age_Group</th>\n",
       "      <th>MTRANS_Automobile</th>\n",
       "      <th>MTRANS_Bike</th>\n",
       "      <th>MTRANS_Motorbike</th>\n",
       "      <th>MTRANS_Public_Transportation</th>\n",
       "      <th>MTRANS_Walking</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Age^2</th>\n",
       "      <th>Age^3</th>\n",
       "      <th>BMI^2</th>\n",
       "      <th>Age * BMI</th>\n",
       "      <th>Age * BMI^2</th>\n",
       "      <th>Age^2 * BMI^2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>1.550000</td>\n",
       "      <td>51.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>21-30</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>21.227888</td>\n",
       "      <td>1.0</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>21.227888</td>\n",
       "      <td>441.000000</td>\n",
       "      <td>445.785640</td>\n",
       "      <td>450.623213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>1.700000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0-20</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>27.681661</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>27.681661</td>\n",
       "      <td>400.000000</td>\n",
       "      <td>553.633218</td>\n",
       "      <td>766.274350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>1.600000</td>\n",
       "      <td>60.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0-20</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>23.437500</td>\n",
       "      <td>1.0</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>23.437500</td>\n",
       "      <td>324.000000</td>\n",
       "      <td>421.875000</td>\n",
       "      <td>549.316406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>1.632983</td>\n",
       "      <td>111.720238</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2.559750</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.396972</td>\n",
       "      <td>1</td>\n",
       "      <td>21-30</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>41.895611</td>\n",
       "      <td>1.0</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>41.895611</td>\n",
       "      <td>676.000000</td>\n",
       "      <td>1089.285877</td>\n",
       "      <td>1755.242193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>21.682636</td>\n",
       "      <td>1.748524</td>\n",
       "      <td>133.845064</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2.843777</td>\n",
       "      <td>0</td>\n",
       "      <td>1.427037</td>\n",
       "      <td>0.849236</td>\n",
       "      <td>1</td>\n",
       "      <td>21-30</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>43.778327</td>\n",
       "      <td>1.0</td>\n",
       "      <td>21.682636</td>\n",
       "      <td>43.778327</td>\n",
       "      <td>470.136704</td>\n",
       "      <td>949.229536</td>\n",
       "      <td>1916.541944</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Gender        Age    Height      Weight  family_history_with_overweight  \\\n",
       "0       1  21.000000  1.550000   51.000000                               0   \n",
       "1       0  20.000000  1.700000   80.000000                               1   \n",
       "2       1  18.000000  1.600000   60.000000                               1   \n",
       "3       1  26.000000  1.632983  111.720238                               1   \n",
       "4       1  21.682636  1.748524  133.845064                               1   \n",
       "\n",
       "   FAVC  FCVC  NCP  CAEC  SMOKE      CH2O  SCC       FAF       TUE  CALC  \\\n",
       "0     1   3.0  1.0     2      0  2.000000    0  0.000000  0.000000     0   \n",
       "1     1   2.0  3.0     1      0  2.000000    0  2.000000  1.000000     1   \n",
       "2     1   2.0  3.0     1      0  2.000000    0  1.000000  1.000000     0   \n",
       "3     1   3.0  3.0     1      0  2.559750    0  0.000000  0.396972     1   \n",
       "4     1   3.0  3.0     1      0  2.843777    0  1.427037  0.849236     1   \n",
       "\n",
       "  Age_Group  MTRANS_Automobile  MTRANS_Bike  MTRANS_Motorbike  \\\n",
       "0     21-30              False        False             False   \n",
       "1      0-20              False        False             False   \n",
       "2      0-20              False        False             False   \n",
       "3     21-30              False        False             False   \n",
       "4     21-30              False        False             False   \n",
       "\n",
       "   MTRANS_Public_Transportation  MTRANS_Walking        BMI  Age^2      Age^3  \\\n",
       "0                          True           False  21.227888    1.0  21.000000   \n",
       "1                          True           False  27.681661    1.0  20.000000   \n",
       "2                         False            True  23.437500    1.0  18.000000   \n",
       "3                          True           False  41.895611    1.0  26.000000   \n",
       "4                          True           False  43.778327    1.0  21.682636   \n",
       "\n",
       "       BMI^2   Age * BMI  Age * BMI^2  Age^2 * BMI^2  \n",
       "0  21.227888  441.000000   445.785640     450.623213  \n",
       "1  27.681661  400.000000   553.633218     766.274350  \n",
       "2  23.437500  324.000000   421.875000     549.316406  \n",
       "3  41.895611  676.000000  1089.285877    1755.242193  \n",
       "4  43.778327  470.136704   949.229536    1916.541944  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show all columns\n",
    "pd.set_option('display.max_columns', None)\n",
    "X_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Testing Single Instance For Architecture Development"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_data = {\n",
    "    \"id\": 6204,\n",
    "    \"Gender\": \"Female\",\n",
    "    \"Age\": 23.0,\n",
    "    \"Height\": 1.581527,\n",
    "    \"Weight\": 78.089575,\n",
    "    \"family_history_with_overweight\": \"yes\",\n",
    "    \"FAVC\": \"yes\",\n",
    "    \"FCVC\": 2.0,\n",
    "    \"NCP\": 2.070033,\n",
    "    \"CAEC\": \"Sometimes\",\n",
    "    \"SMOKE\": \"no\", \n",
    "    \"CH2O\": 2.953192,\n",
    "    \"SCC\": \"no\",\n",
    "    \"FAF\": 0.118271,\n",
    "    \"TUE\": 0.0,\n",
    "    \"CALC\": \"no\",\n",
    "    \"MTRANS\": \"Public_Transportation\"\n",
    "    \n",
    "}\n",
    "\n",
    "input_df = pd.DataFrame([input_data])\n",
    "input_df = New_Test_Instances_Pipeline(input_df, scaler_age, scaler_weight, scaler_height)\n",
    "\n",
    "# X input to have same columns as features\n",
    "X_input = pd.DataFrame(columns=features)\n",
    "# if input df does not have a column that is in features, add it with 0s at the same position\n",
    "for col in features:\n",
    "    if col not in input_df.columns:\n",
    "        if col in ['MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation', 'MTRANS_Walking']:\n",
    "            X_input[col] = False\n",
    "        else:\n",
    "            X_input[col] = 0\n",
    "    else:\n",
    "        X_input[col] = input_df[col]\n",
    "    # if MTRANS_Automobile, MTRANS_Bike, MTRANS_Motorbike, MTRANS_Public_Transportation, MTRANS_Walking are zero, make them False\n",
    "    \n",
    "y_pred_proba = model.predict(X_input)\n",
    "y_pred = np.argmax(y_pred_proba)\n",
    "\n",
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "DataScience",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}