867 lines (866 with data), 36.6 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Gloabl Experiment Setup"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024/04/25 15:58:03 INFO mlflow.tracking.fluent: Experiment with name 'LightGBM' does not exist. Creating a new experiment.\n"
]
},
{
"data": {
"text/plain": [
"<Experiment: artifact_location='/Users/arham/Downloads/Projects/03-Experiments/mlruns/4', creation_time=1714075083201, experiment_id='4', last_update_time=1714075083201, lifecycle_stage='active', name='LightGBM', tags={}>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import mlflow\n",
"# Set the MLflow tracking URI to a new SQLite URI\n",
"mlflow.set_tracking_uri(\"sqlite:///new_mlflow.db\")\n",
"mlflow.set_experiment(\"LightGBM\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"import lightgbm as lgb\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"def load_data(path):\n",
" df = pd.read_csv(path)\n",
" train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
" train_df, val_df, = train_test_split(train_df, test_size=0.20, random_state=42)\n",
" train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
" test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
" val_df = val_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
" return train_df, val_df, test_df\n",
"\n",
"def encode_target(train):\n",
" target_key = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4,'Obesity_Type_II' : 5, 'Obesity_Type_III': 6}\n",
" train['NObeyesdad'] = train['NObeyesdad'].map(target_key)\n",
" return train\n",
"\n",
"def make_gender_binary(train):\n",
" train['Gender'] = train['Gender'].map({'Male':0, 'Female':1})\n",
"\n",
"def datatypes(train):\n",
" train['Weight'] = train['Weight'].astype(float)\n",
" train['Age'] = train['Age'].astype(float)\n",
" train['Height'] = train['Height'].astype(float)\n",
" return train\n",
"\n",
"# def age_binning(train_df):\n",
"# train_df['Age_Group'] = pd.cut(train_df['Age'], bins=[0, 20, 30, 40, 50, train_df['Age'].max()], labels=['0-20', '21-30', '31-40', '41-50', '50+'])\n",
"# return train_df\n",
"\n",
"def age_binning(df):\n",
" age_groups = []\n",
" for age in df['Age']:\n",
" if age <= 20:\n",
" age_group = 1\n",
" elif age <= 30:\n",
" age_group = 2\n",
" elif age <= 40:\n",
" age_group = 3\n",
" elif age <= 50:\n",
" age_group = 4\n",
" else:\n",
" age_group = 5\n",
" age_groups.append(age_group)\n",
" df['Age_Group'] = age_groups\n",
" return df\n",
"\n",
"def age_scaling_log(train_df):\n",
" train_df['Age'] = train_df['Age'].astype(float)\n",
" train_df['Log_Age'] = np.log1p(train_df['Age'])\n",
" return train_df\n",
"\n",
"def age_scaling_minmax(train_df):\n",
" train_df['Age'] = train_df['Age'].astype(float)\n",
" scaler_age = MinMaxScaler()\n",
" train_df['Scaled_Age'] = scaler_age.fit_transform(train_df['Age'].values.reshape(-1, 1))\n",
" return train_df, scaler_age\n",
"\n",
"def weight_scaling_log(train_df):\n",
" train_df['Weight'] = train_df['Weight'].astype(float)\n",
" train_df['Log_Weight'] = np.log1p(train_df['Weight'])\n",
" return train_df\n",
"\n",
"def weight_scaling_minmax(train_df):\n",
" train_df['Weight'] = train_df['Weight'].astype(float)\n",
" scaler_weight = MinMaxScaler()\n",
" train_df['Scaled_Weight'] = scaler_weight.fit_transform(train_df['Weight'].values.reshape(-1, 1))\n",
" return train_df, scaler_weight\n",
"\n",
"def height_scaling_log(train_df):\n",
" train_df['Log_Height'] = np.log1p(train_df['Height'])\n",
" return train_df\n",
"\n",
"def height_scaling_minmax(train_df):\n",
" scaler_height = MinMaxScaler()\n",
" train_df['Scaled_Height'] = scaler_height.fit_transform(train_df['Height'].values.reshape(-1, 1))\n",
" return train_df, scaler_height\n",
"\n",
"def make_gender_binary(train):\n",
" train['Gender'] = train['Gender'].map({'Female':1, 'Male':0})\n",
" return train\n",
"\n",
"def fix_binary_columns(train):\n",
" Binary_Cols = ['family_history_with_overweight','FAVC', 'SCC','SMOKE']\n",
" # if yes then 1 else 0\n",
" for col in Binary_Cols:\n",
" train[col] = train[col].map({'yes': 1, 'no': 0})\n",
" return train\n",
"\n",
"def freq_cat_cols(train):\n",
" # One hot encoding\n",
" cat_cols = ['CAEC', 'CALC']\n",
" for col in cat_cols:\n",
" train[col] = train[col].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})\n",
" return train\n",
"\n",
"def Mtrans(train):\n",
" \"\"\"\n",
" Public_Transportation 8692\n",
" Automobile 1835\n",
" Walking 231\n",
" Motorbike 19\n",
" Bike 16\n",
" \"\"\"\n",
" # train['MTRANS'] = train['MTRANS'].map({'Public_Transportation': 3, 'Automobile': 5, 'Walking': 1, 'Motorbike': 4, 'Bike': 2})\n",
" # dummify column\n",
" train = pd.get_dummies(train, columns=['MTRANS'])\n",
" return train\n",
"\n",
"\n",
"def other_features(train):\n",
" train['BMI'] = train['Weight'] / (train['Height'] ** 2)\n",
" # train['Age'*'Gender'] = train['Age'] * train['Gender']\n",
" polynomial_features = PolynomialFeatures(degree=2)\n",
" X_poly = polynomial_features.fit_transform(train[['Age', 'BMI']])\n",
" poly_features_df = pd.DataFrame(X_poly, columns=['Age^2', 'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2'])\n",
" train = pd.concat([train, poly_features_df], axis=1)\n",
" return train\n",
"\n",
"\n",
"def test_pipeline(test, scaler_age, scaler_weight, scaler_height):\n",
" test = datatypes(test)\n",
" test = encode_target(test)\n",
" test = age_binning(test)\n",
" test = age_scaling_log(test)\n",
" test['Scaled_Age'] = scaler_age.transform(test['Age'].values.reshape(-1, 1))\n",
" test = weight_scaling_log(test)\n",
" test['Scaled_Weight'] = scaler_weight.transform(test['Weight'].values.reshape(-1, 1))\n",
" test = height_scaling_log(test)\n",
" test['Scaled_Height'] = scaler_height.transform(test['Height'].values.reshape(-1, 1))\n",
" test = make_gender_binary(test)\n",
" test = fix_binary_columns(test)\n",
" test = freq_cat_cols(test)\n",
" test = Mtrans(test)\n",
" test = other_features(test)\n",
"\n",
" return test\n",
"\n",
"def train_model(params, X_train, y_train):\n",
" lgb_train = lgb.Dataset(X_train, y_train)\n",
" model = lgb.train(params, lgb_train, num_boost_round=1000)\n",
" return model\n",
"\n",
"def evaluate_model(model, X_val, y_val):\n",
" y_pred = model.predict(X_val)\n",
" y_pred = [np.argmax(y) for y in y_pred]\n",
" accuracy = accuracy_score(y_val, y_pred)\n",
" return accuracy\n",
"\n",
"def objective(trial, X_train, y_train):\n",
" params = {\n",
" 'objective': 'multiclass',\n",
" 'num_class': 7,\n",
" 'metric': 'multi_logloss',\n",
" 'boosting_type': 'gbdt',\n",
" 'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),\n",
" 'num_leaves': trial.suggest_int('num_leaves', 10, 1000),\n",
" 'max_depth': trial.suggest_int('max_depth', -1, 20),\n",
" 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 0.95),\n",
" 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 0.95),\n",
" 'verbosity': -1\n",
" }\n",
"\n",
" n_splits = 5\n",
" kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)\n",
" scores = []\n",
"\n",
" for train_index, val_index in kf.split(X_train, y_train):\n",
" X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]\n",
" y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]\n",
"\n",
" model = train_model(params, X_tr, y_tr)\n",
" accuracy = evaluate_model(model, X_val, y_val)\n",
" scores.append(accuracy)\n",
"\n",
" return np.mean(scores)\n",
"\n",
"def optimize_hyperparameters(X_train, y_train, n_trials=2):\n",
" study = optuna.create_study(direction='maximize')\n",
" study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=n_trials)\n",
" return study.best_params\n",
"\n",
"def New_Test_Instances_Pipeline(test, scaler_age, scaler_weight, scaler_height):\n",
" test = datatypes(test)\n",
" test = age_binning(test)\n",
" test = age_scaling_log(test)\n",
" test['Scaled_Age'] = scaler_age.transform(test['Age'].values.reshape(-1, 1))\n",
" test = weight_scaling_log(test)\n",
" test['Scaled_Weight'] = scaler_weight.transform(test['Weight'].values.reshape(-1, 1))\n",
" test = height_scaling_log(test)\n",
" test['Scaled_Height'] = scaler_height.transform(test['Height'].values.reshape(-1, 1))\n",
" test = make_gender_binary(test)\n",
" test = fix_binary_columns(test)\n",
" test = freq_cat_cols(test)\n",
" test = Mtrans(test)\n",
" test = other_features(test)\n",
"\n",
" return test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Experiment"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]\n",
"[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001163 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 3576\n",
"[LightGBM] [Info] Number of data points in the train set: 10793, number of used features: 25\n",
"[LightGBM] [Info] Start training from score -2.103541\n",
"[LightGBM] [Info] Start training from score -1.893390\n",
"[LightGBM] [Info] Start training from score -2.159762\n",
"[LightGBM] [Info] Start training from score -2.113461\n",
"[LightGBM] [Info] Start training from score -1.974767\n",
"[LightGBM] [Info] Start training from score -1.867272\n",
"[LightGBM] [Info] Start training from score -1.619963\n",
"[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000883 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 3576\n",
"[LightGBM] [Info] Number of data points in the train set: 8634, number of used features: 25\n",
"[LightGBM] [Info] Start training from score -2.104065\n",
"[LightGBM] [Info] Start training from score -1.893344\n",
"[LightGBM] [Info] Start training from score -2.159716\n",
"[LightGBM] [Info] Start training from score -2.113607\n",
"[LightGBM] [Info] Start training from score -1.974220\n",
"[LightGBM] [Info] Start training from score -1.867526\n",
"[LightGBM] [Info] Start training from score -1.619799\n",
"[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
"[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001080 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 3573\n",
"[LightGBM] [Info] Number of data points in the train set: 8634, number of used features: 25\n",
"[LightGBM] [Info] Start training from score -2.104065\n",
"[LightGBM] [Info] Start training from score -1.893344\n",
"[LightGBM] [Info] Start training from score -2.159716\n",
"[LightGBM] [Info] Start training from score -2.112648\n",
"[LightGBM] [Info] Start training from score -1.974220\n",
"[LightGBM] [Info] Start training from score -1.867526\n",
"[LightGBM] [Info] Start training from score -1.620385\n",
"[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
"[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000459 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 3572\n",
"[LightGBM] [Info] Number of data points in the train set: 8634, number of used features: 25\n",
"[LightGBM] [Info] Start training from score -2.103115\n",
"[LightGBM] [Info] Start training from score -1.893344\n",
"[LightGBM] [Info] Start training from score -2.159716\n",
"[LightGBM] [Info] Start training from score -2.113607\n",
"[LightGBM] [Info] Start training from score -1.975054\n",
"[LightGBM] [Info] Start training from score -1.867526\n",
"[LightGBM] [Info] Start training from score -1.619799\n",
"[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001021 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 3571\n",
"[LightGBM] [Info] Number of data points in the train set: 8635, number of used features: 25\n",
"[LightGBM] [Info] Start training from score -2.103231\n",
"[LightGBM] [Info] Start training from score -1.893459\n",
"[LightGBM] [Info] Start training from score -2.159832\n",
"[LightGBM] [Info] Start training from score -2.113723\n",
"[LightGBM] [Info] Start training from score -1.975170\n",
"[LightGBM] [Info] Start training from score -1.866892\n",
"[LightGBM] [Info] Start training from score -1.619915\n",
"[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000919 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 3575\n",
"[LightGBM] [Info] Number of data points in the train set: 8635, number of used features: 25\n",
"[LightGBM] [Info] Start training from score -2.103231\n",
"[LightGBM] [Info] Start training from score -1.893459\n",
"[LightGBM] [Info] Start training from score -2.159832\n",
"[LightGBM] [Info] Start training from score -2.113723\n",
"[LightGBM] [Info] Start training from score -1.975170\n",
"[LightGBM] [Info] Start training from score -1.866892\n",
"[LightGBM] [Info] Start training from score -1.619915\n",
"[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
"\n",
"Accuracy: 0.904845733345687\n",
"Precision: 0.9046557231546489\n",
"Recall: 0.904845733345687\n",
"F1: 0.9046297258523301\n",
"[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001173 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 3576\n",
"[LightGBM] [Info] Number of data points in the train set: 10793, number of used features: 25\n",
"[LightGBM] [Info] Start training from score -2.103541\n",
"[LightGBM] [Info] Start training from score -1.893390\n",
"[LightGBM] [Info] Start training from score -2.159762\n",
"[LightGBM] [Info] Start training from score -2.113461\n",
"[LightGBM] [Info] Start training from score -1.974767\n",
"[LightGBM] [Info] Start training from score -1.867272\n",
"[LightGBM] [Info] Start training from score -1.619963\n",
"Recall for class 0: 0.9367088607594937\n",
"Recall for class 1: 0.9117647058823529\n",
"Recall for class 2: 0.755223880597015\n",
"Recall for class 3: 0.8267477203647416\n",
"Recall for class 4: 0.8669833729216152\n",
"Recall for class 5: 0.9617224880382775\n",
"Recall for class 6: 0.9960474308300395\n"
]
}
],
"source": [
"path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
"train_df, val_df, test_df = load_data(path)\n",
"\n",
"# train test val pipeline\n",
"train_df = datatypes(train_df)\n",
"train_df = encode_target(train_df)\n",
"train_df = age_binning(train_df)\n",
"train_df, scaler_age = age_scaling_minmax(train_df)\n",
"train_df = age_scaling_log(train_df)\n",
"train_df, scaler_weight = weight_scaling_minmax(train_df)\n",
"train_df = weight_scaling_log(train_df)\n",
"train_df, scaler_height = height_scaling_minmax(train_df)\n",
"train_df = height_scaling_log(train_df)\n",
"train_df = make_gender_binary(train_df)\n",
"train_df = fix_binary_columns(train_df)\n",
"train_df = freq_cat_cols(train_df)\n",
"train_df = Mtrans(train_df)\n",
"train_df = other_features(train_df)\n",
"val_df = test_pipeline(val_df, scaler_age, scaler_weight, scaler_height)\n",
"test_df = test_pipeline(test_df, scaler_age, scaler_weight, scaler_height)\n",
"\n",
"\n",
"# target & predictors\n",
"Target = 'NObeyesdad'\n",
"features = ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',\n",
" 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',\n",
" 'CALC', 'Age_Group', \n",
" 'MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike',\n",
" 'MTRANS_Public_Transportation', 'MTRANS_Walking', 'BMI', 'Age^2',\n",
" 'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2'] \n",
" #'Scaled_Age', 'Log_Age', 'Scaled_Weight', 'Log_Weight', 'Scaled_Height', 'Log_Height',\n",
"\n",
"X_train = train_df[features]\n",
"y_train = train_df[Target]\n",
"X_val = val_df[features]\n",
"y_val = val_df[Target]\n",
"X_test = test_df[features]\n",
"y_test = test_df[Target]\n",
"\n",
"\n",
"lgb_train = lgb.Dataset(X_train, y_train)\n",
"params = {\n",
" 'objective': 'multiclass',\n",
" 'num_class': 7,\n",
" 'metric': 'multi_logloss',\n",
"}\n",
"\n",
"from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
"import mlflow\n",
"import lightgbm as lgb\n",
"from lightgbm import LGBMClassifier\n",
"from sklearn.model_selection import cross_val_predict\n",
"\n",
"mlflow.sklearn.autolog(disable=True)\n",
"\n",
"with mlflow.start_run(run_name=\"LGBM_without_FE_v2\"):\n",
" class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]\n",
" class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]\n",
" target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]\n",
" print(f\"Target Drift For Each Class {target_drift}\")\n",
" mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})\n",
"\n",
" model = LGBMClassifier(**params) # Assuming you have your parameters defined somewhere\n",
" model.fit(X_train, y_train) # Fit the model on training data\n",
"\n",
" # CV predictions of LightGBM\n",
" cv_predictions = cross_val_predict(model, X_train, y_train, cv=5)\n",
" accuracy_lgbm = accuracy_score(y_train, cv_predictions)\n",
" \n",
" # Compute precision, recall, and F1-score\n",
" precision_lgbm, recall_lgbm, f1_lgbm, _ = precision_recall_fscore_support(y_train, cv_predictions, average='weighted')\n",
" \n",
" print(\"\\nAccuracy:\", accuracy_lgbm)\n",
" print(\"Precision:\", precision_lgbm)\n",
" print(\"Recall:\", recall_lgbm)\n",
" print(\"F1:\", f1_lgbm)\n",
" \n",
" mlflow.log_metric('accuracy', accuracy_lgbm)\n",
" mlflow.log_metric('precision', precision_lgbm)\n",
" mlflow.log_metric('recall', recall_lgbm)\n",
" mlflow.log_metric('f1', f1_lgbm)\n",
"\n",
" model.fit(X_train, y_train)\n",
" y_val_pred_lgbm = model.predict(X_val)\n",
" \n",
" # Compute precision, recall, and F1-score per class\n",
" precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_val_pred_lgbm, average=None)\n",
" for i in range(len(recall_per_class)):\n",
" print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
" mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
"\n",
" mlflow.lightgbm.log_model(model, 'model')\n",
" mlflow.set_tag('experiments', 'Arham A.')\n",
" mlflow.set_tag('model_name', 'LGBM')\n",
" mlflow.set_tag('preprocessing', 'Yes')\n"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"# train_df, val_df, test_df = load_data(path)\n",
"\n",
"\n",
"# X_val = val_df[features]\n",
"# y_val = val_df[Target]\n",
"# y_pred = model.predict(X_val, num_iteration=model.best_iteration)\n",
"# # y_pred to a dataframe\n",
"# y_pred = pd.DataFrame(y_pred, columns=['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'])\n",
"# # add prefix to columns \"prob_lgbm_\"\n",
"# y_pred = y_pred.add_prefix('prob_lgbm_')\n",
"# # add to X_val\n",
"# X_val = pd.concat([X_val, y_pred], axis=1)\n",
"# # export as stack_aid_lgbm.csv\n",
"# X_val.to_csv('stack_aid_lgbm.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Height</th>\n",
" <th>Weight</th>\n",
" <th>family_history_with_overweight</th>\n",
" <th>FAVC</th>\n",
" <th>FCVC</th>\n",
" <th>NCP</th>\n",
" <th>CAEC</th>\n",
" <th>SMOKE</th>\n",
" <th>CH2O</th>\n",
" <th>SCC</th>\n",
" <th>FAF</th>\n",
" <th>TUE</th>\n",
" <th>CALC</th>\n",
" <th>Age_Group</th>\n",
" <th>MTRANS_Automobile</th>\n",
" <th>MTRANS_Bike</th>\n",
" <th>MTRANS_Motorbike</th>\n",
" <th>MTRANS_Public_Transportation</th>\n",
" <th>MTRANS_Walking</th>\n",
" <th>BMI</th>\n",
" <th>Age^2</th>\n",
" <th>Age^3</th>\n",
" <th>BMI^2</th>\n",
" <th>Age * BMI</th>\n",
" <th>Age * BMI^2</th>\n",
" <th>Age^2 * BMI^2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>21.000000</td>\n",
" <td>1.550000</td>\n",
" <td>51.000000</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>2.000000</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0</td>\n",
" <td>21-30</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>21.227888</td>\n",
" <td>1.0</td>\n",
" <td>21.000000</td>\n",
" <td>21.227888</td>\n",
" <td>441.000000</td>\n",
" <td>445.785640</td>\n",
" <td>450.623213</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>20.000000</td>\n",
" <td>1.700000</td>\n",
" <td>80.000000</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2.000000</td>\n",
" <td>0</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1</td>\n",
" <td>0-20</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>27.681661</td>\n",
" <td>1.0</td>\n",
" <td>20.000000</td>\n",
" <td>27.681661</td>\n",
" <td>400.000000</td>\n",
" <td>553.633218</td>\n",
" <td>766.274350</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>18.000000</td>\n",
" <td>1.600000</td>\n",
" <td>60.000000</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2.000000</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0</td>\n",
" <td>0-20</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>23.437500</td>\n",
" <td>1.0</td>\n",
" <td>18.000000</td>\n",
" <td>23.437500</td>\n",
" <td>324.000000</td>\n",
" <td>421.875000</td>\n",
" <td>549.316406</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>26.000000</td>\n",
" <td>1.632983</td>\n",
" <td>111.720238</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2.559750</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.396972</td>\n",
" <td>1</td>\n",
" <td>21-30</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>41.895611</td>\n",
" <td>1.0</td>\n",
" <td>26.000000</td>\n",
" <td>41.895611</td>\n",
" <td>676.000000</td>\n",
" <td>1089.285877</td>\n",
" <td>1755.242193</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>21.682636</td>\n",
" <td>1.748524</td>\n",
" <td>133.845064</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2.843777</td>\n",
" <td>0</td>\n",
" <td>1.427037</td>\n",
" <td>0.849236</td>\n",
" <td>1</td>\n",
" <td>21-30</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>43.778327</td>\n",
" <td>1.0</td>\n",
" <td>21.682636</td>\n",
" <td>43.778327</td>\n",
" <td>470.136704</td>\n",
" <td>949.229536</td>\n",
" <td>1916.541944</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Gender Age Height Weight family_history_with_overweight \\\n",
"0 1 21.000000 1.550000 51.000000 0 \n",
"1 0 20.000000 1.700000 80.000000 1 \n",
"2 1 18.000000 1.600000 60.000000 1 \n",
"3 1 26.000000 1.632983 111.720238 1 \n",
"4 1 21.682636 1.748524 133.845064 1 \n",
"\n",
" FAVC FCVC NCP CAEC SMOKE CH2O SCC FAF TUE CALC \\\n",
"0 1 3.0 1.0 2 0 2.000000 0 0.000000 0.000000 0 \n",
"1 1 2.0 3.0 1 0 2.000000 0 2.000000 1.000000 1 \n",
"2 1 2.0 3.0 1 0 2.000000 0 1.000000 1.000000 0 \n",
"3 1 3.0 3.0 1 0 2.559750 0 0.000000 0.396972 1 \n",
"4 1 3.0 3.0 1 0 2.843777 0 1.427037 0.849236 1 \n",
"\n",
" Age_Group MTRANS_Automobile MTRANS_Bike MTRANS_Motorbike \\\n",
"0 21-30 False False False \n",
"1 0-20 False False False \n",
"2 0-20 False False False \n",
"3 21-30 False False False \n",
"4 21-30 False False False \n",
"\n",
" MTRANS_Public_Transportation MTRANS_Walking BMI Age^2 Age^3 \\\n",
"0 True False 21.227888 1.0 21.000000 \n",
"1 True False 27.681661 1.0 20.000000 \n",
"2 False True 23.437500 1.0 18.000000 \n",
"3 True False 41.895611 1.0 26.000000 \n",
"4 True False 43.778327 1.0 21.682636 \n",
"\n",
" BMI^2 Age * BMI Age * BMI^2 Age^2 * BMI^2 \n",
"0 21.227888 441.000000 445.785640 450.623213 \n",
"1 27.681661 400.000000 553.633218 766.274350 \n",
"2 23.437500 324.000000 421.875000 549.316406 \n",
"3 41.895611 676.000000 1089.285877 1755.242193 \n",
"4 43.778327 470.136704 949.229536 1916.541944 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show all columns\n",
"pd.set_option('display.max_columns', None)\n",
"X_train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Testing Single Instance For Architecture Development"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"input_data = {\n",
" \"id\": 6204,\n",
" \"Gender\": \"Female\",\n",
" \"Age\": 23.0,\n",
" \"Height\": 1.581527,\n",
" \"Weight\": 78.089575,\n",
" \"family_history_with_overweight\": \"yes\",\n",
" \"FAVC\": \"yes\",\n",
" \"FCVC\": 2.0,\n",
" \"NCP\": 2.070033,\n",
" \"CAEC\": \"Sometimes\",\n",
" \"SMOKE\": \"no\", \n",
" \"CH2O\": 2.953192,\n",
" \"SCC\": \"no\",\n",
" \"FAF\": 0.118271,\n",
" \"TUE\": 0.0,\n",
" \"CALC\": \"no\",\n",
" \"MTRANS\": \"Public_Transportation\"\n",
" \n",
"}\n",
"\n",
"input_df = pd.DataFrame([input_data])\n",
"input_df = New_Test_Instances_Pipeline(input_df, scaler_age, scaler_weight, scaler_height)\n",
"\n",
"# X input to have same columns as features\n",
"X_input = pd.DataFrame(columns=features)\n",
"# if input df does not have a column that is in features, add it with 0s at the same position\n",
"for col in features:\n",
" if col not in input_df.columns:\n",
" if col in ['MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation', 'MTRANS_Walking']:\n",
" X_input[col] = False\n",
" else:\n",
" X_input[col] = 0\n",
" else:\n",
" X_input[col] = input_df[col]\n",
" # if MTRANS_Automobile, MTRANS_Bike, MTRANS_Motorbike, MTRANS_Public_Transportation, MTRANS_Walking are zero, make them False\n",
" \n",
"y_pred_proba = model.predict(X_input)\n",
"y_pred = np.argmax(y_pred_proba)\n",
"\n",
"y_pred"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "DataScience",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}