376 lines (375 with data), 14.6 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024/04/26 04:39:52 INFO mlflow.tracking.fluent: Experiment with name 'LGB' does not exist. Creating a new experiment.\n"
]
},
{
"data": {
"text/plain": [
"<Experiment: artifact_location='/Users/arham/Downloads/Projects/mlruns/2', creation_time=1714120792214, experiment_id='2', last_update_time=1714120792214, lifecycle_stage='active', name='LGB', tags={}>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import mlflow\n",
"\n",
"\n",
"# Set the MLflow tracking URI to a new SQLite URI\n",
"mlflow.set_tracking_uri(\"sqlite:///new_mlflow.db\")\n",
"mlflow.set_experiment(\"LGB\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from scipy.stats import chi2_contingency\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import lightgbm as lgb\n",
"from catboost import CatBoostClassifier, Pool\n",
"from xgboost import XGBClassifier\n",
"from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
"from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score, f1_score, auc,classification_report\n",
"from scipy.stats import ks_2samp\n",
"\n",
"from sklearn.preprocessing import label_binarize,OneHotEncoder, StandardScaler, FunctionTransformer, LabelEncoder\n",
"from itertools import cycle\n",
"\n",
"from sklearn.ensemble import VotingClassifier\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"import shap\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.decomposition import TruncatedSVD, PCA\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"import numpy as np \n",
"import pandas as pd\n",
"\n",
"def load_data(path):\n",
" df = pd.read_csv(path)\n",
" # arham check this later\n",
" # original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')\n",
" # split to train test\n",
" train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
" train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
" test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
" return train_df, test_df\n",
"\n",
"def corr_heat_map(df,scale=1) :\n",
" # Calculate the correlation matrix\n",
" correlation_matrix = df.corr()\n",
"\n",
" # Create a mask for the upper triangle\n",
" mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))\n",
"\n",
" # Set up the matplotlib figure\n",
" plt.figure(figsize=(10//scale, 8//scale))\n",
"\n",
" # Define a custom color palette\n",
" cmap = sns.diverging_palette(220, 20, as_cmap=True)\n",
"\n",
" # Draw the heatmap with the mask and correct aspect ratio\n",
" sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,\n",
" square=True, linewidths=.5, cbar_kws={\"shrink\": 0.7})\n",
"\n",
" plt.title('Correlation Heatmap')\n",
"\n",
"\n",
"path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
"train, test = load_data(path)\n",
"\n",
"target = 'NObeyesdad'\n",
"num_col = []\n",
"cat_col = []\n",
"\n",
"for i in train.columns.drop([target]) : \n",
" \n",
" if train[i].dtype == 'object' : \n",
" cat_col.append(i)\n",
" \n",
" else : \n",
" num_col.append(i)\n",
"\n",
"# print(\"Numerical Columns : \", *num_col,\"\\n\",sep=\"\\n\")\n",
"# print(\"Categorical Columns : \", *cat_col,sep=\"\\n\")\n",
"\n",
"\n",
"train = pd.get_dummies(train,\n",
" columns=cat_col)\n",
"test = pd.get_dummies(test, \n",
" columns=cat_col)\n",
"\n",
"target = 'NObeyesdad'\n",
"\n",
"le = LabelEncoder()\n",
"train['NObeyesdad'] = le.fit_transform(train['NObeyesdad'])\n",
"\n",
"X_train, X_val, y_train, y_val = train_test_split(train.drop([target],axis=1),train[target],test_size=0.2,random_state=42)\n",
"X_train.shape , y_train.shape, X_val.shape, y_val.shape \n",
"\n",
"import optuna\n",
"ran_optuna = False \n",
"\n",
"def optimization_function(trial) : \n",
" \n",
" lgbParams = {\n",
" 'num_class': 7,\n",
" 'random_state': 42,\n",
" 'metric': 'multi_logloss',\n",
" \"boosting_type\": \"gbdt\",\n",
" 'objective': 'multiclass',\n",
" \n",
" 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),\n",
" 'n_estimators': trial.suggest_int('n_estimators', 400, 600),\n",
" 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),\n",
" 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-1, 10.0),\n",
" 'max_depth': trial.suggest_int('max_depth', 6, 20),\n",
" 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),\n",
" 'subsample': trial.suggest_float('subsample', 0.8, 1.0),\n",
" 'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),\n",
" }\n",
" \n",
" lgb_model=lgb.LGBMClassifier(**lgbParams)\n",
" \n",
"# skf = StratifiedKFold(n_splits=5,shuffle=False, random_state=None)\n",
"# accuracy = cross_val_score(lgb_model,X_train,y_train, cv=skf,scoring='accuracy')\n",
"# print(\"=\"*50,'\\nValidation Accuracy:', accuracy.mean())\n",
"\n",
" lgb_model.fit(X_train,y_train)\n",
" \n",
" acc = accuracy_score(y_val,lgb_model.predict(X_val))\n",
"\n",
" mlflow.log_metric('accuracy', accuracy)\n",
" mlflow.log_metric('precision', precision)\n",
" mlflow.log_metric('recall', recall)\n",
" mlflow.log_metric('f1', f1)\n",
"\n",
" precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n",
" for i in range(len(recall_per_class)):\n",
" print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
" mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
"\n",
" mlflow.lightgbm.log_model(lgb_model_final, 'model')\n",
" mlflow.set_tag('experiments', 'Arham A.')\n",
" mlflow.set_tag('model_name', 'LightGBM')\n",
" mlflow.set_tag('preprocessing', 'Yes')\n",
" \n",
" return acc"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9058910707669507"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if ran_optuna : \n",
"\n",
" print('Number of finished trials:', len(study.trials))\n",
"\n",
" print('Best trial:', study.best_trial.params)\n",
"\n",
" optuna.visualization.plot_param_importances(study)\n",
"\n",
" study.trials_dataframe().sort_values('value',ascending=False)\n",
"\n",
" optuna.visualization.plot_slice(study)\n",
"\n",
"# 100 trials \n",
"# {'objective': 'multiclassova', 'learning_rate': 0.04641200998070569, 'n_estimators': 587, 'reg_alpha': 0.0065043557057678746, 'reg_lambda': 4.460933310544669, 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 'subsample': 0.8193986843950917, 'min_child_samples': 15}\n",
"\n",
"\n",
"if ran_optuna : \n",
" lgbParams = study.best_trial.params\n",
"\n",
"else :\n",
" \n",
"# # 100- traials with PCA seed = None\n",
"# lgbParams = {\n",
"# 'objective': 'multiclassova', \n",
"# 'learning_rate': 0.04641200998070569, \n",
"# 'n_estimators': 587, \n",
"# 'reg_alpha': 0.0065043557057678746, \n",
"# 'reg_lambda': 4.460933310544669, \n",
"# 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, \n",
"# 'subsample': 0.8193986843950917, \n",
"# 'min_child_samples': 15\n",
"# }\n",
" \n",
" \n",
" # Moaz HyperParams\n",
" lgbParams = {\n",
" \"objective\": \"multiclass\", # Objective function for the model\n",
" \"metric\": \"multi_logloss\", # Evaluation metric\n",
" \"verbosity\": -1, # Verbosity level (-1 for silent)\n",
" \"boosting_type\": \"gbdt\", # Gradient boosting type\n",
" \"random_state\": 42, # Random state for reproducibility\n",
" \"num_class\": 7, # Number of classes in the dataset\n",
" 'learning_rate': 0.030962211546832760, # Learning rate for gradient boosting\n",
" 'n_estimators': 500, # Number of boosting iterations\n",
" 'lambda_l1': 0.009667446568254372, # L1 regularization term\n",
" 'lambda_l2': 0.04018641437301800, # L2 regularization term\n",
" 'max_depth': 10, # Maximum depth of the trees\n",
" 'colsample_bytree': 0.40977129346872643, # Fraction of features to consider for each tree\n",
" 'subsample': 0.9535797422450176, # Fraction of samples to consider for each boosting iteration\n",
" 'min_child_samples': 26 # Minimum number of data needed in a leaf\n",
" }\n",
"\n",
"\n",
"\n",
"fixed_params = {\n",
" 'boosting_type': 'gbdt',\n",
" 'num_class': 7,\n",
" 'random_state': 42,\n",
" 'metric': 'multi_logloss',\n",
"}\n",
"\n",
"\n",
"for i in fixed_params.keys() : \n",
"\n",
" lgbParams[i] = fixed_params[i]\n",
"\n",
"\n",
"lgbParams\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659, -0.0087675011457998, -0.001077949504617301]\n",
"\n",
"Accuracy: 0.9058910707669507\n",
"Precision: 0.9067204051187663\n",
"Recall: 0.9058910707669507\n",
"F1 0.9063055482178468\n",
"Recall for class 0: 0.9208860759493671\n",
"Recall for class 1: 0.9090909090909091\n",
"Recall for class 2: 0.8741092636579573\n",
"Recall for class 3: 0.9736842105263158\n",
"Recall for class 4: 0.9960474308300395\n",
"Recall for class 5: 0.7701492537313432\n",
"Recall for class 6: 0.8419452887537994\n"
]
}
],
"source": [
"\n",
"\n",
"import xgboost as xgb\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score\n",
"import mlflow\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"# import precision_recall_fscore_support\n",
"from sklearn.metrics import precision_recall_fscore_support\n",
"\n",
"mlflow.sklearn.autolog(disable=True)\n",
"\n",
"with mlflow.start_run(run_name=\"LGB_Final\"):\n",
" class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]\n",
" class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]\n",
" target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]\n",
" print(f\"Target Drift For Each Class {target_drift}\")\n",
" mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})\n",
"\n",
"\n",
"\n",
" lgb_model_final = lgb.LGBMClassifier(**lgbParams)\n",
" lgb_model_final = lgb_model_final.fit(X_train, y_train)\n",
" y_pred = lgb_model_final.predict(X_val)\n",
" accuracy_xgb = accuracy_score(y_val, y_pred) \n",
" precision_xgb = precision_score(y_val, y_pred, average='weighted')\n",
" recall_xgb = recall_score(y_val, y_pred, average='weighted')\n",
" f1_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb)\n",
" print(\"\\nAccuracy:\", accuracy_xgb)\n",
" print(\"Precision:\", precision_xgb)\n",
" print(\"Recall:\", recall_xgb)\n",
" print(\"F1\", f1_xgb)\n",
" mlflow.log_metric('accuracy', accuracy_xgb)\n",
" mlflow.log_metric('precision', precision_xgb)\n",
" mlflow.log_metric('recall', recall_xgb)\n",
" mlflow.log_metric('f1', f1_xgb)\n",
"\n",
" precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n",
" for i in range(len(recall_per_class)):\n",
" print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
" mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
"\n",
" mlflow.lightgbm.log_model(lgb_model_final, 'model')\n",
" mlflow.set_tag('experiments', 'Arham A.')\n",
" mlflow.set_tag('model_name', 'LightGBM')\n",
" mlflow.set_tag('preprocessing', 'Yes')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "DataScience",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}