--- a +++ b/03-Experiments/Untitled-1.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/04/26 04:39:52 INFO mlflow.tracking.fluent: Experiment with name 'LGB' does not exist. Creating a new experiment.\n" + ] + }, + { + "data": { + "text/plain": [ + "<Experiment: artifact_location='/Users/arham/Downloads/Projects/mlruns/2', creation_time=1714120792214, experiment_id='2', last_update_time=1714120792214, lifecycle_stage='active', name='LGB', tags={}>" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlflow\n", + "\n", + "\n", + "# Set the MLflow tracking URI to a new SQLite URI\n", + "mlflow.set_tracking_uri(\"sqlite:///new_mlflow.db\")\n", + "mlflow.set_experiment(\"LGB\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from scipy.stats import chi2_contingency\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import lightgbm as lgb\n", + "from catboost import CatBoostClassifier, Pool\n", + "from xgboost import XGBClassifier\n", + "from sklearn.model_selection import StratifiedKFold, cross_val_score\n", + "from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score, f1_score, auc,classification_report\n", + "from scipy.stats import ks_2samp\n", + "\n", + "from sklearn.preprocessing import label_binarize,OneHotEncoder, StandardScaler, FunctionTransformer, LabelEncoder\n", + "from itertools import cycle\n", + "\n", + "from sklearn.ensemble import VotingClassifier\n", + "from sklearn.model_selection import RandomizedSearchCV\n", + "import shap\n", + "\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.decomposition import TruncatedSVD, PCA\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "import numpy as np \n", + "import pandas as pd\n", + "\n", + "def load_data(path):\n", + " df = pd.read_csv(path)\n", + " # arham check this later\n", + " # original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')\n", + " # split to train test\n", + " train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n", + " train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n", + " test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n", + " return train_df, test_df\n", + "\n", + "def corr_heat_map(df,scale=1) :\n", + " # Calculate the correlation matrix\n", + " correlation_matrix = df.corr()\n", + "\n", + " # Create a mask for the upper triangle\n", + " mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))\n", + "\n", + " # Set up the matplotlib figure\n", + " plt.figure(figsize=(10//scale, 8//scale))\n", + "\n", + " # Define a custom color palette\n", + " cmap = sns.diverging_palette(220, 20, as_cmap=True)\n", + "\n", + " # Draw the heatmap with the mask and correct aspect ratio\n", + " sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,\n", + " square=True, linewidths=.5, cbar_kws={\"shrink\": 0.7})\n", + "\n", + " plt.title('Correlation Heatmap')\n", + "\n", + "\n", + "path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n", + "train, test = load_data(path)\n", + "\n", + "target = 'NObeyesdad'\n", + "num_col = []\n", + "cat_col = []\n", + "\n", + "for i in train.columns.drop([target]) : \n", + " \n", + " if train[i].dtype == 'object' : \n", + " cat_col.append(i)\n", + " \n", + " else : \n", + " num_col.append(i)\n", + "\n", + "# print(\"Numerical Columns : \", *num_col,\"\\n\",sep=\"\\n\")\n", + "# print(\"Categorical Columns : \", *cat_col,sep=\"\\n\")\n", + "\n", + "\n", + "train = pd.get_dummies(train,\n", + " columns=cat_col)\n", + "test = pd.get_dummies(test, \n", + " columns=cat_col)\n", + "\n", + "target = 'NObeyesdad'\n", + "\n", + "le = LabelEncoder()\n", + "train['NObeyesdad'] = le.fit_transform(train['NObeyesdad'])\n", + "\n", + "X_train, X_val, y_train, y_val = train_test_split(train.drop([target],axis=1),train[target],test_size=0.2,random_state=42)\n", + "X_train.shape , y_train.shape, X_val.shape, y_val.shape \n", + "\n", + "import optuna\n", + "ran_optuna = False \n", + "\n", + "def optimization_function(trial) : \n", + " \n", + " lgbParams = {\n", + " 'num_class': 7,\n", + " 'random_state': 42,\n", + " 'metric': 'multi_logloss',\n", + " \"boosting_type\": \"gbdt\",\n", + " 'objective': 'multiclass',\n", + " \n", + " 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),\n", + " 'n_estimators': trial.suggest_int('n_estimators', 400, 600),\n", + " 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),\n", + " 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-1, 10.0),\n", + " 'max_depth': trial.suggest_int('max_depth', 6, 20),\n", + " 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),\n", + " 'subsample': trial.suggest_float('subsample', 0.8, 1.0),\n", + " 'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),\n", + " }\n", + " \n", + " lgb_model=lgb.LGBMClassifier(**lgbParams)\n", + " \n", + "# skf = StratifiedKFold(n_splits=5,shuffle=False, random_state=None)\n", + "# accuracy = cross_val_score(lgb_model,X_train,y_train, cv=skf,scoring='accuracy')\n", + "# print(\"=\"*50,'\\nValidation Accuracy:', accuracy.mean())\n", + "\n", + " lgb_model.fit(X_train,y_train)\n", + " \n", + " acc = accuracy_score(y_val,lgb_model.predict(X_val))\n", + "\n", + " mlflow.log_metric('accuracy', accuracy)\n", + " mlflow.log_metric('precision', precision)\n", + " mlflow.log_metric('recall', recall)\n", + " mlflow.log_metric('f1', f1)\n", + "\n", + " precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n", + " for i in range(len(recall_per_class)):\n", + " print(f\"Recall for class {i}: {recall_per_class[i]}\")\n", + " mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n", + "\n", + " mlflow.lightgbm.log_model(lgb_model_final, 'model')\n", + " mlflow.set_tag('experiments', 'Arham A.')\n", + " mlflow.set_tag('model_name', 'LightGBM')\n", + " mlflow.set_tag('preprocessing', 'Yes')\n", + " \n", + " return acc" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9058910707669507" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "if ran_optuna : \n", + "\n", + " print('Number of finished trials:', len(study.trials))\n", + "\n", + " print('Best trial:', study.best_trial.params)\n", + "\n", + " optuna.visualization.plot_param_importances(study)\n", + "\n", + " study.trials_dataframe().sort_values('value',ascending=False)\n", + "\n", + " optuna.visualization.plot_slice(study)\n", + "\n", + "# 100 trials \n", + "# {'objective': 'multiclassova', 'learning_rate': 0.04641200998070569, 'n_estimators': 587, 'reg_alpha': 0.0065043557057678746, 'reg_lambda': 4.460933310544669, 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 'subsample': 0.8193986843950917, 'min_child_samples': 15}\n", + "\n", + "\n", + "if ran_optuna : \n", + " lgbParams = study.best_trial.params\n", + "\n", + "else :\n", + " \n", + "# # 100- traials with PCA seed = None\n", + "# lgbParams = {\n", + "# 'objective': 'multiclassova', \n", + "# 'learning_rate': 0.04641200998070569, \n", + "# 'n_estimators': 587, \n", + "# 'reg_alpha': 0.0065043557057678746, \n", + "# 'reg_lambda': 4.460933310544669, \n", + "# 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, \n", + "# 'subsample': 0.8193986843950917, \n", + "# 'min_child_samples': 15\n", + "# }\n", + " \n", + " \n", + " # Moaz HyperParams\n", + " lgbParams = {\n", + " \"objective\": \"multiclass\", # Objective function for the model\n", + " \"metric\": \"multi_logloss\", # Evaluation metric\n", + " \"verbosity\": -1, # Verbosity level (-1 for silent)\n", + " \"boosting_type\": \"gbdt\", # Gradient boosting type\n", + " \"random_state\": 42, # Random state for reproducibility\n", + " \"num_class\": 7, # Number of classes in the dataset\n", + " 'learning_rate': 0.030962211546832760, # Learning rate for gradient boosting\n", + " 'n_estimators': 500, # Number of boosting iterations\n", + " 'lambda_l1': 0.009667446568254372, # L1 regularization term\n", + " 'lambda_l2': 0.04018641437301800, # L2 regularization term\n", + " 'max_depth': 10, # Maximum depth of the trees\n", + " 'colsample_bytree': 0.40977129346872643, # Fraction of features to consider for each tree\n", + " 'subsample': 0.9535797422450176, # Fraction of samples to consider for each boosting iteration\n", + " 'min_child_samples': 26 # Minimum number of data needed in a leaf\n", + " }\n", + "\n", + "\n", + "\n", + "fixed_params = {\n", + " 'boosting_type': 'gbdt',\n", + " 'num_class': 7,\n", + " 'random_state': 42,\n", + " 'metric': 'multi_logloss',\n", + "}\n", + "\n", + "\n", + "for i in fixed_params.keys() : \n", + "\n", + " lgbParams[i] = fixed_params[i]\n", + "\n", + "\n", + "lgbParams\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659, -0.0087675011457998, -0.001077949504617301]\n", + "\n", + "Accuracy: 0.9058910707669507\n", + "Precision: 0.9067204051187663\n", + "Recall: 0.9058910707669507\n", + "F1 0.9063055482178468\n", + "Recall for class 0: 0.9208860759493671\n", + "Recall for class 1: 0.9090909090909091\n", + "Recall for class 2: 0.8741092636579573\n", + "Recall for class 3: 0.9736842105263158\n", + "Recall for class 4: 0.9960474308300395\n", + "Recall for class 5: 0.7701492537313432\n", + "Recall for class 6: 0.8419452887537994\n" + ] + } + ], + "source": [ + "\n", + "\n", + "import xgboost as xgb\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score\n", + "import mlflow\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "# import precision_recall_fscore_support\n", + "from sklearn.metrics import precision_recall_fscore_support\n", + "\n", + "mlflow.sklearn.autolog(disable=True)\n", + "\n", + "with mlflow.start_run(run_name=\"LGB_Final\"):\n", + " class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]\n", + " class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]\n", + " target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]\n", + " print(f\"Target Drift For Each Class {target_drift}\")\n", + " mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})\n", + "\n", + "\n", + "\n", + " lgb_model_final = lgb.LGBMClassifier(**lgbParams)\n", + " lgb_model_final = lgb_model_final.fit(X_train, y_train)\n", + " y_pred = lgb_model_final.predict(X_val)\n", + " accuracy_xgb = accuracy_score(y_val, y_pred) \n", + " precision_xgb = precision_score(y_val, y_pred, average='weighted')\n", + " recall_xgb = recall_score(y_val, y_pred, average='weighted')\n", + " f1_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb)\n", + " print(\"\\nAccuracy:\", accuracy_xgb)\n", + " print(\"Precision:\", precision_xgb)\n", + " print(\"Recall:\", recall_xgb)\n", + " print(\"F1\", f1_xgb)\n", + " mlflow.log_metric('accuracy', accuracy_xgb)\n", + " mlflow.log_metric('precision', precision_xgb)\n", + " mlflow.log_metric('recall', recall_xgb)\n", + " mlflow.log_metric('f1', f1_xgb)\n", + "\n", + " precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n", + " for i in range(len(recall_per_class)):\n", + " print(f\"Recall for class {i}: {recall_per_class[i]}\")\n", + " mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n", + "\n", + " mlflow.lightgbm.log_model(lgb_model_final, 'model')\n", + " mlflow.set_tag('experiments', 'Arham A.')\n", + " mlflow.set_tag('model_name', 'LightGBM')\n", + " mlflow.set_tag('preprocessing', 'Yes')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "DataScience", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}