b/03-Experiments/Untitled-1.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/04/26 04:39:52 INFO mlflow.tracking.fluent: Experiment with name 'LGB' does not exist. Creating a new experiment.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<Experiment: artifact_location='/Users/arham/Downloads/Projects/mlruns/2', creation_time=1714120792214, experiment_id='2', last_update_time=1714120792214, lifecycle_stage='active', name='LGB', tags={}>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import mlflow\n",
+    "\n",
+    "\n",
+    "# Set the MLflow tracking URI to a new SQLite URI\n",
+    "mlflow.set_tracking_uri(\"sqlite:///new_mlflow.db\")\n",
+    "mlflow.set_experiment(\"LGB\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "from scipy.stats import chi2_contingency\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "import lightgbm as lgb\n",
+    "from catboost import CatBoostClassifier, Pool\n",
+    "from xgboost import XGBClassifier\n",
+    "from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
+    "from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score, f1_score, auc,classification_report\n",
+    "from scipy.stats import ks_2samp\n",
+    "\n",
+    "from sklearn.preprocessing import label_binarize,OneHotEncoder, StandardScaler, FunctionTransformer, LabelEncoder\n",
+    "from itertools import cycle\n",
+    "\n",
+    "from sklearn.ensemble import VotingClassifier\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "import shap\n",
+    "\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.decomposition import TruncatedSVD, PCA\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "import numpy as np \n",
+    "import pandas as pd\n",
+    "\n",
+    "def load_data(path):\n",
+    "    df = pd.read_csv(path)\n",
+    "    # arham check this later\n",
+    "    # original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')\n",
+    "    # split to train test\n",
+    "    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
+    "    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
+    "    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
+    "    return train_df, test_df\n",
+    "\n",
+    "def corr_heat_map(df,scale=1) :\n",
+    "    # Calculate the correlation matrix\n",
+    "    correlation_matrix = df.corr()\n",
+    "\n",
+    "    # Create a mask for the upper triangle\n",
+    "    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))\n",
+    "\n",
+    "    # Set up the matplotlib figure\n",
+    "    plt.figure(figsize=(10//scale, 8//scale))\n",
+    "\n",
+    "    # Define a custom color palette\n",
+    "    cmap = sns.diverging_palette(220, 20, as_cmap=True)\n",
+    "\n",
+    "    # Draw the heatmap with the mask and correct aspect ratio\n",
+    "    sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,\n",
+    "                square=True, linewidths=.5, cbar_kws={\"shrink\": 0.7})\n",
+    "\n",
+    "    plt.title('Correlation Heatmap')\n",
+    "\n",
+    "\n",
+    "path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
+    "train, test = load_data(path)\n",
+    "\n",
+    "target = 'NObeyesdad'\n",
+    "num_col = []\n",
+    "cat_col = []\n",
+    "\n",
+    "for i in train.columns.drop([target]) : \n",
+    "    \n",
+    "    if train[i].dtype == 'object' : \n",
+    "        cat_col.append(i)\n",
+    "        \n",
+    "    else : \n",
+    "        num_col.append(i)\n",
+    "\n",
+    "# print(\"Numerical Columns : \", *num_col,\"\\n\",sep=\"\\n\")\n",
+    "# print(\"Categorical Columns : \", *cat_col,sep=\"\\n\")\n",
+    "\n",
+    "\n",
+    "train = pd.get_dummies(train,\n",
+    "                       columns=cat_col)\n",
+    "test = pd.get_dummies(test, \n",
+    "                      columns=cat_col)\n",
+    "\n",
+    "target = 'NObeyesdad'\n",
+    "\n",
+    "le = LabelEncoder()\n",
+    "train['NObeyesdad'] = le.fit_transform(train['NObeyesdad'])\n",
+    "\n",
+    "X_train, X_val, y_train, y_val = train_test_split(train.drop([target],axis=1),train[target],test_size=0.2,random_state=42)\n",
+    "X_train.shape , y_train.shape, X_val.shape, y_val.shape \n",
+    "\n",
+    "import optuna\n",
+    "ran_optuna = False \n",
+    "\n",
+    "def optimization_function(trial) : \n",
+    "    \n",
+    "    lgbParams = {\n",
+    "        'num_class': 7,\n",
+    "        'random_state': 42,\n",
+    "        'metric': 'multi_logloss',\n",
+    "        \"boosting_type\": \"gbdt\",\n",
+    "        'objective': 'multiclass',\n",
+    "        \n",
+    "        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),\n",
+    "        'n_estimators': trial.suggest_int('n_estimators', 400, 600),\n",
+    "        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),\n",
+    "        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-1, 10.0),\n",
+    "        'max_depth': trial.suggest_int('max_depth', 6, 20),\n",
+    "        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),\n",
+    "        'subsample': trial.suggest_float('subsample', 0.8, 1.0),\n",
+    "        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),\n",
+    "    }\n",
+    "    \n",
+    "    lgb_model=lgb.LGBMClassifier(**lgbParams)\n",
+    "    \n",
+    "#     skf = StratifiedKFold(n_splits=5,shuffle=False, random_state=None)\n",
+    "#     accuracy = cross_val_score(lgb_model,X_train,y_train, cv=skf,scoring='accuracy')\n",
+    "#     print(\"=\"*50,'\\nValidation Accuracy:', accuracy.mean())\n",
+    "\n",
+    "    lgb_model.fit(X_train,y_train)\n",
+    "    \n",
+    "    acc = accuracy_score(y_val,lgb_model.predict(X_val))\n",
+    "\n",
+    "        mlflow.log_metric('accuracy', accuracy)\n",
+    "        mlflow.log_metric('precision', precision)\n",
+    "        mlflow.log_metric('recall', recall)\n",
+    "        mlflow.log_metric('f1', f1)\n",
+    "\n",
+    "        precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n",
+    "        for i in range(len(recall_per_class)):\n",
+    "            print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
+    "            mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
+    "\n",
+    "        mlflow.lightgbm.log_model(lgb_model_final, 'model')\n",
+    "        mlflow.set_tag('experiments', 'Arham A.')\n",
+    "        mlflow.set_tag('model_name', 'LightGBM')\n",
+    "        mlflow.set_tag('preprocessing', 'Yes')\n",
+    "    \n",
+    "    return acc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9058910707669507"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "if ran_optuna : \n",
+    "\n",
+    "    print('Number of finished trials:', len(study.trials))\n",
+    "\n",
+    "    print('Best trial:', study.best_trial.params)\n",
+    "\n",
+    "    optuna.visualization.plot_param_importances(study)\n",
+    "\n",
+    "    study.trials_dataframe().sort_values('value',ascending=False)\n",
+    "\n",
+    "    optuna.visualization.plot_slice(study)\n",
+    "\n",
+    "# 100 trials \n",
+    "# {'objective': 'multiclassova', 'learning_rate': 0.04641200998070569, 'n_estimators': 587, 'reg_alpha': 0.0065043557057678746, 'reg_lambda': 4.460933310544669, 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 'subsample': 0.8193986843950917, 'min_child_samples': 15}\n",
+    "\n",
+    "\n",
+    "if ran_optuna : \n",
+    "    lgbParams = study.best_trial.params\n",
+    "\n",
+    "else :\n",
+    "    \n",
+    "#     # 100- traials with PCA seed = None\n",
+    "#     lgbParams = {\n",
+    "#         'objective': 'multiclassova', \n",
+    "#         'learning_rate': 0.04641200998070569, \n",
+    "#         'n_estimators': 587, \n",
+    "#         'reg_alpha': 0.0065043557057678746, \n",
+    "#         'reg_lambda': 4.460933310544669, \n",
+    "#         'max_depth': 7, 'colsample_bytree': 0.6833315654013498, \n",
+    "#         'subsample': 0.8193986843950917, \n",
+    "#         'min_child_samples': 15\n",
+    "#     }\n",
+    "    \n",
+    "    \n",
+    "    # Moaz HyperParams\n",
+    "    lgbParams = {\n",
+    "        \"objective\": \"multiclass\",          # Objective function for the model\n",
+    "        \"metric\": \"multi_logloss\",          # Evaluation metric\n",
+    "        \"verbosity\": -1,                    # Verbosity level (-1 for silent)\n",
+    "        \"boosting_type\": \"gbdt\",            # Gradient boosting type\n",
+    "        \"random_state\": 42,       # Random state for reproducibility\n",
+    "        \"num_class\": 7,                     # Number of classes in the dataset\n",
+    "        'learning_rate': 0.030962211546832760,  # Learning rate for gradient boosting\n",
+    "        'n_estimators': 500,                # Number of boosting iterations\n",
+    "        'lambda_l1': 0.009667446568254372,  # L1 regularization term\n",
+    "        'lambda_l2': 0.04018641437301800,   # L2 regularization term\n",
+    "        'max_depth': 10,                    # Maximum depth of the trees\n",
+    "        'colsample_bytree': 0.40977129346872643,  # Fraction of features to consider for each tree\n",
+    "        'subsample': 0.9535797422450176,    # Fraction of samples to consider for each boosting iteration\n",
+    "        'min_child_samples': 26             # Minimum number of data needed in a leaf\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "\n",
+    "fixed_params = {\n",
+    "    'boosting_type': 'gbdt',\n",
+    "    'num_class': 7,\n",
+    "    'random_state': 42,\n",
+    "    'metric': 'multi_logloss',\n",
+    "}\n",
+    "\n",
+    "\n",
+    "for i in fixed_params.keys() : \n",
+    "\n",
+    "    lgbParams[i] = fixed_params[i]\n",
+    "\n",
+    "\n",
+    "lgbParams\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659, -0.0087675011457998, -0.001077949504617301]\n",
+      "\n",
+      "Accuracy: 0.9058910707669507\n",
+      "Precision: 0.9067204051187663\n",
+      "Recall: 0.9058910707669507\n",
+      "F1 0.9063055482178468\n",
+      "Recall for class 0: 0.9208860759493671\n",
+      "Recall for class 1: 0.9090909090909091\n",
+      "Recall for class 2: 0.8741092636579573\n",
+      "Recall for class 3: 0.9736842105263158\n",
+      "Recall for class 4: 0.9960474308300395\n",
+      "Recall for class 5: 0.7701492537313432\n",
+      "Recall for class 6: 0.8419452887537994\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "import xgboost as xgb\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score\n",
+    "import mlflow\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "# import precision_recall_fscore_support\n",
+    "from sklearn.metrics import precision_recall_fscore_support\n",
+    "\n",
+    "mlflow.sklearn.autolog(disable=True)\n",
+    "\n",
+    "with mlflow.start_run(run_name=\"LGB_Final\"):\n",
+    "    class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]\n",
+    "    class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]\n",
+    "    target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]\n",
+    "    print(f\"Target Drift For Each Class {target_drift}\")\n",
+    "    mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})\n",
+    "\n",
+    "\n",
+    "\n",
+    "    lgb_model_final = lgb.LGBMClassifier(**lgbParams)\n",
+    "    lgb_model_final = lgb_model_final.fit(X_train, y_train)\n",
+    "    y_pred = lgb_model_final.predict(X_val)\n",
+    "    accuracy_xgb = accuracy_score(y_val, y_pred) \n",
+    "    precision_xgb = precision_score(y_val, y_pred, average='weighted')\n",
+    "    recall_xgb = recall_score(y_val, y_pred, average='weighted')\n",
+    "    f1_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb)\n",
+    "    print(\"\\nAccuracy:\", accuracy_xgb)\n",
+    "    print(\"Precision:\", precision_xgb)\n",
+    "    print(\"Recall:\", recall_xgb)\n",
+    "    print(\"F1\", f1_xgb)\n",
+    "    mlflow.log_metric('accuracy', accuracy_xgb)\n",
+    "    mlflow.log_metric('precision', precision_xgb)\n",
+    "    mlflow.log_metric('recall', recall_xgb)\n",
+    "    mlflow.log_metric('f1', f1_xgb)\n",
+    "\n",
+    "    precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n",
+    "    for i in range(len(recall_per_class)):\n",
+    "        print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
+    "        mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
+    "\n",
+    "    mlflow.lightgbm.log_model(lgb_model_final, 'model')\n",
+    "    mlflow.set_tag('experiments', 'Arham A.')\n",
+    "    mlflow.set_tag('model_name', 'LightGBM')\n",
+    "    mlflow.set_tag('preprocessing', 'Yes')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "DataScience",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}