[7bf731]: / 03-Experiments / Untitled-1.ipynb

Download this file

376 lines (375 with data), 14.6 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024/04/26 04:39:52 INFO mlflow.tracking.fluent: Experiment with name 'LGB' does not exist. Creating a new experiment.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<Experiment: artifact_location='/Users/arham/Downloads/Projects/mlruns/2', creation_time=1714120792214, experiment_id='2', last_update_time=1714120792214, lifecycle_stage='active', name='LGB', tags={}>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import mlflow\n",
    "\n",
    "\n",
    "# Set the MLflow tracking URI to a new SQLite URI\n",
    "mlflow.set_tracking_uri(\"sqlite:///new_mlflow.db\")\n",
    "mlflow.set_experiment(\"LGB\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import chi2_contingency\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import lightgbm as lgb\n",
    "from catboost import CatBoostClassifier, Pool\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
    "from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score, f1_score, auc,classification_report\n",
    "from scipy.stats import ks_2samp\n",
    "\n",
    "from sklearn.preprocessing import label_binarize,OneHotEncoder, StandardScaler, FunctionTransformer, LabelEncoder\n",
    "from itertools import cycle\n",
    "\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "import shap\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD, PCA\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import numpy as np \n",
    "import pandas as pd\n",
    "\n",
    "def load_data(path):\n",
    "    df = pd.read_csv(path)\n",
    "    # arham check this later\n",
    "    # original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')\n",
    "    # split to train test\n",
    "    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
    "    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
    "    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
    "    return train_df, test_df\n",
    "\n",
    "def corr_heat_map(df,scale=1) :\n",
    "    # Calculate the correlation matrix\n",
    "    correlation_matrix = df.corr()\n",
    "\n",
    "    # Create a mask for the upper triangle\n",
    "    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))\n",
    "\n",
    "    # Set up the matplotlib figure\n",
    "    plt.figure(figsize=(10//scale, 8//scale))\n",
    "\n",
    "    # Define a custom color palette\n",
    "    cmap = sns.diverging_palette(220, 20, as_cmap=True)\n",
    "\n",
    "    # Draw the heatmap with the mask and correct aspect ratio\n",
    "    sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,\n",
    "                square=True, linewidths=.5, cbar_kws={\"shrink\": 0.7})\n",
    "\n",
    "    plt.title('Correlation Heatmap')\n",
    "\n",
    "\n",
    "path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
    "train, test = load_data(path)\n",
    "\n",
    "target = 'NObeyesdad'\n",
    "num_col = []\n",
    "cat_col = []\n",
    "\n",
    "for i in train.columns.drop([target]) : \n",
    "    \n",
    "    if train[i].dtype == 'object' : \n",
    "        cat_col.append(i)\n",
    "        \n",
    "    else : \n",
    "        num_col.append(i)\n",
    "\n",
    "# print(\"Numerical Columns : \", *num_col,\"\\n\",sep=\"\\n\")\n",
    "# print(\"Categorical Columns : \", *cat_col,sep=\"\\n\")\n",
    "\n",
    "\n",
    "train = pd.get_dummies(train,\n",
    "                       columns=cat_col)\n",
    "test = pd.get_dummies(test, \n",
    "                      columns=cat_col)\n",
    "\n",
    "target = 'NObeyesdad'\n",
    "\n",
    "le = LabelEncoder()\n",
    "train['NObeyesdad'] = le.fit_transform(train['NObeyesdad'])\n",
    "\n",
    "X_train, X_val, y_train, y_val = train_test_split(train.drop([target],axis=1),train[target],test_size=0.2,random_state=42)\n",
    "X_train.shape , y_train.shape, X_val.shape, y_val.shape \n",
    "\n",
    "import optuna\n",
    "ran_optuna = False \n",
    "\n",
    "def optimization_function(trial) : \n",
    "    \n",
    "    lgbParams = {\n",
    "        'num_class': 7,\n",
    "        'random_state': 42,\n",
    "        'metric': 'multi_logloss',\n",
    "        \"boosting_type\": \"gbdt\",\n",
    "        'objective': 'multiclass',\n",
    "        \n",
    "        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),\n",
    "        'n_estimators': trial.suggest_int('n_estimators', 400, 600),\n",
    "        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),\n",
    "        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-1, 10.0),\n",
    "        'max_depth': trial.suggest_int('max_depth', 6, 20),\n",
    "        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),\n",
    "        'subsample': trial.suggest_float('subsample', 0.8, 1.0),\n",
    "        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),\n",
    "    }\n",
    "    \n",
    "    lgb_model=lgb.LGBMClassifier(**lgbParams)\n",
    "    \n",
    "#     skf = StratifiedKFold(n_splits=5,shuffle=False, random_state=None)\n",
    "#     accuracy = cross_val_score(lgb_model,X_train,y_train, cv=skf,scoring='accuracy')\n",
    "#     print(\"=\"*50,'\\nValidation Accuracy:', accuracy.mean())\n",
    "\n",
    "    lgb_model.fit(X_train,y_train)\n",
    "    \n",
    "    acc = accuracy_score(y_val,lgb_model.predict(X_val))\n",
    "\n",
    "        mlflow.log_metric('accuracy', accuracy)\n",
    "        mlflow.log_metric('precision', precision)\n",
    "        mlflow.log_metric('recall', recall)\n",
    "        mlflow.log_metric('f1', f1)\n",
    "\n",
    "        precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n",
    "        for i in range(len(recall_per_class)):\n",
    "            print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
    "            mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
    "\n",
    "        mlflow.lightgbm.log_model(lgb_model_final, 'model')\n",
    "        mlflow.set_tag('experiments', 'Arham A.')\n",
    "        mlflow.set_tag('model_name', 'LightGBM')\n",
    "        mlflow.set_tag('preprocessing', 'Yes')\n",
    "    \n",
    "    return acc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9058910707669507"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "if ran_optuna : \n",
    "\n",
    "    print('Number of finished trials:', len(study.trials))\n",
    "\n",
    "    print('Best trial:', study.best_trial.params)\n",
    "\n",
    "    optuna.visualization.plot_param_importances(study)\n",
    "\n",
    "    study.trials_dataframe().sort_values('value',ascending=False)\n",
    "\n",
    "    optuna.visualization.plot_slice(study)\n",
    "\n",
    "# 100 trials \n",
    "# {'objective': 'multiclassova', 'learning_rate': 0.04641200998070569, 'n_estimators': 587, 'reg_alpha': 0.0065043557057678746, 'reg_lambda': 4.460933310544669, 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 'subsample': 0.8193986843950917, 'min_child_samples': 15}\n",
    "\n",
    "\n",
    "if ran_optuna : \n",
    "    lgbParams = study.best_trial.params\n",
    "\n",
    "else :\n",
    "    \n",
    "#     # 100- traials with PCA seed = None\n",
    "#     lgbParams = {\n",
    "#         'objective': 'multiclassova', \n",
    "#         'learning_rate': 0.04641200998070569, \n",
    "#         'n_estimators': 587, \n",
    "#         'reg_alpha': 0.0065043557057678746, \n",
    "#         'reg_lambda': 4.460933310544669, \n",
    "#         'max_depth': 7, 'colsample_bytree': 0.6833315654013498, \n",
    "#         'subsample': 0.8193986843950917, \n",
    "#         'min_child_samples': 15\n",
    "#     }\n",
    "    \n",
    "    \n",
    "    # Moaz HyperParams\n",
    "    lgbParams = {\n",
    "        \"objective\": \"multiclass\",          # Objective function for the model\n",
    "        \"metric\": \"multi_logloss\",          # Evaluation metric\n",
    "        \"verbosity\": -1,                    # Verbosity level (-1 for silent)\n",
    "        \"boosting_type\": \"gbdt\",            # Gradient boosting type\n",
    "        \"random_state\": 42,       # Random state for reproducibility\n",
    "        \"num_class\": 7,                     # Number of classes in the dataset\n",
    "        'learning_rate': 0.030962211546832760,  # Learning rate for gradient boosting\n",
    "        'n_estimators': 500,                # Number of boosting iterations\n",
    "        'lambda_l1': 0.009667446568254372,  # L1 regularization term\n",
    "        'lambda_l2': 0.04018641437301800,   # L2 regularization term\n",
    "        'max_depth': 10,                    # Maximum depth of the trees\n",
    "        'colsample_bytree': 0.40977129346872643,  # Fraction of features to consider for each tree\n",
    "        'subsample': 0.9535797422450176,    # Fraction of samples to consider for each boosting iteration\n",
    "        'min_child_samples': 26             # Minimum number of data needed in a leaf\n",
    "    }\n",
    "\n",
    "\n",
    "\n",
    "fixed_params = {\n",
    "    'boosting_type': 'gbdt',\n",
    "    'num_class': 7,\n",
    "    'random_state': 42,\n",
    "    'metric': 'multi_logloss',\n",
    "}\n",
    "\n",
    "\n",
    "for i in fixed_params.keys() : \n",
    "\n",
    "    lgbParams[i] = fixed_params[i]\n",
    "\n",
    "\n",
    "lgbParams\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659, -0.0087675011457998, -0.001077949504617301]\n",
      "\n",
      "Accuracy: 0.9058910707669507\n",
      "Precision: 0.9067204051187663\n",
      "Recall: 0.9058910707669507\n",
      "F1 0.9063055482178468\n",
      "Recall for class 0: 0.9208860759493671\n",
      "Recall for class 1: 0.9090909090909091\n",
      "Recall for class 2: 0.8741092636579573\n",
      "Recall for class 3: 0.9736842105263158\n",
      "Recall for class 4: 0.9960474308300395\n",
      "Recall for class 5: 0.7701492537313432\n",
      "Recall for class 6: 0.8419452887537994\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "import xgboost as xgb\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score\n",
    "import mlflow\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "# import precision_recall_fscore_support\n",
    "from sklearn.metrics import precision_recall_fscore_support\n",
    "\n",
    "mlflow.sklearn.autolog(disable=True)\n",
    "\n",
    "with mlflow.start_run(run_name=\"LGB_Final\"):\n",
    "    class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]\n",
    "    class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]\n",
    "    target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]\n",
    "    print(f\"Target Drift For Each Class {target_drift}\")\n",
    "    mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})\n",
    "\n",
    "\n",
    "\n",
    "    lgb_model_final = lgb.LGBMClassifier(**lgbParams)\n",
    "    lgb_model_final = lgb_model_final.fit(X_train, y_train)\n",
    "    y_pred = lgb_model_final.predict(X_val)\n",
    "    accuracy_xgb = accuracy_score(y_val, y_pred) \n",
    "    precision_xgb = precision_score(y_val, y_pred, average='weighted')\n",
    "    recall_xgb = recall_score(y_val, y_pred, average='weighted')\n",
    "    f1_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb)\n",
    "    print(\"\\nAccuracy:\", accuracy_xgb)\n",
    "    print(\"Precision:\", precision_xgb)\n",
    "    print(\"Recall:\", recall_xgb)\n",
    "    print(\"F1\", f1_xgb)\n",
    "    mlflow.log_metric('accuracy', accuracy_xgb)\n",
    "    mlflow.log_metric('precision', precision_xgb)\n",
    "    mlflow.log_metric('recall', recall_xgb)\n",
    "    mlflow.log_metric('f1', f1_xgb)\n",
    "\n",
    "    precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n",
    "    for i in range(len(recall_per_class)):\n",
    "        print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
    "        mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
    "\n",
    "    mlflow.lightgbm.log_model(lgb_model_final, 'model')\n",
    "    mlflow.set_tag('experiments', 'Arham A.')\n",
    "    mlflow.set_tag('model_name', 'LightGBM')\n",
    "    mlflow.set_tag('preprocessing', 'Yes')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "DataScience",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}