a b/03-Experiments/Untitled-1.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 4,
6
   "metadata": {},
7
   "outputs": [
8
    {
9
     "name": "stderr",
10
     "output_type": "stream",
11
     "text": [
12
      "2024/04/26 04:39:52 INFO mlflow.tracking.fluent: Experiment with name 'LGB' does not exist. Creating a new experiment.\n"
13
     ]
14
    },
15
    {
16
     "data": {
17
      "text/plain": [
18
       "<Experiment: artifact_location='/Users/arham/Downloads/Projects/mlruns/2', creation_time=1714120792214, experiment_id='2', last_update_time=1714120792214, lifecycle_stage='active', name='LGB', tags={}>"
19
      ]
20
     },
21
     "execution_count": 4,
22
     "metadata": {},
23
     "output_type": "execute_result"
24
    }
25
   ],
26
   "source": [
27
    "import mlflow\n",
28
    "\n",
29
    "\n",
30
    "# Set the MLflow tracking URI to a new SQLite URI\n",
31
    "mlflow.set_tracking_uri(\"sqlite:///new_mlflow.db\")\n",
32
    "mlflow.set_experiment(\"LGB\")\n",
33
    "\n"
34
   ]
35
  },
36
  {
37
   "cell_type": "code",
38
   "execution_count": 1,
39
   "metadata": {},
40
   "outputs": [],
41
   "source": [
42
    "import seaborn as sns\n",
43
    "import matplotlib.pyplot as plt\n",
44
    "from scipy.stats import chi2_contingency\n",
45
    "from sklearn.model_selection import train_test_split\n",
46
    "\n",
47
    "import lightgbm as lgb\n",
48
    "from catboost import CatBoostClassifier, Pool\n",
49
    "from xgboost import XGBClassifier\n",
50
    "from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
51
    "from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score, f1_score, auc,classification_report\n",
52
    "from scipy.stats import ks_2samp\n",
53
    "\n",
54
    "from sklearn.preprocessing import label_binarize,OneHotEncoder, StandardScaler, FunctionTransformer, LabelEncoder\n",
55
    "from itertools import cycle\n",
56
    "\n",
57
    "from sklearn.ensemble import VotingClassifier\n",
58
    "from sklearn.model_selection import RandomizedSearchCV\n",
59
    "import shap\n",
60
    "\n",
61
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
62
    "from sklearn.decomposition import TruncatedSVD, PCA\n",
63
    "\n",
64
    "import warnings\n",
65
    "warnings.filterwarnings(\"ignore\")\n",
66
    "\n",
67
    "import numpy as np \n",
68
    "import pandas as pd\n",
69
    "\n",
70
    "def load_data(path):\n",
71
    "    df = pd.read_csv(path)\n",
72
    "    # arham check this later\n",
73
    "    # original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')\n",
74
    "    # split to train test\n",
75
    "    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
76
    "    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
77
    "    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
78
    "    return train_df, test_df\n",
79
    "\n",
80
    "def corr_heat_map(df,scale=1) :\n",
81
    "    # Calculate the correlation matrix\n",
82
    "    correlation_matrix = df.corr()\n",
83
    "\n",
84
    "    # Create a mask for the upper triangle\n",
85
    "    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))\n",
86
    "\n",
87
    "    # Set up the matplotlib figure\n",
88
    "    plt.figure(figsize=(10//scale, 8//scale))\n",
89
    "\n",
90
    "    # Define a custom color palette\n",
91
    "    cmap = sns.diverging_palette(220, 20, as_cmap=True)\n",
92
    "\n",
93
    "    # Draw the heatmap with the mask and correct aspect ratio\n",
94
    "    sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,\n",
95
    "                square=True, linewidths=.5, cbar_kws={\"shrink\": 0.7})\n",
96
    "\n",
97
    "    plt.title('Correlation Heatmap')\n",
98
    "\n",
99
    "\n",
100
    "path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
101
    "train, test = load_data(path)\n",
102
    "\n",
103
    "target = 'NObeyesdad'\n",
104
    "num_col = []\n",
105
    "cat_col = []\n",
106
    "\n",
107
    "for i in train.columns.drop([target]) : \n",
108
    "    \n",
109
    "    if train[i].dtype == 'object' : \n",
110
    "        cat_col.append(i)\n",
111
    "        \n",
112
    "    else : \n",
113
    "        num_col.append(i)\n",
114
    "\n",
115
    "# print(\"Numerical Columns : \", *num_col,\"\\n\",sep=\"\\n\")\n",
116
    "# print(\"Categorical Columns : \", *cat_col,sep=\"\\n\")\n",
117
    "\n",
118
    "\n",
119
    "train = pd.get_dummies(train,\n",
120
    "                       columns=cat_col)\n",
121
    "test = pd.get_dummies(test, \n",
122
    "                      columns=cat_col)\n",
123
    "\n",
124
    "target = 'NObeyesdad'\n",
125
    "\n",
126
    "le = LabelEncoder()\n",
127
    "train['NObeyesdad'] = le.fit_transform(train['NObeyesdad'])\n",
128
    "\n",
129
    "X_train, X_val, y_train, y_val = train_test_split(train.drop([target],axis=1),train[target],test_size=0.2,random_state=42)\n",
130
    "X_train.shape , y_train.shape, X_val.shape, y_val.shape \n",
131
    "\n",
132
    "import optuna\n",
133
    "ran_optuna = False \n",
134
    "\n",
135
    "def optimization_function(trial) : \n",
136
    "    \n",
137
    "    lgbParams = {\n",
138
    "        'num_class': 7,\n",
139
    "        'random_state': 42,\n",
140
    "        'metric': 'multi_logloss',\n",
141
    "        \"boosting_type\": \"gbdt\",\n",
142
    "        'objective': 'multiclass',\n",
143
    "        \n",
144
    "        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),\n",
145
    "        'n_estimators': trial.suggest_int('n_estimators', 400, 600),\n",
146
    "        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),\n",
147
    "        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-1, 10.0),\n",
148
    "        'max_depth': trial.suggest_int('max_depth', 6, 20),\n",
149
    "        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),\n",
150
    "        'subsample': trial.suggest_float('subsample', 0.8, 1.0),\n",
151
    "        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),\n",
152
    "    }\n",
153
    "    \n",
154
    "    lgb_model=lgb.LGBMClassifier(**lgbParams)\n",
155
    "    \n",
156
    "#     skf = StratifiedKFold(n_splits=5,shuffle=False, random_state=None)\n",
157
    "#     accuracy = cross_val_score(lgb_model,X_train,y_train, cv=skf,scoring='accuracy')\n",
158
    "#     print(\"=\"*50,'\\nValidation Accuracy:', accuracy.mean())\n",
159
    "\n",
160
    "    lgb_model.fit(X_train,y_train)\n",
161
    "    \n",
162
    "    acc = accuracy_score(y_val,lgb_model.predict(X_val))\n",
163
    "\n",
164
    "        mlflow.log_metric('accuracy', accuracy)\n",
165
    "        mlflow.log_metric('precision', precision)\n",
166
    "        mlflow.log_metric('recall', recall)\n",
167
    "        mlflow.log_metric('f1', f1)\n",
168
    "\n",
169
    "        precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n",
170
    "        for i in range(len(recall_per_class)):\n",
171
    "            print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
172
    "            mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
173
    "\n",
174
    "        mlflow.lightgbm.log_model(lgb_model_final, 'model')\n",
175
    "        mlflow.set_tag('experiments', 'Arham A.')\n",
176
    "        mlflow.set_tag('model_name', 'LightGBM')\n",
177
    "        mlflow.set_tag('preprocessing', 'Yes')\n",
178
    "    \n",
179
    "    return acc"
180
   ]
181
  },
182
  {
183
   "cell_type": "code",
184
   "execution_count": 2,
185
   "metadata": {},
186
   "outputs": [
187
    {
188
     "data": {
189
      "text/plain": [
190
       "0.9058910707669507"
191
      ]
192
     },
193
     "execution_count": 2,
194
     "metadata": {},
195
     "output_type": "execute_result"
196
    }
197
   ],
198
   "source": [
199
    "if ran_optuna : \n",
200
    "\n",
201
    "    print('Number of finished trials:', len(study.trials))\n",
202
    "\n",
203
    "    print('Best trial:', study.best_trial.params)\n",
204
    "\n",
205
    "    optuna.visualization.plot_param_importances(study)\n",
206
    "\n",
207
    "    study.trials_dataframe().sort_values('value',ascending=False)\n",
208
    "\n",
209
    "    optuna.visualization.plot_slice(study)\n",
210
    "\n",
211
    "# 100 trials \n",
212
    "# {'objective': 'multiclassova', 'learning_rate': 0.04641200998070569, 'n_estimators': 587, 'reg_alpha': 0.0065043557057678746, 'reg_lambda': 4.460933310544669, 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 'subsample': 0.8193986843950917, 'min_child_samples': 15}\n",
213
    "\n",
214
    "\n",
215
    "if ran_optuna : \n",
216
    "    lgbParams = study.best_trial.params\n",
217
    "\n",
218
    "else :\n",
219
    "    \n",
220
    "#     # 100- traials with PCA seed = None\n",
221
    "#     lgbParams = {\n",
222
    "#         'objective': 'multiclassova', \n",
223
    "#         'learning_rate': 0.04641200998070569, \n",
224
    "#         'n_estimators': 587, \n",
225
    "#         'reg_alpha': 0.0065043557057678746, \n",
226
    "#         'reg_lambda': 4.460933310544669, \n",
227
    "#         'max_depth': 7, 'colsample_bytree': 0.6833315654013498, \n",
228
    "#         'subsample': 0.8193986843950917, \n",
229
    "#         'min_child_samples': 15\n",
230
    "#     }\n",
231
    "    \n",
232
    "    \n",
233
    "    # Moaz HyperParams\n",
234
    "    lgbParams = {\n",
235
    "        \"objective\": \"multiclass\",          # Objective function for the model\n",
236
    "        \"metric\": \"multi_logloss\",          # Evaluation metric\n",
237
    "        \"verbosity\": -1,                    # Verbosity level (-1 for silent)\n",
238
    "        \"boosting_type\": \"gbdt\",            # Gradient boosting type\n",
239
    "        \"random_state\": 42,       # Random state for reproducibility\n",
240
    "        \"num_class\": 7,                     # Number of classes in the dataset\n",
241
    "        'learning_rate': 0.030962211546832760,  # Learning rate for gradient boosting\n",
242
    "        'n_estimators': 500,                # Number of boosting iterations\n",
243
    "        'lambda_l1': 0.009667446568254372,  # L1 regularization term\n",
244
    "        'lambda_l2': 0.04018641437301800,   # L2 regularization term\n",
245
    "        'max_depth': 10,                    # Maximum depth of the trees\n",
246
    "        'colsample_bytree': 0.40977129346872643,  # Fraction of features to consider for each tree\n",
247
    "        'subsample': 0.9535797422450176,    # Fraction of samples to consider for each boosting iteration\n",
248
    "        'min_child_samples': 26             # Minimum number of data needed in a leaf\n",
249
    "    }\n",
250
    "\n",
251
    "\n",
252
    "\n",
253
    "fixed_params = {\n",
254
    "    'boosting_type': 'gbdt',\n",
255
    "    'num_class': 7,\n",
256
    "    'random_state': 42,\n",
257
    "    'metric': 'multi_logloss',\n",
258
    "}\n",
259
    "\n",
260
    "\n",
261
    "for i in fixed_params.keys() : \n",
262
    "\n",
263
    "    lgbParams[i] = fixed_params[i]\n",
264
    "\n",
265
    "\n",
266
    "lgbParams\n",
267
    "\n"
268
   ]
269
  },
270
  {
271
   "cell_type": "code",
272
   "execution_count": 6,
273
   "metadata": {},
274
   "outputs": [
275
    {
276
     "name": "stdout",
277
     "output_type": "stream",
278
     "text": [
279
      "Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659, -0.0087675011457998, -0.001077949504617301]\n",
280
      "\n",
281
      "Accuracy: 0.9058910707669507\n",
282
      "Precision: 0.9067204051187663\n",
283
      "Recall: 0.9058910707669507\n",
284
      "F1 0.9063055482178468\n",
285
      "Recall for class 0: 0.9208860759493671\n",
286
      "Recall for class 1: 0.9090909090909091\n",
287
      "Recall for class 2: 0.8741092636579573\n",
288
      "Recall for class 3: 0.9736842105263158\n",
289
      "Recall for class 4: 0.9960474308300395\n",
290
      "Recall for class 5: 0.7701492537313432\n",
291
      "Recall for class 6: 0.8419452887537994\n"
292
     ]
293
    }
294
   ],
295
   "source": [
296
    "\n",
297
    "\n",
298
    "import xgboost as xgb\n",
299
    "from sklearn.model_selection import cross_val_score\n",
300
    "from sklearn.metrics import accuracy_score, precision_score, recall_score\n",
301
    "import mlflow\n",
302
    "import warnings\n",
303
    "warnings.filterwarnings(\"ignore\")\n",
304
    "# import precision_recall_fscore_support\n",
305
    "from sklearn.metrics import precision_recall_fscore_support\n",
306
    "\n",
307
    "mlflow.sklearn.autolog(disable=True)\n",
308
    "\n",
309
    "with mlflow.start_run(run_name=\"LGB_Final\"):\n",
310
    "    class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]\n",
311
    "    class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]\n",
312
    "    target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]\n",
313
    "    print(f\"Target Drift For Each Class {target_drift}\")\n",
314
    "    mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})\n",
315
    "\n",
316
    "\n",
317
    "\n",
318
    "    lgb_model_final = lgb.LGBMClassifier(**lgbParams)\n",
319
    "    lgb_model_final = lgb_model_final.fit(X_train, y_train)\n",
320
    "    y_pred = lgb_model_final.predict(X_val)\n",
321
    "    accuracy_xgb = accuracy_score(y_val, y_pred) \n",
322
    "    precision_xgb = precision_score(y_val, y_pred, average='weighted')\n",
323
    "    recall_xgb = recall_score(y_val, y_pred, average='weighted')\n",
324
    "    f1_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb)\n",
325
    "    print(\"\\nAccuracy:\", accuracy_xgb)\n",
326
    "    print(\"Precision:\", precision_xgb)\n",
327
    "    print(\"Recall:\", recall_xgb)\n",
328
    "    print(\"F1\", f1_xgb)\n",
329
    "    mlflow.log_metric('accuracy', accuracy_xgb)\n",
330
    "    mlflow.log_metric('precision', precision_xgb)\n",
331
    "    mlflow.log_metric('recall', recall_xgb)\n",
332
    "    mlflow.log_metric('f1', f1_xgb)\n",
333
    "\n",
334
    "    precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)\n",
335
    "    for i in range(len(recall_per_class)):\n",
336
    "        print(f\"Recall for class {i}: {recall_per_class[i]}\")\n",
337
    "        mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])\n",
338
    "\n",
339
    "    mlflow.lightgbm.log_model(lgb_model_final, 'model')\n",
340
    "    mlflow.set_tag('experiments', 'Arham A.')\n",
341
    "    mlflow.set_tag('model_name', 'LightGBM')\n",
342
    "    mlflow.set_tag('preprocessing', 'Yes')\n",
343
    "\n"
344
   ]
345
  },
346
  {
347
   "cell_type": "code",
348
   "execution_count": null,
349
   "metadata": {},
350
   "outputs": [],
351
   "source": []
352
  }
353
 ],
354
 "metadata": {
355
  "kernelspec": {
356
   "display_name": "DataScience",
357
   "language": "python",
358
   "name": "python3"
359
  },
360
  "language_info": {
361
   "codemirror_mode": {
362
    "name": "ipython",
363
    "version": 3
364
   },
365
   "file_extension": ".py",
366
   "mimetype": "text/x-python",
367
   "name": "python",
368
   "nbconvert_exporter": "python",
369
   "pygments_lexer": "ipython3",
370
   "version": "3.10.13"
371
  }
372
 },
373
 "nbformat": 4,
374
 "nbformat_minor": 2
375
}