a b/03-Experiments/11-Pycaret.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "Expeirment Setup"
8
   ]
9
  },
10
  {
11
   "cell_type": "code",
12
   "execution_count": 1,
13
   "metadata": {},
14
   "outputs": [],
15
   "source": [
16
    "import pandas as pd\n",
17
    "from sklearn.model_selection import train_test_split\n",
18
    "import matplotlib.pyplot as plt\n",
19
    "import seaborn as sns\n",
20
    "import numpy as np\n",
21
    "from sklearn.preprocessing import MinMaxScaler\n",
22
    "from sklearn.preprocessing import PolynomialFeatures\n",
23
    "import lightgbm as lgb\n",
24
    "from sklearn.metrics import accuracy_score\n",
25
    "\n",
26
    "def load_data(path):\n",
27
    "    df = pd.read_csv(path)\n",
28
    "    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)\n",
29
    "    train_df, val_df,  = train_test_split(train_df, test_size=0.20, random_state=42)\n",
30
    "    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
31
    "    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
32
    "    val_df = val_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)\n",
33
    "    return train_df, val_df, test_df\n",
34
    "\n",
35
    "def encode_target(train):\n",
36
    "    target_key = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4,'Obesity_Type_II' : 5, 'Obesity_Type_III': 6}\n",
37
    "    train['NObeyesdad'] = train['NObeyesdad'].map(target_key)\n",
38
    "    return train\n",
39
    "\n",
40
    "def make_gender_binary(train):\n",
41
    "    train['Gender'] = train['Gender'].map({'Male':0, 'Female':1})\n",
42
    "\n",
43
    "def datatypes(train):\n",
44
    "    train['Weight'] = train['Weight'].astype(float)\n",
45
    "    train['Age'] = train['Age'].astype(float)\n",
46
    "    train['Height'] = train['Height'].astype(float)\n",
47
    "    return train\n",
48
    "\n",
49
    "def age_binning(train_df):\n",
50
    "    train_df['Age_Group'] = pd.cut(train_df['Age'], bins=[0, 20, 30, 40, 50, train_df['Age'].max()], labels=['0-20', '21-30', '31-40', '41-50', '50+'])\n",
51
    "    return train_df\n",
52
    "\n",
53
    "def age_scaling_log(train_df):\n",
54
    "    train_df['Age'] = train_df['Age'].astype(float)\n",
55
    "    train_df['Log_Age'] = np.log1p(train_df['Age'])\n",
56
    "    return train_df\n",
57
    "\n",
58
    "def age_scaling_minmax(train_df):\n",
59
    "    train_df['Age'] = train_df['Age'].astype(float)\n",
60
    "    scaler_age = MinMaxScaler()\n",
61
    "    train_df['Scaled_Age'] = scaler_age.fit_transform(train_df['Age'].values.reshape(-1, 1))\n",
62
    "    return train_df, scaler_age\n",
63
    "\n",
64
    "def weight_scaling_log(train_df):\n",
65
    "    train_df['Weight'] = train_df['Weight'].astype(float)\n",
66
    "    train_df['Log_Weight'] = np.log1p(train_df['Weight'])\n",
67
    "    return train_df\n",
68
    "\n",
69
    "def weight_scaling_minmax(train_df):\n",
70
    "    train_df['Weight'] = train_df['Weight'].astype(float)\n",
71
    "    scaler_weight = MinMaxScaler()\n",
72
    "    train_df['Scaled_Weight'] = scaler_weight.fit_transform(train_df['Weight'].values.reshape(-1, 1))\n",
73
    "    return train_df, scaler_weight\n",
74
    "\n",
75
    "def height_scaling_log(train_df):\n",
76
    "    train_df['Log_Height'] = np.log1p(train_df['Height'])\n",
77
    "    return train_df\n",
78
    "\n",
79
    "def height_scaling_minmax(train_df):\n",
80
    "    scaler_height = MinMaxScaler()\n",
81
    "    train_df['Scaled_Height'] = scaler_height.fit_transform(train_df['Height'].values.reshape(-1, 1))\n",
82
    "    return train_df, scaler_height\n",
83
    "\n",
84
    "def make_gender_binary(train):\n",
85
    "    train['Gender'] = train['Gender'].map({'Female':1, 'Male':0})\n",
86
    "    return train\n",
87
    "\n",
88
    "def fix_binary_columns(train):\n",
89
    "    Binary_Cols = ['family_history_with_overweight','FAVC', 'SCC','SMOKE']\n",
90
    "    # if yes then 1 else 0\n",
91
    "    for col in Binary_Cols:\n",
92
    "        train[col] = train[col].map({'yes': 1, 'no': 0})\n",
93
    "    return train\n",
94
    "\n",
95
    "def freq_cat_cols(train):\n",
96
    "    # One hot encoding\n",
97
    "    cat_cols = ['CAEC', 'CALC']\n",
98
    "    for col in cat_cols:\n",
99
    "        train[col] = train[col].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})\n",
100
    "    return train\n",
101
    "\n",
102
    "def Mtrans(train):\n",
103
    "    \"\"\"\n",
104
    "    Public_Transportation    8692\n",
105
    "    Automobile               1835\n",
106
    "    Walking                   231\n",
107
    "    Motorbike                  19\n",
108
    "    Bike                       16\n",
109
    "    \"\"\"\n",
110
    "    # train['MTRANS'] = train['MTRANS'].map({'Public_Transportation': 3, 'Automobile': 5, 'Walking': 1, 'Motorbike': 4, 'Bike': 2})\n",
111
    "    # dummify column\n",
112
    "    train = pd.get_dummies(train, columns=['MTRANS'])\n",
113
    "    return train\n",
114
    "\n",
115
    "\n",
116
    "def other_features(train):\n",
117
    "    train['BMI'] = train['Weight'] / (train['Height'] ** 2)\n",
118
    "    # train['Age'*'Gender'] = train['Age'] * train['Gender']\n",
119
    "    polynomial_features = PolynomialFeatures(degree=2)\n",
120
    "    X_poly = polynomial_features.fit_transform(train[['Age', 'BMI']])\n",
121
    "    poly_features_df = pd.DataFrame(X_poly, columns=['Age^2', 'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2'])\n",
122
    "    train = pd.concat([train, poly_features_df], axis=1)\n",
123
    "    return train\n",
124
    "\n",
125
    "\n",
126
    "def test_pipeline(test, scaler_age, scaler_weight, scaler_height):\n",
127
    "    test = datatypes(test)\n",
128
    "    test = encode_target(test)\n",
129
    "    test = age_binning(test)\n",
130
    "    test = age_scaling_log(test)\n",
131
    "    test['Scaled_Age'] = scaler_age.transform(test['Age'].values.reshape(-1, 1))\n",
132
    "    test = weight_scaling_log(test)\n",
133
    "    test['Scaled_Weight'] = scaler_weight.transform(test['Weight'].values.reshape(-1, 1))\n",
134
    "    test = height_scaling_log(test)\n",
135
    "    test['Scaled_Height'] = scaler_height.transform(test['Height'].values.reshape(-1, 1))\n",
136
    "    test = make_gender_binary(test)\n",
137
    "    test = fix_binary_columns(test)\n",
138
    "    test = freq_cat_cols(test)\n",
139
    "    test = Mtrans(test)\n",
140
    "    test = other_features(test)\n",
141
    "\n",
142
    "    return test\n",
143
    "\n",
144
    "def train_model(params, X_train, y_train):\n",
145
    "    lgb_train = lgb.Dataset(X_train, y_train)\n",
146
    "    model = lgb.train(params, lgb_train, num_boost_round=1000)\n",
147
    "    return model\n",
148
    "\n",
149
    "def evaluate_model(model, X_val, y_val):\n",
150
    "    y_pred = model.predict(X_val)\n",
151
    "    y_pred = [np.argmax(y) for y in y_pred]\n",
152
    "    accuracy = accuracy_score(y_val, y_pred)\n",
153
    "    return accuracy\n",
154
    "\n",
155
    "def objective(trial, X_train, y_train):\n",
156
    "    params = {\n",
157
    "        'objective': 'multiclass',\n",
158
    "        'num_class': 7,\n",
159
    "        'metric': 'multi_logloss',\n",
160
    "        'boosting_type': 'gbdt',\n",
161
    "        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),\n",
162
    "        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),\n",
163
    "        'max_depth': trial.suggest_int('max_depth', -1, 20),\n",
164
    "        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 0.95),\n",
165
    "        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 0.95),\n",
166
    "        'verbosity': -1\n",
167
    "    }\n",
168
    "\n",
169
    "    n_splits = 5\n",
170
    "    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)\n",
171
    "    scores = []\n",
172
    "\n",
173
    "    for train_index, val_index in kf.split(X_train, y_train):\n",
174
    "        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]\n",
175
    "        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]\n",
176
    "\n",
177
    "        model = train_model(params, X_tr, y_tr)\n",
178
    "        accuracy = evaluate_model(model, X_val, y_val)\n",
179
    "        scores.append(accuracy)\n",
180
    "\n",
181
    "    return np.mean(scores)\n",
182
    "\n",
183
    "def optimize_hyperparameters(X_train, y_train, n_trials=2):\n",
184
    "    study = optuna.create_study(direction='maximize')\n",
185
    "    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=n_trials)\n",
186
    "    return study.best_params"
187
   ]
188
  },
189
  {
190
   "cell_type": "markdown",
191
   "metadata": {},
192
   "source": [
193
    "Code"
194
   ]
195
  },
196
  {
197
   "cell_type": "code",
198
   "execution_count": 5,
199
   "metadata": {},
200
   "outputs": [],
201
   "source": [
202
    "import pandas as pd\n",
203
    "from pycaret.classification import *\n",
204
    "\n",
205
    "path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'\n",
206
    "train_df, val_df, test_df = load_data(path)\n",
207
    "\n",
208
    "train_df = datatypes(train_df)\n",
209
    "train_df = encode_target(train_df)\n",
210
    "train_df = age_binning(train_df)\n",
211
    "train_df, scaler_age = age_scaling_minmax(train_df)\n",
212
    "train_df = age_scaling_log(train_df)\n",
213
    "train_df, scaler_weight = weight_scaling_minmax(train_df)\n",
214
    "train_df = weight_scaling_log(train_df)\n",
215
    "train_df, scaler_height = height_scaling_minmax(train_df)\n",
216
    "train_df = height_scaling_log(train_df)\n",
217
    "train_df = make_gender_binary(train_df)\n",
218
    "train_df = fix_binary_columns(train_df)\n",
219
    "train_df = freq_cat_cols(train_df)\n",
220
    "train_df = Mtrans(train_df)\n",
221
    "train_df = other_features(train_df)\n",
222
    "\n",
223
    "val_df = test_pipeline(val_df, scaler_age, scaler_weight, scaler_height)\n",
224
    "test_df = test_pipeline(test_df, scaler_age, scaler_weight, scaler_height)\n",
225
    "\n",
226
    "# Split data\n",
227
    "Target = 'NObeyesdad'\n",
228
    "features = ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',\n",
229
    "            'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',\n",
230
    "            'CALC', 'Age_Group', 'MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike',\n",
231
    "            'MTRANS_Public_Transportation', 'MTRANS_Walking', 'BMI', 'Age^2',\n",
232
    "            'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2',\n",
233
    "            'Scaled_Age', 'Log_Age', 'Scaled_Weight', 'Log_Weight', 'Scaled_Height', 'Log_Height']\n",
234
    "\n",
235
    "train_data = train_df[features + [Target]]\n",
236
    "val_data = val_df[features + [Target]]\n",
237
    "test_data = test_df[features + [Target]]\n",
238
    "\n",
239
    "# Initialize PyCaret setup\n",
240
    "exp1 = setup(data=train_data, target=Target, session_id=123)\n",
241
    "\n",
242
    "# Compare models\n",
243
    "best_model = compare_models()\n",
244
    "\n",
245
    "# Tune model\n",
246
    "tuned_model = tune_model(best_model)\n",
247
    "\n",
248
    "# Finalize model\n",
249
    "final_model = finalize_model(tuned_model)\n",
250
    "\n",
251
    "# Save model\n",
252
    "save_model(final_model, 'model_name')\n",
253
    "\n",
254
    "# # load model\n",
255
    "# final_model = load_model('model_name')\n",
256
    "\n",
257
    "\n",
258
    "predictions = predict_model(final_model, data=val_data)\n",
259
    "\n",
260
    "predictions\n",
261
    "# # Evaluate performance\n",
262
    "from sklearn.metrics import precision_recall_fscore_support\n",
263
    "\n",
264
    "precision, recall, f1, support = precision_recall_fscore_support(predictions['NObeyesdad'], predictions['prediction_label'], average='weighted')\n",
265
    "print(f\"Precision: {precision}, Recall: {recall}, F1 Score: {f1}\")\n",
266
    "\n",
267
    "# Log performance metrics\n",
268
    "import mlflow\n",
269
    "with mlflow.start_run(run_name=\"PyCaret_With_Extended_Engineering\"):\n",
270
    "    # Log PyCaret model\n",
271
    "    mlflow.pyfunc.log_model(artifact_path=\"pycaret_model\", python_model=final_model)\n",
272
    "    \n",
273
    "    # Log metrics\n",
274
    "    mlflow.log_metric('accuracy', accuracy_score(predictions[Target], predictions['Label']))\n",
275
    "    mlflow.log_metric('precision', precision)\n",
276
    "    mlflow.log_metric('recall', recall)\n",
277
    "    mlflow.log_metric('f1', f1)\n",
278
    "\n",
279
    "    # Log recall per class\n",
280
    "    recall_per_class = recall_score(predictions['NObeyesdad'], predictions['prediction_label'], average=None)\n",
281
    "    for i, recall_class in enumerate(recall_per_class):\n",
282
    "        print(f\"Recall for class {i}: {recall_class}\")\n",
283
    "        mlflow.log_metric(f'recall_class_{i}', recall_class)\n",
284
    "\n",
285
    "    mlflow.set_tag('experiments', 'Arham A.')\n",
286
    "    mlflow.set_tag('model_name', 'PyCaret')\n",
287
    "    mlflow.set_tag('preprocessing', 'Yes')\n"
288
   ]
289
  }
290
 ],
291
 "metadata": {
292
  "kernelspec": {
293
   "display_name": "DataScience",
294
   "language": "python",
295
   "name": "python3"
296
  },
297
  "language_info": {
298
   "codemirror_mode": {
299
    "name": "ipython",
300
    "version": 3
301
   },
302
   "file_extension": ".py",
303
   "mimetype": "text/x-python",
304
   "name": "python",
305
   "nbconvert_exporter": "python",
306
   "pygments_lexer": "ipython3",
307
   "version": "3.10.13"
308
  }
309
 },
310
 "nbformat": 4,
311
 "nbformat_minor": 2
312
}