# Global Experiment Setup

In [1]:
import mlflow


# Set the MLflow tracking URI to a new SQLite URI
mlflow.set_tracking_uri("sqlite:///new_mlflow.db")
mlflow.set_experiment("XGBoost")

<Experiment: artifact_location='/Users/arham/Downloads/Projects/03-Experiments/mlruns/2', creation_time=1713912394972, experiment_id='2', last_update_time=1713912394972, lifecycle_stage='active', name='XGBoost', tags={}>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score


def load_data(path):
    df = pd.read_csv(path)
    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)
    train_df, val_df,  = train_test_split(train_df, test_size=0.20, random_state=42)
    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)
    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)
    val_df = val_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)
    return train_df, val_df, test_df

def encode_target(train):
    target_key = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4,'Obesity_Type_II' : 5, 'Obesity_Type_III': 6}
    train['NObeyesdad'] = train['NObeyesdad'].map(target_key)
    return train

def make_gender_binary(train):
    train['Gender'] = train['Gender'].map({'Male':0, 'Female':1})

def datatypes(train):
    train['Weight'] = train['Weight'].astype(float)
    train['Age'] = train['Age'].astype(float)
    train['Height'] = train['Height'].astype(float)
    return train

# def age_binning(train_df):
#     # train_df['Age_Group'] = pd.cut(train_df['Age'], bins=[0, 20, 30, 40, 50, train_df['Age'].max()], labels=['0-20', '21-30', '31-40', '41-50', '50+'])
#     train_df['Age_Group'] = pd.cut(train_df['Age'], bins=[0, 20, 30, 40, 50, train_df['Age'].max()], labels=[1, 2, 3, 4, 5])
#     train_df['Age_Group'] = train_df['Age_Group'].astype(int)
#     return train_df

def age_binning(df):
    age_groups = []
    for age in df['Age']:
        if age <= 20:
            age_group = 1
        elif age <= 30:
            age_group = 2
        elif age <= 40:
            age_group = 3
        elif age <= 50:
            age_group = 4
        else:
            age_group = 5
        age_groups.append(age_group)
    df['Age_Group'] = age_groups
    return df

def age_scaling_log(train_df):
    train_df['Age'] = train_df['Age'].astype(float)
    train_df['Log_Age'] = np.log1p(train_df['Age'])
    return train_df

def age_scaling_minmax(train_df):
    train_df['Age'] = train_df['Age'].astype(float)
    scaler_age = MinMaxScaler()
    train_df['Scaled_Age'] = scaler_age.fit_transform(train_df['Age'].values.reshape(-1, 1))
    return train_df, scaler_age

def weight_scaling_log(train_df):
    train_df['Weight'] = train_df['Weight'].astype(float)
    train_df['Log_Weight'] = np.log1p(train_df['Weight'])
    return train_df

def weight_scaling_minmax(train_df):
    train_df['Weight'] = train_df['Weight'].astype(float)
    scaler_weight = MinMaxScaler()
    train_df['Scaled_Weight'] = scaler_weight.fit_transform(train_df['Weight'].values.reshape(-1, 1))
    return train_df, scaler_weight

def height_scaling_log(train_df):
    train_df['Log_Height'] = np.log1p(train_df['Height'])
    return train_df

def height_scaling_minmax(train_df):
    scaler_height = MinMaxScaler()
    train_df['Scaled_Height'] = scaler_height.fit_transform(train_df['Height'].values.reshape(-1, 1))
    return train_df, scaler_height

def make_gender_binary(train):
    train['Gender'] = train['Gender'].map({'Female':1, 'Male':0})
    return train

def fix_binary_columns(train):
    Binary_Cols = ['family_history_with_overweight','FAVC', 'SCC','SMOKE']
    # if yes then 1 else 0
    for col in Binary_Cols:
        train[col] = train[col].map({'yes': 1, 'no': 0})
        # column datatype integer
        train[col] = train[col].astype(int)
    return train

def freq_cat_cols(train):
    # One hot encoding
    cat_cols = ['CAEC', 'CALC']
    for col in cat_cols:
        train[col] = train[col].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})
    return train

def Mtrans(train):
    """
    Public_Transportation    8692
    Automobile               1835
    Walking                   231
    Motorbike                  19
    Bike                       16
    """
    # train['MTRANS'] = train['MTRANS'].map({'Public_Transportation': 3, 'Automobile': 5, 'Walking': 1, 'Motorbike': 4, 'Bike': 2})
    # dummify column
    train = pd.get_dummies(train, columns=['MTRANS'])
    # convert these columns to integer
    train['MTRANS_Automobile'] = train['MTRANS_Automobile'].astype(int)
    train['MTRANS_Walking'] = train['MTRANS_Walking'].astype(int)
    train['MTRANS_Motorbike'] = train['MTRANS_Motorbike'].astype(int)
    train['MTRANS_Bike'] = train['MTRANS_Bike'].astype(int)
    train['MTRANS_Public_Transportation'] = train['MTRANS_Public_Transportation'].astype(int)
    return train


def other_features(train):
    train['BMI'] = train['Weight'] / (train['Height'] ** 2)
    # train['Age'*'Gender'] = train['Age'] * train['Gender']
    polynomial_features = PolynomialFeatures(degree=2)
    X_poly = polynomial_features.fit_transform(train[['Age', 'BMI']])
    poly_features_df = pd.DataFrame(X_poly, columns=['Age^2', 'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2'])
    train = pd.concat([train, poly_features_df], axis=1)
    return train


def test_pipeline(test, scaler_age, scaler_weight, scaler_height):
    test = datatypes(test)
    test = encode_target(test)
    test = age_binning(test)
    test = age_scaling_log(test)
    test['Scaled_Age'] = scaler_age.transform(test['Age'].values.reshape(-1, 1))
    test = weight_scaling_log(test)
    test['Scaled_Weight'] = scaler_weight.transform(test['Weight'].values.reshape(-1, 1))
    test = height_scaling_log(test)
    test['Scaled_Height'] = scaler_height.transform(test['Height'].values.reshape(-1, 1))
    test = make_gender_binary(test)
    test = fix_binary_columns(test)
    test = freq_cat_cols(test)
    test = Mtrans(test)
    test = other_features(test)

    return test

def train_model(params, X_train, y_train):
    lgb_train = lgb.Dataset(X_train, y_train)
    model = lgb.train(params, lgb_train, num_boost_round=1000)
    return model

def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    y_pred = [np.argmax(y) for y in y_pred]
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

def objective(trial, X_train, y_train):
    params = {
        'objective': 'multiclass',
        'num_class': 7,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', -1, 20),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 0.95),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 0.95),
        'verbosity': -1
    }

    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_index, val_index in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        model = train_model(params, X_tr, y_tr)
        accuracy = evaluate_model(model, X_val, y_val)
        scores.append(accuracy)

    return np.mean(scores)

def optimize_hyperparameters(X_train, y_train, n_trials=2):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=n_trials)
    return study.best_params


### XGB With Optuna

In [3]:

path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'
train_df, val_df, test_df = load_data(path)

train_df = datatypes(train_df)
train_df = encode_target(train_df)
train_df = age_binning(train_df)
train_df, scaler_age = age_scaling_minmax(train_df)
train_df = age_scaling_log(train_df)
train_df, scaler_weight = weight_scaling_minmax(train_df)
train_df = weight_scaling_log(train_df)
train_df, scaler_height = height_scaling_minmax(train_df)
train_df = height_scaling_log(train_df)
train_df = make_gender_binary(train_df)
train_df = fix_binary_columns(train_df)
train_df = freq_cat_cols(train_df)
train_df = Mtrans(train_df)
train_df = other_features(train_df)

val_df = test_pipeline(val_df, scaler_age, scaler_weight, scaler_height)
test_df = test_pipeline(test_df, scaler_age, scaler_weight, scaler_height)

Target = 'NObeyesdad'
# features = train_df.columns.drop(Target)
features = ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'Age_Group', 
       'MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike',
       'MTRANS_Public_Transportation', 'MTRANS_Walking', 'BMI', 'Age^2',
       'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2'] 

       #'Scaled_Age', 'Log_Age', 'Scaled_Weight', 'Log_Weight', 'Scaled_Height', 'Log_Height',


X_train = train_df[features]
y_train = train_df[Target]
X_val = val_df[features]
y_val = val_df[Target]
X_test = test_df[features]
y_test = test_df[Target]

# save X_train, y_train, X_val, X_test, y_test


In [5]:
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support
import mlflow

mlflow.sklearn.autolog(disable=True)

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
        'random_state': 42,
        'objective': 'multi:softmax',
        'num_class': 7,
        'eval_metric': 'mlogloss'
    }

    with mlflow.start_run(run_name="XGB_with_Feature_Engineering_optuna_" + str(trial.number), nested=True):
        # Log hyperparameters as a single dictionary
        mlflow.log_params(params)

        # Calculate target drift and log
        class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]
        class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]
        target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]
        mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})
        print(f"Target Drift For Each Class {target_drift}")

        # Train XGBoost model
        model = XGBClassifier(**params)
        cv_scores_xgb = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        print("Cross-validation Scores (XGBoost):", cv_scores_xgb)
        print("Mean CV Accuracy (XGBoost):", cv_scores_xgb.mean())
        

        # Log CV metrics
        mlflow.log_metric('mean_cv_accuracy', cv_scores_xgb.mean())

        # Fit model on full training data
        model.fit(X_train, y_train)
        y_val_pred_xgb = model.predict(X_val)

        # Calculate evaluation metrics
        accuracy_xgb = accuracy_score(y_val, y_val_pred_xgb)
        precision_xgb = precision_score(y_val, y_val_pred_xgb, average='weighted')
        recall_xgb = recall_score(y_val, y_val_pred_xgb, average='weighted')
        f1_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb)
        print("\nAccuracy (XGBoost):", accuracy_xgb)
        print("Precision (XGBoost):", precision_xgb)
        print("Recall (XGBoost):", recall_xgb)
        print("F1 (XGBoost):", f1_xgb)

        # Log evaluation metrics
        mlflow.log_metric('accuracy', accuracy_xgb)
        mlflow.log_metric('precision', precision_xgb)
        mlflow.log_metric('recall', recall_xgb)
        mlflow.log_metric('f1', f1_xgb)

        # Calculate and log precision, recall for each class
        precision_per_class, recall_per_class, _, _ = precision_recall_fscore_support(y_val, y_val_pred_xgb, average=None)
        for i in range(len(recall_per_class)):
            print(f"Recall for class {i}: {recall_per_class[i]}")
            mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])

        # Log XGBoost model
        mlflow.xgboost.log_model(model, 'model')

        # Set tags
        mlflow.set_tag('experiments', 'Arham A.')
        mlflow.set_tag('model_name', 'XGBoost')
        mlflow.set_tag('preprocessing', 'Yes')

    return cv_scores_xgb.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

print('Number of finished trials:', len(study.trials))
print('Best trial:')
trial = study.best_trial
print('  Value: {:.5f}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


[I 2024-04-25 15:07:02,465] A new study created in memory with name: no-name-64002971-707f-4151-be7c-8ed470ee8bc1
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.88003705 0.88235294 0.86984715 0.86376274 0.87673772]
Mean CV Accuracy (XGBoost): 0.8745475220438529

Accuracy (XGBoost): 0.8684698036309745
Precision (XGBoost): 0.8684692480457514
Recall (XGBoost): 0.8684698036309745
F1 (XGBoost): 0.8684695258382741
Recall for class 0: 0.9113924050632911
Recall for class 1: 0.8770053475935828
Recall for class 2: 0.7432835820895523
Recall for class 3: 0.7355623100303952
Recall for class 4: 0.8218527315914489
Recall for class 5: 0.9330143540669856
Recall for class 6: 0.9901185770750988


[I 2024-04-25 15:07:19,496] Trial 0 finished with value: 0.8745475220438529 and parameters: {'max_depth': 5, 'learning_rate': 0.013394988506670473, 'n_estimators': 77, 'min_child_weight': 10, 'subsample': 1.0, 'colsample_bytree': 0.7, 'gamma': 6.905297068817435e-07, 'reg_alpha': 0.678891339417352, 'reg_lambda': 2.8230926058910324}. Best is trial 0 with value: 0.8745475220438529.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.90504863 0.90690134 0.90273275 0.8952734  0.90037071]
Mean CV Accuracy (XGBoost): 0.9020653676808635

Accuracy (XGBoost): 0.9018154872174879
Precision (XGBoost): 0.9021161905127771
Recall (XGBoost): 0.9018154872174879
F1 (XGBoost): 0.9019658138025226
Recall for class 0: 0.9272151898734177
Recall for class 1: 0.9010695187165776
Recall for class 2: 0.7761194029850746
Recall for class 3: 0.8237082066869301
Recall for class 4: 0.8646080760095012
Recall for class 5: 0.9688995215311005
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:10:12,141] Trial 1 finished with value: 0.9020653676808635 and parameters: {'max_depth': 10, 'learning_rate': 0.09249377976464467, 'n_estimators': 721, 'min_child_weight': 7, 'subsample': 0.9, 'colsample_bytree': 0.8, 'gamma': 0.00012452373684121342, 'reg_alpha': 0.71016682383988, 'reg_lambda': 1.2412790195572406e-06}. Best is trial 1 with value: 0.9020653676808635.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.90504863 0.91014358 0.90736452 0.89944393 0.90222428]
Mean CV Accuracy (XGBoost): 0.9048449901075782

Accuracy (XGBoost): 0.9040385327899222
Precision (XGBoost): 0.9045766402024644
Recall (XGBoost): 0.9040385327899222
F1 (XGBoost): 0.9043075064461108
Recall for class 0: 0.9335443037974683
Recall for class 1: 0.9037433155080213
Recall for class 2: 0.7611940298507462
Recall for class 3: 0.8328267477203647
Recall for class 4: 0.8741092636579573
Recall for class 5: 0.9712918660287081
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:10:55,359] Trial 2 finished with value: 0.9048449901075782 and parameters: {'max_depth': 8, 'learning_rate': 0.1480724813458801, 'n_estimators': 201, 'min_child_weight': 10, 'subsample': 0.7, 'colsample_bytree': 0.6, 'gamma': 4.241734906017588e-07, 'reg_alpha': 1.1757709861728217e-05, 'reg_lambda': 3.398219063416917}. Best is trial 2 with value: 0.9048449901075782.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.89717462 0.8994905  0.89161649 0.88878591 0.89712697]
Mean CV Accuracy (XGBoost): 0.8948388988311532

Accuracy (XGBoost): 0.889218228973694
Precision (XGBoost): 0.8896005944786827
Recall (XGBoost): 0.889218228973694
F1 (XGBoost): 0.8894093706305529
Recall for class 0: 0.9208860759493671
Recall for class 1: 0.8823529411764706
Recall for class 2: 0.7522388059701492
Recall for class 3: 0.8206686930091185
Recall for class 4: 0.838479809976247
Recall for class 5: 0.9593301435406698
Recall for class 6: 0.9940711462450593


[I 2024-04-25 15:11:45,267] Trial 3 finished with value: 0.8948388988311532 and parameters: {'max_depth': 6, 'learning_rate': 0.0915177926130075, 'n_estimators': 339, 'min_child_weight': 5, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.0813995583239973, 'reg_alpha': 20.619635618013948, 'reg_lambda': 3.974196714305311e-07}. Best is trial 2 with value: 0.9048449901075782.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.89995368 0.90736452 0.89810097 0.89712697 0.89759036]
Mean CV Accuracy (XGBoost): 0.9000273012812284

Accuracy (XGBoost): 0.8984809188588366
Precision (XGBoost): 0.898791107190973
Recall (XGBoost): 0.8984809188588366
F1 (XGBoost): 0.8986359862574483
Recall for class 0: 0.9272151898734177
Recall for class 1: 0.8903743315508021
Recall for class 2: 0.7522388059701492
Recall for class 3: 0.8358662613981763
Recall for class 4: 0.8669833729216152
Recall for class 5: 0.9641148325358851
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:13:55,859] Trial 4 finished with value: 0.9000273012812284 and parameters: {'max_depth': 9, 'learning_rate': 0.030870513505594194, 'n_estimators': 288, 'min_child_weight': 1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0.9549119955309026, 'reg_alpha': 0.007762320297440327, 'reg_lambda': 2.0978432882529218e-05}. Best is trial 2 with value: 0.9048449901075782.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.90829088 0.90736452 0.90504863 0.89759036 0.90315107]
Mean CV Accuracy (XGBoost): 0.9042890913781608

Accuracy (XGBoost): 0.9058910707669507
Precision (XGBoost): 0.9059794693613187
Recall (XGBoost): 0.9058910707669507
F1 (XGBoost): 0.9059352679077136
Recall for class 0: 0.930379746835443
Recall for class 1: 0.9090909090909091
Recall for class 2: 0.7791044776119403
Recall for class 3: 0.8358662613981763
Recall for class 4: 0.8669833729216152
Recall for class 5: 0.9712918660287081
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:15:44,305] Trial 5 finished with value: 0.9042890913781608 and parameters: {'max_depth': 8, 'learning_rate': 0.04624235053164599, 'n_estimators': 385, 'min_child_weight': 3, 'subsample': 0.6, 'colsample_bytree': 0.6, 'gamma': 2.4010178478007346e-07, 'reg_alpha': 1.2972918999055175e-08, 'reg_lambda': 0.010144470159191883}. Best is trial 2 with value: 0.9048449901075782.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.89671144 0.90458546 0.89671144 0.89388323 0.89851715]
Mean CV Accuracy (XGBoost): 0.8980817415813537

Accuracy (XGBoost): 0.8969988884772138
Precision (XGBoost): 0.897313793211423
Recall (XGBoost): 0.8969988884772138
F1 (XGBoost): 0.8971563132111772
Recall for class 0: 0.9208860759493671
Recall for class 1: 0.8983957219251337
Recall for class 2: 0.7731343283582089
Recall for class 3: 0.7993920972644377
Recall for class 4: 0.8622327790973872
Recall for class 5: 0.9688995215311005
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:18:36,520] Trial 6 finished with value: 0.8980817415813537 and parameters: {'max_depth': 6, 'learning_rate': 0.21361268583130158, 'n_estimators': 989, 'min_child_weight': 6, 'subsample': 0.6, 'colsample_bytree': 0.7, 'gamma': 0.15715908126940034, 'reg_alpha': 0.002876421704879931, 'reg_lambda': 0.00020071476895386069}. Best is trial 2 with value: 0.9048449901075782.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.89532191 0.90273275 0.8994905  0.8952734  0.90176089]
Mean CV Accuracy (XGBoost): 0.8989158901612793

Accuracy (XGBoost): 0.9007039644312709
Precision (XGBoost): 0.9010058877096547
Recall (XGBoost): 0.9007039644312709
F1 (XGBoost): 0.9008549007729195
Recall for class 0: 0.9272151898734177
Recall for class 1: 0.9037433155080213
Recall for class 2: 0.7761194029850746
Recall for class 3: 0.817629179331307
Recall for class 4: 0.8598574821852731
Recall for class 5: 0.9688995215311005
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:19:40,518] Trial 7 finished with value: 0.8989158901612793 and parameters: {'max_depth': 7, 'learning_rate': 0.2701374472461074, 'n_estimators': 220, 'min_child_weight': 4, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 6.952295076389228e-06, 'reg_alpha': 4.7746605073070416e-08, 'reg_lambda': 0.010461996036680242}. Best is trial 2 with value: 0.9048449901075782.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.88652154 0.89022696 0.87633164 0.87395737 0.88693234]
Mean CV Accuracy (XGBoost): 0.8827939684773225

Accuracy (XGBoost): 0.8806965542793628
Precision (XGBoost): 0.8807374877985908
Recall (XGBoost): 0.8806965542793628
F1 (XGBoost): 0.880717020563355
Recall for class 0: 0.9335443037974683
Recall for class 1: 0.8663101604278075
Recall for class 2: 0.7432835820895523
Recall for class 3: 0.7993920972644377
Recall for class 4: 0.8218527315914489
Recall for class 5: 0.9545454545454546
Recall for class 6: 0.9901185770750988


[I 2024-04-25 15:20:20,776] Trial 8 finished with value: 0.8827939684773225 and parameters: {'max_depth': 10, 'learning_rate': 0.030824226564898152, 'n_estimators': 161, 'min_child_weight': 10, 'subsample': 1.0, 'colsample_bytree': 0.8, 'gamma': 0.0002761971902093278, 'reg_alpha': 34.12323171902436, 'reg_lambda': 1.1963423659959008e-06}. Best is trial 2 with value: 0.9048449901075782.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.90180639 0.90690134 0.90180639 0.89341983 0.90176089]
Mean CV Accuracy (XGBoost): 0.9011389699604345

Accuracy (XGBoost): 0.9021859948128936
Precision (XGBoost): 0.9026219001901948
Recall (XGBoost): 0.9021859948128936
F1 (XGBoost): 0.9024038948606238
Recall for class 0: 0.9367088607594937
Recall for class 1: 0.8983957219251337
Recall for class 2: 0.7582089552238805
Recall for class 3: 0.8237082066869301
Recall for class 4: 0.8812351543942993
Recall for class 5: 0.9641148325358851
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:20:52,333] Trial 9 finished with value: 0.9011389699604345 and parameters: {'max_depth': 6, 'learning_rate': 0.30841942106618764, 'n_estimators': 132, 'min_child_weight': 7, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 5.749686185921016e-07, 'reg_alpha': 1.953546914108059e-08, 'reg_lambda': 0.003027552093067664}. Best is trial 2 with value: 0.9048449901075782.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.91014358 0.91014358 0.90736452 0.90083411 0.90222428]
Mean CV Accuracy (XGBoost): 0.9061420155986472

Accuracy (XGBoost): 0.9029270100037051
Precision (XGBoost): 0.9034148150281903
Recall (XGBoost): 0.9029270100037051
F1 (XGBoost): 0.9031708466497482
Recall for class 0: 0.930379746835443
Recall for class 1: 0.9117647058823529
Recall for class 2: 0.764179104477612
Recall for class 3: 0.8328267477203647
Recall for class 4: 0.8622327790973872
Recall for class 5: 0.9688995215311005
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:22:05,710] Trial 10 finished with value: 0.9061420155986472 and parameters: {'max_depth': 3, 'learning_rate': 0.14385928468253742, 'n_estimators': 574, 'min_child_weight': 8, 'subsample': 0.7, 'colsample_bytree': 0.6, 'gamma': 1.5922499564605758e-08, 'reg_alpha': 7.445790014646167e-06, 'reg_lambda': 95.07956676133519}. Best is trial 10 with value: 0.9061420155986472.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.91014358 0.91060676 0.90736452 0.90500463 0.90315107]
Mean CV Accuracy (XGBoost): 0.9072541135432812

Accuracy (XGBoost): 0.9047795479807336
Precision (XGBoost): 0.9054304429990271
Recall (XGBoost): 0.9047795479807336
F1 (XGBoost): 0.9051048784691129
Recall for class 0: 0.9367088607594937
Recall for class 1: 0.9064171122994652
Recall for class 2: 0.7701492537313432
Recall for class 3: 0.8389057750759878
Recall for class 4: 0.8622327790973872
Recall for class 5: 0.9712918660287081
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:23:19,966] Trial 11 finished with value: 0.9072541135432812 and parameters: {'max_depth': 3, 'learning_rate': 0.14050588188423715, 'n_estimators': 601, 'min_child_weight': 9, 'subsample': 0.7, 'colsample_bytree': 0.6, 'gamma': 1.9866742273835196e-08, 'reg_alpha': 1.1945121332468573e-05, 'reg_lambda': 71.93048807006664}. Best is trial 11 with value: 0.9072541135432812.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.90968041 0.91060676 0.91060676 0.90454124 0.90083411]
Mean CV Accuracy (XGBoost): 0.9072538559840245

Accuracy (XGBoost): 0.9044090403853279
Precision (XGBoost): 0.9053349503998932
Recall (XGBoost): 0.9044090403853279
F1 (XGBoost): 0.9048717585333201
Recall for class 0: 0.9367088607594937
Recall for class 1: 0.9117647058823529
Recall for class 2: 0.764179104477612
Recall for class 3: 0.8358662613981763
Recall for class 4: 0.8646080760095012
Recall for class 5: 0.9688995215311005
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:24:34,779] Trial 12 finished with value: 0.9072538559840245 and parameters: {'max_depth': 3, 'learning_rate': 0.13577039129548693, 'n_estimators': 604, 'min_child_weight': 8, 'subsample': 0.7, 'colsample_bytree': 0.6, 'gamma': 2.3371777592215328e-08, 'reg_alpha': 8.403097374567035e-06, 'reg_lambda': 41.38139839851449}. Best is trial 11 with value: 0.9072541135432812.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.8994905  0.90597499 0.90319592 0.89759036 0.89805375]
Mean CV Accuracy (XGBoost): 0.9008611064488117

Accuracy (XGBoost): 0.9036680251945165
Precision (XGBoost): 0.9043871882647396
Recall (XGBoost): 0.9036680251945165
F1 (XGBoost): 0.9040274637042609
Recall for class 0: 0.9272151898734177
Recall for class 1: 0.9010695187165776
Recall for class 2: 0.7731343283582089
Recall for class 3: 0.8267477203647416
Recall for class 4: 0.8693586698337292
Recall for class 5: 0.9760765550239234
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:26:22,855] Trial 13 finished with value: 0.9008611064488117 and parameters: {'max_depth': 3, 'learning_rate': 0.45330520785367046, 'n_estimators': 709, 'min_child_weight': 8, 'subsample': 0.8, 'colsample_bytree': 0.9, 'gamma': 1.050198550207745e-08, 'reg_alpha': 1.4096829741801802e-05, 'reg_lambda': 54.59517406603631}. Best is trial 11 with value: 0.9072541135432812.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),


Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.0087675011457998, -0.001077949504617301, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659]
Cross-validation Scores (XGBoost): [0.90690134 0.91060676 0.90551181 0.90454124 0.90037071]
Mean CV Accuracy (XGBoost): 0.9055863744284867

Accuracy (XGBoost): 0.9044090403853279
Precision (XGBoost): 0.9048073892271471
Recall (XGBoost): 0.9044090403853279
F1 (XGBoost): 0.9046081709525012
Recall for class 0: 0.9335443037974683
Recall for class 1: 0.9117647058823529
Recall for class 2: 0.7611940298507462
Recall for class 3: 0.8297872340425532
Recall for class 4: 0.8717339667458432
Recall for class 5: 0.9712918660287081
Recall for class 6: 0.9960474308300395


[I 2024-04-25 15:27:41,681] Trial 14 finished with value: 0.9055863744284867 and parameters: {'max_depth': 4, 'learning_rate': 0.05892316102120539, 'n_estimators': 515, 'min_child_weight': 8, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 0.002339216477344003, 'reg_alpha': 1.5648083825482848e-06, 'reg_lambda': 0.3956617258269376}. Best is trial 11 with value: 0.9072541135432812.


Number of finished trials: 15
Best trial:
  Value: 0.90725
  Params: 
    max_depth: 3
    learning_rate: 0.14050588188423715
    n_estimators: 601
    min_child_weight: 9
    subsample: 0.7
    colsample_bytree: 0.6
    gamma: 1.9866742273835196e-08
    reg_alpha: 1.1945121332468573e-05
    reg_lambda: 71.93048807006664


In [7]:
!mlflow ui --backend-store-uri "sqlite:////Users/arham/Downloads/Projects/03-Experiments/new_mlflow.db"

[2024-04-26 12:25:15 -0400] [60478] [INFO] Starting gunicorn 21.2.0
[2024-04-26 12:25:15 -0400] [60478] [INFO] Listening at: http://127.0.0.1:5000 (60478)
[2024-04-26 12:25:15 -0400] [60478] [INFO] Using worker: sync
[2024-04-26 12:25:15 -0400] [60479] [INFO] Booting worker with pid: 60479
[2024-04-26 12:25:15 -0400] [60480] [INFO] Booting worker with pid: 60480
[2024-04-26 12:25:15 -0400] [60481] [INFO] Booting worker with pid: 60481
[2024-04-26 12:25:16 -0400] [60482] [INFO] Booting worker with pid: 60482
^C
[2024-04-26 14:01:01 -0400] [60478] [INFO] Handling signal: int
[2024-04-26 14:01:01 -0400] [60482] [INFO] Worker exiting (pid: 60482)
[2024-04-26 14:01:01 -0400] [60480] [INFO] Worker exiting (pid: 60480)
[2024-04-26 14:01:01 -0400] [60479] [INFO] Worker exiting (pid: 60479)
[2024-04-26 14:01:01 -0400] [60481] [INFO] Worker exiting (pid: 60481)
