In [4]:
import mlflow


# Set the MLflow tracking URI to a new SQLite URI
mlflow.set_tracking_uri("sqlite:///new_mlflow.db")
mlflow.set_experiment("LGB")



2024/04/26 04:39:52 INFO mlflow.tracking.fluent: Experiment with name 'LGB' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/arham/Downloads/Projects/mlruns/2', creation_time=1714120792214, experiment_id='2', last_update_time=1714120792214, lifecycle_stage='active', name='LGB', tags={}>

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score, f1_score, auc,classification_report
from scipy.stats import ks_2samp

from sklearn.preprocessing import label_binarize,OneHotEncoder, StandardScaler, FunctionTransformer, LabelEncoder
from itertools import cycle

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
import shap

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA

import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd

def load_data(path):
    df = pd.read_csv(path)
    # arham check this later
    # original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')
    # split to train test
    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)
    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)
    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)
    return train_df, test_df

def corr_heat_map(df,scale=1) :
    # Calculate the correlation matrix
    correlation_matrix = df.corr()

    # Create a mask for the upper triangle
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

    # Set up the matplotlib figure
    plt.figure(figsize=(10//scale, 8//scale))

    # Define a custom color palette
    cmap = sns.diverging_palette(220, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": 0.7})

    plt.title('Correlation Heatmap')


path = '/Users/arham/Downloads/Projects/01-Dataset/01-Data-for-model-building/train.csv'
train, test = load_data(path)

target = 'NObeyesdad'
num_col = []
cat_col = []

for i in train.columns.drop([target]) : 
    
    if train[i].dtype == 'object' : 
        cat_col.append(i)
        
    else : 
        num_col.append(i)

# print("Numerical Columns : ", *num_col,"\n",sep="\n")
# print("Categorical Columns : ", *cat_col,sep="\n")


train = pd.get_dummies(train,
                       columns=cat_col)
test = pd.get_dummies(test, 
                      columns=cat_col)

target = 'NObeyesdad'

le = LabelEncoder()
train['NObeyesdad'] = le.fit_transform(train['NObeyesdad'])

X_train, X_val, y_train, y_val = train_test_split(train.drop([target],axis=1),train[target],test_size=0.2,random_state=42)
X_train.shape , y_train.shape, X_val.shape, y_val.shape 

import optuna
ran_optuna = False 

def optimization_function(trial) : 
    
    lgbParams = {
        'num_class': 7,
        'random_state': 42,
        'metric': 'multi_logloss',
        "boosting_type": "gbdt",
        'objective': 'multiclass',
        
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 400, 600),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-1, 10.0),
        'max_depth': trial.suggest_int('max_depth', 6, 20),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
    }
    
    lgb_model=lgb.LGBMClassifier(**lgbParams)
    
#     skf = StratifiedKFold(n_splits=5,shuffle=False, random_state=None)
#     accuracy = cross_val_score(lgb_model,X_train,y_train, cv=skf,scoring='accuracy')
#     print("="*50,'\nValidation Accuracy:', accuracy.mean())

    lgb_model.fit(X_train,y_train)
    
    acc = accuracy_score(y_val,lgb_model.predict(X_val))

        mlflow.log_metric('accuracy', accuracy)
        mlflow.log_metric('precision', precision)
        mlflow.log_metric('recall', recall)
        mlflow.log_metric('f1', f1)

        precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)
        for i in range(len(recall_per_class)):
            print(f"Recall for class {i}: {recall_per_class[i]}")
            mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])

        mlflow.lightgbm.log_model(lgb_model_final, 'model')
        mlflow.set_tag('experiments', 'Arham A.')
        mlflow.set_tag('model_name', 'LightGBM')
        mlflow.set_tag('preprocessing', 'Yes')
    
    return acc

In [2]:
if ran_optuna : 

    print('Number of finished trials:', len(study.trials))

    print('Best trial:', study.best_trial.params)

    optuna.visualization.plot_param_importances(study)

    study.trials_dataframe().sort_values('value',ascending=False)

    optuna.visualization.plot_slice(study)

# 100 trials 
# {'objective': 'multiclassova', 'learning_rate': 0.04641200998070569, 'n_estimators': 587, 'reg_alpha': 0.0065043557057678746, 'reg_lambda': 4.460933310544669, 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 'subsample': 0.8193986843950917, 'min_child_samples': 15}


if ran_optuna : 
    lgbParams = study.best_trial.params

else :
    
#     # 100- traials with PCA seed = None
#     lgbParams = {
#         'objective': 'multiclassova', 
#         'learning_rate': 0.04641200998070569, 
#         'n_estimators': 587, 
#         'reg_alpha': 0.0065043557057678746, 
#         'reg_lambda': 4.460933310544669, 
#         'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 
#         'subsample': 0.8193986843950917, 
#         'min_child_samples': 15
#     }
    
    
    # Moaz HyperParams
    lgbParams = {
        "objective": "multiclass",          # Objective function for the model
        "metric": "multi_logloss",          # Evaluation metric
        "verbosity": -1,                    # Verbosity level (-1 for silent)
        "boosting_type": "gbdt",            # Gradient boosting type
        "random_state": 42,       # Random state for reproducibility
        "num_class": 7,                     # Number of classes in the dataset
        'learning_rate': 0.030962211546832760,  # Learning rate for gradient boosting
        'n_estimators': 500,                # Number of boosting iterations
        'lambda_l1': 0.009667446568254372,  # L1 regularization term
        'lambda_l2': 0.04018641437301800,   # L2 regularization term
        'max_depth': 10,                    # Maximum depth of the trees
        'colsample_bytree': 0.40977129346872643,  # Fraction of features to consider for each tree
        'subsample': 0.9535797422450176,    # Fraction of samples to consider for each boosting iteration
        'min_child_samples': 26             # Minimum number of data needed in a leaf
    }



fixed_params = {
    'boosting_type': 'gbdt',
    'num_class': 7,
    'random_state': 42,
    'metric': 'multi_logloss',
}


for i in fixed_params.keys() : 

    lgbParams[i] = fixed_params[i]


lgbParams



0.9058910707669507

In [6]:


import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
import mlflow
import warnings
warnings.filterwarnings("ignore")
# import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support

mlflow.sklearn.autolog(disable=True)

with mlflow.start_run(run_name="LGB_Final"):
    class_counts_train = [y_train[y_train == i].count() / y_train.count() for i in range(7)]
    class_counts_val = [y_val[y_val == i].count() / y_val.count() for i in range(7)]
    target_drift = [(train_count - val_count) for train_count, val_count in zip(class_counts_train, class_counts_val)]
    print(f"Target Drift For Each Class {target_drift}")
    mlflow.log_params({'Target_Drift_' + str(i): freq for i, freq in enumerate(target_drift)})



    lgb_model_final = lgb.LGBMClassifier(**lgbParams)
    lgb_model_final = lgb_model_final.fit(X_train, y_train)
    y_pred = lgb_model_final.predict(X_val)
    accuracy_xgb = accuracy_score(y_val, y_pred) 
    precision_xgb = precision_score(y_val, y_pred, average='weighted')
    recall_xgb = recall_score(y_val, y_pred, average='weighted')
    f1_xgb = 2 * (precision_xgb * recall_xgb) / (precision_xgb + recall_xgb)
    print("\nAccuracy:", accuracy_xgb)
    print("Precision:", precision_xgb)
    print("Recall:", recall_xgb)
    print("F1", f1_xgb)
    mlflow.log_metric('accuracy', accuracy_xgb)
    mlflow.log_metric('precision', precision_xgb)
    mlflow.log_metric('recall', recall_xgb)
    mlflow.log_metric('f1', f1_xgb)

    precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(y_val, y_pred, average=None)
    for i in range(len(recall_per_class)):
        print(f"Recall for class {i}: {recall_per_class[i]}")
        mlflow.log_metric(f'recall_class_{i}', recall_per_class[i])

    mlflow.lightgbm.log_model(lgb_model_final, 'model')
    mlflow.set_tag('experiments', 'Arham A.')
    mlflow.set_tag('model_name', 'LightGBM')
    mlflow.set_tag('preprocessing', 'Yes')



Target Drift For Each Class [0.004943133623686147, 0.011990707821925795, -0.017190035106736085, -0.00032756263090533144, 0.01042920694244659, -0.0087675011457998, -0.001077949504617301]

Accuracy: 0.9058910707669507
Precision: 0.9067204051187663
Recall: 0.9058910707669507
F1 0.9063055482178468
Recall for class 0: 0.9208860759493671
Recall for class 1: 0.9090909090909091
Recall for class 2: 0.8741092636579573
Recall for class 3: 0.9736842105263158
Recall for class 4: 0.9960474308300395
Recall for class 5: 0.7701492537313432
Recall for class 6: 0.8419452887537994
