# Load Data

In [2]:
import pandas as pd
from xgboost import XGBClassifier

df = pd.read_excel('dataset.xlsx',engine='openpyxl')
df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)
from sklearn.model_selection import train_test_split
# X = df.drop(['target label / yes no'],axis=1)
# y = df['target label / yes no']
# df_train, df_test= train_test_split(df, test_size=0.2,shuffle=True, random_state=42)

# Grid Search
Search for best paramters and log them in mlflow

In [16]:
from utils import Preprocess, MissingValue

from utils import MLModelSelector


preprocess_param_grid  = {
    'missing_value_per': [0.1, 0.2, 0.3, 0.4, 0.5],
    'variance_threshold': [0.0, 0.01, 0.05, 0.1, 0.2],
    'min_null_per': [0.5]
}

param_grids = {
    # 'SVM': {
    #     'C': [0.1, 1, 10],         
    #     'kernel': ['linear', 'rbf'] 
    # },
    # 'logistic_regression': {
    #     'C': [0.1, 1, 10],         
    #     'solver': ['lbfgs'],        
    #     'max_iter': [500, 1000, 2000] 
    # },
    'random_forest': {
    'n_estimators': [150, 250],  # Number of trees in the forest
    'max_depth': [10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],    # Minimum number of samples required to be at a leaf node
    'bootstrap': [True],       # Whether bootstrap samples are used when building trees
},
    'XGBoost': {
    'n_estimators': [100, 200],      # Number of boosting rounds
    'max_depth': [3, 6],              # Maximum depth of a tree
    'learning_rate': [0.1, 0.2],    # Step size shrinkage used in update to prevent overfitting
    'subsample': [0.6, 1.0],         # Fraction of samples used for training each tree
    'colsample_bytree': [0.8, 1.0],  # Fraction of features used at each split
}
}

best_preprocess_params = None
best_model_params = None
best_overall_score = -1
best_model_name = None
i = 0
for missing_value_per in preprocess_param_grid['missing_value_per']:
    for variance_threshold in preprocess_param_grid['variance_threshold']:
        for min_null_per in preprocess_param_grid['min_null_per']:
            df_train, df_test= train_test_split(df, test_size=0.2,shuffle=True)
            preprocess_params = {
                'missing_value_per': missing_value_per,
                'variance_threshold': variance_threshold,
                'min_null_per': min_null_per
            }
            pre_data = Preprocess(
                dataframe=df_train, 
                missing_value_per=missing_value_per, 
                variance_threshold=variance_threshold, 
                min_null_per=min_null_per
            )
            pre_data.apply()
            df_train = pre_data.dataframe
            missing = MissingValue(original_df=df_train,test_size=0.1)
            df_train = missing.fill_dataframe()
            cols = df_train.columns
            X_train = df_train.drop(['target label / yes no'], axis=1)
            y_train = df_train['target label / yes no']
            
            df_test = df_test[cols]
 
            df_test = pre_data._mapping(df_test)
            missing = MissingValue(original_df=df_test,test_size=0.1)
            df_test = missing.fill_dataframe()
            X_test = df_test.drop(['target label / yes no'], axis=1)
            y_test = df_test['target label / yes no'].values.astype(int)
            
            for model_name, param_grid in param_grids.items():
                model_selector = MLModelSelector()
                best_params, best_score = model_selector.train_model(
                    X_train, y_train,X_test,y_test, 
                    model_name, 
                    param_grid, 
                    preprocess_parameters=preprocess_params  
                )
                
                if best_score > best_overall_score:
                    best_overall_score = best_score
                    best_preprocess_params = preprocess_params
                    best_model_params = best_params
                    best_model_name = model_name
                   
            i = i + 1
            print(i)
            print(best_overall_score)

print(f"Best Preprocess Params: {best_preprocess_params}")
print(f"Best Model Params: {best_model_params}")
print(f"Best Overall Score: {best_overall_score}")


1
0.7454545454545455
2
0.7454545454545455
3
0.7484030554078361
4
0.7546958304853042
5
0.7546958304853042
6
0.7562392881887466
7
0.7562392881887466
8
0.7562392881887466
9
0.7562392881887466
10
0.7562392881887466
11
0.7562392881887466
12
0.7562392881887466
13
0.7562392881887466
14
0.7562392881887466
15
0.7562392881887466
16
0.8621799805212067
17
0.8621799805212067
18
0.8853833897195243
19
0.8853833897195243
20
0.8853833897195243
21
0.8853833897195243
22
0.8853833897195243
23
0.8853833897195243
24
0.8853833897195243
25
0.8853833897195243
Best Preprocess Params: {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}
Best Model Params: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}
Best Overall Score: 0.8853833897195243


# Train Best Model
Use the best parameter and save the model and other important elements

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import Preprocess, MissingValue
import pickle
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

## Load Data

In [58]:
df = pd.read_excel('dataset.xlsx',engine='openpyxl')
df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)

## Load parameters 

In [49]:
best_preprocess_params =  {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}
best_model_params = {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}
best_model_name = 'random_forest'

## Train model

### Split Data

In [59]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
# train, test = train_test_split(df, test_size=0.2, shuffle=True)
train = df.copy()

## Preprocess Train data

In [60]:
preprocessor = Preprocess(
    dataframe=train,
    missing_value_per=best_preprocess_params['missing_value_per'],
    variance_threshold=best_preprocess_params['variance_threshold'],
    min_null_per=best_preprocess_params['min_null_per']
)
preprocessor.apply()
preprocessed_train = preprocessor.dataframe

m = MissingValue(preprocessed_train)
preprocessed_train = m.fill_dataframe()

### Preprocess Test data

In [52]:
test = preprocessor._mapping(test)
m = MissingValue(test)
test = m.fill_dataframe()

### Scale Train Data

In [61]:
sc = StandardScaler()
x_train = preprocessed_train.drop(['target label / yes no'], axis=1)
cols = x_train.columns

x_train = sc.fit_transform(x_train)
y_train = preprocessed_train['target label / yes no'].values.astype('int')

### Train

In [62]:
rf = RandomForestClassifier(
    bootstrap=best_model_params['bootstrap'],
    max_depth=best_model_params['max_depth'],
    min_samples_split=best_model_params['min_samples_split'],
    min_samples_leaf=best_model_params['min_samples_leaf'],
    n_estimators=best_model_params['n_estimators']
)
rf.fit(x_train, y_train)

### Test Before Save

In [9]:
test.shape

(385, 225)

In [63]:
x_test = test.drop(['target label / yes no'], axis=1)
x_test = x_test[cols]
# f = FeatureEngineering(x_test)
# x_test = f.kmeans(5).values
x_test = sc.transform(x_test)
y_test = test['target label / yes no'].values.astype('int')

y_pred = rf.predict(x_test)
# y_pred = xg.predict(x_test)

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')

ValueError: could not convert string to float: 'ZERO'

In [56]:
print(f1)

0.8371721052057187


### Save Scaler and Cols

In [64]:
with open(f'model/columns.pkl', 'wb') as f:
    pickle.dump(cols.to_list(), f)

with open(f'model/scaler.pkl', 'wb') as f:
    pickle.dump(sc, f)

with open(f'model/model.pkl', 'wb') as f:
    pickle.dump(rf, f)

# Loop

In [43]:
best_preprocess_params =  {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}
best_model_params = {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}
best_model_name = 'random_forest'

f1_list = []
for i in range(5):
    print(i)
    i+=1
    train, test = train_test_split(df, test_size=0.2, shuffle=True)
    
    preprocessor = Preprocess(
        dataframe=train,
        missing_value_per=best_preprocess_params['missing_value_per'],
        variance_threshold=best_preprocess_params['variance_threshold'],
        min_null_per=best_preprocess_params['min_null_per']
    )
    preprocessor.apply()
    preprocessed_train = preprocessor.dataframe
    
    m = MissingValue(preprocessed_train)
    preprocessed_train = m.fill_dataframe()
    
    test = preprocessor._mapping(test)
    m = MissingValue(test)
    test = m.fill_dataframe()
    
    sc = StandardScaler()
    x_train = preprocessed_train.drop(['target label / yes no'], axis=1)
    cols = x_train.columns
    
    x_train = sc.fit_transform(x_train)
    y_train = preprocessed_train['target label / yes no'].values.astype('int')
    
    xg = XGBClassifier(
        subsample=1.0,
        colsample_bytree=0.8,
        learning_rate=0.1,
        max_depth=3,
        n_estimators=100
        
    )
    xg.fit(x_train, y_train)
    # rf = RandomForestClassifier(
    #     bootstrap=best_model_params['bootstrap'],
    #     max_depth=best_model_params['max_depth'],
    #     min_samples_split=best_model_params['min_samples_split'],
    #     min_samples_leaf=best_model_params['min_samples_leaf'],
    #     n_estimators=best_model_params['n_estimators']
    # )
    # rf.fit(x_train, y_train)
    
    x_test = test.drop(['target label / yes no'], axis=1)
    x_test = x_test[cols]
    # f = FeatureEngineering(x_test)
    # x_test = f.kmeans(5).values
    x_test = sc.transform(x_test)
    y_test = test['target label / yes no'].values.astype('int')
    
    # y_pred = rf.predict(x_test)
    y_pred = xg.predict(x_test)
    
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f1)
    print('-'*20)
    f1_list.append(f1)

0
0.8465739750445631
--------------------
1
0.8587816632306016
--------------------
2
0.8692866312086448
--------------------
3
0.8277746239161031
--------------------
4
0.8769936338171633
--------------------


In [44]:
sum(f1_list)/len(f1_list)

0.8558821054434151

In [42]:
f1_list

[0.7916205533596838,
 0.843782847316944,
 0.8567467859318522,
 0.8484060247934703,
 0.841208207987869]

In [4]:
import pickle 
with open(f'model/columns.pkl', 'rb') as f:
    cols = pickle.load(f)

with open(f'model/scaler.pkl', 'rb') as f:
    sc = pickle.load(f)

with open(f'model/model.pkl', 'rb') as f:
    model = pickle.load(f)

In [1]:
import pandas as pd 
df = pd.read_excel('dataset.xlsx',engine='openpyxl')
df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)


In [5]:
from utils import Preprocess
pre = Preprocess(df, missing_value_per=0.4, variance_threshold=0.05, min_null_per=0.5)
df = pre._mapping(df)
cols.append('target label / yes no')
df = df[cols]
X = df.drop(['target label / yes no'], axis=1)
X = sc.transform(X)
y = df['target label / yes no'].fillna(0).to_numpy()
y_pre = model.predict(X)

In [6]:
y

array([0, 0, 0, ..., 0, 0, 0], dtype=object)

In [7]:
from sklearn.metrics import f1_score

f1_score(y.astype(int), y_pre.astype(int), average='weighted')

0.9027694811920448