import math
import pathlib
import pickle
import random
import numpy as np
import pandas as pd
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
KFold,
StratifiedKFold,
StratifiedShuffleSplit,
train_test_split,
)
from sklearn.tree import DecisionTreeRegressor
from app.core.evaluation import eval_metrics
from app.core.utils import init_random
from app.datasets.base import load_data
from app.datasets.dl import Dataset
from app.datasets.ml import flatten_dataset, numpy_dataset
from app.models import (
build_model_from_cfg,
get_multi_task_loss,
predict_all_visits_bce_loss,
predict_all_visits_mse_loss,
)
from app.utils import perflog
def train(x, y, method, cfg, seed=42):
if method == "xgboost":
model = xgb.XGBRegressor(
objective="reg:squarederror",
eval_metric="mae",
verbosity=0,
learning_rate=cfg.learning_rate,
max_depth=cfg.max_depth,
min_child_weight=cfg.min_child_weight,
n_estimators=1000,
use_label_encoder=False,
random_state=seed,
)
model.fit(x, y)
elif method == "gbdt":
method = GradientBoostingRegressor(
random_state=seed,
learning_rate=cfg.learning_rate,
n_estimators=cfg.n_estimators,
subsample=cfg.subsample,
)
model = method.fit(x, y)
elif method == "random_forest":
method = RandomForestRegressor(
random_state=seed,
max_depth=cfg.max_depth,
min_samples_split=cfg.min_samples_split,
n_estimators=cfg.n_estimators,
)
model = method.fit(x, y)
elif method == "decision_tree":
model = DecisionTreeRegressor(random_state=seed, max_depth=cfg.max_depth)
model.fit(x, y)
elif method == "catboost":
model = CatBoostRegressor(
random_seed=seed,
iterations=cfg.iterations, # performance is better when iterations = 100
learning_rate=cfg.learning_rate,
depth=cfg.depth,
verbose=None,
silent=True,
allow_writing_files=False,
loss_function="MAE",
)
model.fit(x, y)
return model
def validate(x, y, model, los_statistics):
"""val/test"""
y_pred = model.predict(x)
y = reverse_zscore_los(y, los_statistics)
y_pred = reverse_zscore_los(y_pred, los_statistics)
evaluation_scores = eval_metrics.print_metrics_regression(y, y_pred, verbose=0)
return evaluation_scores
def calculate_los_statistics(y):
"""calculate los's mean/std"""
mean, std = y.mean(), y.std()
los_statistics = {"los_mean": mean, "los_std": std}
return los_statistics
def zscore_los(y, los_statistics):
"""zscore scale y"""
y = (y - los_statistics["los_mean"]) / los_statistics["los_std"]
return y
def reverse_zscore_los(y, los_statistics):
"""reverse zscore y"""
y = y * los_statistics["los_std"] + los_statistics["los_mean"]
return y
def start_pipeline(cfg):
dataset_type, mode, method, num_folds, train_fold = (
cfg.dataset,
cfg.mode,
cfg.model,
cfg.num_folds,
cfg.train_fold,
)
# Load data
x, y, x_lab_length = load_data(dataset_type)
x, y_outcome, y_los, x_lab_length = numpy_dataset(x, y, x_lab_length)
all_history = {}
test_performance = {
"test_mad": [],
"test_mse": [],
"test_mape": [],
"test_rmse": [],
}
kfold_test = StratifiedKFold(
n_splits=num_folds, shuffle=True, random_state=cfg.dataset_split_seed
)
skf = kfold_test.split(np.arange(len(x)), y_outcome)
for fold_test in range(train_fold):
train_and_val_idx, test_idx = next(skf)
print("====== Test Fold {} ======".format(fold_test + 1))
sss = StratifiedShuffleSplit(
n_splits=1,
test_size=1 / (num_folds - 1),
random_state=cfg.dataset_split_seed,
)
sub_x = x[train_and_val_idx]
sub_x_lab_length = x_lab_length[train_and_val_idx]
sub_y = y[train_and_val_idx]
sub_y_los = sub_y[:, :, 1]
sub_y_outcome = sub_y[:, 0, 0]
train_idx, val_idx = next(
sss.split(np.arange(len(train_and_val_idx)), sub_y_outcome)
)
x_train, y_train, _ = flatten_dataset(
sub_x, sub_y, train_idx, sub_x_lab_length, case="los"
)
los_statistics = calculate_los_statistics(y_train)
print(los_statistics)
y_train = zscore_los(y_train, los_statistics)
x_val, y_val, _ = flatten_dataset(
sub_x, sub_y, val_idx, sub_x_lab_length, case="los"
)
y_val = zscore_los(y_val, los_statistics)
x_test, y_test, _ = flatten_dataset(x, y, test_idx, x_lab_length, case="los")
y_test = zscore_los(y_test, los_statistics)
all_history["test_fold_{}".format(fold_test + 1)] = {}
history = {"val_mad": [], "val_mse": [], "val_mape": [], "val_rmse": []}
for seed in cfg.model_init_seed:
init_random(seed)
if cfg.train == True:
model = train(x_train, y_train, method, cfg, seed)
pd.to_pickle(
model, f"checkpoints/{cfg.name}_{fold_test + 1}_seed{seed}.pth"
)
if mode == "val":
val_evaluation_scores = validate(x_val, y_val, model, los_statistics)
history["val_mad"].append(val_evaluation_scores["mad"])
history["val_mse"].append(val_evaluation_scores["mse"])
history["val_mape"].append(val_evaluation_scores["mape"])
history["val_rmse"].append(val_evaluation_scores["rmse"])
print(
f"Performance on val set {fold_test+1}: \
MAE = {val_evaluation_scores['mad']}, \
MSE = {val_evaluation_scores['mse']}, \
MAPE = {val_evaluation_scores['mape']},\
RMSE = {val_evaluation_scores['rmse']}"
)
elif mode == "test":
model = pd.read_pickle(
f"checkpoints/{cfg.name}_{fold_test + 1}_seed{seed}.pth"
)
test_evaluation_scores = validate(x_test, y_test, model, los_statistics)
test_performance["test_mad"].append(test_evaluation_scores["mad"])
test_performance["test_mse"].append(test_evaluation_scores["mse"])
test_performance["test_mape"].append(test_evaluation_scores["mape"])
test_performance["test_rmse"].append(test_evaluation_scores["rmse"])
print(
f"Performance on test set {fold_test+1}: \
MAE = {test_evaluation_scores['mad']}, \
MSE = {test_evaluation_scores['mse']}, \
MAPE = {test_evaluation_scores['mape']}, \
RMSE = {test_evaluation_scores['rmse']}"
)
all_history["test_fold_{}".format(fold_test + 1)] = history
if mode == "val":
# Calculate average performance on 10-fold val set
val_mad_list = []
val_mse_list = []
val_mape_list = []
val_rmse_list = []
for f in range(train_fold):
val_mad_list.extend(all_history[f"test_fold_{f + 1}"]["val_mad"])
val_mse_list.extend(all_history[f"test_fold_{f + 1}"]["val_mse"])
val_mape_list.extend(all_history[f"test_fold_{f + 1}"]["val_mape"])
val_rmse_list.extend(all_history[f"test_fold_{f + 1}"]["val_rmse"])
val_mad_list = np.array(val_mad_list)
val_mse_list = np.array(val_mse_list)
val_mape_list = np.array(val_mape_list)
val_rmse_list = np.array(val_rmse_list)
print("====================== VAL RESULT ======================")
print("MAE: {:.3f} ({:.3f})".format(val_mad_list.mean(), val_mad_list.std()))
print("MSE: {:.3f} ({:.3f})".format(val_mse_list.mean(), val_mse_list.std()))
print("MAPE: {:.3f} ({:.3f})".format(val_mape_list.mean(), val_mape_list.std()))
print("RMSE: {:.3f} ({:.3f})".format(val_rmse_list.mean(), val_rmse_list.std()))
perflog.process_and_upload_performance(
cfg,
mae=val_mad_list,
mse=val_mse_list,
rmse=val_rmse_list,
mape=val_mape_list,
verbose=1,
upload=cfg.db,
)
elif mode == "test":
# Calculate average performance on 10-fold test set
test_mad_list = np.array(test_performance["test_mad"])
test_mse_list = np.array(test_performance["test_mse"])
test_mape_list = np.array(test_performance["test_mape"])
test_rmse_list = np.array(test_performance["test_rmse"])
print("====================== TEST RESULT ======================")
print("MAE: {:.3f} ({:.3f})".format(test_mad_list.mean(), test_mad_list.std()))
print("MSE: {:.3f} ({:.3f})".format(test_mse_list.mean(), test_mse_list.std()))
print(
"MAPE: {:.3f} ({:.3f})".format(test_mape_list.mean(), test_mape_list.std())
)
print(
"RMSE: {:.3f} ({:.3f})".format(test_rmse_list.mean(), test_rmse_list.std())
)
print("=========================================================")
perflog.process_and_upload_performance(
cfg,
mae=test_mad_list,
mse=test_mse_list,
rmse=test_rmse_list,
mape=test_mape_list,
verbose=1,
upload=cfg.db,
)