ICUonAdmission_sepsisPred / Git / [c4ddf6] /wrap_AdvancedAnalysis_wOnset

Models:
RaymondKing/
ICUonAdmission_sepsisPred
Downloads: 1
[c4ddf6]: / wrap_AdvancedAnalysis_wOnset_wAUROC.py
History
Download this file
592 lines (447 with data), 30.8 kB

# -*- coding: utf-8 -*-
"""
Created on Thu Feb  8 14:49:49 2024

@author: aa36
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import  average_precision_score, precision_recall_curve, roc_curve, roc_auc_score
import ast 

# import sys
# sys.path.append('../Methods_utils')  # Add the path to the custom folder
# print(sys.path)

import Methods_utils.methods_cm_time as custom_cm
import Methods_utils.methods as custom
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np

import os

colors = ['#630C3A', '#27C3C1', '#FFC107', '#7E34F9', '#E01889', '#617111','#fe6100',  '#7d413c',
'#423568', '#5590b4']
sns.set_palette(sns.color_palette(colors))


    
## Def plot correct predictions, incorrect predictions vs onset time
def metrics_model (y_test, probabilities, predictions, model):
    print("probs: ", probabilities)
    precision, recall, thresh = precision_recall_curve(y_test,predictions )
    fpr, tpr, _ = roc_curve(y_test, probabilities)
    
    auc = roc_auc_score(y_test, probabilities)
    auprc = average_precision_score(y_test, probabilities)
    
    print("Precision for ", model, " : ", precision)
    print("Recall for ", model, " : ", recall)
    print("Threshold for PR for ", model, " : ", thresh)

    print("AUC for ", model, " : ", auc)
    print("AUPRC for ", model, " : ", auprc)

    return auc, fpr, tpr, auprc, precision, recall

#%% Top 10 selected features from Hetmap and CV10
def heatmap_featureSelection (data_heatmap):
    heatmap_featSel = data_heatmap['Selected Features'].iloc[-1]
    # print("Top 10 most selected features from the CV, as seen in the heatmap:", heatmap_featSel)
    
    return heatmap_featSel

def cv_featureSelection (data_CV):
    best_auprc_row = data_CV.loc[data_CV['Test AUPRC'].idxmax()]

    best_AUPRC = best_auprc_row['Test AUPRC']
    best_features = best_auprc_row['Selected Features']
    feature_selection_method = best_auprc_row['Current Feature Selection']
    model_best_folds = best_auprc_row['Model']

    print("\n--------Choosing the best features from CV -------------------------------------------")
    print("Best AUPRC among the folds:", best_AUPRC)
    print("Corresponding Features:", best_features)
    print("Feature Selection Method:", feature_selection_method)
    print("Model:", model_best_folds)

    # Filter out rows where the feature selection method is LASSO
    data_CV_filtered = data_CV[data_CV['Current Feature Selection'] != 'lasso']

    # Find the row with the highest AUPRC among the remaining rows
    best_auprc_row_noLasso = data_CV_filtered.loc[data_CV_filtered['Test AUPRC'].idxmax()]

    # Step 4: Retrieve the corresponding features, feature selection method, and model for that row
    best_AUPRC_noLasso = best_auprc_row_noLasso['Test AUPRC']
    #if best_auprc_row_noLasso['Selected Features'] == ['All']:
    # If the best set from CV10 is ['All'], then force the full set of features in here
    if 'All' in best_auprc_row_noLasso['Selected Features']:
        print(":::::::::::::::::::::::::::::::::::::::: CASE 1")
        best_features_noLasso = "['c_gender', 'c_vor_diab', 'c_vor_herz' ,'c_vor_atem' ,'c_vor_alko','c_vor_smok', 'c_vor_kidn' ,'c_vor_canc', 'c_ek', 'c_pct', 'c_mechventil','c_dialyse', 'c_ecmo_pecla', 'c_picco' ,'o_sofa_resp', 'o_sofa_cardio','o_sofa_coag' ,'o_sofa_renal', 'o_sofa_liver','n_alter', 'n_kat', 'n_sapsii','n_bddia' ,'n_bdmit', 'n_bdsys', 'n_herzfr', 'n_temp', 'n_ph', 'n_po2' ,'n_pco2','n_fio2pro' ,'n_sbe', 'n_balance', 'n_laktat', 'n_hb' ,'n_blutz', 'n_calcium','n_kalium' ,'n_leuko' ,'n_thrombo' ,'n_bili', 'n_inr' ,'n_ptt' ,'n_ery', 'n_hct','n_crp', 'n_krea' ,'n_harn' ,'n_sofa_total' ,'n_meanlambda' ,'n_delta', 'n_c']"
    else:
        print(":::::::::::::::::::::::::::::::::::::::: CASE 2")
        best_features_noLasso = best_auprc_row_noLasso['Selected Features']
    feature_selection_method_noLasso = best_auprc_row_noLasso['Current Feature Selection']
    model_noLasso = best_auprc_row_noLasso['Model']

    print("\nBest AUPRC (excluding LASSO):", best_AUPRC_noLasso)
    print("Corresponding Features:", best_features_noLasso)
    print("Feature Selection Method:", feature_selection_method_noLasso)
    print("Model:", model_noLasso)
    
    return best_features_noLasso

def averages_AUROC (data_CV):
    best_auprc_row = data_CV.loc[data_CV['Test AUROC'].idxmax()]

    best_AUPRC = best_auprc_row['Test AUROC']
    best_features = best_auprc_row['Selected Features']
    feature_selection_method = best_auprc_row['Current Feature Selection']
    model_best_folds = best_auprc_row['Model']

    print("\n--------Choosing the best features from CV -------------------------------------------")
    print("Best AUROC among the folds:", best_AUPRC)
    print("Corresponding Features:", best_features)
    print("Feature Selection Method:", feature_selection_method)
    print("Model:", model_best_folds)

    # Filter out rows where the feature selection method is LASSO
    data_CV_filtered = data_CV[data_CV['Current Feature Selection'] != 'lasso']

    # Find the row with the highest AUPRC among the remaining rows
    best_auprc_row_noLasso = data_CV_filtered.loc[data_CV_filtered['Test AUROC'].idxmax()]
    best_AUPRC_noLasso = best_auprc_row_noLasso['Test AUROC']
    # if best_auprc_row_noLasso['Selected Features'] == ['All']:
    # If the best set from CV10 is ['All'], then force the full set of features in here
    if 'All' in best_auprc_row_noLasso['Selected Features']:
        print(":::::::::::::::::::::::::::::::::::::::: CASE !")
        best_features_noLasso = "['c_gender', 'c_vor_diab', 'c_vor_herz' ,'c_vor_atem' ,'c_vor_alko','c_vor_smok', 'c_vor_kidn' ,'c_vor_canc', 'c_ek', 'c_pct', 'c_mechventil','c_dialyse', 'c_ecmo_pecla', 'c_picco' ,'o_sofa_resp', 'o_sofa_cardio','o_sofa_coag' ,'o_sofa_renal', 'o_sofa_liver','n_alter', 'n_kat', 'n_sapsii','n_bddia' ,'n_bdmit', 'n_bdsys', 'n_herzfr', 'n_temp', 'n_ph', 'n_po2' ,'n_pco2','n_fio2pro' ,'n_sbe', 'n_balance', 'n_laktat', 'n_hb' ,'n_blutz', 'n_calcium','n_kalium' ,'n_leuko' ,'n_thrombo' ,'n_bili', 'n_inr' ,'n_ptt' ,'n_ery', 'n_hct','n_crp', 'n_krea' ,'n_harn' ,'n_sofa_total' ,'n_meanlambda' ,'n_delta', 'n_c']"
    else:
        print(":::::::::::::::::::::::::::::::::::::::: CASE 2")
        best_features_noLasso = best_auprc_row_noLasso['Selected Features']
    feature_selection_method_noLasso = best_auprc_row_noLasso['Current Feature Selection']
    model_noLasso = best_auprc_row_noLasso['Model']

    print("\nBest AUROC (excluding LASSO):", best_AUPRC_noLasso)
    print("Corresponding Features:", best_features_noLasso)
    print("Feature Selection Method:", feature_selection_method_noLasso)
    print("Model:", model_noLasso)
    
        
    return best_features_noLasso

#%% Best model name retrieval and ML model
def cv_bestAverageModel (data_CV):
    print("\n--------Choosing the best performing model on average -------------------------------------------")
    # group by model and calculate the mean performance
    average_performance_per_model = data_CV.groupby('Model')['Test AUPRC'].mean()

    # calculate the mean performance across all metrics for each model
    average_performance_per_model = data_CV.groupby('Model')['Test AUPRC'].mean()
    
    # find the best average performing model
    best_avg_model = average_performance_per_model.idxmax()
    best_average_auprc = average_performance_per_model.max()

    print("Average Performance of each Model accross the folds:")
    print(average_performance_per_model)

    print("\nBest Average Performing Model:")
    print("Model:", best_avg_model)
    print("Average Performance:", best_average_auprc)

    print("Average best performing model is: ", best_avg_model, best_average_auprc*100)
    
    return best_avg_model 

def cv_bestAverageModel_AUROC_Table (data_CV):
    print("\n--------Choosing the best performing model on average -------------------------------------------")
    # group by model and calculate the mean performance
    average_performance_per_model = data_CV.groupby('Model')['Test AUROC'].mean()

    # calculate the mean performance across all metrics for each model
    average_performance_per_model = data_CV.groupby('Model')['Test AUROC'].mean()
    
    # find the best average performing model
    best_avg_model = average_performance_per_model.idxmax()
    best_average_auprc = average_performance_per_model.max()

    print("Average Performance of each Model accross the folds:")
    print(average_performance_per_model)

    print("\nBest Average Performing Model:")
    print("Model:", best_avg_model)
    print("Average Performance:", best_average_auprc)

    print("Average best performing model AUROC is: ", best_avg_model, best_average_auprc*100)
    
    return best_avg_model 

def ml_model_cm (model_name, X_train, y_train, X_test, y_test, iteration, onset_days_arr, plot_number):
    
    if model_name == 'rf':
        model, auroc, fpr, tpr, auprc, precision, recall, plot_info = custom_cm.random_forest(X_train, y_train, X_test, y_test, True, "RF_" + iteration, onset_days_arr, plot_number)
    elif model_name == 'svm':
        model, auroc, fpr, tpr, auprc, precision, recall, plot_info = custom_cm.svm(X_train, y_train, X_test, y_test, True, "SVM_" + iteration, onset_days_arr, plot_number)
    elif model_name == 'xgb':
        model, auroc, fpr, tpr, auprc, precision, recall, plot_info = custom_cm.xgboost_clf(X_train, y_train, X_test, y_test, True, "XGB_" + iteration, onset_days_arr, plot_number)
    elif model_name == 'ridge':
        model, auroc, fpr, tpr, auprc, precision, recall, plot_info = custom_cm.ridge(X_train, y_train, X_test, y_test, True, "Ridge_" + iteration, onset_days_arr, plot_number)
    elif model_name == 'logistic':
        model, auroc, fpr, tpr, auprc, precision, recall, plot_info = custom_cm.logistic(X_train, y_train, X_test, y_test, True, "Logistic_" + iteration,onset_days_arr, plot_number)
    else:
        print("ERROR")
        pass
    
    return  model, auroc, fpr, tpr, auprc, precision, recall, plot_info

#%% Plot AUROC train and test, plotViolin, print AUROC and AUPRC and sd

def plot_ROC (df_train_OrTest, title_name, save_name):
    colors = {'HeatmapTop10': '#630C3A', 'cv10_FeatSel': '#27C3C1', 'AllFeatures': '#FFC107', 'Baseline': '#7E34F9'}

    plt.figure(figsize=(10, 9))
    plt.rcParams['font.family'] = 'Arial'
    
# 'HeatmapTop10 Features', 'CV10TopAUPRC Features', 'All Features'
    for _, row in df_train_OrTest.iterrows():
        method = row['Iteration Counter']
        label = ''
  
        if method == 'HeatmapTop10':
            label = 'HeatmapTop10 Features'
        elif method == 'cv10_FeatSel':
            label = 'CV10TopAUPRC Features'
        elif method == 'AllFeatures':
            label = 'All Features'
            
        plt.plot(row['FPR'], row['TPR'], marker='o', linestyle='-', color=colors[method], label=f"{label}: {row['AUROC']:.2f}")

    plt.plot([0, 1], [0, 1], linestyle='--', color='black', label='Baseline: 0.5')

    plt.xlabel('False Positive Rate', fontsize = 18+4)
    plt.ylabel('True Positive Rate', fontsize = 18+4)
    plt.title(title_name, fontsize = 20+4)

    plt.xticks(fontsize=16+4)
    plt.yticks(fontsize=16+4)

    # Increase the size of the text in the legend
    legend = plt.legend(prop={'size': 18+4}, loc='lower right')  # Adjust size and location as needed
      # Adjust size as needed

    for text in legend.get_texts():
        parts = text.get_text().split(':')  # Split text at ":"
        if len(parts) > 1:  # Ensure there is a part after ":"
            text.set_text(f"{parts[0]}: $\\mathbf{{{parts[1]}}}$")  # Set LaTeX format for bold text

    plt.tight_layout()
    plt.savefig( save_name + '_' + str(df_train_OrTest['Count'].unique()) + '.png' , dpi=600)
    plt.show()

def plotAUROC_trainAndTest(plot_AUROC_df_train_grouped, plot_AUROC_df_test_grouped, results_directory):
    # Plot each group separately
    for counter_iter, group_train in plot_AUROC_df_train_grouped:
        print("Counter iter is: ", counter_iter)
        print("Counter iter is: ", group_train)
        
        group_test = plot_AUROC_df_test_grouped.get_group(counter_iter)
        print("Group test AUROC: ", group_test['AUROC'])
        
        title_name_train = f'ROC Curve for Training Data'
        save_name_train = results_directory + str(counter_iter) + '_training_holdout'
        plot_ROC(group_train, title_name_train, save_name_train)
        
        title_name_test = f'ROC Curve for Testing Data'
        save_name_test = results_directory + str(counter_iter) + '_testing_holdout'
        plot_ROC(group_test, title_name_test, save_name_test)

##########################################################################
# Violin plot for the variation of the % of correct predictions          #
# of the best average performing model with HeatmapTop10 and CV feats    #
##########################################################################
def plotViolin (plot_info_df, results_directory):
    heatmap_df = plot_info_df[plot_info_df['Feature Selection Method'] == 'HeatmapTop10']

    # Calculate percentage of correct values for each row
    heatmap_df['Percentage Correct'] = (heatmap_df['Correct'] / heatmap_df['Total']) * 100
    # print("This is heatmap_df: _______________________________", heatmap_df)

    unique_values = heatmap_df['Percentage Correct'].unique()
    print("Unique values in 'Percentage Correct' column:", unique_values)

    heatmap_df['Percentage Correct'] = pd.to_numeric(heatmap_df['Percentage Correct'], errors='coerce')

    colors_violin = ['#8c95c5', '#4d004b',  '#b6cde2']
    sns.set_palette(sns.color_palette(colors_violin))


    # Create violin plot
    plt.figure(figsize=(10, 6))
    sns.violinplot(data = heatmap_df, x = 'time_categories', y = 'Percentage Correct', cmap = colors_violin)
    plt.title('Violin Plot - HeatmapTop10 Features')
    plt.xlabel('Time Categories')
    plt.ylabel('Percentage Correct Sepsis Predictions')
    plt.savefig(results_directory + "violin_plot_heatmap10.png", dpi = 600)
    plt.show()

    heatmap_df.to_csv(results_directory + "heatmap_df_violinData.csv")

    ##### Violin plot CV10
    cv10_df_violin = plot_info_df[plot_info_df['Feature Selection Method'] == 'cv10_FeatSel']

    # Calculate percentage of correct values for each row
    cv10_df_violin['Percentage Correct'] = (cv10_df_violin['Correct'] / cv10_df_violin['Total']) * 100
    unique_values_CV10 = cv10_df_violin['Percentage Correct'].unique()

    cv10_df_violin['Percentage Correct'] = pd.to_numeric(cv10_df_violin['Percentage Correct'], errors='coerce')

    colors_violin = ['#8c95c5', '#4d004b', '#b6cde2']
    sns.set_palette(sns.color_palette(colors_violin))


    # Create violin plot
    plt.figure(figsize=(10, 6))
    sns.violinplot(data = cv10_df_violin, x = 'time_categories', y = 'Percentage Correct', cmap = colors_violin)
    plt.title('Violin Plot - CV10 Features')
    plt.xlabel('Time Categories')
    plt.ylabel('Percentage Correct Sepsis Predictions')
    plt.savefig(results_directory + "violin_plot_CV10.png", dpi = 600)
    plt.show()

    cv10_df_violin.to_csv(results_directory + "cv10_df_violinData.csv")

##########################################################################
# AUROC and AUPRC values with their respective standard deviations after #
# taking into accounts all the iterations (here 20 (0, 19))              #
##########################################################################
def print_AUROCandAUPRC_andSTD (results_dict):
    logistic_results = {k: [] for k in results_dict.keys()}
    for i, model in enumerate(results_dict['Model']):
        if 'LogisticRegression' in str(model):
            for key, value in results_dict.items():
                logistic_results[key].append(value[i])

    # Group by features
    grouped_results = {}
    for model, features, aurocs in zip(logistic_results['Model'], logistic_results['Features'], logistic_results['AUROC']):
        features_tuple = tuple(features)  # Convert list to tuple
        if features_tuple not in grouped_results:
            grouped_results[features_tuple] = []
        grouped_results[features_tuple].append(aurocs)


    # Compute standard deviation for each feature set
    mean_stddev_dict = {}
    for features, aurocs in grouped_results.items():
        mean_stddev_dict[features] = {
            'mean': np.mean(aurocs),
            'stddev': np.std(aurocs)
        }

    # Print mean and standard deviation for each feature set
    print("Mean and standard deviation of AUROC for models containing 'LogisticRegression' grouped by features:")
    for features, values in mean_stddev_dict.items():
        print(f"Features: {features}, Mean AUROC: {values['mean']}, Stddev: {values['stddev']}")
        
        
        
    ### AUPRC mean and stddev
    print("")
    logistic_results = {k: [] for k in results_dict.keys()}
    for i, model in enumerate(results_dict['Model']):
        if 'LogisticRegression' in str(model):
            for key, value in results_dict.items():
                logistic_results[key].append(value[i])

    # Group by features
    grouped_results = {}
    for model, features, aurocs in zip(logistic_results['Model'], logistic_results['Features'], logistic_results['AUPRC']):
        features_tuple = tuple(features)  # Convert list to tuple
        if features_tuple not in grouped_results:
            grouped_results[features_tuple] = []
        grouped_results[features_tuple].append(aurocs)


    # Compute standard deviation for each feature set
    mean_stddev_dict = {}
    for features, aurocs in grouped_results.items():
        mean_stddev_dict[features] = {
            'mean': np.mean(aurocs),
            'stddev': np.std(aurocs)
        }

    # Print mean and standard deviation for each feature set
    print("Mean and standard deviation of AUPRC for models containing 'LogisticRegression' grouped by features:")
    for features, values in mean_stddev_dict.items():
        print(f"Features: {features}, Mean AUPRC: {values['mean']}, Stddev: {values['stddev']}")

#%% Retrieve data and make it usable
def getData(data_path, CV_nr):
    # read the all CVs to extract the avg perf model ^ best score
    cv_resPath = './resultsAllCVs_pipeline_' + str(CV_nr) + '__split_' + str(CV_nr) + '.csv' #'C:/Users/aa36.MEDMA/Desktop/ML_paper/Restructured_withConfusionMatrix_Balanced/resultsAllCVs_pipeline_10__split_10.csv'
    data_CV = pd.read_csv(cv_resPath, encoding='latin-1', sep='~')

    # read heatmap cv to get the features
    heatmap_resPath = './final_stratif.csv'
    data_heatmap = pd.read_csv(heatmap_resPath, encoding='latin-1', sep=',')

    data = pd.read_csv(data_path, encoding='latin-1', sep='~')
    # print(data.columns.values)


    onset_time_path = 'C:/Users/aa36.MEDMA/Desktop/ML_paper/fbentriesProgV3.csv'
    data_onset = pd.read_csv(onset_time_path, encoding='latin-1', sep='~')

    heatmap_featSel = heatmap_featureSelection(data_heatmap)
    print("\nTop 10 most selected features from the CV, as seen in the heatmap:", heatmap_featSel)

    cv_featSel = cv_featureSelection (data_CV) #"['c_gender', 'c_vor_diab', 'c_vor_herz' ,'c_vor_atem' ,'c_vor_alko','c_vor_smok', 'c_vor_kidn' ,'c_vor_canc', 'c_ek', 'c_pct', 'c_mechventil','c_dialyse', 'c_ecmo_pecla', 'c_picco' ,'o_sofa_resp', 'o_sofa_cardio','o_sofa_coag' ,'o_sofa_renal', 'o_sofa_liver','n_alter', 'n_kat', 'n_sapsii','n_bddia' ,'n_bdmit', 'n_bdsys', 'n_herzfr', 'n_temp', 'n_ph', 'n_po2' ,'n_pco2','n_fio2pro' ,'n_sbe', 'n_balance', 'n_laktat', 'n_hb' ,'n_blutz', 'n_calcium','n_kalium' ,'n_leuko' ,'n_thrombo' ,'n_bili', 'n_inr' ,'n_ptt' ,'n_ery', 'n_hct','n_crp', 'n_krea' ,'n_harn' ,'n_sofa_total' ,'n_meanlambda' ,'n_delta', 'n_c']"  
        #cv_featureSelection (data_CV)

    print("\nTop 10 most selected features from the CV, based on best AUPRC:", cv_featSel)

    best_avg_model = cv_bestAverageModel (data_CV)

    # aurocs_averages = cv_bestAverageModel_AUROC_Table(data_CV)
    # auprcs_averages_manuscriptTable = cv_bestAverageModel (data_CV)
    ## Although it does not guarantee that the best model will actually do great 
    ## with the features from the best auprc in the folds
    ## still the model "saw" those features at a certain point
    ## and because that was specific to the split which model does best

    #%% Retrain avg_best_model on the 90% data 
    # data split
    data2 = data[['c_gender', 'c_vor_diab', 'c_vor_herz' ,'c_vor_atem' ,'c_vor_alko',
    'c_vor_smok', 'c_vor_kidn' ,'c_vor_canc', 'c_ek', 'c_pct', 'c_mechventil',
    'c_dialyse', 'c_ecmo_pecla', 'c_picco' ,'o_sofa_resp', 'o_sofa_cardio',
    'o_sofa_coag' ,'o_sofa_renal', 'o_sofa_liver','n_alter', 'n_kat', 'n_sapsii',
    'n_bddia' ,'n_bdmit', 'n_bdsys', 'n_herzfr', 'n_temp', 'n_ph', 'n_po2' ,'n_pco2',
    'n_fio2pro' ,'n_sbe', 'n_balance', 'n_laktat', 'n_hb' ,'n_blutz', 'n_calcium',
    'n_kalium' ,'n_leuko' ,'n_thrombo' ,'n_bili', 'n_inr' ,'n_ptt' ,'n_ery', 'n_hct',
    'n_crp', 'n_krea' ,'n_harn' ,'n_sofa_total' ,'n_meanlambda' ,'n_delta', 'n_c']].copy()

    y_toSplit = data['event']
    X = data2

    featureSelection_options_str = [heatmap_featSel, cv_featSel]

    featureSelection_options = [ast.literal_eval(s) for s in featureSelection_options_str]
    print(featureSelection_options)

    featureSelection_options.append(['c_gender', 'c_vor_diab', 'c_vor_herz' ,'c_vor_atem' ,'c_vor_alko',
    'c_vor_smok', 'c_vor_kidn' ,'c_vor_canc', 'c_ek', 'c_pct', 'c_mechventil',
    'c_dialyse', 'c_ecmo_pecla', 'c_picco' ,'o_sofa_resp', 'o_sofa_cardio',
    'o_sofa_coag' ,'o_sofa_renal', 'o_sofa_liver','n_alter', 'n_kat', 'n_sapsii',
    'n_bddia' ,'n_bdmit', 'n_bdsys', 'n_herzfr', 'n_temp', 'n_ph', 'n_po2' ,'n_pco2',
    'n_fio2pro' ,'n_sbe', 'n_balance', 'n_laktat', 'n_hb' ,'n_blutz', 'n_calcium',
    'n_kalium' ,'n_leuko' ,'n_thrombo' ,'n_bili', 'n_inr' ,'n_ptt' ,'n_ery', 'n_hct',
    'n_crp', 'n_krea' ,'n_harn' ,'n_sofa_total' ,'n_meanlambda' ,'n_delta', 'n_c'])

    return X, y_toSplit, featureSelection_options, data_onset, best_avg_model

#%% Train models and use other functions
def trainModels_andTest(X, y_toSplit, featureSelection_options, data_onset, best_avg_model, number_ofIterations):
    results_dir = "./Results_iterationPlots/"
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
        print("....Created results directory...")
    
    iteration = ['HeatmapTop10', 'cv10_FeatSel', 'AllFeatures']
    count = 0

    results_dict = {'Model': [], 'Features': [], 'AUROC': [], 'AUPRC': [], 'Precision': [], 'Recall': []}
    plot_info_df = pd.DataFrame(columns=['Iteration Counter', 'Feature Selection Method', 'time_categories', 'Total', 'Correct', 'Incorrect'])
    plot_AUROC_df_train = pd.DataFrame(columns=['Count', 'Iteration Counter', 'Feature Selection Method', 'AUROC', 'FPR', 'TPR'])
    plot_AUROC_df_test = pd.DataFrame(columns=['Count','Iteration Counter', 'Feature Selection Method', 'AUROC', 'FPR', 'TPR'])

    undersample = RandomUnderSampler(sampling_strategy=1)

    X_train_unscaled_imbalanced, X_test_unscaled, y_train_imbalanced, y_test = train_test_split(X, y_toSplit, 
                                                                        stratify=y_toSplit,
                                                                        test_size=0.1 , 
                                                                        random_state = 1)

    print("Cases and controls hold-out aka test data: \n", y_test.value_counts())
    print("Cases and controls training data: \n", y_train_imbalanced.value_counts())

    subjects_index_with_sepsis = y_test[y_test == 1].index
    # Filter based on the index values where y_test is equal to 1, because onset can be only for sepsis (thus 1)
    onset_days_arr = data_onset.loc[subjects_index_with_sepsis, 'n_onset_days']
    onset_array = onset_days_arr

    y_test.reset_index(drop=True)

    for counter_iter in range (0, number_ofIterations):
        count = 0
        
        plot_number = results_dir + str(counter_iter)

        for features in featureSelection_options:
            print("-----------------------> Using the features from ", iteration[count] )
            X_toScale = X_train_unscaled_imbalanced[features].copy()
            X_test_featsGood = X_test_unscaled[features].copy()
            print("Currently working with: ", X_toScale)
            
            X_train_unscaled, y_train = undersample.fit_resample(X_toScale, y_train_imbalanced)
            
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train_unscaled)
            X_test = scaler.fit_transform(X_test_featsGood)
            
            print("Cases and controls training data balanced: \n", y_train.value_counts())
            
            model, auroc_model, fpr_model, tpr_model, auprc_model, precision_model, recall_model, plot_info = ml_model_cm (best_avg_model, X_train, y_train, X_test, y_test, iteration[count], onset_array, plot_number)
            results_dict['Model'].append(model)
            results_dict['Features'].append(features)
            results_dict['AUROC'].append(auroc_model)
            results_dict['AUPRC'].append(auprc_model)
            results_dict['Precision'].append(precision_model)
            results_dict['Recall'].append(recall_model)
            
            plot_info = plot_info.reset_index()
            # print("PLOTTING INFORMATIONNNNNNNNNNN: ", plot_info)
            plot_info['Iteration Counter'] = counter_iter
            plot_info['Feature Selection Method'] = iteration[count]
            
            # Append the current iteration's plot_info to the main plot_info DataFrame
            plot_info_df = pd.concat([plot_info_df, plot_info], ignore_index=True)
            
            plot_AUROC_df_test = plot_AUROC_df_test.append({'Count': counter_iter,
                                                            'Iteration Counter': iteration[count],
                                                    'Feature Selection Method': features,
                                                    'AUROC': auroc_model,
                                                    'FPR': fpr_model,
                                                    'TPR': tpr_model}, ignore_index=True)
        
            predictions_model_train = model.predict(X_train)
            
            model_Grid_probabilities_train = model.predict_proba(X_train)
            model_probabilities_train = model_Grid_probabilities_train[:,1]
            
            
            auc_train, fpr_train, tpr_train, auprc_train, precision_train, recall_train = metrics_model(y_train, model_probabilities_train, predictions_model_train, model)
            
            plot_AUROC_df_train = plot_AUROC_df_train.append({'Count': counter_iter,
                                                            'Iteration Counter': iteration[count],
                                                        'Feature Selection Method': features,
                                                        'AUROC': auc_train,
                                                        'FPR': fpr_train,
                                                        'TPR': tpr_train}, ignore_index=True)
        
            # Append the current iteration's plot_info to the main plot_info dataframe
            # plot_info_df = pd.concat([plot_info_df, iteration_plot_info_df], ignore_index=True)
        
            count = count + 1
        
    title_name_train = 'ROC Curve for Training Data'
    title_name_test = 'ROC Curve for Testing Data'

    save_name_train = results_dir + str(plot_number) + "training_holdout" + ".png"  # plot number is actually the iteration number to be used in saving the plot
    save_name_test = results_dir + str(plot_number) + "testing_holdout.png"

    plot_AUROC_df_train_grouped = plot_AUROC_df_train.groupby('Count')
    plot_AUROC_df_test_grouped = plot_AUROC_df_test.groupby('Count')

    # Plot each group separately
    plotAUROC_trainAndTest (plot_AUROC_df_train_grouped, plot_AUROC_df_test_grouped, results_dir)

    
    #%% Dummies: majority, minority, stratified
    model_dummy_majority, auroc_dummy_majority, fpr_dummy_majority, tpr_dummy_majority, auprc_dummy_majority, precision_dummy_majority, recall_dummy_majority = custom.dummy_clf_majority0 (X_train, y_train, X_test, y_test)#, True, "Dummy_majority_" + iteration[count-1])
    results_dict['Model'].append(model_dummy_majority)
    results_dict['Features'].append(featureSelection_options[1])
    results_dict['AUROC'].append(auroc_dummy_majority)
    results_dict['AUPRC'].append(auprc_dummy_majority)
    results_dict['Precision'].append(precision_dummy_majority)
    results_dict['Recall'].append(recall_dummy_majority)

    model_dummy_minority, auroc_dummy_minority, fpr_dummy_minority, tpr_dummy_minority, auprc_dummy_minority, precision_dummy_minority, recall_dummy_minority = custom.dummy_clf_minority1(X_train, y_train, X_test, y_test)#, True, "Dummy_minority_" + iteration[count-1])
    results_dict['Model'].append(model_dummy_minority)
    results_dict['Features'].append(featureSelection_options[1])
    results_dict['AUROC'].append(model_dummy_minority)
    results_dict['AUPRC'].append(auprc_dummy_minority)
    results_dict['Precision'].append(precision_dummy_minority)
    results_dict['Recall'].append(recall_dummy_minority)

    model_dummy_stratif, auroc_dummy_stratif, fpr_dummy_stratif, tpr_dummy_stratif, auprc_dummy_stratif, precision_dummy_stratif, recall_dummy_stratif = custom.dummy_clf(X_train, y_train, X_test, y_test)#, True, "Dummy_stratif_" + iteration[count-1])
    results_dict['Model'].append(model_dummy_stratif)
    results_dict['Features'].append(featureSelection_options[1])
    results_dict['AUROC'].append(model_dummy_stratif)
    results_dict['AUPRC'].append(auprc_dummy_stratif)
    results_dict['Precision'].append(precision_dummy_stratif)
    results_dict['Recall'].append(recall_dummy_stratif)

    #%% Results
    print(results_dict)

    plotViolin (plot_info_df, results_dir)
    print_AUROCandAUPRC_andSTD (results_dict)

#%% Wrap this .py script
def wrapAdvancedAnalysis (data_path, CV_nr, number_ofIterations):
    X, y_toSplit, featureSelection_options, data_onset, best_avg_model = getData(data_path, CV_nr)
    trainModels_andTest(X, y_toSplit, featureSelection_options, data_onset, best_avg_model, number_ofIterations)


#data_path = 'C:/Users/aa36.MEDMA/Desktop/Franzi/CC_QtJune/New_Bianka/fbentriesProgV2.csv'
#wrapAdvancedAnalysis (data_path)