--- a +++ b/Methods_utils/methods_heatmap.py @@ -0,0 +1,274 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Jan 29 14:15:21 2024 + +@author: Asus +""" +import pandas as pd +import numpy as np +import seaborn as sns + +from sklearn import preprocessing +from sklearn.model_selection import train_test_split +import time +import matplotlib.pyplot as plt + +from collections import Counter + + + +#%% heatmap_oneFeatureSelectionCV (featSel_folds, save_name); savename should be experim + iteration_max aka CV + foldName +## savename should be experim + iteration_max aka CV + foldName +def heatmap_oneFeatureSelectionCV (featSel_folds, save_name): + # Flatten the array of arrays and get unique elements + flat_data = [item for sublist in featSel_folds for item in sublist] + unique_elements = np.unique(flat_data) + print("Entered heatmap_oneFeatureSelectionCV and this is the save name: ", save_name) + # Create a dictionary to store counts for each element + counts_dict = {element: [sublist.count(element) for sublist in featSel_folds] + [flat_data.count(element)] for element in unique_elements} + + # Create a DataFrame for seaborn + df = pd.DataFrame(counts_dict, index=[f"Array {i}" for i in range(1, len(featSel_folds) + 1)] + ['Total']) + name_to_save = ' '.join(["Heatmap_", save_name, ".png"]) + # Create a heatmap using seaborn + plt.figure(figsize=(10, 7)) + sns.heatmap(df, cmap='viridis', annot=True, fmt="d", cbar=True) + plt.xlabel('Element Name') + plt.ylabel('Array Index') + plt.title('Heatmap of Element Counts in Arrays') + # plt.savefig("Heatmap_" + save_name + ".png") + plt.savefig(name_to_save) + plt.show() + +## we don't include lasso here beauase it never changes +def original_heatmap(*args): + save_name, shap_folds, rf_folds, xgb_folds, ridge_folds, logistic_folds = args + print("Entered Original Heatmap and this si the save name: ", save_name) + # Flatten the arrays of arrays and get unique elements + + flat_data_rf = [item for sublist in rf_folds for item in sublist] + flat_data_xgb = [item for sublist in xgb_folds for item in sublist] + flat_data_ridge = [item for sublist in ridge_folds for item in sublist] + flat_data_logistic = [item for sublist in logistic_folds for item in sublist] + flat_data_shap = [item for sublist in shap_folds for item in sublist] + + unique_elements_rf = np.unique(flat_data_rf) + unique_elements_xgb = np.unique(flat_data_xgb) + unique_elements_ridge = np.unique(flat_data_ridge) + unique_elements_logistic = np.unique(flat_data_logistic) + unique_elements_shap = np.unique(flat_data_shap) + + + # Create a dictionary to store counts for each element + counts_dict_rf = {element: flat_data_rf.count(element) for element in unique_elements_rf} + counts_dict_xgb= {element: flat_data_xgb.count(element) for element in unique_elements_xgb} + counts_dict_ridge = {element: flat_data_ridge.count(element) for element in unique_elements_ridge} + counts_dict_logistic = {element: flat_data_logistic.count(element) for element in unique_elements_logistic} + counts_dict_shap = {element: flat_data_shap.count(element) for element in unique_elements_shap} + + + # Create a DataFrame for seaborn + df_rf = pd.DataFrame(list(counts_dict_rf.items()), columns=['Element', 'Total RF']) + df_xgb = pd.DataFrame(list(counts_dict_xgb.items()), columns=['Element', 'Total XGB']) + df_ridge = pd.DataFrame(list(counts_dict_ridge.items()), columns=['Element', 'Total Ridge']) + df_logistic = pd.DataFrame(list(counts_dict_logistic.items()), columns=['Element', 'Total Logistic']) + df_shap = pd.DataFrame(list(counts_dict_shap.items()), columns=['Element', 'Total Shap']) + + # Set index for both DataFrames + df_rf.set_index('Element', inplace=True) + df_xgb.set_index('Element', inplace=True) + df_ridge.set_index('Element', inplace=True) + df_logistic.set_index('Element', inplace=True) + df_shap.set_index('Element', inplace=True) + + # Combine the DataFrames + df_combined = pd.concat([df_shap, df_rf, df_xgb, df_ridge, df_logistic], axis=1) + + # Convert the DataFrame values to integers + df_combined = df_combined.fillna(0) + + df_combined = df_combined.astype(int) + + total_totals = df_combined.sum(axis=1) + total_totals = total_totals.sort_values(ascending=False) + print("Total of included features: ", total_totals.size) + + top_10_absolute = total_totals.nlargest(10) + + # Print the absolute top 10 most selected features + print("Absolute Top 10 most selected features:") + print(top_10_absolute) + top10_ever = top_10_absolute.index + top10_ever_list = top10_ever.tolist() + + + plt.figure(figsize=(10, 18)) + ax = sns.heatmap(pd.DataFrame(total_totals, columns=['Total']), cmap=plt.cm.BuPu, annot=True, fmt="d", cbar=True, annot_kws={"size": 18}) + + plt.xticks(fontsize=18) # X-axis ticks + plt.yticks(fontsize=18) # Y-axis ticks + + cbar = ax.collections[0].colorbar + cbar.ax.tick_params(labelsize=16) + + ax.set_xticklabels([]) # so the 'Total' doesn't appear right under the plot and only then 'Prevalence'. this removes the 'Total' + + plt.ylabel('Feature Name', fontsize=18) + plt.xlabel('Prevalence', fontsize=18) + + plt.title('Heatmap of Feature Selection Prevalence', fontsize = 20) + plt.savefig("Heatmap of features" + save_name + ".png", bbox_inches='tight', dpi = 600) + plt.show() + + return top10_ever_list + +# #%% +# def heatmaps_allFeatureSelectionsCV (*args): +# x = 10 +# arrays, arra2 = args +# nr_models = len(auc_score) +# count = 0 +# while count < nr_models: +# auc = auc_score[count] +# plt.plot(fpr[count], tpr[count], linestyle = '-', label = model[count] + ' AUROC ' + str(auc)) +# count = count + 1 + +# plt.title("ROC AUC plot") +# plt.xlabel("False Positive Rate (FPR)") +# plt.ylabel("True Positive Rate (TPR)") + +# plt.legend() +# plt.savefig("AUC ROC" + experim + "baseline_allFeats_models.png") +# plt.show() + +# def plot_auprc_models (*args): + +# recall, precision, auprc_score, model, experim = args +# # print("fpr", fpr) +# # print("auc", auc_score) +# # print(model) +# nr_models = len(auprc_score) +# count = 0 +# while count < nr_models: +# auprc = auprc_score[count] +# plt.plot(recall[count], precision[count], linestyle = '-', label = model[count] + ' AUPRC ' + str(auprc)) +# count = count + 1 + +# plt.title("AUPRC plot") +# plt.xlabel("Recall (Sensitivity, TPR)") +# plt.ylabel("Precision (PPV))") + +# plt.legend() +# plt.savefig("AUPRC" + experim + "baseline_allFeats_models.png") +# plt.show() + + + +# def feat_imp_xgb(model_xgb, names): +# xgb_feat_imp = model_xgb.feature_importances_ +# # print(xgb_feat_imp) + +# res_xgb = {} + +# for i in range(0,len(xgb_feat_imp)): +# res_xgb[names[i]] = xgb_feat_imp[i] + +# print(" ----------------------------------------------------------------------- ") +# #print(" All features with their XGBoost Importance") +# sorted_res_xgb = dict(sorted(res_xgb.items(), key=lambda item: item[1], reverse = True)) +# #print(sorted_res_xgb) + +# # print(" ----------------------------------------------------------------------- ") +# # print(" Selected XGBoost features based on importance") +# selected_xgb = {} +# count_xgb = 1 +# for key, value in sorted_res_xgb.items(): +# if value >= 0.01 and count_xgb <= 10: +# selected_xgb[key] = value +# count_xgb = count_xgb + 1 +# # print(selected_xgb) +# # print(selected_xgb.keys()) + +# keys = [k for k, v in selected_xgb.items()] +# # print(keys) +# # print(len(keys)) + +# return keys + +# def feat_imp_ridge(model_ridge, names): +# coefficients = model_ridge.coef_[0] + +# feature_importance_ridge = pd.DataFrame({'Feature': names, 'Importance': np.abs(coefficients)}) +# feature_importance_ridge = feature_importance_ridge.sort_values('Importance', ascending=False) +# #feature_importance_ridge.plot(x='Feature', y='Importance', kind='barh', figsize=(10, 6)) + +# feature_importance_ridge_arr = feature_importance_ridge.query('Importance > 0.1')['Feature'].values +# #print(feature_importance_ridge_arr) +# #print(len(feature_importance_ridge_arr)) +# print(feature_importance_ridge_arr[0:10]) +# #print(len(feature_importance_ridge_arr[0:10])) +# keys = feature_importance_ridge_arr[0:10].tolist() + +# return keys + + +# def feat_imp_logistic(model_logistic, names): +# coefficients = model_logistic.coef_[0] + +# feature_importance_logistic = pd.DataFrame({'Feature': names, 'Importance': np.abs(coefficients)}) +# feature_importance_logistic = feature_importance_logistic.sort_values('Importance', ascending=False) +# #feature_importance_logistic.plot(x='Feature', y='Importance', kind='barh', figsize=(10, 6)) +# #print(feature_importance_logistic) + +# feature_importance_logistic_arr = feature_importance_logistic.query('Importance > 0.1')['Feature'].values +# #print(feature_importance_logistic_arr) +# #print(len(feature_importance_logistic_arr)) +# keys = feature_importance_logistic_arr[0:10].tolist() +# print(keys) +# #print(len(feature_importance_logistic_arr[0:10])) + +# return keys + +# def feat_imp_shap(model, names, kind, subset): # it is just model because it may change every time +# #make the explainer type based on a persed string. like TreeExplainer, LinearExplainer +# #subset means that we can use this to explain what was going on in the training or in the test +# # but according to practice, it is more useful to see what it does to test data. +# if kind == 'rf' or kind == 'random forest' or kind == 'svm': +# explainer = shap.KernelExplainer(model.predict, subset) +# shap_values = explainer.shap_values(subset, check_additivity=False) +# print(shap_values) +# # Get top 10 features based on SHAP values +# vals = np.abs(shap_values).mean(axis=0) +# top_10_features_indices = np.argsort(vals)[::-1][:10] +# top_10_features = names[top_10_features_indices] +# return top_10_features.tolist() + +# elif kind == 'xgb': +# explainer = shap.TreeExplainer(model, subset) + +# elif kind == 'linear': +# explainer = shap.LinearExplainer(model, subset) + +# # Calculate SHAP values for the training set +# shap_values = explainer.shap_values(subset) + +# # Get top 10 features based on SHAP values +# vals = np.abs(shap_values).mean(axis=0) +# top_10_features_indices = np.argsort(vals)[::-1][:10] +# top_10_features = names[top_10_features_indices] + +# # Create a DataFrame with SHAP values and top 10 features +# shap_df = pd.DataFrame(shap_values, columns=names) +# #shap_df['target'] = y_train # Assuming 'target' is your target variable +# shap_df['abs_shap_values_mean'] = np.abs(shap_values).mean(axis=1) + +# # Add top 10 features to the DataFrame +# #shap_df_top_10 = shap_df[['target', 'abs_shap_values_mean'] + top_10_features.tolist()] +# shap_df_top_10 = shap_df[['abs_shap_values_mean'] + top_10_features.tolist()] + +# # Display the DataFrame with top 10 features +# print(shap_df_top_10.head()) + +# return top_10_features.tolist() + +# #%% How to combine the feat imp into a pd df \ No newline at end of file