b/src/iterpretability/experiments/experiments_base.py
+from pathlib import Path
+import os
+import catenets.models as cate_models
+from catenets.models.diffpo import DiffPOLearner
+import numpy as np
+import pandas as pd
+import wandb
+import shap
+from PIL import Image
+import src.iterpretability.logger as log
+from sklearn.metrics import roc_auc_score
+from src.plotting import (
+    plot_results_datasets_compare,
+    plot_expertise_metrics,
+    plot_performance_metrics,
+    plot_performance_metrics_f_cf,
+)
+from src.iterpretability.explain import Explainer
+from src.iterpretability.datasets.data_loader import load
+from src.iterpretability.simulators import (
+    SimulatorBase,
+    TYSimulator,
+    TSimulator
+)
+from src.iterpretability.utils import (
+    attribution_accuracy,
+)
+import time
+# For contour plotting
+from sklearn.metrics import mean_squared_error, roc_auc_score, f1_score
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
+from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, LogisticRegressionCV, LassoCV
+from sklearn.model_selection import LeaveOneOut
+from sklearn.metrics import auc
+# Hydra for configuration
+from omegaconf import DictConfig, OmegaConf
+class ExperimentBase():
+    """
+    Base class for all experiments.
+    """
+    def __init__(self, cfg: DictConfig) -> None:
+        # Store configuration
+        self.cfg = cfg
+        # General experiment settings
+        self.seeds = cfg.seeds
+        self.n_splits = cfg.n_splits
+        self.experiment_name = cfg.experiment_name
+        self.simulation_type = cfg.simulator.simulation_type
+        self.evaluate_inference = cfg.evaluate_inference
+        # Load data
+        if self.simulation_type == "TY":
+            self.X,self.feature_names = load(cfg.dataset,
+                                            train_ratio=1,
+                                            debug=cfg.debug,
+                                            sim_type=self.simulation_type,
+                                            directory_path_=cfg.directory_path+'/',
+                                            repo_path = cfg.repo_path,
+                                            n_samples=cfg.n_samples)
+            self.outcomes = None
+            self.discrete_outcome = cfg.simulator.num_binary_outcome == cfg.simulator.dim_Y # Currently the only discrete outcome is binary
+        elif self.simulation_type == "T":
+            self.X, self.outcomes, self.feature_names = load(cfg.dataset,
+                                                            train_ratio=1,
+                                                            debug=cfg.debug,
+                                                            directory_path_=cfg.directory_path+'/',
+                                                            repo_path = cfg.repo_path,
+                                                            sim_type=self.simulation_type)
+            # Check whether all entries in outcomes are integers
+            self.discrete_outcome = cfg.simulator.num_binary_outcome
+            cfg.simulator.num_T = self.outcomes.shape[1]
+        else:
+            raise ValueError(f"Simulation type {self.cfg.simulator.simulation_type} not supported.")
+        # Initialize learners
+        self.discrete_treatment = True # Currently simulation only supports discrete treatment
+        # Results path and directories
+        self.file_name = f"{self.cfg.results_dictionary_prefix}_{self.cfg.dataset}_T{self.cfg.simulator.num_T}_Y{self.cfg.simulator.dim_Y}_{self.cfg.simulator.simulation_type}sim_numbin{self.cfg.simulator.num_binary_outcome}"
+        self.results_path = Path(self.cfg.results_path) / Path(self.cfg.experiment_name) / Path(self.file_name)
+        if not self.results_path.exists():
+            self.results_path.mkdir(parents=True, exist_ok=True)
+        # Variables set by some or all experiments
+        self.learners = None
+        self.baseline_learners = None
+        self.pred_learners = None
+        self.prog_learner = None
+        self.select_learner = None
+        self.explanations = None
+        self.all_important_features = None
+        self.pred_features = None
+        self.prog_features = None
+        self.select_features = None
+        self.true_num_swaps = None
+        self.swap_counter = None
+        self.training_times = {}
+    def get_learners(self,
+                     num_features: int,
+                     seed: int = 123) -> dict:
+        """
+        Get learners for the experiment, based on the configuration settings.
+        """
+        if self.cfg.simulator.dim_Y > 1 and "Torch_XLearner" in self.cfg.model_names:
+            raise ValueError("Torch_XLearner only supports one outcome dimension.")
+        elif self.cfg.simulator.dim_Y > 1 and "EconML_SparseLinearDRLearner" in self.cfg.model_names:
+            raise ValueError("EconML_SparseLinearDRLearner only supports one outcome dimension.")
+        # Let user know that torch and diffpo do not support multiple treatments
+        if self.cfg.simulator.num_T > 2 and ("Torch" in self.cfg.model_names or "DiffPOLearner" in self.cfg.model_names):
+            raise ValueError("Torch and DiffPO models do not support multiple treatments. Only the first treatment will be used.")
+        binary_y = self.discrete_outcome and self.cfg.simulator.num_binary_outcome == 1
+        learners = {
+                        "EconML_CausalForestDML": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                model_name="EconML_CausalForestDML",
+                                                                                discrete_treatment=self.discrete_treatment,
+                                                                                discrete_outcome=self.discrete_outcome,
+                                                                                seed=seed),
+                        "EconML_DML": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                        model_name="EconML_DML",
+                                                                        discrete_treatment=self.discrete_treatment,
+                                                                        discrete_outcome=self.discrete_outcome,
+                                                                        seed=seed),
+                        "EconML_DRLearner": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                            model_name="EconML_DRLearner",
+                                                                            discrete_treatment=self.discrete_treatment,
+                                                                            discrete_outcome=self.discrete_outcome,
+                                                                            seed=seed),
+                        "EconML_DMLOrthoForest": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                model_name="EconML_DMLOrthoForest",
+                                                                                discrete_treatment=self.discrete_treatment,
+                                                                                discrete_outcome=self.discrete_outcome,
+                                                                                seed=seed),
+                        "EconML_DROrthoForest": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                model_name="EconML_DROrthoForest",
+                                                                                discrete_treatment=self.discrete_treatment,
+                                                                                discrete_outcome=self.discrete_outcome,
+                                                                                seed=seed),
+                        "EconML_ForestDRLearner": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                    model_name="EconML_ForestDRLearner",
+                                                                                    discrete_treatment=self.discrete_treatment,
+                                                                                    discrete_outcome=self.discrete_outcome,
+                                                                                    seed=seed),
+                        "EconML_LinearDML": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                            model_name="EconML_LinearDML",
+                                                                            discrete_treatment=self.discrete_treatment,
+                                                                            discrete_outcome=self.discrete_outcome,
+                                                                            seed=seed),
+                        "EconML_LinearDRLearner": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                model_name="EconML_LinearDRLearner",
+                                                                                discrete_treatment=self.discrete_treatment,
+                                                                                discrete_outcome=self.discrete_outcome,
+                                                                                seed=seed),
+                        "EconML_SparseLinearDML": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                    model_name="EconML_SparseLinearDML",
+                                                                                    discrete_treatment=self.discrete_treatment,
+                                                                                    discrete_outcome=self.discrete_outcome,
+                                                                                    seed=seed),
+                        "EconML_SparseLinearDRLearner": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                            model_name="EconML_SparseLinearDRLearner",
+                                                                                            discrete_treatment=self.discrete_treatment,
+                                                                                            discrete_outcome=self.discrete_outcome,
+                                                                                            seed=seed),
+                        "EconML_SLearner_Lasso": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                    model_name="EconML_SLearner_Lasso",
+                                                                                    discrete_treatment=self.discrete_treatment,
+                                                                                    discrete_outcome=self.discrete_outcome,
+                                                                                    seed=seed),
+                        "EconML_TLearner_Lasso": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                    model_name="EconML_TLearner_Lasso",
+                                                                                    discrete_treatment=self.discrete_treatment,
+                                                                                    discrete_outcome=self.discrete_outcome,
+                                                                                    seed=seed),
+                        "EconML_XLearner_Lasso": cate_models.econml.EconMlEstimator2(self.cfg,
+                                                                                    model_name="EconML_XLearner_Lasso",
+                                                                                    discrete_treatment=self.discrete_treatment,
+                                                                                    discrete_outcome=self.discrete_outcome,
+                                                                                    seed=seed),
+                        "Torch_SLearner": cate_models.torch.SLearner(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_SLearner),
+                        "Torch_TLearner": cate_models.torch.TLearner(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_TLearner),
+                        "Torch_XLearner": cate_models.torch.XLearner(num_features,
+                                                                     binary_y=binary_y,
+                                                                     **self.cfg.Torch_XLearner),
+                        "Torch_DRLearner": cate_models.torch.DRLearner(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_DRLearner),
+                        "Torch_DragonNet": cate_models.torch.DragonNet(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_DragonNet),
+                        "Torch_DragonNet_2": cate_models.torch.DragonNet(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_DragonNet_2),
+                        "Torch_DragonNet_4": cate_models.torch.DragonNet(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_DragonNet_4),
+                        "Torch_ActionNet": cate_models.torch.ActionNet(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_ActionNet),
+                        # "Torch_FlexTENet": cate_models.torch.FlexTENet(num_features,
+                        #                                                 binary_y=binary_y,
+                        #                                                 **self.cfg.Torch_FlexTENet),
+                        "Torch_PWLearner": cate_models.torch.PWLearner(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_PWLearner),
+                        "Torch_RALearner": cate_models.torch.RALearner(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_RALearner),
+                        "Torch_RLearner": cate_models.torch.RLearner(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_RLearner),
+                        "Torch_TARNet": cate_models.torch.TARNet(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_TARNet),
+                        "Torch_ULearner": cate_models.torch.ULearner(num_features,
+                                                                        binary_y=binary_y,
+                                                                        **self.cfg.Torch_ULearner),
+                        "Torch_CFRNet_0.01": cate_models.torch.TARNet(num_features,
+                                                                binary_y=binary_y,
+                                                                **self.cfg.Torch_CRFNet_0_01),
+                        "Torch_CFRNet_0.001": cate_models.torch.TARNet(num_features,
+                                                                binary_y=binary_y,
+                                                                **self.cfg.Torch_CRFNet_0_001),
+                        "Torch_CFRNet_0.0001": cate_models.torch.TARNet(num_features,
+                                                                binary_y=binary_y,
+                                                                **self.cfg.Torch_CRFNet_0_0001),
+                        "DiffPOLearner": DiffPOLearner(self.cfg,
+                                                        num_features,
+                                                        binary_y=binary_y,
+                                                        ),
+                    }
+        # Deal with the case where the model is not available
+        for name in self.cfg.model_names:
+            if name not in learners:
+                raise Exception(f"Unknown model name {name}.")
+        # Only return learners from cfg.model_names
+        return {k: v for k, v in learners.items() if k in self.cfg.model_names}
+    def get_baseline_learners(self,
+                             seed: int = 123) -> dict:
+        """
+        Get baseline learners for the experiment, based on the configuration settings.
+        These models will be used as a baseline for retrieving absolute outcomes from cate predictions.
+        """
+        # Instantiate baseline learner for all possible treatment options
+        baseline_learners = []
+        if self.cfg.debug or self.cfg.dataset == "cytof_normalized_with_fastdrug" or self.cfg.dataset.startswith("melanoma"):
+            cv = LeaveOneOut()
+        else:
+            cv = 5
+        for t in range(self.cfg.simulator.num_T):
+            if self.discrete_outcome == 1:
+                base_model = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=cv, random_state=seed)
+                #base_model = RandomForestClassifier(n_estimators=100, random_state=seed)
+                model = MultiOutputClassifier(base_model)
+                baseline_learners.append(model)
+            else:
+                base_model = LassoCV(cv=cv, n_alphas=5, max_iter=50, random_state=seed)
+                #base_model = RandomForestRegressor(n_estimators=100, random_state=seed)
+                model = MultiOutputRegressor(base_model)
+                baseline_learners.append(model)
+        return baseline_learners
+    def get_select_learner(self,
+                            seed: int = 123) -> dict:
+        """
+        Get learners for feature selection.
+        """
+        # Instantiate baseline learner for all possible treatment options
+        if self.cfg.debug:
+            cv = LeaveOneOut()
+        else:
+            cv = 5
+        select_learner = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=cv, random_state=seed)
+        #select_learner = RandomForestRegressor(n_estimators=100, max_depth=6)
+        return select_learner
+    def get_prog_learner(self,
+                            seed: int = 123) -> dict:
+        """
+        Get learners for the prognostic part of the outcomes.
+        """
+        if self.cfg.debug:
+            cv = LeaveOneOut()
+        else:
+            cv = 5
+        base_model = LassoCV(cv=cv, n_alphas=5, max_iter=50, random_state=seed)
+        #base_model = RandomForestRegressor(n_estimators=100, random_state=seed)
+        prog_learner = MultiOutputRegressor(base_model)
+        return prog_learner
+    def get_pred_learners(self,
+                            seed: int = 123) -> dict:
+        """
+        Get learners for the treatment-specific CATEs.
+        """
+        # Instantiate baseline learner for all possible treatment options
+        pred_learners = []
+        if self.cfg.debug or self.cfg.dataset.startswith("melanoma"):
+            cv = LeaveOneOut()
+        else:
+            cv = 5
+        #base_model = RandomForestRegressor(n_estimators=100, random_state=seed)
+        for t in range(self.cfg.simulator.num_T):
+            base_model = LassoCV(cv=cv, n_alphas=5, max_iter=50, random_state=seed)
+            model = MultiOutputRegressor(base_model)
+            pred_learners.append(model)
+        return pred_learners
+    def train_learners(self,
+                        X_train: np.ndarray,
+                        Y_train: np.ndarray,
+                        T_train: np.ndarray,
+                        outcomes_train: np.ndarray) -> None:
+        """
+        Train all learners.
+        """
+        self.training_times = {}
+        for name in self.learners:
+            # measure training time
+            start_time = time.time()
+            log.info(f"Fitting {name}.")
+            if self.evaluate_inference:
+                if not (name.startswith("EconML") or name.startswith("DiffPOLearner")):
+                    raise ValueError("Only EconML models support inference.")
+            if name == "DiffPOLearner":
+                self.learners[name].train(X=X_train, y=Y_train, w=T_train, outcomes=outcomes_train) # fit before!!
+            else:
+                self.learners[name].train(X=X_train, y=Y_train, w=T_train) # fit before!!
+            # measure training time
+            end_time = time.time()
+            self.training_times[name] = end_time - start_time
+    def train_baseline_learners(self,
+                                X_train: np.ndarray,
+                                outcomes_train: np.ndarray,
+                                T_train: np.ndarray) -> None:
+        """
+        Train all baseline learners.
+        """
+        for t in range(self.cfg.simulator.num_T):
+            # Get all data points where treatment is t
+            mask = T_train == t
+            Y_train_t = outcomes_train[mask,t,:]
+            X_train_t = X_train[mask,:]
+            log.debug(
+            f'Check baseline data for treatment {t}:'
+            f'============================================'
+            f'X_train: {X_train.shape}'
+            f'\n{X_train}'
+            f'\noutcomes_train: {outcomes_train.shape}'
+            f'\n{outcomes_train}'
+            f'\nT_train: {T_train.shape}'
+            f'\n{T_train}'
+            f'\nX_train_t: {X_train_t.shape}'
+            f'\n{X_train_t}'
+            f'\nY_train_t: {Y_train_t.shape}'
+            f'\n{Y_train_t}'
+            f'\n============================================\n\n'
+            )
+            # Check that there are data points for this treatment
+            assert Y_train_t.shape[0] > 0, f"No data points for treatment {t}."
+            log.info(f"Fitting baseline learner for treatment {t}.")
+            if self.discrete_outcome:
+                Y_train_t = Y_train_t.astype(bool)
+            self.baseline_learners[t].fit(X_train_t, Y_train_t)
+    def train_select_learner(self,
+                                X_train: np.ndarray,
+                                T_train: np.ndarray) -> None:
+        """
+        Train the feature selection learner.
+        """
+        log.info(f"Fitting feature selection learner.")
+        self.select_learner.fit(X_train, T_train)
+    def train_prog_learner(self,
+                            X_train: np.ndarray,
+                            pred_outcomes_train: np.ndarray) -> None:
+        """
+        Train the model learning the prognostic part of the outcomes.
+        Here the prognostic part is the average of all possible outcomes.
+        """
+        # pred_outcomes shape: n, num_T, dim_Y
+        # X_train shape: n, dim_X
+        # Compute the average of all possible outcomes
+        prog_Y_train = np.mean(pred_outcomes_train, axis=1)
+        # Train the model
+        self.prog_learner.fit(X_train, prog_Y_train)
+        log.debug(
+            f'Check prog learner data:'
+            f'============================================'
+            f'X_train: {X_train.shape}'
+            f'\n{X_train}'
+            f'\npred_outcomes_train: {pred_outcomes_train.shape}'
+            f'\n{pred_outcomes_train}'
+            f'\nprog_Y_train: {prog_Y_train.shape}'
+            f'\n{prog_Y_train}'
+            f'\n============================================\n\n'
+        )
+    def train_pred_learner(self,
+                            X_train: np.ndarray,
+                            pred_cates_train: np.ndarray,
+                            T_train: np.ndarray) -> None:
+        """
+        Train the model learning the CATEs.
+        """
+        for t in range(self.cfg.simulator.num_T):
+            # Get treatment mask
+            mask = T_train == t
+            # For every patient with treatment t, we can simply compute the mean CATE
+            # For others we need to add the cate for t, to get the desired cates (see biomarker attribution good notes)
+            pred_Y_train = np.zeros((X_train.shape[0], self.cfg.simulator.dim_Y))
+            pred_Y_train = pred_cates_train.mean(axis=1)
+            pred_Y_train[~mask] -= pred_cates_train[~mask, t, :]
+            # Train the model
+            self.pred_learners[t].fit(X_train, pred_Y_train)
+            log.debug(
+                f'Check pred learner data for treatment {t}:'
+                f'============================================'
+                f'X_train: {X_train.shape}'
+                f'\n{X_train[:10]}'
+                f'\npred_cates_train: {pred_cates_train.shape}'
+                f'\n{pred_cates_train[:10]}'
+                f'\nT_train: {T_train.shape}'
+                f'\n{T_train[:10]}'
+                f'\npred_Y_train: {pred_Y_train.shape}'
+                f'\n{pred_Y_train[:10]}'
+                f'\n============================================\n\n'
+            )
+    def get_learner_explanations(self,
+                                 X: np.ndarray,
+                                 return_explainer_names: bool = True,
+                                 ignore_explainer_limit: bool = False,
+                                 type: str = "pred") -> dict:
+        """
+        Get explanations for all learners.
+        """
+        learner_explainers = {}
+        learner_explanations = {}
+        explainer_names = {}
+        for name in self.learners:
+            log.info(f"Explaining {name}.")
+            if ignore_explainer_limit:
+                explainer_limit = X.shape[0]
+            else:
+                explainer_limit = self.cfg.explainer_limit
+            if "EconML" in name:
+                # EconML
+                if self.cfg.explainer_econml != "shap":
+                    raise ValueError("Only shap is supported for EconML models.")
+                if type == "pred":
+                    cate_est = lambda X: self.learners[name].predict(X)
+                    shap_values_avg = shap.Explainer(cate_est, X[:explainer_limit]).shap_values(X[:explainer_limit])
+                    explainer_names[name] = "shap"
+                    # shap_values = self.learners[name].explain(X[:explainer_limit], background_samples=None)
+                    # treatment_names = self.learners[name].est.cate_treatment_names()
+                    # output_names = self.learners[name].est.cate_output_names()
+                    # # average absolute shaps over all treatment names
+                    # shap_values_avg = np.zeros_like(shap_values[output_names[0]][treatment_names[0]].values)
+                    # for output_name in output_names:
+                    #     for treatment_name in treatment_names:
+                    #         shap_values_avg += np.abs(shap_values[output_name][treatment_name].values)
+                # Does not work yet!
+                elif type == "prog":
+                    if name == "EconML_SLearner_Lasso":
+                        y0_est = lambda X: self.learners[name].est.overall_model.predict(np.hstack([X, np.ones((X.shape[0], 1)),np.zeros((X.shape[0], 1))]))
+                        # pred outcomes shape should be: n, num_T, dim_Y
+                    elif name == "EconML_TLearner_Lasso":
+                        y0_est = lambda X: self.learners[name].est.models[0].predict(X)
+                    elif name == "EconML_DML":
+                        y0_est = lambda X: self.learners[name].predict_outcomes(X, outcomes=None)[:,0]
+                    else:
+                        raise ValueError(f"Model {name} not supported for prog learner explanations.")
+                    shap_values = shap.Explainer(y0_est, X[:explainer_limit]).shap_values(X[:explainer_limit])
+                    shap_values_avg = shap_values
+                    # average absolute shaps over all treatment names
+                    # shap_values_avg = np.zeros_like(shap_values[0])
+                    # for i in range(len(shap_values)):
+                    #     shap_values_avg += np.abs(shap_values[i])
+                learner_explanations[name] = shap_values_avg
+                explainer_names[name] = "shap"
+            else:
+                # Also doesn't work yet!
+                if type == "prog" and name in ["Torch_SLearner", "Torch_TLearner", "Torch_TARNet", "Torch_CFRNet_0.001", "Torch_CFRNet_0.01","Torch_CFRNet_0.0001","Torch_DragonNet","Torch_DragonNet_2","Torch_DragonNet_4"]:
+                    # model_to_explain = self.learners[name]._po_estimators[0]
+                    # X_to_explain = self._repr_estimator(X).squeeze()
+                    y0_est = lambda X: self.learners[name].predict(X, return_po=True)[1]
+                    learner_explanations[name] = shap.Explainer(y0_est, X[:explainer_limit]).shap_values(X[:explainer_limit])
+                    explainer_names[name] = "shap"
+                elif type == "prog" and not name in ["Torch_SLearner", "Torch_TLearner", "Torch_TARNet", "Torch_CFRNet_0.001", "Torch_CFRNet_0.01","Torch_CFRNet_0.0001","Torch_DragonNet","Torch_DragonNet_2","Torch_DragonNet_4"]:
+                    learner_explanations[name] = None
+                    explainer_names[name] = None
+                    #raise ValueError(f"Model {name} not supported for prog learner explanations.")
+                else:
+                    cate_est = lambda X: self.learners[name].predict(X)
+                    learner_explanations[name] = shap.Explainer(cate_est, X[:explainer_limit]).shap_values(X[:explainer_limit])
+                    explainer_names[name] = "shap"
+                    # model_to_explain = self.learners[name]
+                    # X_to_explain = X
+                    # learner_explainers[name] = Explainer(
+                    #     model_to_explain,
+                    #     feature_names=list(range(X.shape[1])),
+                    #     explainer_list=[self.cfg.explainer_torch],
+                    # )
+                    # learner_explanations[name] = learner_explainers[name].explain(
+                    #     X[: explainer_limit]
+                    # )
+                    # learner_explanations[name] = learner_explanations[name][self.cfg.explainer_torch]
+                    # explainer_names[name] = self.cfg.explainer_torch
+        # Check dimensions of explanations by looking at the shape of all explanation np arrays
+        # for name in learner_explanations:
+        #     log.debug(
+        #         f"\nExplanations for {name} have shape {learner_explanations[name].shape}."
+        #     )
+        if return_explainer_names:
+            return learner_explanations, explainer_names
+        else:
+            return learner_explanations
+    def get_select_learner_explanations(self,
+                                        X_reference: np.ndarray,
+                                        X_to_explain: np.ndarray) -> np.ndarray:
+        """
+        Get explanations for the feature selection learner.
+        Returns shape: n, dim_X, num_T.
+        """
+        explainer = shap.Explainer(self.select_learner, X_reference)
+        shap_values = explainer(X_to_explain).values # Shape: n, dim_X, num_T
+        return shap_values
+    def get_prog_learner_explanations(self,
+                                        X_reference: np.ndarray,
+                                        X_to_explain: np.ndarray) -> np.ndarray:
+        """
+        Get explanations for the prognostic learner.
+        Returns shape: n, dim_X, dim_Y.
+        """
+        # Get explanations for every outcome
+        shap_values = np.zeros((X_to_explain.shape[0], X_to_explain.shape[1], self.cfg.simulator.dim_Y))
+        for i in range(self.cfg.simulator.dim_Y):
+            explainer = shap.Explainer(self.prog_learner.estimators_[i], X_reference, check_additivity=False)
+            shap_values[:,:,i] = explainer(X_to_explain).values #, check_additivity=False for forest
+        return shap_values
+    def get_pred_learner_explanations(self,
+                                        X_reference: np.ndarray,
+                                        X_to_explain: np.ndarray) -> np.ndarray:
+        """
+        Get explanations for the treatment-specific CATEs (one vs. all).
+        Returns shape: num_T, n, dim_X, dim_Y.
+        """
+        # Get explanations for every outcome and every reference treatment
+        shap_values = np.zeros((self.cfg.simulator.num_T, X_to_explain.shape[0], X_to_explain.shape[1], self.cfg.simulator.dim_Y))
+        shap_base_values = np.zeros((self.cfg.simulator.num_T, self.cfg.simulator.dim_Y))
+        for t in range(self.cfg.simulator.num_T):
+            for i in range(self.cfg.simulator.dim_Y):
+                explainer = shap.Explainer(self.pred_learners[t].estimators_[i], X_reference)
+                shap_explanation = explainer(X_to_explain)
+                pred = self.pred_learners[t].predict(X_to_explain)
+                shap_values[t,:,:,i] = explainer(X_to_explain).values #, check_additivity=False for forest
+                shap_base_values[t,i] = shap_explanation.base_values[0]
+        log.debug(
+            f"\nExplanations for pred_learner have shape {shap_values.shape}."
+            f"\n{shap_values[:,:10,:10,:]}"
+        )
+        return shap_values, shap_base_values
+    def save_results(self, metrics_df: pd.DataFrame, compare_axis_values = None, plot_only: bool = False, save_df_only: bool = False, compare_axis=None, fig_type = "all") -> None:
+        """
+        Save results of the experiment.
+        """
+        file_name = self.file_name
+        results_path = self.results_path
+        if save_df_only:
+            log.info(f"Saving intermediate results in {results_path}...")
+            if not results_path.exists():
+                results_path.mkdir(parents=True, exist_ok=True)
+        else:
+            log.info(f"Saving final results in {results_path}...")
+            if not results_path.exists():
+                results_path.mkdir(parents=True, exist_ok=True)
+        if not plot_only:
+            # Save metrics csv and metadata csv with the configuration
+            if save_df_only:
+                metrics_df.to_csv(
+                    results_path / Path(file_name+"_checkpoint.csv")
+                )
+            else:
+                metrics_df.to_csv(
+                    results_path / Path(file_name+".csv")
+                )
+            OmegaConf.save(self.cfg, results_path / Path(file_name+".yaml"))
+        # Choose figure size and legend position
+        n_rows = len(self.cfg.metrics_to_plot)
+        figsize = None #(10, 100)
+        legend_position = 0.95
+        log_x_axis = False
+        # Save plots
+        if self.cfg.plot_results and not save_df_only:
+            if self.cfg.experiment_name == 'sample_size_sensitivity':
+                compare_axis = "Propensity Scale"
+                x_axis = "Sample Portion"
+                x_label_name = r'$\omega_{\mathrm{ss}}$'
+                x_values_to_plot = self.cfg.sample_sizes
+            elif self.cfg.experiment_name == 'important_feature_num_sensitivity':
+                compare_axis = "Feature Overlap"
+                x_axis = "Num Important Features"
+                x_label_name = r'$\omega_{\mathrm{IFs}}$'
+                x_values_to_plot = self.cfg.important_feature_nums
+            elif self.cfg.experiment_name == 'propensity_scale_sensitivity':
+                # compare_axis = "Unbalancedness Exp"
+                compare_axis = "Propensity Type"
+                x_axis = "Propensity Scale"
+                x_label_name = r'${\mathrm{\beta}}$'
+                x_values_to_plot = self.cfg.propensity_scales
+                log_x_axis = True
+            elif self.cfg.experiment_name == 'predictive_scale_sensitivity':
+                compare_axis = "Nonlinearity Scale"
+                x_axis = "Predictive Scale"
+                x_label_name = r'$\omega_{\mathrm{pds}}$'
+                x_values_to_plot = self.cfg.predictive_scales
+            elif self.cfg.experiment_name == 'treatment_space_sensitivity':
+                compare_axis = "Binary Outcome"
+                x_axis = "Treatment Options"
+                x_label_name = r'$\omega_{\mathrm{Ts}}$'
+                x_values_to_plot = self.cfg.num_Ts
+            elif self.cfg.experiment_name == 'data_dimensionality_sensitivity':
+                compare_axis = compare_axis
+                x_axis = "Data Dimension"
+                x_label_name = r'$Num Features$'
+                x_values_to_plot = self.cfg.data_dims
+            elif self.cfg.experiment_name == 'feature_overlap_sensitivity':
+                compare_axis = "Overlap Type"
+                x_axis = "Propensity Scale"
+                x_label_name = r'$\omega_{\mathrm{pps}}$'
+                x_values_to_plot = self.cfg.propensity_scales
+            elif self.cfg.experiment_name == 'expertise_sensitivity':
+                compare_axis = "Propensity Type"
+                x_axis = "Alpha"
+                x_label_name = r'$\alpha$'
+                x_values_to_plot = self.cfg.alphas
+            else:
+                raise ValueError(f"Experiment {self.cfg.experiment_name} not supported for plotting.")
+            if fig_type == "all":
+                fig = plot_results_datasets_compare(
+                            results_df=metrics_df,
+                            model_names=self.cfg.model_names,
+                            dataset=self.cfg.dataset,
+                            compare_axis=compare_axis,
+                            compare_axis_values=compare_axis_values,
+                            x_axis=x_axis,
+                            x_label_name=x_label_name,
+                            x_values_to_plot=x_values_to_plot,
+                            metrics_list=self.cfg.metrics_to_plot,
+                            learners_list=self.cfg.model_names,
+                            figsize=figsize,
+                            legend_position=legend_position,
+                            seeds_list=self.cfg.seeds,
+                            n_splits=self.cfg.n_splits,
+                            sharey="row",
+                            legend_rows=1,
+                            dim_X=self.X.shape[0],
+                            log_x_axis=log_x_axis
+                        )
+            elif fig_type == "expertise":
+                fig = plot_expertise_metrics(
+                            results_df=metrics_df,
+                            model_names=self.cfg.model_names,
+                            dataset=self.cfg.dataset,
+                            compare_axis=compare_axis,
+                            compare_axis_values=compare_axis_values,
+                            x_axis=x_axis,
+                            x_label_name=x_label_name,
+                            x_values_to_plot=x_values_to_plot,
+                            metrics_list=self.cfg.metrics_to_plot,
+                            learners_list=self.cfg.model_names,
+                            figsize=figsize,
+                            legend_position=legend_position,
+                            seeds_list=self.cfg.seeds,
+                            n_splits=self.cfg.n_splits,
+                            sharey="row",
+                            legend_rows=1,
+                            dim_X=self.X.shape[0],
+                            log_x_axis=log_x_axis
+                        )
+            elif fig_type == "performance":
+                fig = plot_performance_metrics(
+                            results_df=metrics_df,
+                            model_names=self.cfg.model_names,
+                            dataset=self.cfg.dataset,
+                            compare_axis=compare_axis,
+                            compare_axis_values=compare_axis_values,
+                            x_axis=x_axis,
+                            x_label_name=x_label_name,
+                            x_values_to_plot=x_values_to_plot,
+                            metrics_list=self.cfg.metrics_to_plot,
+                            learners_list=self.cfg.model_names,
+                            figsize=figsize,
+                            legend_position=legend_position,
+                            seeds_list=self.cfg.seeds,
+                            n_splits=self.cfg.n_splits,
+                            sharey="row",
+                            legend_rows=1,
+                            dim_X=self.X.shape[0],
+                            log_x_axis=log_x_axis
+                        )
+            elif fig_type == "performance_f_cf":
+                fig = plot_performance_metrics_f_cf(
+                            results_df=metrics_df,
+                            model_names=self.cfg.model_names,
+                            dataset=self.cfg.dataset,
+                            compare_axis=compare_axis,
+                            compare_axis_values=compare_axis_values,
+                            x_axis=x_axis,
+                            x_label_name=x_label_name,
+                            x_values_to_plot=x_values_to_plot,
+                            metrics_list=self.cfg.metrics_to_plot,
+                            learners_list=self.cfg.model_names,
+                            figsize=figsize,
+                            legend_position=legend_position,
+                            seeds_list=self.cfg.seeds,
+                            n_splits=self.cfg.n_splits,
+                            sharey="row",
+                            legend_rows=1,
+                            dim_X=self.X.shape[0],
+                            log_x_axis=log_x_axis
+                        )
+            # Save figure
+            try:
+                fig.savefig(results_path / f"{self.cfg.plot_name_prefix}.png", bbox_inches='tight')
+            except:
+                fig.savefig(results_path / f"{file_name}.png", bbox_inches='tight')
+    def load_and_plot_results(self, compare_axis_values, fig_type: str = "all") -> None:
+        """
+        Load and plot results of the experiment.
+        """
+        file_name = self.file_name
+        results_path = self.results_path
+        log.info(f"Loading results from {results_path}...")
+        if not results_path.exists():
+            raise FileNotFoundError(f"Results path {results_path} does not exist.")
+        # Load metrics
+        metrics_df = pd.read_csv(results_path / Path(file_name+"_checkpoint.csv"))
+        if self.cfg.experiment_name == "data_dimensionality_sensitivity":
+            if self.cfg.compare_axis == "propensity":
+                compare_axis = "Propensity Scale"
+            elif self.cfg.compare_axis == "num_features":
+                compare_axis = "Num Important Features"
+            self.save_results(metrics_df, plot_only=True, compare_axis=compare_axis, compare_axis_values=compare_axis_values, fig_type=fig_type)
+        else:
+            self.save_results(metrics_df, plot_only=True, compare_axis_values=compare_axis_values, fig_type=fig_type)
+    def compute_outcome_mse(self,
+                            pred_outcomes: np.ndarray,
+                            true_outcomes: np.ndarray,
+                            T: np.ndarray = None) -> float:
+        """
+        Compute MSE for all outcomes.
+        """
+        # Only keep factual cates
+        pred_outcomes_factual = np.zeros((pred_outcomes.shape[0], pred_outcomes.shape[2])) # n, dim_Y
+        true_outcomes_factual = np.zeros((true_outcomes.shape[0], true_outcomes.shape[2]))
+        pred_outcomes_factual_Y0 = np.zeros(((T==0).sum(), pred_outcomes.shape[2])) # n, dim_Y
+        true_outcomes_factual_Y0 = np.zeros(((T==0).sum(), pred_outcomes.shape[2]))
+        pred_outcomes_factual_Y1 = np.zeros(((T==1).sum(), pred_outcomes.shape[2])) # n, dim_Y
+        true_outcomes_factual_Y1 = np.zeros(((T==1).sum(), pred_outcomes.shape[2]))
+        pred_outcomes_cf = np.zeros((pred_outcomes.shape[0], pred_outcomes.shape[1]-1, pred_outcomes.shape[2])) # n, num_T-1, dim_Y
+        true_outcomes_cf = np.zeros((true_outcomes.shape[0], true_outcomes.shape[1]-1, true_outcomes.shape[2]))
+        pred_outcomes_cf_Y0 = np.zeros(((T==1).sum(), pred_outcomes.shape[1]-1, pred_outcomes.shape[2])) # n, dim_Y
+        true_outcomes_cf_Y0 = np.zeros(((T==1).sum(), pred_outcomes.shape[1]-1, pred_outcomes.shape[2]))
+        pred_outcomes_cf_Y1 = np.zeros(((T==0).sum(), pred_outcomes.shape[1]-1, pred_outcomes.shape[2])) # n, dim_Y
+        true_outcomes_cf_Y1 = np.zeros(((T==0).sum(), pred_outcomes.shape[1]-1, pred_outcomes.shape[2]))
+        counter_Y0 = 0
+        counter_Y1 = 0
+        for i in range(pred_outcomes.shape[0]):
+            mask_factual = np.zeros(pred_outcomes.shape[1], dtype=bool)
+            mask_cf = np.ones(pred_outcomes.shape[1], dtype=bool)
+            mask_factual[T[i]] = True
+            mask_cf[T[i]] = False
+            pred_outcomes_factual[i,:] = pred_outcomes[i, mask_factual,:]
+            true_outcomes_factual[i,:] = true_outcomes[i, mask_factual,:]
+            pred_outcomes_cf[i,:,:] = pred_outcomes[i, mask_cf,:]
+            true_outcomes_cf[i,:,:] = true_outcomes[i, mask_cf,:]
+            if T[i] == 0:
+                pred_outcomes_factual_Y0[counter_Y0,:] = pred_outcomes[i, mask_factual,:]
+                true_outcomes_factual_Y0[counter_Y0,:] = true_outcomes[i, mask_factual,:]
+                pred_outcomes_cf_Y1[counter_Y0,:,:] = pred_outcomes[i,mask_cf,:]
+                true_outcomes_cf_Y1[counter_Y0,:,:] = true_outcomes[i,mask_cf,:]
+                counter_Y0 += 1
+            else:
+                pred_outcomes_factual_Y1[counter_Y1,:] = pred_outcomes[i, mask_factual,:]
+                true_outcomes_factual_Y1[counter_Y1,:] = true_outcomes[i, mask_factual,:]
+                pred_outcomes_cf_Y0[counter_Y1,:,:] = pred_outcomes[i,mask_cf,:]
+                true_outcomes_cf_Y0[counter_Y1,:,:] = true_outcomes[i,mask_cf,:]
+                counter_Y1 += 1
+        rmse_factual = mean_squared_error(true_outcomes_factual.reshape(-1), pred_outcomes_factual.reshape(-1), squared=False)
+        rmse_factual_Y0 = mean_squared_error(true_outcomes_factual_Y0.reshape(-1), pred_outcomes_factual_Y0.reshape(-1), squared=False)
+        rmse_factual_Y1 = mean_squared_error(true_outcomes_factual_Y1.reshape(-1), pred_outcomes_factual_Y1.reshape(-1), squared=False)
+        rmse_cf = mean_squared_error(true_outcomes_cf.reshape(-1), pred_outcomes_cf.reshape(-1), squared=False)
+        rmse_cf_Y0 = mean_squared_error(true_outcomes_cf_Y0.reshape(-1), pred_outcomes_cf_Y0.reshape(-1), squared=False)
+        rmse_cf_Y1 = mean_squared_error(true_outcomes_cf_Y1.reshape(-1), pred_outcomes_cf_Y1.reshape(-1), squared=False)
+        rmse_Y0 = mean_squared_error(np.concatenate([true_outcomes_factual_Y0.reshape(-1), true_outcomes_cf_Y0.reshape(-1)]), np.concatenate([pred_outcomes_factual_Y0.reshape(-1), pred_outcomes_cf_Y0.reshape(-1)]), squared=False)
+        rmse_Y1 = mean_squared_error(np.concatenate([true_outcomes_factual_Y1.reshape(-1), true_outcomes_cf_Y1.reshape(-1)]), np.concatenate([pred_outcomes_factual_Y1.reshape(-1), pred_outcomes_cf_Y1.reshape(-1)]), squared=False)
+        # Get variance of true outcomes, factual and cf
+        # factual_std = np.std(true_outcomes_factual)
+        # cf_std = np.std(true_outcomes_cf)
+        factual_std = np.var(true_outcomes_factual)
+        cf_std = np.var(true_outcomes_cf)
+        log.debug(
+            f"\nPred outcomes: {pred_outcomes.shape}: \n{pred_outcomes}"
+            f"\n\nT: \n{T}"
+            f"\n\nPred outcomes factual: {pred_outcomes_factual.shape}: \n{pred_outcomes_factual}"
+        )
+        return rmse_Y0, rmse_Y1, rmse_factual_Y0, rmse_factual_Y1, rmse_cf_Y0, rmse_cf_Y1, rmse_factual, rmse_cf, rmse_factual/factual_std, rmse_cf/cf_std, np.mean(true_outcomes_factual), np.mean(true_outcomes_cf), np.std(true_outcomes_factual), np.std(true_outcomes_cf)
+    def compute_outcome_auroc(self,
+                              pred_outcomes: np.ndarray,
+                              true_outcomes: np.ndarray,
+                              T: np.ndarray = None) -> float:
+        """
+        Compute AUROC for all outcomes.
+        """
+        # Only keep factual cates
+        pred_outcomes_factual = np.zeros((pred_outcomes.shape[0], pred_outcomes.shape[2])) # n, dim_Y
+        true_outcomes_factual = np.zeros((true_outcomes.shape[0], true_outcomes.shape[2]))
+        pred_outcomes_cf = np.zeros((pred_outcomes.shape[0], pred_outcomes.shape[1]-1, pred_outcomes.shape[2])) # n, num_T-1, dim_Y
+        true_outcomes_cf = np.zeros((true_outcomes.shape[0], pred_outcomes.shape[1]-1, true_outcomes.shape[2]))
+        for i in range(pred_outcomes.shape[0]):
+            mask_factual = np.zeros(pred_outcomes.shape[1], dtype=bool)
+            mask_cf = np.ones(pred_outcomes.shape[1], dtype=bool)
+            mask_factual[T[i]] = True
+            mask_cf[T[i]] = False
+            pred_outcomes_factual[i,:] = pred_outcomes[i, mask_factual,:]
+            true_outcomes_factual[i,:] = true_outcomes[i, mask_factual,:]
+            pred_outcomes_cf[i,:,:] = pred_outcomes[i, mask_cf,:]
+            true_outcomes_cf[i,:,:] = true_outcomes[i, mask_cf,:]
+        auroc_factual = roc_auc_score(true_outcomes_factual.reshape(-1), pred_outcomes_factual.reshape(-1))
+        auroc_cf = roc_auc_score(true_outcomes_cf.reshape(-1), pred_outcomes_cf.reshape(-1))
+        log.debug(
+            f"\nPred outcomes: {pred_outcomes.shape}: \n{pred_outcomes}"
+            f"\n\nT: \n{T}"
+            f"\n\nPred outcomes factual: {pred_outcomes_factual.shape}: \n{pred_outcomes_factual}"
+        )
+        return auroc_factual, auroc_cf
+        # if factual:
+        #     # Only keep factual cates
+        #     pred_outcomes_factual = np.zeros((pred_outcomes.shape[0], pred_outcomes.shape[2])) # dim_X, num_T, dim_Y
+        #     true_outcomes_factual = np.zeros((true_outcomes.shape[0], true_outcomes.shape[2]))
+        #     for i in range(pred_outcomes.shape[0]):
+        #         mask = np.zeros(pred_outcomes.shape[1], dtype=bool)
+        #         mask[T[i]] = True
+        #         pred_outcomes_factual[i,:] = pred_outcomes[i, mask,:]
+        #         true_outcomes_factual[i,:] = true_outcomes[i, mask,:]
+        #     # f1 = f1_score(true_outcomes_factual.reshape(-1), pred_outcomes_factual.reshape(-1))
+        #     f1 = roc_auc_score(true_outcomes_factual.reshape(-1), pred_outcomes_factual.clip(0,1).reshape(-1))
+        # else:
+        #     auroc = roc_auc_score(true_outcomes.reshape(-1), pred_outcomes.clip(0,1).reshape(-1))
+        #     f1 = auroc
+        #     #f1 = f1_score(true_outcomes.reshape(-1), pred_outcomes.reshape(-1))
+        # auroc = f1 #TODO: Change name to f1
+        # return auroc
+    def compute_overall_pehe(self,
+                                pred_cates: np.ndarray,
+                                true_cates: np.ndarray,
+                                T: np.ndarray) -> float:
+        """
+        Compute average PEHE across all treatment options for all outcomes.
+        """
+        # Remove cates where basline and assigned treatment are the same - they evaluate to zero and are not informative
+        if self.discrete_outcome:
+            pred_cates = pred_cates.clip(-1,1)
+        pehe_sum_total = 0
+        pehe_normalized_sum_total = 0
+        mean_sum_total = 0
+        std_sum_total = 0
+        for outcome_idx in range(pred_cates.shape[2]):
+            pred_cates_curr = pred_cates[:,:,outcome_idx]
+            true_cates_curr = true_cates[:,:,outcome_idx]
+            counter = 0
+            pehe_sum = 0
+            pehe_normalized_sum = 0
+            mean_sum = 0
+            std_sum = 0
+            for i in range(pred_cates.shape[1]):
+                for j in range(i):
+                    mask_j = T == j
+                    mask_i = T == i
+                    n = np.sum(mask_j) + np.sum(mask_i)
+                    counter += 1
+                    pred_cates_curr_cate_j = pred_cates_curr[mask_j,i]
+                    true_cates_curr_cate_j = true_cates_curr[mask_j,i]
+                    # pred_cates_curr_cate_i = pred_cates_curr[mask_i,j]
+                    # true_cates_curr_cate_i = true_cates_curr[mask_i,j]
+                    pred_cates_curr_cate_i = -pred_cates_curr[mask_i,j] # Make sure to always record cate in same direction
+                    true_cates_curr_cate_i = -true_cates_curr[mask_i,j] # TODO: Make sure this makes sense for multiple treatments & outcomes
+                    pred_cates_curr_cate = np.concatenate((pred_cates_curr_cate_j, pred_cates_curr_cate_i)).reshape(-1)
+                    true_cates_curr_cate = np.concatenate((true_cates_curr_cate_j, true_cates_curr_cate_i)).reshape(-1)
+                    # Compute mean CATE
+                    true_cates_mean = np.mean(true_cates_curr_cate)
+                    mean_sum += true_cates_mean
+                    # Compute std of CATE
+                    true_cates_std = np.std(true_cates_curr_cate)
+                    std_sum += true_cates_std
+                    pehe_curr = mean_squared_error(true_cates_curr_cate, pred_cates_curr_cate)
+                    pehe_sum += pehe_curr
+                    pehe_normalized_sum += pehe_curr / np.var(true_cates_curr_cate)
+                    log.debug(
+                        f'Check pehe computation for outcome: {outcome_idx}, cate: {i}-{j}'
+                        f'============================================'
+                        f'\npred_cates: {pred_cates.shape}'
+                        f'\n{pred_cates}'
+                        f'\n\ntrue_cates: {true_cates.shape}'
+                        f'\n{true_cates}'
+                        f'\n\nT: {T.shape}'
+                        f'\n{T}'
+                        f'\n\npred_cates_curr: {pred_cates_curr.shape}'
+                        f'\n{pred_cates_curr}'
+                        f'\n\ntrue_cates_curr: {true_cates_mean.shape}'
+                        f'\n{true_cates_curr}'
+                        f'\n\nmask_i: {mask_i.shape}'
+                        f'\n{mask_i}'
+                        f'\n\nmask_j: {mask_j.shape}'
+                        f'\n{mask_j}'
+                        f'\n\pred_cates_curr_cate: {pred_cates_curr_cate.shape}'
+                        f'\n{pred_cates_curr_cate}'
+                        f'\n\ntrue_cates_curr_cate: {true_cates_curr_cate.shape}'
+                        f'\n{true_cates_curr_cate}'
+                        f'\n\ncounter is currently: {counter}'
+                        f'\n\npehe_curr: {pehe_curr}'
+                        f'\n\nmean_sum: {mean_sum}'
+                        f'\n\nstd_sum: {std_sum}'
+                        f'\n============================================\n\n'
+                    )
+            pehe = pehe_sum / counter
+            pehe_normalized = pehe_normalized_sum / counter
+            pehe_sum_total += pehe
+            pehe_normalized_sum_total += pehe_normalized
+            mean_sum_total += mean_sum / counter
+            std_sum_total += std_sum / counter
+        pehe_total = pehe_sum_total / pred_cates.shape[2]
+        pehe_normalized_total = pehe_normalized_sum_total / pred_cates.shape[2]
+        true_cates_mean_total = mean_sum_total / pred_cates.shape[2]
+        true_cates_std_total = std_sum_total / pred_cates.shape[2]
+        # pred_cates_filt = np.zeros((pred_cates.shape[0], pred_cates.shape[1]-1, pred_cates.shape[2])) # dim_X, num_T, dim_Y
+        # true_cates_filt = np.zeros((true_cates.shape[0], true_cates.shape[1]-1, true_cates.shape[2]))
+        # for i in range(pred_cates.shape[0]):
+        #     mask = np.ones(pred_cates.shape[1], dtype=bool)
+        #     mask[T[i]] = False
+        #     pred_cates_filt[i,:,:] = pred_cates[i, mask,:]
+        #     true_cates_filt[i,:,:] = true_cates[i, mask,:]
+        # # Reshape to get all cates for each outcome in one dimension
+        # pred_cates_filt = pred_cates_filt.reshape(-1)
+        # true_cates_filt = true_cates_filt.reshape(-1)
+        # # Compute PEHE
+        # pehe = mean_squared_error(true_cates_filt, pred_cates_filt)
+        # Compute mean CATE
+        # pred_cates_mean = np.mean(pred_cates_filt)
+        # true_cates_mean = np.mean(true_cates_filt)
+        # # Compute std of CATE
+        # pred_cates_std = np.std(pred_cates_filt)
+        # true_cates_std = np.std(true_cates_filt)
+        # # Compute normalized PEHE
+        # pehe_normalized = pehe / np.var(true_cates_filt)
+        return pehe_total, pehe_normalized_total, true_cates_mean_total, true_cates_std_total
+    def get_pred_cates(self,
+                       model_name: str,
+                       X: np.ndarray,
+                       T: np.ndarray,
+                       outcomes_test: np.ndarray) -> np.ndarray:
+        """
+        Get the predicted CATEs for a model.
+        """
+        # Predict cate for every treatment option and use assigned treatment as baseline treatment.
+        T0 = T
+        T1 = np.zeros_like(T)
+        pred_cates = np.zeros((X.shape[0], self.cfg.simulator.num_T, self.cfg.simulator.dim_Y))
+        pred_cates_conf = np.zeros((X.shape[0], self.cfg.simulator.num_T, self.cfg.simulator.dim_Y, 2))
+        for i in range(self.cfg.simulator.num_T):
+            # Set to current treatment
+            T1[:] = i
+            # Deal with Torch models in case there are only two treatment options
+            if self.cfg.simulator.num_T > 2:
+                pred = self.learners[model_name].predict(X, T0=T0, T1=T1) # This predicts y[T1]-y[T0]
+            elif model_name == "DiffPOLearner":
+                mask = T1 == T0
+                pred = self.learners[model_name].predict(X, T0=T0, T1=T1, outcomes=outcomes_test)
+                pred[mask] = 0
+                # if self.evaluate_inference:
+                #     cates_conf = self.learners[model_name].est.effect_interval(X)
+                if i == 0:
+                    pred = -pred
+            else:
+                mask = T1 == T0
+                pred = self.learners[model_name].predict(X) # This predicts y[1]-y[0]
+                pred[mask] = 0
+                # if self.evaluate_inference:
+                #     cates_conf = self.learners[model_name].est.effect_interval(X)
+                if i == 0:
+                    pred = -pred
+                    # cates_conf_lbs = -cates_conf[0]
+                    # cates_conf_ups = -cates_conf[1]
+            # print(cates_conf_lbs)
+            # print(cates_conf_ups)
+            pred = pred.reshape(-1, self.cfg.simulator.dim_Y)
+            pred_cates[:, i, :] = pred.cpu().detach().numpy()
+            # if self.evaluate_inference:
+            #     cates_conf_lbs = cates_conf_lbs.reshape(-1, self.cfg.simulator.dim_Y)
+            #     cates_conf_ups = cates_conf_ups.reshape(-1, self.cfg.simulator.dim_Y)
+            #     pred_cates_conf[:, i, :, 0] = cates_conf_lbs
+            #     pred_cates_conf[:, i, :, 1] = cates_conf_ups
+        log.debug(f"Predicted CATEs for {model_name} have shape {pred_cates.shape}.")
+        # if self.discrete_outcome:
+        #     # Clip the values to the range [-1, 1]
+        #     pred_cates = np.clip(pred_cates, -1, 1)
+        #     # Round to the nearest integer
+        #     pred_cates = np.rint(pred_cates)
+        # Fill nan values with zeros
+        if np.isnan(pred_cates).any():
+            log.warning(f"There are nan values in the predicted CATEs for model: {model_name}. They were filled with 0s")
+        pred_cates = np.nan_to_num(pred_cates)
+        return pred_cates
+    def get_pred_outcomes(self,
+                            model_name: str,
+                            pred_cates: np.ndarray,
+                            X: np.ndarray,
+                            T: np.ndarray) -> np.ndarray:
+            """
+            Get the predicted outcomes for a model.
+            """
+            # Get predicted outcomes for all treatment options
+            pred_outcomes_baselines = np.zeros((X.shape[0], self.cfg.simulator.num_T, self.cfg.simulator.dim_Y))
+            # Get directly predicted outcomes if available
+            if model_name in ["DiffPOLearner", "EconML_SLearner_Lasso", "EconML_TLearner_Lasso", "EconML_DML"]:
+                pred_outcomes = self.learners[model_name].predict_outcomes(X, T0=T, outcomes=None)
+                return pred_outcomes
+            elif model_name in ["Torch_ActionNet", "Torch_TARNet", "Torch_TLearner", "Torch_SLearner", "Torch_DragonNet_2", "Torch_DragonNet_4", "Torch_DragonNet", "TorchSNet", "Torch_CFRNet_0.001", "Torch_CFRNet_0.01", "Torch_CFRNet_0.0001"]:
+                _, y0_pred, y1_pred = self.learners[model_name].predict(X, return_po=True)
+                y0_pred = y0_pred.cpu().detach().numpy()
+                y1_pred = y1_pred.cpu().detach().numpy()
+                pred_outcomes = np.zeros((X.shape[0], self.cfg.simulator.num_T, self.cfg.simulator.dim_Y))
+                pred_outcomes[:, 0, :] = y0_pred.reshape(-1, self.cfg.simulator.dim_Y)
+                pred_outcomes[:, 1, :] = y1_pred.reshape(-1, self.cfg.simulator.dim_Y)
+                return pred_outcomes
+            else:
+                for i in range(self.cfg.simulator.num_T):
+                    # Get baseline outcomes
+                    mask = T == i
+                    # Fill in baseline outcomes according to selected treatments for each patient
+                    if self.discrete_outcome:
+                        baseline_preds = np.zeros((X[mask, :].shape[0], self.cfg.simulator.dim_Y))
+                        baseline_preds_list = self.baseline_learners[i].predict_proba(X[mask, :])
+                        for out_dim in range(self.cfg.simulator.dim_Y):
+                            baseline_preds_curr = baseline_preds_list[out_dim][:,1]
+                            baseline_preds[:,out_dim] = baseline_preds_curr
+                    else:
+                        baseline_preds = self.baseline_learners[i].predict(X[mask, :])
+                    baseline_preds = np.repeat(baseline_preds[:,np.newaxis,:], self.cfg.simulator.num_T, axis=1)
+                    # Copy baseline predictions to all treatment options and make sure dimensions match
+                    pred_outcomes_baselines[mask, :, :] = baseline_preds
+                # Add predicted CATEs to baseline outcomes
+                pred_outcomes = pred_outcomes_baselines + pred_cates
+                log.debug(
+                    f'Check predicted outcomes for model:'
+                    f'============================================'
+                    f'X: {X.shape}'
+                    f'\n{X}'
+                    f'\nT: {T.shape}'
+                    f'\n{T}'
+                    f'\npred_cates: {pred_cates.shape}'
+                    f'\n{pred_cates}'
+                    f'\npred_outcomes_baselines: {pred_outcomes_baselines.shape}'
+                    f'\n{pred_outcomes_baselines}'
+                    f'\npred_outcomes: {pred_outcomes.shape}'
+                    f'\n{pred_outcomes}'
+                    f'\n============================================\n\n'
+                )
+                # Clip to binary outcome
+                # if self.discrete_outcome:
+                #     pred_outcomes = np.clip(pred_outcomes, 0, 1)
+                # Fill nan values with zeros and warn user
+                if np.isnan(pred_outcomes).any():
+                    log.warning("There are nan values in the predicted outcomes for. They were filled with zeros.")
+                pred_outcomes = np.nan_to_num(pred_outcomes)
+                return pred_outcomes
+    def get_effect_cis(self,
+                        model_name: str,
+                        X: np.ndarray,
+                        T: np.ndarray) -> np.ndarray:
+        """
+        Get the confidence intervals for the predicted CATEs.
+        """
+        # Predict cate for every treatment option and use assigned treatment as baseline treatment.
+        T0 = T
+        T1 = np.zeros_like(T)
+        pred_cates_conf = np.zeros((2, X.shape[0], self.cfg.simulator.num_T, self.cfg.simulator.dim_Y))
+        for i in range(self.cfg.simulator.num_T):
+            # Set to current treatment
+            T1[:] = i
+            effect_cis = self.learners[model_name].infer_effect_ci(X=X, T0=T0) # dim: 2, n, dim_Y (2 for lower and upper bound of confidence interval)
+            pred_cates_conf[:, :, i, :] = effect_cis
+        return pred_cates_conf
+    def get_swap_statistics_single_outcome(self,
+                                           T: np.ndarray,
+                                           outcomes: np.ndarray, # n, num_T
+                                           pred_cates: np.ndarray, # n, num_T
+                                           shap_values_pred: np.ndarray, # num_T, n, dim_X
+                                           shap_base_values_pred: np.ndarray, # num_T
+                                           k: int = 1,
+                                           threshold: int = 0):
+        """
+        Evaluates for a given decisions threshold and number of decision variables k,
+        how well the personalized shap values predict whether the treatment should be swapped.
+        """
+        tp, fp, tn, fn = 0,0,0,0 # slightly biased, but avoids division by 0
+        for i in range(T.shape[0]):
+            # Check whether the true outcomes would speak for swapping the treatment
+            outcome = outcomes[i,T[i]]
+            outcome_mean = np.mean(outcomes[i])
+            swap_true = outcome < outcome_mean # swap if below mean
+            # print(f"Outcome: {outcome}, Outcome mean: {outcome_mean}, Swap true: {swap_true}")
+            # print(f"Pred cates: {pred_cates[i,:]}")
+            # Check whether the SHAP values would speak for swapping the treatment
+            # get average cate
+            swap_pred = np.mean(pred_cates[i,:]) > threshold
+            ## OLD: for top k evaluation
+            # shap_values = shap_values_pred[T[i],i,:]
+            # Get the top k features in terms of absolute SHAP values
+            # top_k = np.argsort(np.abs(shap_values))[-k:] #[-k] to check for one feature
+            # swap_pred = np.sum(shap_values[top_k]) + shap_base_values_pred[T[i]] > threshold
+            if swap_true and swap_pred:
+                tp += 1
+            elif swap_true and not swap_pred:
+                fn += 1
+            elif not swap_true and swap_pred:
+                fp += 1
+            else:
+                tn += 1
+        # Compute the FPR and TPR and precision and recall
+        fpr = fp / (fp + tn) if fp + tn > 0 else np.nan
+        tpr = tp / (tp + fn) if tp + fn > 0 else np.nan
+        precision = tp / (tp + fp) if tp + fp > 0 else np.nan
+        recall = tp / (tp + fn) if tp + fn > 0 else np.nan
+        return fpr, tpr, precision, recall
+    def compute_swap_metrics_single_outcome(self,
+                                            T: np.ndarray,
+                                            outcomes: np.ndarray, # n, num_T
+                                            pred_cates: np.ndarray, # n, num_T
+                                            shap_values_pred: np.ndarray, # num_T, n, dim_X
+                                            shap_base_values_pred: np.ndarray, # num_T
+                                            k: int = 1,) -> dict:
+        """
+        Computes the swap metrics for all outcomes.
+        """
+        # Get decision thresholds for auroc and auprc computation
+        thresholds = []
+        thresholds = list(pred_cates.reshape(-1))
+        ## For top k auroc
+        # for i in range(T.shape[0]):
+        #     shap_values = shap_values_pred[T[i],i,:]
+        #     # Get the top k features in terms of absolute SHAP values
+        #     top_k = np.argsort(np.abs(shap_values))[-k:]
+        #     thresholds.append(np.sum(shap_values[top_k]) + shap_base_values_pred[T[i]])
+        # Only use unique thresholds
+        thresholds = np.unique(thresholds)
+        if thresholds.shape[0] == 1:
+            thresholds = np.array([-1, thresholds[0],1])
+        # Only leave 200 thresholds to accelerate computation of scores
+        if len(thresholds) >= 200:
+            step = len(thresholds)//200
+        else:
+            step = 1
+        thresholds = thresholds[::step]
+        # Iterate through all thresholds and compute statistics
+        fprs, tprs, precisions, recalls = [], [], [], []
+        for threshold in thresholds:
+            fpr, tpr, precision, recall = self.get_swap_statistics_single_outcome(T,
+                                                                                  outcomes,
+                                                                                  pred_cates,
+                                                                                  shap_values_pred,
+                                                                                  shap_base_values_pred,
+                                                                                  k, threshold)
+            fprs.append(fpr)
+            tprs.append(tpr)
+            if precision is not np.nan:
+                precisions.append(precision)
+            # else:
+            #     log.info("Precision is nan")
+            if recall is not np.nan:
+                recalls.append(recall)
+            # else:
+            #     log.info("Recall is nan")
+        # Compute auroc and auprc
+        # Sort fprs and tprs by fpr
+        fprs_sort, tprs_sort = zip(*sorted(zip(fprs, tprs)))
+        fprs_sort, tprs_sort = np.array(fprs_sort), np.array(tprs_sort)
+        # Compute the AUC
+        roc_auc = auc(fprs_sort, tprs_sort)
+        # Compute the AUC for the precision-recall curve
+        if len(recalls) > 1 and len(precisions) > 1:
+            # Sort precisions and recalls by recall
+            recalls_sort, precisions_sort = zip(*sorted(zip(recalls, precisions)))
+            recalls_sort, precisions_sort = np.array(recalls_sort), np.array(precisions_sort)
+            pr_auc = auc(recalls_sort, precisions_sort)
+        else:
+            pr_auc = -1
+        return roc_auc, pr_auc
+    def compute_swap_metrics(self,
+                            T: np.ndarray,
+                            true_outcomes: np.ndarray,
+                            pred_cates: np.ndarray,
+                            shap_values_pred: np.ndarray,
+                            shap_base_values_pred: np.ndarray,
+                            k: int = 1) -> dict:
+        """
+        Compute swap metrics for all outcomes.
+        """
+        roc_aucs, pr_aucs = [], []
+        for i in range(self.cfg.simulator.dim_Y):
+            roc_auc, pr_auc = self.compute_swap_metrics_single_outcome(T,
+                                                                      true_outcomes[:,:,i],
+                                                                      pred_cates[:,:,i],
+                                                                      shap_values_pred[:,:,:,i],
+                                                                      shap_base_values_pred[:,i],
+                                                                      k)
+            roc_aucs.append(roc_auc)
+            pr_aucs.append(pr_auc)
+        roc_auc_total = np.mean(roc_aucs)
+        pr_auc_total = np.mean(pr_aucs)
+        # Compute swap percentage with threshold 0
+        counter = 0
+        swap_counter = 0
+        policy_precision = 0
+        predicted_precision = 0
+        for j in range(self.cfg.simulator.dim_Y):
+            for i in range(T.shape[0]):
+                # Check whether the true outcomes would speak for swapping the treatment
+                true_outcome = true_outcomes[i,T[i],j]
+                true_outcome_mean = np.mean(true_outcomes[i,:,j])
+                swap_true = true_outcome < true_outcome_mean # swap if below mean
+                counter += 1.0
+                swap_counter += swap_true
+        return roc_auc_total, pr_auc_total, swap_counter/counter
+    def compute_ci_coverage(self,
+                            pred_effect_cis: np.ndarray, # dim: 2, n, num_T-1, dim_Y
+                            true_cates: np.ndarray, # n, num_T, dim_Y
+                            T: np.ndarray) -> float:
+        """
+        Compute the coverage of the confidence intervals.
+        """
+        ci_coverage = 0
+        counter = 0
+        for i in range(pred_effect_cis.shape[2]):
+            for j in range(pred_effect_cis.shape[3]):
+                lb = pred_effect_cis[0,:,i,j]
+                ub = pred_effect_cis[1,:,i,j]
+                true_cate = true_cates[:,i,j]
+                # Only consider the cates where the baseline and assigned treatment are different
+                mask = T != i
+                lb = lb[mask]
+                ub = ub[mask]
+                true_cate = true_cate[mask]
+                ci_coverage += np.sum((lb <= true_cate) & (true_cate <= ub))
+                counter += len(true_cate)
+                log.debug(
+                    f'Check ci coverage computation for outcome: {j}, cate: {i}'
+                    f'============================================'
+                    f'\ntrue_cates: {true_cate.shape}'
+                    f'\n{true_cate}'
+                    f'\n\nT: {T.shape}'
+                    f'\n{T}'
+                    f'\n\nlb: {lb.shape}'
+                    f'\n{lb}'
+                    f'\n\nub: {ub.shape}'
+                    f'\n{ub}'
+                    f'\n\ncoverage is currently: {ci_coverage}'
+                    f'\n============================================\n\n'
+                )
+        ci_coverage /= counter
+        return ci_coverage
+    def compute_expertise_metrics(self,
+                                    T: np.ndarray,
+                                    outcomes: np.ndarray,
+                                    type: str = "prognostic") -> tuple:
+        """
+        Compute the prognostic or treatment expertise.
+        """
+        if self.cfg.simulator.num_T != 2:
+            raise ValueError("Expertise metrics can only be computed for binary treatments.")
+        # Get potential outcomes
+        y0 = outcomes[:, 0, 0]
+        y1 = outcomes[:, 1, 0]
+        # prognostic expertise calculation
+        if type == "predictive" or type == "prognostic" or type == "treatment":
+            if type == "predictive":
+                _, gt_bins = np.histogram(y1 - y0, bins="auto")
+                gt_uncond_result = y1 - y0
+            elif type == "prognostic":
+                _, gt_bins = np.histogram(y0, bins="auto")
+                gt_uncond_result = y0
+            elif type == "treatment":
+                _, gt_bins = np.histogram(y1, bins="auto")
+                gt_uncond_result = y1
+            else:
+                raise ValueError("Invalid expertise type. Choose between 'predictive', 'prognostic' or 'treatment'.")
+            actions, _ = np.histogram(T, bins='auto')
+            actions = actions / T.shape[0]
+            actentropy = -np.sum(actions * np.ma.log(actions))
+            num_ones = np.sum(T) / T.shape[0]
+            # cond = -np.mean(propensity_test * np.log(propensity_test) + (1 - propensity_test) * np.log(1 - propensity_test))
+            gt_uncond_hist, gt_uncond_bins = np.histogram(gt_uncond_result, bins=gt_bins)
+            gt_uncond_hist = gt_uncond_hist / gt_uncond_result.shape[0]
+            gt_uncond_hist = gt_uncond_hist[gt_uncond_hist != 0]
+            gt_uncond_entropy = -np.sum(gt_uncond_hist * np.log(gt_uncond_hist))
+            gt_cond_one = gt_uncond_result * T
+            gt_cond_one = gt_cond_one[gt_cond_one != 0]
+            gt_one_hist, _ = np.histogram(gt_cond_one, bins=gt_bins)
+            gt_one_hist = gt_one_hist / gt_cond_one.shape[0]
+            gt_one_hist = gt_one_hist[gt_one_hist != 0]
+            gt_one_entropy = -np.sum(gt_one_hist * np.log(gt_one_hist))
+            gt_cond_zero = gt_uncond_result * (1 - T)
+            gt_cond_zero = gt_cond_zero[gt_cond_zero != 0]
+            gt_zero_hist, _ = np.histogram(gt_cond_zero, bins=gt_bins)
+            gt_zero_hist = gt_zero_hist / gt_cond_zero.shape[0]
+            gt_zero_hist = gt_zero_hist[gt_zero_hist != 0]
+            gt_zero_entropy = -np.sum(gt_zero_hist * np.log(gt_zero_hist))
+            gt_expertise = gt_uncond_entropy - num_ones * gt_one_entropy - (1 - num_ones) * gt_zero_entropy
+            gt_expertise = gt_expertise / actentropy
+        elif type == "total":
+            _, gt1_bins, gt0_bins = np.histogram2d(y1, y0)
+            actions, _ = np.histogram(T, bins='auto')
+            actions = actions / T.shape[0]
+            actentropy = -np.sum(actions * np.ma.log(actions))
+            num_ones = np.sum(T) / T.shape[0]
+            # cond = -np.mean(propensity_test * np.log(propensity_test) + (1 - propensity_test) * np.log(1 - propensity_test))
+            gt_uncond_hist, _, _ = np.histogram2d(y1, y0, bins=[gt1_bins, gt0_bins])
+            gt_uncond_hist = gt_uncond_hist / y1.shape[0] #/ y0.shape[0]
+            gt_uncond_hist = gt_uncond_hist[gt_uncond_hist != 0]
+            gt_uncond_entropy = -np.sum(gt_uncond_hist * np.log(gt_uncond_hist))
+            gt_cond_one1 = y1 * T
+            gt_cond_one1 = gt_cond_one1[gt_cond_one1 != 0]
+            gt_cond_one0 = y0 * T
+            gt_cond_one0 = gt_cond_one0[gt_cond_one0 != 0]
+            gt_one_hist, _, _ = np.histogram2d(gt_cond_one1, gt_cond_one0, bins=[gt1_bins, gt0_bins])
+            gt_one_hist = gt_one_hist / gt_cond_one1.shape[0] #/ gt_cond_one0.shape[0]
+            gt_one_hist = gt_one_hist[gt_one_hist != 0]
+            gt_one_entropy = -np.sum(gt_one_hist * np.log(gt_one_hist))
+            gt_cond_zero1 = y1 * (1 - T)
+            gt_cond_zero1 = gt_cond_zero1[gt_cond_zero1 != 0]
+            gt_cond_zero0 = y0 * (1 - T)
+            gt_cond_zero0 = gt_cond_zero0[gt_cond_zero0 != 0]
+            gt_zero_hist, _, _ = np.histogram2d(gt_cond_zero1, gt_cond_zero0, bins=[gt1_bins, gt0_bins])
+            gt_zero_hist = gt_zero_hist / gt_cond_zero1.shape[0] #/ gt_cond_zero0.shape[0]
+            gt_zero_hist = gt_zero_hist[gt_zero_hist != 0]
+            gt_zero_entropy = -np.sum(gt_zero_hist * np.log(gt_zero_hist))
+            gt_expertise = gt_uncond_entropy - num_ones * gt_one_entropy - (1 - num_ones) * gt_zero_entropy
+            gt_expertise = gt_expertise / actentropy
+        else:
+            raise ValueError("Invalid expertise type. Choose between 'predictive', 'prognostic' or 'treatment'.")
+        return gt_expertise
+    def compute_incontext_variability(self,
+                                      T: np.ndarray,
+                                        propensities: np.ndarray) -> float:
+        """
+        Compute the in-context variability.
+        """
+        # Sum over contribution of all patients
+        props = propensities.reshape(-1)
+        props = props[props != 0]
+        cond_entropy = 2*-np.mean(props * np.log(props))
+        # Get action entropy
+        actions, _ = np.histogram(T, bins='auto')
+        actions = actions / T.shape[0]
+        actentropy = -np.sum(actions * np.ma.log(actions))
+        return cond_entropy / actentropy
+    def get_pred_assignment_precision(self, pred_outcomes, true_outcomes):
+        # Compute swap percentage with threshold 0
+        counter = 0
+        correct_counter = 0
+        for j in range(self.cfg.simulator.dim_Y):
+            for i in range(pred_outcomes.shape[0]):
+                # Check whether the true outcomes would speak for swapping the treatment
+                true_y0 = true_outcomes[i,0,j]
+                true_y1 = true_outcomes[i,1,j]
+                pred_y0 = pred_outcomes[i,0,j]
+                pred_y1 = pred_outcomes[i,1,j]
+                true_ranking = true_y0 < true_y1
+                pred_ranking = pred_y0 < pred_y1
+                if (true_ranking == 0 and pred_ranking == 0) or ((true_ranking == 1 and pred_ranking == 1)):
+                    correct_counter += 1
+                counter += 1.0
+        return correct_counter / counter
+    def compute_metrics(self,
+                        results_data: dict,
+                        sim: SimulatorBase,
+                        X_train: np.ndarray,
+                        Y_train: np.ndarray,
+                        T_train: np.ndarray,
+                        X_test: np.ndarray,
+                        Y_test: np.ndarray,
+                        T_test: np.ndarray,
+                        outcomes_train: np.ndarray,
+                        outcomes_test: np.ndarray,
+                        propensities_train: np.ndarray,
+                        propensities_test: np.ndarray,
+                        x_axis_value: float,
+                        x_axis_name: str,
+                        compare_value: float,
+                        compare_name: str,
+                        seed: int,
+                        split_id: int) -> dict:
+        """
+        Compute metrics for a given experiment.
+        """
+        # Get learners
+        self.baseline_learners = self.get_baseline_learners(seed=seed)
+        self.learners = self.get_learners(num_features=X_train.shape[1], seed=seed)
+        # Train learners
+        if self.cfg.train_baseline_learner:
+            self.train_baseline_learners(X_train, outcomes_train, T_train)
+        self.train_learners(X_train, Y_train, T_train, outcomes_train)
+        # Get treatment distribution for train and test
+        train_treatment_distribution = np.bincount(T_train) / len(T_train)
+        test_treatment_distribution = np.bincount(T_test) / len(T_test)
+        if self.cfg.simulator.num_T == 2: # Such that it can be plotted
+            train_treatment_distribution = train_treatment_distribution[0]
+            test_treatment_distribution = test_treatment_distribution[0]
+        # Get learner explanations
+        (learner_explanations, _) = self.get_learner_explanations(X=X_test, type="pred") if self.cfg.evaluate_explanations else (None, None)
+        (learner_prog_explanations, _) = self.get_learner_explanations(X=X_test, type="prog") if self.cfg.evaluate_prog_explanations else (None, None)
+        # Get and train select learner - treatment selection model
+        if self.cfg.evaluate_in_context_variability:
+            self.select_learner = self.get_select_learner(seed=seed)
+            self.train_select_learner(X_train, T_train)
+            propensities_pred_test = self.select_learner.predict_proba(X_test)
+            propensities_pred_train = self.select_learner.predict_proba(X_train)
+        # # get auroc for propensity score
+        # propensity_auc = roc_auc_score(T_test, prop_test[:,1])
+        # # propensity_auc = auc(fpr, tpr)
+        # print(propensity_auc)
+        # quit()
+        # select_learner_explanations = self.get_select_learner_explanations(X_reference=X_train,
+        #                                                                     X_to_explain=X_test)
+        # Compute metrics
+        baseline_metrics = {} # First model in list will be used as baseline
+        for i, model_name in enumerate(self.cfg.model_names):
+            # try:
+            # Compute attribution accuracies
+            # print(f"Computing attribution accuracies for model: {model_name}...")
+            # print(learner_explanations)
+            # print(f"Attribution explanations: {learner_explanations[model_name].shape}")
+            if self.cfg.evaluate_explanations and learner_explanations[model_name] is not None:
+                attribution_est = np.abs(learner_explanations[model_name])
+                pred_acc_scores_all_features = attribution_accuracy(self.all_important_features, attribution_est)
+                pred_acc_scores_predictive_features = attribution_accuracy(self.pred_features, attribution_est)
+                pred_acc_scores_prog_features = attribution_accuracy(self.prog_features, attribution_est)
+                pred_acc_scores_selective_features = attribution_accuracy(self.select_features, attribution_est)
+            else:
+                pred_acc_scores_all_features = -1
+                pred_acc_scores_predictive_features = -1
+                pred_acc_scores_prog_features = -1
+                pred_acc_scores_selective_features = -1
+            if self.cfg.evaluate_prog_explanations and learner_prog_explanations[model_name] is not None:
+                prog_attribution_est = np.abs(learner_prog_explanations[model_name])
+                prog_acc_scores_all_features = attribution_accuracy(self.all_important_features, prog_attribution_est)
+                prog_acc_scores_predictive_features = attribution_accuracy(self.pred_features, prog_attribution_est)
+                prog_acc_scores_prog_features = attribution_accuracy(self.prog_features, prog_attribution_est)
+                prog_acc_scores_selective_features = attribution_accuracy(self.select_features, prog_attribution_est)
+            else:
+                prog_acc_scores_all_features = -1
+                prog_acc_scores_predictive_features = -1
+                prog_acc_scores_prog_features = -1
+                prog_acc_scores_selective_features = -1
+            # Compute predicted cates and outcomes
+            pred_cates = self.get_pred_cates(model_name, X_test, T_test, outcomes_test)
+            pred_cates_train = self.get_pred_cates(model_name, X_train, T_train, outcomes_train)
+            pred_outcomes = self.get_pred_outcomes(model_name, pred_cates, X_test, T_test)
+            pred_outcomes_train = self.get_pred_outcomes(model_name, pred_cates_train, X_train, T_train)
+            pred_effect_cis = self.get_effect_cis(model_name, X_test, T_test) if self.evaluate_inference else None
+            # quit()
+            # pred_cates_train = self.get_pred_cates(model_name, X_train, T_train, outcomes_train)
+            # pred_outcomes_train = self.get_pred_outcomes(pred_cates_train, X_train, T_train)
+            # Train models for treat-specific explanations
+            temp = self.discrete_outcome
+            self.discrete_outcome = 0
+            self.pred_learners = self.get_pred_learners(seed=seed) # One for each treatment as reference treatment
+            self.prog_learner = self.get_prog_learner(seed=seed) # One for the average over all treatments
+            self.discrete_outcome = temp
+            # self.train_prog_learner(X_train, pred_outcomes_train)
+            # self.train_pred_learner(X_train, pred_cates_train, T_train)
+            # Get treat-specific explanations
+            # prog_learner_explanations = self.get_prog_learner_explanations(X_reference=X_train,
+            #                                                                 X_to_explain=X_test)
+            # if self.cfg.evaluate_explanations:
+            #     pred_learner_explanations, pred_learner_base_values = self.get_pred_learner_explanations(X_reference=X_train,
+            #                                                                                         X_to_explain=X_test)
+            # else:
+            #     pred_learner_explanations = np.zeros((self.cfg.simulator.num_T, X_test.shape[0], X_test.shape[1], self.cfg.simulator.dim_Y))
+            #     pred_learner_base_values = np.zeros((self.cfg.simulator.num_T, self.cfg.simulator.dim_Y))
+            ## COMPUTE METRICS ##
+            # Compute PEHE
+            true_cates = sim.get_true_cates(X_test, T_test, outcomes_test)
+            # print(f"Pred cates: {pred_cates[:5]}")
+            # print(f"Pred outcomes: {pred_outcomes[:5]}")
+            # print(f"True cates: {true_cates[:5]}")
+            # print(f"True outcomes: {outcomes_test[:5]}")
+            # quit()
+            (
+                pehe,
+                pehe_normalized,
+                true_cates_mean,
+                true_cates_std
+            ) = self.compute_overall_pehe(pred_cates, true_cates, T_test)
+            (
+                rmse_Y0, rmse_Y1, rmse_factual_Y0, rmse_factual_Y1, rmse_cf_Y0, rmse_cf_Y1,
+                factual_outcomes_rmse, cf_outcomes_rmse,
+                factual_outcomes_rmse_normalized, cf_outcomes_rmse_normalized,
+                factual_outcomes_mean, cf_outcomes_mean,
+                factual_outcomes_std, cf_outcomes_std
+            )= self.compute_outcome_mse(pred_outcomes, outcomes_test, T_test)
+            f_cf_diff = np.abs(factual_outcomes_rmse - cf_outcomes_rmse)
+            f_cf_diff_norm = np.abs(factual_outcomes_rmse_normalized - cf_outcomes_rmse_normalized)
+            # Get aurocs in case of discrete outcomes
+            cf_outcomes_auroc = -1
+            factual_outcomes_auroc = -1
+            if self.discrete_outcome:
+                factual_outcomes_auroc, cf_outcomes_auroc = self.compute_outcome_auroc(pred_outcomes, outcomes_test, T_test)
+            # Compute personalized explanations metrics
+            #k_tre = self.cfg.simulator.num_pred_features * self.cfg.simulator.num_T
+            k_all = X_test.shape[1]
+            # Compute confidence intervall coverage
+            ci_coverage = self.compute_ci_coverage(pred_effect_cis, true_cates, T_test) if self.evaluate_inference else -1
+            # Compute expertise metrics
+            # Get learned policy
+            T_pred = pred_outcomes[:, :, 0].argmax(axis=1)
+            try:
+                gt_pred_expertise = self.compute_expertise_metrics(T_train, outcomes_train, type="predictive")
+                gt_prog_expertise = self.compute_expertise_metrics(T_train, outcomes_train, type="prognostic")
+                gt_tre_expertise = self.compute_expertise_metrics(T_train, outcomes_train, type="treatment")
+                updated_gt_pred_expertise = self.compute_expertise_metrics(T_pred, outcomes_test, type="predictive")
+                updated_gt_prog_expertise = self.compute_expertise_metrics(T_pred, outcomes_test, type="prognostic")
+                updated_gt_tre_expertise = self.compute_expertise_metrics(T_pred, outcomes_test, type="treatment")
+                gt_total_expertise = self.compute_expertise_metrics(T_train, outcomes_train, type="total") if self.discrete_outcome == 0 else -1
+                es_pred_expertise = self.compute_expertise_metrics(T_train, pred_outcomes_train, type="predictive")
+                es_prog_expertise = self.compute_expertise_metrics(T_train, pred_outcomes_train, type="prognostic")
+                es_tre_expertise = self.compute_expertise_metrics(T_train, pred_outcomes_train, type="treatment")
+                es_total_expertise = self.compute_expertise_metrics(T_train, pred_outcomes_train, type="total") if self.discrete_outcome == 0 else -1
+            except:
+                print("Error while computing expertise")
+                gt_pred_expertise = -1
+                gt_prog_expertise = -1
+                gt_tre_expertise = -1
+                gt_total_expertise = -1
+                updated_gt_pred_expertise = -1
+                updated_gt_prog_expertise = -1
+                updated_gt_tre_expertise = -1
+                es_pred_expertise = -1
+                es_prog_expertise = -1
+                es_total_expertise = -1
+                es_tre_expertise = -1
+            # Compute incontext variability
+            try:
+                gt_incontext_variability = self.compute_incontext_variability(T_train, propensities_train)
+                es_incontext_variability = self.compute_incontext_variability(T_train, propensities_pred_train) if self.cfg.evaluate_in_context_variability else -1
+            except:
+                gt_incontext_variability = -1
+                es_incontext_variability = -1
+            # swap_auroc_1, swap_auprc_1 = self.compute_swap_metrics(T=T_test,
+            #                                                     true_outcomes=outcomes_test,
+            #                                                     pred_cates=pred_cates,
+            #                                                     shap_values_pred=pred_learner_explanations,
+            #                                                     shap_base_values_pred=pred_learner_base_values,
+            #                                                     k=1)
+            # swap_auroc_5, swap_auprc_5 = self.compute_swap_metrics(T=T_test,
+            #                                                     true_outcomes=outcomes_test,
+            #                                                     pred_cates=pred_cates,
+            #                                                     shap_values_pred=pred_learner_explanations,
+            #                                                     shap_base_values_pred=pred_learner_base_values,
+            #                                                     k=5)
+            # swap_auroc_tre, swap_auprc_tre = self.compute_swap_metrics(T=T_test,
+            #                                                         true_outcomes=outcomes_test,
+            #                                                         pred_cates=pred_cates,
+            #                                                         shap_values_pred=pred_learner_explanations,
+            #                                                         shap_base_values_pred=pred_learner_base_values,
+            #                                                         k=k_tre)
+            pred_learner_explanations = np.zeros((self.cfg.simulator.num_T, X_test.shape[0], X_test.shape[1], self.cfg.simulator.dim_Y))
+            pred_learner_base_values = np.zeros((self.cfg.simulator.num_T, self.cfg.simulator.dim_Y))
+            (
+                swap_auroc_all,
+                swap_auprc_all,
+                swap_perc
+            ) = self.compute_swap_metrics(T=T_test,
+                                        true_outcomes=outcomes_test,
+                                        pred_cates=pred_cates,
+                                        shap_values_pred=pred_learner_explanations,
+                                        shap_base_values_pred=pred_learner_base_values,
+                                        k=k_all)
+            pred_correct_assignment_precision = self.get_pred_assignment_precision(pred_outcomes, outcomes_test)
+            policy_correct_assignment_precision = 1-swap_perc
+            # Get scores relative to baseline
+            if i == 0:
+                baseline_metrics["pehe"] = pehe
+                baseline_metrics["pehe_normalized"] = pehe_normalized
+                baseline_metrics["factual_outcomes_rmse"] = factual_outcomes_rmse
+                baseline_metrics["factual_outcomes_rmse_normalized"] = factual_outcomes_rmse_normalized
+                baseline_metrics["cf_outcomes_rmse"] = cf_outcomes_rmse
+                baseline_metrics["cf_outcomes_rmse_normalized"] = cf_outcomes_rmse_normalized
+                baseline_metrics["swap_auroc_all"] = swap_auroc_all
+                baseline_metrics["swap_auprc_all"] = swap_auprc_all
+            fc_pehe = pehe_normalized - baseline_metrics["pehe_normalized"]
+            fc_factual_outcomes = factual_outcomes_rmse_normalized - baseline_metrics["factual_outcomes_rmse_normalized"]
+            fc_cf_outcomes = cf_outcomes_rmse_normalized - baseline_metrics["cf_outcomes_rmse_normalized"]
+            fc_swap_auroc_all = swap_auroc_all - baseline_metrics["swap_auroc_all"]
+            fc_swap_auprc_all = swap_auprc_all - baseline_metrics["swap_auprc_all"]
+            # Compute attribution accuracies
+            # pred_attribution_est = np.abs(pred_learner_explanations).mean(axis=(0,3)) # average over treatments and outcomes
+            # prog_attribution_est = np.abs(prog_learner_explanations).mean(axis=2) # average over outcomes
+            # select_attribution_est = np.abs(select_learner_explanations) # average over outcomes
+            # pred_acc_scores_all_features = attribution_accuracy(self.all_important_features, pred_attribution_est)
+            # pred_acc_scores_predictive_features = attribution_accuracy(self.pred_features, pred_attribution_est)
+            # pred_acc_scores_prog_features = attribution_accuracy(self.prog_features, pred_attribution_est)
+            # pred_acc_scores_selective_features = attribution_accuracy(self.select_features, pred_attribution_est)
+            # prog_acc_scores_all_features = attribution_accuracy(self.all_important_features, prog_attribution_est)
+            # prog_acc_scores_predictive_features = attribution_accuracy(self.pred_features, prog_attribution_est)
+            # prog_acc_scores_prog_features = attribution_accuracy(self.prog_features, prog_attribution_est)
+            # prog_acc_scores_selective_features = attribution_accuracy(self.select_features, prog_attribution_est)
+            # select_acc_scores_all_features = attribution_accuracy(self.all_important_features, select_attribution_est)
+            # select_acc_scores_predictive_features = attribution_accuracy(self.pred_features, select_attribution_est)
+            # select_acc_scores_prog_features = attribution_accuracy(self.prog_features, select_attribution_est)
+            # select_acc_scores_selective_features = attribution_accuracy(self.select_features, select_attribution_est)
+            results_data.append(
+                [
+                    seed,
+                    split_id,
+                    x_axis_value,
+                    compare_value,
+                    model_name,
+                    # explainer_names[model_name],
+                    true_cates_mean,
+                    true_cates_std,
+                    pehe,
+                    pehe_normalized,
+                    factual_outcomes_auroc,
+                    cf_outcomes_auroc,
+                    rmse_Y0,
+                    rmse_Y1,
+                    rmse_factual_Y0,
+                    rmse_factual_Y1,
+                    rmse_cf_Y0,
+                    rmse_cf_Y1,
+                    factual_outcomes_rmse,
+                    cf_outcomes_rmse,
+                    factual_outcomes_rmse_normalized,
+                    cf_outcomes_rmse_normalized,
+                    factual_outcomes_mean,
+                    cf_outcomes_mean,
+                    factual_outcomes_std,
+                    cf_outcomes_std,
+                    f_cf_diff,
+                    f_cf_diff_norm,
+                    fc_pehe,
+                    fc_factual_outcomes,
+                    fc_cf_outcomes,
+                    fc_swap_auroc_all,
+                    fc_swap_auprc_all,
+                    # swap_auroc_1,
+                    # swap_auprc_1,
+                    # swap_auroc_5,
+                    # swap_auprc_5,
+                    # swap_auroc_tre,
+                    # swap_auprc_tre,
+                    swap_auroc_all,
+                    swap_auprc_all,
+                    swap_perc,
+                    pred_correct_assignment_precision,
+                    policy_correct_assignment_precision,
+                    ci_coverage,
+                    pred_acc_scores_all_features,
+                    pred_acc_scores_predictive_features,
+                    pred_acc_scores_prog_features,
+                    pred_acc_scores_selective_features,
+                    prog_acc_scores_all_features,
+                    prog_acc_scores_predictive_features,
+                    prog_acc_scores_prog_features,
+                    prog_acc_scores_selective_features,
+                    train_treatment_distribution,
+                    test_treatment_distribution,
+                    gt_pred_expertise,
+                    gt_prog_expertise,
+                    gt_tre_expertise,
+                    gt_pred_expertise/(gt_pred_expertise + gt_prog_expertise),
+                    gt_total_expertise,
+                    updated_gt_pred_expertise,
+                    updated_gt_prog_expertise,
+                    updated_gt_tre_expertise,
+                    es_pred_expertise,
+                    es_prog_expertise,
+                    es_tre_expertise,
+                    es_total_expertise,
+                    np.abs(gt_pred_expertise - es_pred_expertise),
+                    np.abs(gt_prog_expertise - es_prog_expertise),
+                    np.abs(gt_total_expertise - es_total_expertise),
+-gt_incontext_variability,
+-es_incontext_variability,
+                    self.training_times[model_name],
+                    # select_acc_scores_all_features,
+                    # select_acc_scores_predictive_features,
+                    # select_acc_scores_prog_features,
+                    # select_acc_scores_selective_features
+                ]
+            )
+            metrics_df = pd.DataFrame(
+                results_data,
+                columns=[
+                    "Seed",
+                    "Split ID",
+                    x_axis_name,
+                    compare_name,
+                    "Learner",
+                    # "Explainer",
+                    "CATE true mean",
+                    "CATE true std",
+                    "PEHE",
+                    "Normalized PEHE",
+                    "Factual AUROC",
+                    "CF AUROC",
+                    "RMSE Y0",
+                    "RMSE Y1",
+                    "Factual RMSE Y0",
+                    "Factual RMSE Y1",
+                    "CF RMSE Y0",
+                    "CF RMSE Y1",
+                    "Factual RMSE",
+                    "CF RMSE",
+                    "Normalized F-RMSE",
+                    "Normalized CF-RMSE",
+                    "F-Outcome true mean",
+                    "CF-Outcome true mean",
+                    "F-Outcome true std",
+                    "CF-Outcome true std",
+                    "F-CF Outcome Diff",
+                    "Normalized F-CF Diff",
+                    "FC PEHE",
+                    "FC F-RMSE",
+                    "FC CF-RMSE",
+                    "FC Swap AUROC",
+                    "FC Swap AUPRC",
+                    # "Swap AUROC@1",
+                    # "Swap AUPRC@1",
+                    # "Swap AUROC@5",
+                    # "Swap AUPRC@5",
+                    # "Swap AUROC@tre",
+                    # "Swap AUPRC@tre",
+                    "Swap AUROC@all",
+                    "Swap AUPRC@all",
+                    "True Swap Perc",
+                    "Pred Precision",
+                    "Policy Precision",
+                    "CI Coverage",
+                    "Pred: All features ACC",
+                    "Pred: Pred features ACC",
+                    "Pred: Prog features ACC",
+                    "Pred: Select features ACC",
+                    "Prog: All features ACC",
+                    "Prog: Pred features ACC",
+                    "Prog: Prog features ACC",
+                    "Prog: Select features ACC",
+                    "T Distribution: Train",
+                    "T Distribution: Test",
+                    "GT Pred Expertise",
+                    "GT Prog Expertise",
+                    "GT Tre Expertise",
+                    "GT Expertise Ratio",
+                    "GT Total Expertise",
+                    "Upd. GT Pred Expertise",
+                    "Upd. GT Prog Expertise",
+                    "Upd. GT Tre Expertise",
+                    "ES Pred Expertise",
+                    "ES Prog Expertise",
+                    "ES Tre Expertise",
+                    "ES Total Expertise",
+                    "GT-ES Pred Expertise Diff",
+                    "GT-ES Prog Expertise Diff",
+                    "GT-ES Total Expertise Diff",
+                    "GT In-context Var",
+                    "ES In-context Var",
+                    "Training Duration",
+                    # "Select: All features ACC",
+                    # "Select: Pred features ACC",
+                    # "Select: Prog features ACC",
+                    # "Select: Select features ACC",
+                ],
+            )
+            # Save intermediate results
+            self.save_results(metrics_df, save_df_only=True, compare_axis_values=None)
+        return metrics_df