Diff of /evaluator.py [000000] .. [bb7f56]

Switch to side-by-side view

--- a
+++ b/evaluator.py
@@ -0,0 +1,544 @@
+#!/usr/bin/env python
+# Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os, time
+from multiprocessing import Pool
+
+import numpy as np
+import pandas as pd
+import torch
+from sklearn.metrics import roc_auc_score, average_precision_score
+from sklearn.metrics import roc_curve, precision_recall_curve
+
+import utils.exp_utils as utils
+import utils.model_utils as mutils
+import utils.eval_util as eutils
+import plotting
+
+
+
+class Evaluator():
+
+    def __init__(self, cf, logger, mode='test'):
+        """
+        :param mode: either 'val_sampling', 'val_patient' or 'test'. handles prediction lists of different forms.
+        """
+        self.cf = cf
+        self.logger = logger
+        self.mode = mode
+
+        self.plot_dir = self.cf.test_dir if self.mode == "test" else self.cf.plot_dir
+        if self.cf.plot_prediction_histograms:
+            self.hist_dir = os.path.join(self.plot_dir, 'histograms')
+            os.makedirs(self.hist_dir, exist_ok=True)
+        if self.cf.plot_stat_curves:
+            self.curves_dir = os.path.join(self.plot_dir, 'stat_curves')
+            os.makedirs(self.curves_dir, exist_ok=True)
+
+
+    def eval_losses(self, batch_res_dicts):
+        if hasattr(self.cf, "losses_to_monitor"):
+            loss_names = self.cf.losses_to_monitor
+        else:
+            loss_names = {name for b_res_dict in batch_res_dicts for name in b_res_dict if 'loss' in name}
+        self.epoch_losses = {l_name: torch.tensor([b_res_dict[l_name] for b_res_dict in batch_res_dicts if l_name
+                                                   in b_res_dict.keys()]).mean().item() for l_name in loss_names}
+
+    def eval_boxes(self, batch_res_dicts, pid_list):
+        """ """
+
+        df_list_preds = []
+        df_list_labels = []
+        df_list_class_preds = []
+        df_list_pids = []
+        df_list_type = []
+        df_list_match_iou = []
+
+
+        if self.mode == 'train' or self.mode=='val_sampling':
+            # one pid per batch element
+            # batch_size > 1, with varying patients across batch:
+            # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
+            # -> [results_0, results_1, ..]
+            batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts]  # len: nr of batches in epoch
+            batch_inst_boxes = [[b_inst_boxes] for whole_batch_boxes in batch_inst_boxes for b_inst_boxes in
+                                whole_batch_boxes]
+        else:
+            # patient processing, one element per batch = one patient.
+            # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..]
+            batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts]
+
+        assert len(batch_inst_boxes) == len(pid_list)
+
+        for match_iou in self.cf.ap_match_ious:
+            self.logger.info('evaluating with match_iou: {}'.format(match_iou))
+            for cl in list(self.cf.class_dict.keys()):
+                for pix, pid in enumerate(pid_list):
+
+                    len_df_list_before_patient = len(df_list_pids)
+
+                    # input of each batch element is a list of boxes, where each box is a dictionary.
+                    for bix, b_boxes_list in enumerate(batch_inst_boxes[pix]):
+
+                        b_tar_boxes = np.array([box['box_coords'] for box in b_boxes_list if
+                                                (box['box_type'] == 'gt' and box['box_label'] == cl)])
+                        b_cand_boxes = np.array([box['box_coords'] for box in b_boxes_list if
+                                                 (box['box_type'] == 'det' and
+                                                  box['box_pred_class_id'] == cl)])
+                        b_cand_scores = np.array([box['box_score'] for box in b_boxes_list if
+                                                  (box['box_type'] == 'det' and
+                                                   box['box_pred_class_id'] == cl)])
+
+                        # check if predictions and ground truth boxes exist and match them according to match_iou.
+                        if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
+                            overlaps = mutils.compute_overlaps(b_cand_boxes, b_tar_boxes)
+                            match_cand_ixs = np.argwhere(np.max(overlaps, 1) > match_iou)[:, 0]
+                            non_match_cand_ixs = np.argwhere(np.max(overlaps, 1) <= match_iou)[:, 0]
+                            match_gt_ixs = np.argmax(overlaps[match_cand_ixs, :],
+                                                     1) if not 0 in match_cand_ixs.shape else np.array([])
+                            non_match_gt_ixs = np.array(
+                                [ii for ii in np.arange(b_tar_boxes.shape[0]) if ii not in match_gt_ixs])
+                            unique, counts = np.unique(match_gt_ixs, return_counts=True)
+
+                            # check for double assignments, i.e. two predictions having been assigned to the same gt.
+                            # according to the COCO-metrics, only one prediction counts as true positive, the rest counts as
+                            # false positive. This case is supposed to be avoided by the model itself by,
+                            #  e.g. using a low enough NMS threshold.
+                            if np.any(counts > 1):
+                                double_match_gt_ixs = unique[np.argwhere(counts > 1)[:, 0]]
+                                keep_max = []
+                                double_match_list = []
+                                for dg in double_match_gt_ixs:
+                                    double_match_cand_ixs = match_cand_ixs[np.argwhere(match_gt_ixs == dg)]
+                                    keep_max.append(double_match_cand_ixs[np.argmax(b_cand_scores[double_match_cand_ixs])])
+                                    double_match_list += [ii for ii in double_match_cand_ixs]
+
+                                fp_ixs = np.array([ii for ii in match_cand_ixs if
+                                                     (ii in double_match_list and ii not in keep_max)])
+
+                                match_cand_ixs = np.array([ii for ii in match_cand_ixs if ii not in fp_ixs])
+
+                                df_list_preds += [ii for ii in b_cand_scores[fp_ixs]]
+                                df_list_labels += [0] * fp_ixs.shape[0]
+                                df_list_class_preds += [cl] * fp_ixs.shape[0]
+                                df_list_pids += [pid] * fp_ixs.shape[0]
+                                df_list_type += ['det_fp'] * fp_ixs.shape[0]
+
+                            # matched:
+                            if not 0 in match_cand_ixs.shape:
+                                df_list_preds += [ii for ii in b_cand_scores[match_cand_ixs]]
+                                df_list_labels += [1] * match_cand_ixs.shape[0]
+                                df_list_class_preds += [cl] * match_cand_ixs.shape[0]
+                                df_list_pids += [pid] * match_cand_ixs.shape[0]
+                                df_list_type += ['det_tp'] * match_cand_ixs.shape[0]
+                            # rest fp:
+                            if not 0 in non_match_cand_ixs.shape:
+                                df_list_preds += [ii for ii in b_cand_scores[non_match_cand_ixs]]
+                                df_list_labels += [0] * non_match_cand_ixs.shape[0]
+                                df_list_class_preds += [cl] * non_match_cand_ixs.shape[0]
+                                df_list_pids += [pid] * non_match_cand_ixs.shape[0]
+                                df_list_type += ['det_fp'] * non_match_cand_ixs.shape[0]
+                            # rest fn:
+                            if not 0 in non_match_gt_ixs.shape:
+                                df_list_preds += [0] * non_match_gt_ixs.shape[0]
+                                df_list_labels += [1] * non_match_gt_ixs.shape[0]
+                                df_list_class_preds += [cl] * non_match_gt_ixs.shape[0]
+                                df_list_pids += [pid]  * non_match_gt_ixs.shape[0]
+                                df_list_type += ['det_fn']  * non_match_gt_ixs.shape[0]
+                        # only fp:
+                        if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape:
+                            df_list_preds += [ii for ii in b_cand_scores]
+                            df_list_labels += [0] * b_cand_scores.shape[0]
+                            df_list_class_preds += [cl] * b_cand_scores.shape[0]
+                            df_list_pids += [pid] * b_cand_scores.shape[0]
+                            df_list_type += ['det_fp'] * b_cand_scores.shape[0]
+                        # only fn:
+                        if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
+                            df_list_preds += [0] * b_tar_boxes.shape[0]
+                            df_list_labels += [1] * b_tar_boxes.shape[0]
+                            df_list_class_preds += [cl] * b_tar_boxes.shape[0]
+                            df_list_pids += [pid] * b_tar_boxes.shape[0]
+                            df_list_type += ['det_fn'] * b_tar_boxes.shape[0]
+
+                    # empty patient with 0 detections needs patient dummy score, in order to not disappear from stats.
+                    # filtered out for roi-level evaluation later. During training (and val_sampling),
+                    # tn are assigned per sample independently of associated patients.
+                    if len(df_list_pids) == len_df_list_before_patient:
+                        df_list_preds += [0] * 1
+                        df_list_labels += [0] * 1
+                        df_list_class_preds += [cl] * 1
+                        df_list_pids += [pid] * 1
+                        df_list_type += ['patient_tn'] * 1 # true negative: no ground truth boxes, no detections.
+
+            df_list_match_iou += [match_iou] * (len(df_list_preds) - len(df_list_match_iou))
+
+        self.test_df = pd.DataFrame()
+        self.test_df['pred_score'] = df_list_preds
+        self.test_df['class_label'] = df_list_labels
+        self.test_df['pred_class'] = df_list_class_preds
+        self.test_df['pid'] = df_list_pids
+        self.test_df['det_type'] = df_list_type
+        self.test_df['fold'] = self.cf.fold
+        self.test_df['match_iou'] = df_list_match_iou
+
+
+    def evaluate_predictions(self, results_list, monitor_metrics=None):
+        """
+        Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes.
+        Resulting info of each prediction is stored as one line in an internal dataframe, with the keys:
+        det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative)
+        pred_class: foreground class which the object predicts.
+        pid: corresponding patient-id.
+        pred_score: confidence score [0, 1]
+        fold: corresponding fold of CV.
+        match_iou: utilized IoU for matching.
+        :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form:
+        [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
+        Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...])
+        :param monitor_metrics (optional):  dict of dicts with all metrics of previous epochs.
+        :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch.
+        """
+
+        self.logger.info('evaluating in mode {}'.format(self.mode))
+
+        batch_res_dicts = [batch[0] for batch in results_list]  # len: nr of batches in epoch
+        if self.mode == 'train' or self.mode == 'val_sampling':
+            # one pid per batch element
+            # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
+            # -> [pid_0, pid_1, ...]
+            # additional list wrapping to make conform with below per-patient batches, where one pid is linked to more than one batch instance
+            pid_list = [batch_instance_pid for batch in results_list for batch_instance_pid in batch[1]]
+        elif self.mode == "val_patient" or self.mode == "test":
+            # [[results_0, pid_0], [results_1, pid_1], ...] -> [pid_0, pid_1, ...]
+            # in patientbatchiterator there is only one pid per batch
+            pid_list = [np.unique(batch[1]) for batch in results_list]
+            assert np.all([len(pid) == 1 for pid in
+                           pid_list]), "pid list in patient-eval mode, should only contain a single scalar per patient: {}".format(
+                pid_list)
+            pid_list = [pid[0] for pid in pid_list]
+            # todo remove assert
+            pid_list_orig = [item[1] for item in results_list]
+            assert np.all(pid_list == pid_list_orig)
+        else:
+            raise Exception("undefined run mode encountered")
+
+        self.eval_losses(batch_res_dicts)
+        self.eval_boxes(batch_res_dicts, pid_list)
+
+        if monitor_metrics is not None:
+            # return all_stats, updated monitor_metrics
+            return self.return_metrics(monitor_metrics)
+
+
+    def return_metrics(self, monitor_metrics=None):
+        """
+        calculates AP/AUC scores for internal dataframe. called directly from evaluate_predictions during training for monitoring,
+        or from score_test_df during inference (for single folds or aggregated test set). Loops over foreground classes
+        and score_levels (typically 'roi' and 'patient'), gets scores and stores them. Optionally creates plots of
+        prediction histograms and roc/prc curves.
+        :param monitor_metrics: dict of dicts with all metrics of previous epochs.
+        this function adds metrics for current epoch and returns the same object.
+        :return: all_stats: list. Contains dicts with resulting scores for each combination of foreground class and
+        score_level.
+        :return: monitor_metrics
+        """
+
+        # -------------- monitoring independent of class, score level ------------
+        if monitor_metrics is not None:
+            for l_name in self.epoch_losses:
+                monitor_metrics[l_name] = [self.epoch_losses[l_name]]
+
+
+        df = self.test_df
+
+        all_stats = []
+        for cl in list(self.cf.class_dict.keys()):
+            cl_df = df[df.pred_class == cl]
+
+            for score_level in self.cf.report_score_level:
+                stats_dict = {}
+                stats_dict['name'] = 'fold_{} {} cl_{}'.format(self.cf.fold, score_level, cl)
+
+                if score_level == 'rois':
+                    # kick out dummy entries for true negative patients. not needed on roi-level.
+                    spec_df = cl_df[cl_df.det_type != 'patient_tn']
+                    stats_dict['ap'] = get_roi_ap_from_df([spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap])
+                    # AUC not sensible on roi-level, since true negative box predictions do not exist. Would reward
+                    # higher amounts of low confidence false positives.
+                    stats_dict['auc'] = np.nan
+                    stats_dict['roc'] = np.nan
+                    stats_dict['prc'] = np.nan
+
+                    # for the aggregated test set case, additionally get the scores for averaging over fold results.
+                    if len(df.fold.unique()) > 1:
+                        aps = []
+                        for fold in df.fold.unique():
+                            fold_df = spec_df[spec_df.fold == fold]
+                            aps.append(get_roi_ap_from_df([fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap]))
+                        stats_dict['mean_ap'] = np.mean(aps)
+                        stats_dict['mean_auc'] = 0
+
+                # on patient level, aggregate predictions per patient (pid): The patient predicted score is the highest
+                # confidence prediction for this class. The patient class label is 1 if roi of this class exists in patient, else 0.
+                if score_level == 'patient':
+                    # spec_df = cl_df.groupby(['pid'], as_index=False).agg({'class_label': 'max', 'pred_score': 'max', 'fold': 'first'})
+                    spec_df = cl_df.groupby(["pid"], as_index=False).apply(eutils.patient_based_filter)
+
+                    if len(spec_df.class_label.unique()) > 1:
+                        stats_dict['auc'] = roc_auc_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
+                        stats_dict['roc'] = roc_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
+                    else:
+                        stats_dict['auc'] = np.nan
+                        stats_dict['roc'] = np.nan
+
+                    if (spec_df.class_label == 1).any():
+                        stats_dict['ap'] = average_precision_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
+                        stats_dict['prc'] = precision_recall_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
+                    else:
+                        stats_dict['ap'] = np.nan
+                        stats_dict['prc'] = np.nan
+
+                    # for the aggregated test set case, additionally get the scores for averaging over fold results.
+                    if len(df.fold.unique()) > 1:
+                        aucs = []
+                        aps = []
+                        for fold in df.fold.unique():
+                            fold_df = spec_df[spec_df.fold == fold]
+                            if len(fold_df.class_label.unique()) > 1:
+                                aucs.append(roc_auc_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist()))
+                            if (fold_df.class_label == 1).any():
+                                aps.append(average_precision_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist()))
+                        stats_dict['mean_auc'] = np.mean(aucs)
+                        stats_dict['mean_ap'] = np.mean(aps)
+
+                # fill new results into monitor_metrics dict. for simplicity, only one class (of interest) is monitored on patient level.
+                if monitor_metrics is not None and not (score_level == 'patient' and cl != self.cf.patient_class_of_interest):
+                    score_level_name = 'patient' if score_level == 'patient' else self.cf.class_dict[cl]
+                    monitor_metrics[score_level_name + '_ap'].append(stats_dict['ap'] if stats_dict['ap'] > 0 else np.nan)
+                    if score_level == 'patient':
+                        monitor_metrics[score_level_name + '_auc'].append(
+                            stats_dict['auc'] if stats_dict['auc'] > 0 else np.nan)
+
+                if self.cf.plot_prediction_histograms:
+                    out_filename = os.path.join(self.hist_dir, 'pred_hist_{}_{}_{}_cl{}'.format(
+                        self.cf.fold, 'val' if 'val' in self.mode else self.mode, score_level, cl))
+                    # type_list = None if score_level == 'patient' else spec_df.det_type.tolist()
+                    type_list = spec_df.det_type.tolist()
+                    utils.split_off_process(plotting.plot_prediction_hist, spec_df.class_label.tolist(),
+                                            spec_df.pred_score.tolist(), type_list, out_filename)
+
+                all_stats.append(stats_dict)
+
+                # analysis of the  hyper-parameter cf.min_det_thresh, for optimization on validation set.
+                if self.cf.scan_det_thresh:
+                    conf_threshs = list(np.arange(0.9, 1, 0.01))
+                    pool = Pool(processes=8)
+                    mp_inputs = [[spec_df, ii, self.cf.per_patient_ap] for ii in conf_threshs]
+                    aps = pool.map(get_roi_ap_from_df, mp_inputs, chunksize=1)
+                    pool.close()
+                    pool.join()
+                    self.logger.info('results from scanning over det_threshs:', [[i, j] for i, j in zip(conf_threshs, aps)])
+
+        if self.cf.plot_stat_curves:
+            out_filename = os.path.join(self.curves_dir, '{}_{}_stat_curves'.format(self.cf.fold, self.mode))
+            utils.split_off_process(plotting.plot_stat_curves, all_stats, out_filename)
+
+        # get average stats over foreground classes on roi level.
+        avg_ap = np.mean([d['ap'] for d in all_stats if 'rois' in d['name']])
+        all_stats.append({'name': 'average_foreground_roi', 'auc': 0, 'ap': avg_ap})
+        if len(df.fold.unique()) > 1:
+            avg_mean_ap = np.mean([d['mean_ap'] for d in all_stats if 'rois' in d['name']])
+            all_stats[-1]['mean_ap'] = avg_mean_ap
+            all_stats[-1]['mean_auc'] = 0
+
+        # in small data sets, values of model_selection_criterion can be identical across epochs, wich breaks the
+        # ranking of model_selector. Thus, pertube identical values by a neglectibale random term.
+        for sc in self.cf.model_selection_criteria:
+            if 'val' in self.mode and monitor_metrics[sc].count(monitor_metrics[sc][-1]) > 1 and monitor_metrics[sc][-1] is not None:
+                monitor_metrics[sc][-1] += 1e-6 * np.random.rand()
+
+        return all_stats, monitor_metrics
+
+
+    def write_to_results_table(self, stats, metrics_to_score, out_path):
+        """Write overall results to a common inter-experiment table.
+        :param metrics_to_score:
+        :return:
+        """
+
+        with open(out_path, 'a') as handle:
+            # ---column headers---
+            handle.write('\n{},'.format("Experiment Name"))
+            handle.write('{},'.format("Time Stamp"))
+            handle.write('{},'.format("Samples Seen"))
+            handle.write('{},'.format("Spatial Dim"))
+            handle.write('{},'.format("Patch Size"))
+            handle.write('{},'.format("CV Folds"))
+            handle.write('{},'.format("WBC IoU"))
+            handle.write('{},'.format("Merge-2D-to-3D IoU"))
+            for s in stats:
+                #if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name'] or "average" in s["name"]:
+                for metric in metrics_to_score:
+                    if metric in s.keys() and not np.isnan(s[metric]):
+                        if metric == 'ap':
+                            handle.write('{} : {}_{},'.format(s['name'], metric.upper(),
+                                                              "_".join((np.array(self.cf.ap_match_ious) * 100)
+                                                                       .astype("int").astype("str"))))
+                        else:
+                            handle.write('{} : {},'.format(s['name'], metric.upper()))
+                    else:
+                        print("WARNING: skipped metric {} since not avail".format(metric))
+            handle.write('\n')
+
+            # --- columns content---
+            handle.write('{},'.format(self.cf.exp_dir.split(os.sep)[-1]))
+            handle.write('{},'.format(time.strftime("%d%b%y %H:%M:%S")))
+            handle.write('{},'.format(self.cf.num_epochs * self.cf.num_train_batches * self.cf.batch_size))
+            handle.write('{}D,'.format(self.cf.dim))
+            handle.write('{},'.format("x".join([str(self.cf.patch_size[i]) for i in range(self.cf.dim)])))
+            handle.write('{},'.format(str(self.test_df.fold.unique().tolist()).replace(",", "")))
+            handle.write('{},'.format(self.cf.wcs_iou))
+            handle.write('{},'.format(self.cf.merge_3D_iou if self.cf.merge_2D_to_3D_preds else str("N/A")))
+            for s in stats:
+                #if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name'] or "mean" in s["name"]:
+                for metric in metrics_to_score:
+                    if metric in s.keys() and not np.isnan(s[metric]):
+                        handle.write('{:0.3f}, '.format(s[metric]))
+            handle.write('\n')
+
+    def score_test_df(self, internal_df=True):
+        """
+        Writes out resulting scores to text files: First checks for class-internal-df (typically current) fold,
+        gets resulting scores, writes them to a text file and pickles data frame. Also checks if data-frame pickles of
+        all folds of cross-validation exist in exp_dir. If true, loads all dataframes, aggregates test sets over folds,
+        and calculates and writes out overall metrics.
+        """
+        if internal_df:
+
+            self.test_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_df.pickle'.format(self.cf.fold)))
+            stats, _ = self.return_metrics()
+
+            with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle:
+                handle.write('\n****************************\n')
+                handle.write('\nresults for fold {} \n'.format(self.cf.fold))
+                handle.write('\n****************************\n')
+                handle.write('\nfold df shape {}\n  \n'.format(self.test_df.shape))
+                for s in stats:
+                    handle.write('AUC {:0.4f}  AP {:0.4f} {} \n'.format(s['auc'], s['ap'], s['name']))
+
+            fold_df_paths = [ii for ii in os.listdir(self.cf.test_dir) if ('test_df.pickle' in ii and not 'overall' in ii)]
+            if len(fold_df_paths) == self.cf.n_cv_splits:
+                results_table_path = os.path.join((os.sep).join(self.cf.exp_dir.split(os.sep)[:-1]), 'results_table.csv')
+
+                if not self.cf.hold_out_test_set or not self.cf.ensemble_folds:
+                    with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle:
+                        self.cf.fold = 'overall'
+                        dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_df_paths]
+                        for ix, df in enumerate(dfs_list):
+                            df['fold'] = ix
+                        self.test_df = pd.concat(dfs_list)
+                        stats, _ = self.return_metrics()
+                        handle.write('\n****************************\n')
+                        handle.write('\nOVERALL RESULTS \n')
+                        handle.write('\n****************************\n')
+                        handle.write('\ndf shape \n  \n'.format(self.test_df.shape))
+                        for s in stats:
+                            handle.write('\nAUC {:0.4f} (mu {:0.4f})  AP {:0.4f} (mu {:0.4f})  {}\n '
+                                         .format(s['auc'], s['mean_auc'], s['ap'], s['mean_ap'], s['name']))
+                    metrics_to_score = ["auc", "mean_auc", "ap", "mean_ap"]
+                    self.write_to_results_table(stats, metrics_to_score, out_path=results_table_path)
+                else:
+                    metrics_to_score = ["auc", "ap"]
+                    self.write_to_results_table(stats, metrics_to_score, out_path=results_table_path)
+
+
+def get_roi_ap_from_df(inputs):
+    '''
+    :param df: data frame.
+    :param det_thresh: min_threshold for filtering out low confidence predictions.
+    :param per_patient_ap: boolean flag. evaluate average precision per image and average over images,
+    instead of computing one ap over data set.
+    :return: average_precision (float)
+    '''
+    df, det_thresh, per_patient_ap = inputs
+
+    if per_patient_ap:
+        pids_list = df.pid.unique()
+        aps = []
+        for match_iou in df.match_iou.unique():
+            iou_df = df[df.match_iou == match_iou]
+            for pid in pids_list:
+                pid_df = iou_df[iou_df.pid == pid]
+                all_p = len(pid_df[pid_df.class_label == 1])
+                pid_df = pid_df[(pid_df.det_type == 'det_fp') | (pid_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False)
+                pid_df = pid_df[pid_df.pred_score > det_thresh]
+                if (len(pid_df) ==0 and all_p == 0):
+                   pass
+                elif (len(pid_df) > 0 and all_p == 0):
+                    aps.append(0)
+                else:
+                    aps.append(compute_roi_ap(pid_df, all_p))
+        return np.mean(aps)
+
+    else:
+        aps = []
+        for match_iou in df.match_iou.unique():
+            iou_df = df[df.match_iou == match_iou]
+            all_p = len(iou_df[iou_df.class_label == 1])
+            iou_df = iou_df[(iou_df.det_type == 'det_fp') | (iou_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False)
+            iou_df = iou_df[iou_df.pred_score > det_thresh]
+            if all_p > 0:
+                aps.append(compute_roi_ap(iou_df, all_p))
+        return np.mean(aps)
+
+
+
+def compute_roi_ap(df, all_p):
+    """
+    adapted from: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py
+    :param df: dataframe containing class labels of predictions sorted in descending manner by their prediction score.
+    :param all_p: number of all ground truth objects. (for denominator of recall.)
+    :return:
+    """
+    tp = df.class_label.values
+    fp = (tp == 0) * 1
+    #recall thresholds, where precision will be measured
+    R = np.linspace(.0, 1, 101, endpoint=True)
+    tp_sum = np.cumsum(tp)
+    fp_sum = np.cumsum(fp)
+    nd = len(tp)
+    rc = tp_sum / all_p
+    pr = tp_sum / (fp_sum + tp_sum)
+    # initialize precision array over recall steps.
+    q = np.zeros((len(R),))
+
+    # numpy is slow without cython optimization for accessing elements
+    # use python array gets significant speed improvement
+    pr = pr.tolist()
+    q = q.tolist()
+    for i in range(nd - 1, 0, -1):
+        if pr[i] > pr[i - 1]:
+            pr[i - 1] = pr[i]
+
+    #discretize empiric recall steps with given bins.
+    inds = np.searchsorted(rc, R, side='left')
+    try:
+        for ri, pi in enumerate(inds):
+            q[ri] = pr[pi]
+    except:
+        pass
+
+    return np.mean(q)