"""
Evaluating passed experiments, for instance if metric was changed
The expected submission structure and required files:
* `registration-results.csv` - cover file with experimental results
* `computer-performances.json` - computer performance evaluation
* landmarks in CSV files with relative path described
in `registration-results.csv` in column 'Warped source landmarks'
The required files in the reference (ground truth):
* `dataset.csv` - cover file with planed registrations
* `computer-performances.json` - reference performance evaluation
* `lnds_provided/` provided landmarks in CSV files with relative path described
in `dataset.csv` in column 'Source landmarks'
* `lnds_reference/` reference (ground truth) landmarks in CSV files with relative
path described in `dataset_cover.csv` in both columns 'Target landmarks'
and 'Source landmarks'
Sample usage::
python evaluate_submission.py \
-e ./results/BmUnwarpJ \
-t ./data-images/pairs-imgs-lnds_histol.csv \
-d ./data-images \
-r ./data-images \
-p ./bm_experiments/computer-performances_cmpgrid-71.json \
-o ./output \
--min_landmarks 0.20
DOCKER
------
Running in grad-challenge.org environment::
python evaluate_submission.py \
-e /input \
-t /opt/evaluation/dataset.csv \
-d /opt/evaluation/lnds_provided \
-r /opt/evaluation/lnds_reference \
-p /opt/evaluation/computer-performances.json \
-o /output \
--min_landmarks 0.20
or run locally::
python bm_ANHIR/evaluate_submission.py \
-e bm_ANHIR/submission \
-t bm_ANHIR/dataset_ANHIR/dataset_medium.csv \
-d bm_ANHIR/dataset_ANHIR/landmarks_user_phase2 \
-r bm_ANHIR/dataset_ANHIR/landmarks_all \
-p bm_ANHIR/dataset_ANHIR/computer-performances_cmpgrid-71.json \
-o bm_ANHIR/output \
--min_landmarks 1.0
References:
* https://grand-challengeorg.readthedocs.io/en/latest/evaluation.html
Copyright (C) 2018-2019 Jiri Borovec <jiri.borovec@fel.cvut.cz>
"""
import argparse
import json
import logging
import os
import re
import sys
import time
from functools import partial
import numpy as np
import pandas as pd
sys.path += [os.path.abspath('.'), os.path.abspath('..')] # Add path to root
from birl.benchmark import _df_drop_unnamed, COL_PAIRED_LANDMARKS, filter_paired_landmarks, ImRegBenchmark
from birl.utilities.data_io import create_folder, load_landmarks, save_landmarks, update_path
from birl.utilities.dataset import parse_path_scale
from birl.utilities.experiments import FORMAT_DATE_TIME, get_nb_workers, iterate_mproc_map, parse_arg_params
NB_WORKERS = get_nb_workers(0.9)
NAME_CSV_RESULTS = 'registration-results.csv'
NAME_JSON_COMPUTER = 'computer-performances.json'
NAME_JSON_RESULTS = 'metrics.json'
COL_NORM_TIME = 'Norm. execution time [minutes]'
COL_TISSUE = 'Tissue kind'
# FOLDER_FILTER_DATASET = 'filtered dataset'
CMP_THREADS = ('1', 'n')
#: Require having initial overlap as the warped is tricky as some image pairs do not
# have the same nb points, so recommend to set it as False
REQUIRE_OVERLAP_INIT_TARGET = False
def create_parser():
""" parse the input parameters
:return dict: parameters
"""
# SEE: https://docs.python.org/3/library/argparse.html
parser = argparse.ArgumentParser()
parser.add_argument(
'-e', '--path_experiment', type=str, required=True, help='path to the experiments', default='/input/'
)
parser.add_argument(
'-t',
'--path_table',
type=str,
required=True,
help='path to cover table (csv file)',
default='/opt/evaluation/dataset.csv'
)
parser.add_argument(
'-d',
'--path_dataset',
type=str,
required=True,
help='path to dataset with provided landmarks',
default='/opt/evaluation/provided'
)
parser.add_argument(
'-r', '--path_reference', type=str, required=False, help='path to complete ground truth landmarks'
)
parser.add_argument(
'-p', '--path_comp_bm', type=str, required=False, help='path to reference computer performance JSON'
)
parser.add_argument(
'-o', '--path_output', type=str, required=True, help='path to output results', default='/output/'
)
# required number of submitted landmarks, match values in COL_PAIRED_LANDMARKS
parser.add_argument(
'--min_landmarks', type=float, required=False, default=0.5, help='ration of required landmarks in submission'
)
# parser.add_argument('--nb_workers', type=int, required=False, default=NB_WORKERS,
# help='number of processes in parallel')
parser.add_argument(
'--details', action='store_true', required=False, default=False, help='export details for each case'
)
return parser
def filter_export_landmarks(idx_row, path_output, path_dataset, path_reference):
""" filter all relevant landmarks which were used and copy them to experiment
The case is that in certain challenge stage users had provided just a subset
of all image landmarks which could be laos shuffled. The idea is to filter identify
all user used (provided in dataset) landmarks and filter them from temporary
reference dataset.
:param tuple(idx,dict|Series) idx_row: experiment DataFrame
:param str path_output: path to output folder
:param str path_dataset: path to provided landmarks
:param str path_reference: path to the complete landmark collection
:return tuple(idx,float): record index and match ratio
"""
idx, row = idx_row
ratio_matches, lnds_filter_ref, lnds_filter_move = \
filter_paired_landmarks(row, path_dataset, path_reference,
ImRegBenchmark.COL_POINTS_MOVE,
ImRegBenchmark.COL_POINTS_REF)
# moving and reference landmarks
for col, lnds_flt in [(ImRegBenchmark.COL_POINTS_REF, lnds_filter_ref),
(ImRegBenchmark.COL_POINTS_MOVE, lnds_filter_move)]:
path_out = update_path(row[col], pre_path=path_output)
create_folder(os.path.dirname(path_out), ok_existing=True)
if os.path.isfile(path_out):
if not np.array_equal(load_landmarks(path_out), lnds_flt):
raise ValueError('overwrite different set of landmarks')
save_landmarks(path_out, lnds_flt)
return idx, ratio_matches
def normalize_exec_time(df_experiments, path_experiments, path_comp_bm=None):
""" normalize execution times if reference and experiment computer is given
:param DF df_experiments: experiment DataFrame
:param str path_experiments: path to experiment folder
:param str path_comp_bm: path to reference comp. benchmark
"""
path_comp_bm_expt = os.path.join(path_experiments, NAME_JSON_COMPUTER)
if ImRegBenchmark.COL_TIME not in df_experiments.columns:
logging.warning('Missing %s among result columns.', ImRegBenchmark.COL_TIME)
return
if not path_comp_bm:
logging.warning('Reference comp. perform. not specified.')
return
if not all(os.path.isfile(p) for p in [path_comp_bm, path_comp_bm_expt]):
logging.warning(
'Missing one of the JSON files: \n %s (%s)\n %s (%s)', path_comp_bm, os.path.isfile(path_comp_bm),
path_comp_bm_expt, os.path.isfile(path_comp_bm_expt)
)
return
logging.info('Normalizing the Execution time.')
with open(path_comp_bm, 'r') as fp:
comp_ref = json.load(fp)
with open(path_comp_bm_expt, 'r') as fp:
comp_exp = json.load(fp)
time_ref = np.mean([comp_ref['registration @%s-thread' % i] for i in CMP_THREADS])
time_exp = np.mean([comp_exp['registration @%s-thread' % i] for i in CMP_THREADS])
coef = time_ref / time_exp
df_experiments[COL_NORM_TIME] = df_experiments[ImRegBenchmark.COL_TIME] * coef
def parse_landmarks(idx_row):
""" parse the warped landmarks and reference and save them as cases
:param tuple(int,series) idx_row: individual row
:return {str: float|[]}: parsed registration pair
"""
idx, row = idx_row
row = dict(row)
# lnds_ref = load_landmarks(update_path_(row[COL_POINTS_REF], path_experiments))
# lnds_warp = load_landmarks(update_path_(row[COL_POINTS_MOVE_WARP], path_experiments))
# if isinstance(row[COL_POINTS_MOVE_WARP], str)else np.array([[]])
path_dir = os.path.dirname(row[ImRegBenchmark.COL_POINTS_MOVE])
match_lnds = np.nan_to_num(row[COL_PAIRED_LANDMARKS]) if COL_PAIRED_LANDMARKS in row else 0.
item = {
'name-tissue': os.path.basename(os.path.dirname(path_dir)),
'scale-tissue': parse_path_scale(os.path.basename(path_dir)),
'type-tissue': row.get(COL_TISSUE),
'name-reference': os.path.splitext(os.path.basename(row[ImRegBenchmark.COL_POINTS_REF]))[0],
'name-source': os.path.splitext(os.path.basename(row[ImRegBenchmark.COL_POINTS_MOVE]))[0],
# 'reference landmarks': np.round(lnds_ref, 1).tolist(),
# 'warped landmarks': np.round(lnds_warp, 1).tolist(),
'matched-landmarks': match_lnds,
'Robustness': np.round(row.get(ImRegBenchmark.COL_ROBUSTNESS, 0), 3),
'Norm-Time_minutes': np.round(row.get(COL_NORM_TIME), 5),
'Status': row.get(ImRegBenchmark.COL_STATUS),
}
def _round_val(row, col):
dec = 5 if col.startswith('rTRE') else 2
return np.round(row[col], dec)
# copy all columns with Affine statistic
item.update({col.replace(' ', '-'): _round_val(row, col) for col in row if 'affine' in col.lower()})
# copy all columns with rTRE, TRE and Overlap
# item.update({col.replace(' (final)', '').replace(' ', '-'): row[col]
# for col in row if '(final)' in col})
item.update({
col.replace(' (elastic)', '_elastic').replace(' ', '-'): _round_val(row, col)
for col in row if 'TRE' in col
})
# later in JSON keys ahs to be str only
return str(idx), item
def compute_scores(df_experiments, min_landmarks=1.):
""" compute all main metrics
.. seealso:: https://anhir.grand-challenge.org/Evaluation/
:param DF df_experiments: complete experiments
:param float min_landmarks: required number of submitted landmarks in range (0, 1),
match values in COL_PAIRED_LANDMARKS
:return dict: results
"""
# if the initial overlap and submitted overlap do not mach, drop results
if 'overlap points (target)' not in df_experiments.columns:
raise ValueError('Missing `overlap points (target)` column, because there are probably missing wrap landmarks.')
unpaired = df_experiments[COL_PAIRED_LANDMARKS] < min_landmarks
hold_overlap = df_experiments['overlap points (init)'] == df_experiments['overlap points (target)']
mask_incomplete = unpaired.copy()
if REQUIRE_OVERLAP_INIT_TARGET:
mask_incomplete |= ~hold_overlap
# rewrite incomplete cases by initial stat
if sum(mask_incomplete) > 0:
for col_f, col_i in zip(*_filter_tre_measure_columns(df_experiments)):
df_experiments.loc[mask_incomplete, col_f] = df_experiments.loc[mask_incomplete, col_i]
df_experiments.loc[mask_incomplete, ImRegBenchmark.COL_ROBUSTNESS] = 0.
logging.warning(
'There are %i cases which incomplete landmarks - unpaired %i & missed overlap %i.',
sum(mask_incomplete),
sum(unpaired),
sum(~hold_overlap),
)
df_expt_robust = df_experiments[df_experiments[ImRegBenchmark.COL_ROBUSTNESS] > 0.5]
pd.set_option('expand_frame_repr', False)
# pre-compute some optional metrics
score_used_lnds = np.mean(df_expt_robust[COL_PAIRED_LANDMARKS]) \
if COL_PAIRED_LANDMARKS in df_experiments.columns else 0
# parse specific metrics
scores = {'Average-used-landmarks': score_used_lnds}
scores.update(_compute_scores_general(df_experiments, df_expt_robust))
scores.update(_compute_scores_state_tissue(df_experiments))
return scores
def _compute_scores_general(df_experiments, df_expt_robust):
# parse specific metrics
scores = {
'Average-Robustness': np.mean(df_experiments[ImRegBenchmark.COL_ROBUSTNESS]),
'STD-Robustness': np.std(df_experiments[ImRegBenchmark.COL_ROBUSTNESS]),
'Median-Robustness': np.median(df_experiments[ImRegBenchmark.COL_ROBUSTNESS]),
'Average-Rank-Median-rTRE': np.nan,
'Average-Rank-Max-rTRE': np.nan,
}
# parse Mean & median specific measures
for name, col in [('Median-rTRE', 'rTRE Median'), ('Max-rTRE', 'rTRE Max'), ('Average-rTRE', 'rTRE Mean'),
('Norm-Time', COL_NORM_TIME)]:
for df, sufix in [(df_experiments, ''), (df_expt_robust, '-Robust')]:
scores['Average-' + name + sufix] = np.nanmean(df[col])
scores['STD-' + name + sufix] = np.nanstd(df[col])
scores['Median-' + name + sufix] = np.median(df[col])
return scores
def _compute_scores_state_tissue(df_experiments):
scores = {}
if ImRegBenchmark.COL_STATUS not in df_experiments.columns:
logging.warning('experiments (table) is missing "%s" column', ImRegBenchmark.COL_STATUS)
df_experiments[ImRegBenchmark.COL_STATUS] = 'any'
# filter all statuses in the experiments
statuses = df_experiments[ImRegBenchmark.COL_STATUS].unique()
# parse metrics according to TEST and TRAIN case
for name, col in [('Average-rTRE', 'rTRE Mean'), ('Median-rTRE', 'rTRE Median'), ('Max-rTRE', 'rTRE Max'),
('Robustness', 'Robustness')]:
# iterate over common measures
for stat_name, stat_func in [('Average', np.mean), ('Median', np.median)]:
_sname = '%s-%s' % (stat_name, name)
for status in statuses:
df_expt_ = df_experiments[df_experiments[ImRegBenchmark.COL_STATUS] == status]
scores['%s_%s' % (_sname, status)] = stat_func(df_expt_[col])
# parse according to Tissue
for tissue, dfg_tissue in df_experiments.groupby(COL_TISSUE):
scores['%s__tissue_%s__All' % (_sname, tissue)] = stat_func(dfg_tissue[col])
# also per state in tissue
for status in statuses:
df_tiss_st_ = dfg_tissue[dfg_tissue[ImRegBenchmark.COL_STATUS] == status]
stat = stat_func(df_tiss_st_[col]) if not df_tiss_st_.empty else np.nan
scores['%s__tissue_%s__%s' % (_sname, tissue, status)] = stat
return scores
def _filter_tre_measure_columns(df_experiments):
""" get columns related to TRE measures
:param DF df_experiments: experiment table
:return tuple(list(str),list(str)):
"""
# copy the initial to final for missing
cols_init = [col for col in df_experiments.columns if re.match(r'(r)?IRE', col)]
cols_final = [col.replace('IRE', 'TRE') for col in cols_init]
if len(cols_final) != len(cols_init):
raise ValueError('columns do not match for future zip')
return cols_final, cols_init
def export_summary_json(df_experiments, path_experiments, path_output, min_landmarks=1., details=True):
""" summarize results in particular JSON format
:param DF df_experiments: experiment DataFrame
:param str path_experiments: path to experiment folder
:param str path_output: path to generated results
:param float min_landmarks: required number of submitted landmarks in range (0, 1),
match values in COL_PAIRED_LANDMARKS
:param bool details: exporting case details
:return str: path to exported results
"""
if COL_NORM_TIME not in df_experiments.columns:
df_experiments[COL_NORM_TIME] = np.nan
# note, we expect that the path starts with tissue and Unix sep "/" is used
def _get_tissue(cell):
tissue = cell.split(os.sep)[0]
return tissue[:tissue.index('_')] if '_' in cell else tissue
df_experiments[COL_TISSUE] = df_experiments[ImRegBenchmark.COL_POINTS_REF].apply(_get_tissue)
# export partial results
cases = list(iterate_mproc_map(parse_landmarks, df_experiments.iterrows(), desc='Parsing landmarks', nb_workers=1))
# copy the initial to final for missing
for col, col2 in zip(*_filter_tre_measure_columns(df_experiments)):
mask = df_experiments[col].isnull()
df_experiments.loc[mask, col] = df_experiments.loc[mask, col2]
# parse final metrics
scores = compute_scores(df_experiments, min_landmarks)
path_comp_bm_expt = os.path.join(path_experiments, NAME_JSON_COMPUTER)
if os.path.isfile(path_comp_bm_expt):
with open(path_comp_bm_expt, 'r') as fp:
comp_exp = json.load(fp)
else:
comp_exp = None
results = {
'aggregates': scores,
'cases': dict(cases) if details else 'not exported',
'computer': comp_exp,
'submission-time': time.strftime(FORMAT_DATE_TIME, time.gmtime()),
'required-landmarks': min_landmarks,
}
path_json = os.path.join(path_output, NAME_JSON_RESULTS)
logging.info('exporting JSON results: %s', path_json)
with open(path_json, 'w') as fp:
json.dump(results, fp)
return path_json
def replicate_missing_warped_landmarks(df_experiments, path_dataset, path_experiment):
""" if some warped landmarks are missing replace the path by initial landmarks
:param DF df_experiments: experiment table
:param str path_dataset: path to dataset folder
:param str path_experiment: path ti user experiment folder
:return DF: experiment table
"""
# find empty warped landmarks paths
missing_mask = df_experiments[ImRegBenchmark.COL_POINTS_MOVE_WARP].isnull()
if ImRegBenchmark.COL_POINTS_REF_WARP in df_experiments.columns:
# if there ar elaso target warped landmarks, allow to use them
missing_mask &= df_experiments[ImRegBenchmark.COL_POINTS_REF_WARP].isnull()
# for the empty place the initial landmarks
df_experiments.loc[missing_mask, ImRegBenchmark.COL_POINTS_MOVE_WARP] = \
df_experiments.loc[missing_mask, ImRegBenchmark.COL_POINTS_MOVE]
# for the empty place maximal execution time
df_experiments.loc[missing_mask, ImRegBenchmark.COL_TIME] = \
df_experiments[ImRegBenchmark.COL_TIME].max()
count = 0
# iterate over whole table and check if the path is valid
for idx, row in df_experiments.iterrows():
# select refence/moving warped landmarks
use_move_warp = isinstance(row.get(ImRegBenchmark.COL_POINTS_MOVE_WARP), str)
col_lnds_warp = ImRegBenchmark.COL_POINTS_MOVE_WARP \
if use_move_warp else ImRegBenchmark.COL_POINTS_REF_WARP
# extract the CSV path
path_csv = update_path(row[col_lnds_warp], pre_path=path_experiment)
if not os.path.isfile(path_csv):
# if the path is false, put there the initial from dataset
path_csv = update_path(row[ImRegBenchmark.COL_POINTS_MOVE], pre_path=path_dataset)
df_experiments.loc[idx, ImRegBenchmark.COL_POINTS_MOVE_WARP] = path_csv
count += 1
logging.info('Missing warped landmarks: %i', count)
return df_experiments
def swap_inverse_experiment(table, allow_inverse):
""" optional swap of registration results from using warped moving to warped reference
:param DF table: experiment table
:param bool allow_inverse: allw swap from using warped moving to warped reference
:return DF: updated experiment table
"""
if not allow_inverse:
return table
if ImRegBenchmark.COL_POINTS_MOVE_WARP in table.columns:
filled = table[ImRegBenchmark.COL_POINTS_MOVE_WARP].dropna()
if len(filled) > 0:
# everything seems to be fine...
return table
logging.warning('Missing target column "%s"', ImRegBenchmark.COL_POINTS_MOVE_WARP)
if ImRegBenchmark.COL_POINTS_REF_WARP not in table.columns:
raise ValueError('Missing target column "%s" to swap to' % ImRegBenchmark.COL_POINTS_REF_WARP)
logging.info('Swapping columns of Moving and Reference landmarks for both - source and warped.')
col_ref = table[ImRegBenchmark.COL_POINTS_REF].values.tolist()
col_move = table[ImRegBenchmark.COL_POINTS_MOVE].values.tolist()
table[ImRegBenchmark.COL_POINTS_REF] = col_move
table[ImRegBenchmark.COL_POINTS_MOVE] = col_ref
table[ImRegBenchmark.COL_POINTS_MOVE_WARP] = table[ImRegBenchmark.COL_POINTS_REF_WARP]
return table
def main(
path_experiment,
path_table,
path_dataset,
path_output,
path_reference=None,
path_comp_bm=None,
min_landmarks=1.,
details=True,
allow_inverse=False,
):
""" main entry point
:param str path_experiment: path to experiment folder
:param str path_table: path to assignment file (requested registration pairs)
:param str path_dataset: path to provided landmarks
:param str path_output: path to generated results
:param str|None path_reference: path to the complete landmark collection,
if None use dataset folder
:param str|None path_comp_bm: path to reference comp. benchmark
:param int nb_workers: number of parallel processes
:param float min_landmarks: required number of submitted landmarks in range (0, 1),
match values in COL_PAIRED_LANDMARKS
:param bool details: exporting case details
:param bool allow_inverse: allow evaluate also inverse transformation,
warped landmarks from ref to move image
"""
path_results = os.path.join(path_experiment, ImRegBenchmark.NAME_CSV_REGISTRATION_PAIRS)
if not os.path.isfile(path_results):
raise AttributeError('Missing experiments results: %s' % path_results)
path_reference = path_dataset if not path_reference else path_reference
# drop time column from Cover which should be empty
df_overview = pd.read_csv(path_table).drop([ImRegBenchmark.COL_TIME], axis=1, errors='ignore')
df_overview = _df_drop_unnamed(df_overview)
# drop Warp* column from Cover which should be empty
df_overview = df_overview.drop(
[col for col in df_overview.columns if 'warped' in col.lower()],
axis=1,
errors='ignore',
)
df_results = pd.read_csv(path_results)
df_results = _df_drop_unnamed(df_results)
# df_results.drop(filter(lambda c: 'Unnamed' in c, df_results.columns), axis=1, inplace=True)
cols_ = list(ImRegBenchmark.COVER_COLUMNS_WRAP) + [ImRegBenchmark.COL_TIME]
df_results = df_results[[col for col in cols_ if col in df_results.columns]]
df_experiments = pd.merge(df_overview, df_results, how='left', on=ImRegBenchmark.COVER_COLUMNS)
df_experiments = swap_inverse_experiment(df_experiments, allow_inverse)
# df_experiments.drop([ImRegBenchmark.COL_IMAGE_REF_WARP, ImRegBenchmark.COL_POINTS_REF_WARP],
# axis=1, errors='ignore', inplace=True)
df_experiments.drop(filter(lambda c: 'Unnamed' in c, df_results.columns), axis=1, inplace=True)
df_experiments = replicate_missing_warped_landmarks(df_experiments, path_dataset, path_experiment)
normalize_exec_time(df_experiments, path_experiment, path_comp_bm)
# logging.info('Filter used landmarks.')
# path_filtered = os.path.join(path_output, FOLDER_FILTER_DATASET)
# create_folder(path_filtered, ok_existing=True)
# _filter_lnds = partial(filter_export_landmarks, path_output=path_filtered,
# path_dataset=path_dataset, path_reference=path_reference)
# for idx, ratio in iterate_mproc_map(_filter_lnds, df_experiments.iterrows(),
# desc='Filtering', nb_workers=nb_workers):
# df_experiments.loc[idx, COL_PAIRED_LANDMARKS] = np.round(ratio, 2)
logging.info('Compute landmarks statistic.')
_compute_lnds_stat = partial(
ImRegBenchmark.compute_registration_statistic,
df_experiments=df_experiments,
path_dataset=path_dataset,
path_experiment=path_experiment,
path_reference=path_reference,
)
# NOTE: this has to run in SINGLE thread so there is SINGLE table instance
list(iterate_mproc_map(_compute_lnds_stat, df_experiments.iterrows(), desc='Statistic', nb_workers=1))
name_results, _ = os.path.splitext(os.path.basename(path_results))
path_results = os.path.join(path_output, name_results + '_NEW.csv')
logging.debug('exporting CSV results: %s', path_results)
df_experiments.to_csv(path_results)
path_json = export_summary_json(df_experiments, path_experiment, path_output, min_landmarks, details)
return path_json
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
arg_params = parse_arg_params(create_parser())
logging.info('running...')
main(**arg_params)
logging.info('DONE')