--- a +++ b/metrics.py @@ -0,0 +1,76 @@ +import numpy as np +import pandas as pd +import warnings + + +def cindex(y_true_times, predicted_times, tol=1e-8): + """ + Author: Romuald Menuet & Rémy Dubois + + Evaluate concordance index from Pandas DataFrame, taking ties into account. + + Args: + y_true_times: pd.DataFrame + pd DataFrame with three columns: `PatientID`, `Event` and `SurvivalTime` the float-valued column of true survival times. + predicted_times: pd.DataFrame + pd DataFrame with three columns: `PatientID`, `SurvivalTime` the float-valued column of predicted survival times, + and one `Event`column, whose value does not matter. It must be appended so that target and predictions have the same format. + tol: float + small float value for numerical stability. + Returns: + Concordance index, as described here: + https://square.github.io/pysurvival/metrics/c_index.html + """ + + assert isinstance(y_true_times, pd.DataFrame), 'Y true times should be pd dataframe with `PatientID` as index, and `Event` and `SurvivalTime` as columns' + assert isinstance(predicted_times, pd.DataFrame), 'Predicted times should be pd dataframe with patient `PatientID` as index, and `Event` and `SurvivalTime` as columns' + assert len(y_true_times.shape) == 2, 'Y true times should be pd dataframe with `PatientID` as index, and `Event` and `SurvivalTime` as columns' + assert len(predicted_times.shape) == 2, 'Predicted times should be pd dataframe with `PatientID` as index, and `Event` and `SurvivalTime` as columns' + assert set(y_true_times.columns) == {'Event', 'SurvivalTime'}, 'Y true times should be pd dataframe with `PatientID` as index, and `Event` and `SurvivalTime` as columns' + assert set(predicted_times.columns) == {'Event', 'SurvivalTime'}, 'Predicted times should be pd dataframe with `PatientID` as index, and `Event` and `SurvivalTime` as columns' + np.testing.assert_equal(y_true_times.shape, predicted_times.shape, err_msg="Not same amount of predicted versus true samples") + assert set(y_true_times.index) == set(predicted_times.index), 'Not same patients in prediction versus ground truth' + assert np.all(predicted_times['SurvivalTime'] > 0), 'Predicted times should all be positive' + + events = y_true_times.Event + y_true_times = y_true_times.SurvivalTime + predicted_times = predicted_times.SurvivalTime + + # Just ordering the right way + predicted_times = predicted_times.loc[y_true_times.index] + events = events.loc[y_true_times.index] + + events = events.values.astype(int) + y_true_times = y_true_times.values.astype(float) + predicted_times = predicted_times.values.astype(float) + # events = events.values.astype(bool) + + np.testing.assert_array_less(1., + predicted_times.astype(float), + err_msg="Predicted y_true_times all below 1.\ + It should be in days. Make sure that you are not predicting risk instead of time.") + + return _cindex_np(y_true_times, predicted_times, events) + + +def _cindex_np(times, predicted_times, events, tol=1.e-8): + """ + Raw CI computation from np arrray. Should not be used as is. + """ + assert times.ndim == predicted_times.ndim == events.ndim == 1, "wrong input, should be vectors only" + assert times.shape[0] == predicted_times.shape[0] == events.shape[0], "wrong input, should be vectors of the same len" + + risks = - predicted_times + + risks_i = risks.reshape((-1, 1)) + risks_j = risks.reshape((1, -1)) + times_i = times.reshape((-1, 1)) + times_j = times.reshape((1, -1)) + events_i = events.reshape((-1, 1)) + + eligible_pairs = (times_i < times_j) * events_i + + well_ordered = np.sum(eligible_pairs * (risks_i > risks_j)) + ties = + np.sum(eligible_pairs * 0.5 * (risks_i == risks_j)) + + return (well_ordered + ties) / (eligible_pairs.sum() + tol)