[d9566e]: / sybil / utils / metrics.py

Download this file

386 lines (326 with data), 14.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
from collections import OrderedDict
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
precision_recall_curve,
auc,
average_precision_score,
)
import numpy as np
import warnings
EPSILON = 1e-6
BINARY_CLASSIF_THRESHOLD = 0.5
def get_classification_metrics(logging_dict, args):
stats_dict = OrderedDict()
golds = np.array(logging_dict["golds"]).reshape(-1)
probs = np.array(logging_dict["probs"])
preds = (probs[:, -1] > 0.5).reshape(-1)
probs = probs.reshape((-1, probs.shape[-1]))
stats_dict["accuracy"] = accuracy_score(y_true=golds, y_pred=preds)
if (args.num_classes == 2) and not (
np.unique(golds)[-1] > 1 or np.unique(preds)[-1] > 1
):
stats_dict["precision"] = precision_score(y_true=golds, y_pred=preds)
stats_dict["recall"] = recall_score(y_true=golds, y_pred=preds)
stats_dict["f1"] = f1_score(y_true=golds, y_pred=preds)
num_pos = golds.sum()
if num_pos > 0 and num_pos < len(golds):
stats_dict["auc"] = roc_auc_score(golds, probs[:, -1], average="samples")
stats_dict["ap_score"] = average_precision_score(
golds, probs[:, -1], average="samples"
)
precision, recall, _ = precision_recall_curve(golds, probs[:, -1])
stats_dict["prauc"] = auc(recall, precision)
return stats_dict
def get_survival_metrics(logging_dict, args):
stats_dict = {}
censor_times, probs, golds = (
logging_dict["censors"],
logging_dict["probs"],
logging_dict["golds"],
)
for followup in range(args.max_followup):
min_followup_if_neg = followup + 1
roc_auc, ap_score, pr_auc = compute_auc_at_followup(
probs, censor_times, golds, followup
)
stats_dict["{}_year_auc".format(min_followup_if_neg)] = roc_auc
stats_dict["{}_year_apscore".format(min_followup_if_neg)] = ap_score
stats_dict["{}_year_prauc".format(min_followup_if_neg)] = pr_auc
if np.array(golds).sum() > 0:
stats_dict["c_index"] = concordance_index(
logging_dict["censors"], probs, golds, args.censoring_distribution
)
else:
stats_dict["c_index"] = -1.0
return stats_dict
def get_alignment_metrics(logging_dict, args):
stats_dict = OrderedDict()
golds = np.array(logging_dict["discrim_golds"]).reshape(-1)
probs = np.array(logging_dict["discrim_probs"])
preds = probs.argmax(axis=-1).reshape(-1)
probs = probs.reshape((-1, probs.shape[-1]))
stats_dict["discrim_accuracy"] = accuracy_score(y_true=golds, y_pred=preds)
stats_dict["discrim_precision"] = precision_score(y_true=golds, y_pred=preds)
stats_dict["discrim_recall"] = recall_score(y_true=golds, y_pred=preds)
stats_dict["discrim_f1"] = f1_score(y_true=golds, y_pred=preds)
num_pos = golds.sum()
if num_pos > 0 and num_pos < len(golds):
try:
stats_dict["discrim_auc"] = roc_auc_score(
golds, probs[:, -1], average="samples"
)
stats_dict["discrim_ap_score"] = average_precision_score(
golds, probs[:, -1], average="samples"
)
precision, recall, _ = precision_recall_curve(golds, probs[:, -1])
stats_dict["discrim_prauc"] = auc(recall, precision)
except Exception as e:
print(e)
return stats_dict
def get_risk_metrics(logging_dict, args):
stats_dict = {}
censor_times, probs, golds = (
logging_dict["censors"],
logging_dict["probs"],
logging_dict["golds"],
)
for followup in range(args.max_followup):
min_followup_if_neg = followup + 1
roc_auc, ap_score, pr_auc = compute_auc_at_followup(
probs, censor_times, golds, followup, fup_lower_bound=0
)
stats_dict["{}_year_risk_auc".format(min_followup_if_neg)] = roc_auc
stats_dict["{}_year_risk_apscore".format(min_followup_if_neg)] = ap_score
stats_dict["{}_year_risk_prauc".format(min_followup_if_neg)] = pr_auc
return stats_dict
def compute_auc_at_followup(probs, censor_times, golds, followup, fup_lower_bound=-1):
golds, censor_times = golds.ravel(), censor_times.ravel()
if len(probs.shape) == 3:
probs = probs.reshape(probs.shape[0] * probs.shape[1], probs.shape[2])
def include_exam_and_determine_label(prob_arr, censor_time, gold):
valid_pos = gold and censor_time <= followup and censor_time > fup_lower_bound
valid_neg = censor_time >= followup
included, label = (valid_pos or valid_neg), valid_pos
return included, label
probs_for_eval, golds_for_eval = [], []
for prob_arr, censor_time, gold in zip(probs, censor_times, golds):
include, label = include_exam_and_determine_label(prob_arr, censor_time, gold)
if include:
probs_for_eval.append(prob_arr[followup])
golds_for_eval.append(label)
try:
roc_auc = roc_auc_score(golds_for_eval, probs_for_eval, average="samples")
ap_score = average_precision_score(
golds_for_eval, probs_for_eval, average="samples"
)
precision, recall, _ = precision_recall_curve(golds_for_eval, probs_for_eval)
pr_auc = auc(recall, precision)
except Exception as e:
warnings.warn("Failed to calculate AUC because {}".format(e))
roc_auc = -1.0
ap_score = -1.0
pr_auc = -1.0
return roc_auc, ap_score, pr_auc
def get_censoring_dist(train_dataset):
from lifelines import KaplanMeierFitter
_dataset = train_dataset.dataset
times, event_observed = (
[d["time_at_event"] for d in _dataset],
[d["y"] for d in _dataset],
)
all_observed_times = set(times)
kmf = KaplanMeierFitter()
kmf.fit(times, event_observed)
censoring_dist = {str(time): kmf.predict(time) for time in all_observed_times}
return censoring_dist
def concordance_index(
event_times, predicted_scores, event_observed=None, censoring_dist=None
):
"""
## Adapted from: https://raw.githubusercontent.com/CamDavidsonPilon/lifelines/master/lifelines/utils/concordance.py
## Modified to weight by ipcw (inverse probality of censor weight) to fit Uno's C-index
## Modified to use a time-dependent score
Calculates the concordance index (C-index) between two series
of event times. The first is the real survival times from
the experimental data, and the other is the predicted survival
times from a model of some kind.
The c-index is the average of how often a model says X is greater than Y when, in the observed
data, X is indeed greater than Y. The c-index also handles how to handle censored values
(obviously, if Y is censored, it's hard to know if X is truly greater than Y).
The concordance index is a value between 0 and 1 where:
- 0.5 is the expected result from random predictions,
- 1.0 is perfect concordance and,
- 0.0 is perfect anti-concordance (multiply predictions with -1 to get 1.0)
Parameters
----------
event_times: iterable
a length-n iterable of observed survival times.
predicted_scores: iterable
a length-n iterable of predicted scores - these could be survival times, or hazards, etc. See https://stats.stackexchange.com/questions/352183/use-median-survival-time-to-calculate-cph-c-statistic/352435#352435
event_observed: iterable, optional
a length-n iterable censorship flags, 1 if observed, 0 if not. Default None assumes all observed.
Returns
-------
c-index: float
a value between 0 and 1.
References
-----------
Harrell FE, Lee KL, Mark DB. Multivariable prognostic models: issues in
developing models, evaluating assumptions and adequacy, and measuring and
reducing errors. Statistics in Medicine 1996;15(4):361-87.
Examples
--------
>>> from lifelines.utils import concordance_index
>>> cph = CoxPHFitter().fit(df, 'T', 'E')
>>> concordance_index(df['T'], -cph.predict_partial_hazard(df), df['E'])
"""
event_times = np.array(event_times).ravel()
predicted_scores = 1 - np.asarray(predicted_scores, dtype=float)
if len(predicted_scores.shape) == 3:
predicted_scores = predicted_scores.reshape(
[
predicted_scores.shape[0] * predicted_scores.shape[1],
predicted_scores.shape[2],
]
)
if event_observed is None:
event_observed = np.ones(event_times.shape[0], dtype=float)
else:
event_observed = np.asarray(event_observed, dtype=float).ravel()
if event_observed.shape != event_times.shape:
raise ValueError(
"Observed events must be 1-dimensional of same length as event times"
)
num_correct, num_tied, num_pairs = _concordance_summary_statistics(
event_times, predicted_scores, event_observed, censoring_dist
)
return _concordance_ratio(num_correct, num_tied, num_pairs)
def _concordance_ratio(num_correct, num_tied, num_pairs):
if num_pairs == 0:
raise ZeroDivisionError("No admissable pairs in the dataset.")
return (num_correct + num_tied / 2) / num_pairs
def _concordance_summary_statistics(
event_times, predicted_event_times, event_observed, censoring_dist
): # pylint: disable=too-many-locals
"""Find the concordance index in n * log(n) time.
Assumes the data has been verified by lifelines.utils.concordance_index first.
"""
# Here's how this works.
#
# It would be pretty easy to do if we had no censored data and no ties. There, the basic idea
# would be to iterate over the cases in order of their true event time (from least to greatest),
# while keeping track of a pool of *predicted* event times for all cases previously seen (= all
# cases that we know should be ranked lower than the case we're looking at currently).
#
# If the pool has O(log n) insert and O(log n) RANK (i.e., "how many things in the pool have
# value less than x"), then the following algorithm is n log n:
#
# Sort the times and predictions by time, increasing
# n_pairs, n_correct := 0
# pool := {}
# for each prediction p:
# n_pairs += len(pool)
# n_correct += rank(pool, p)
# add p to pool
#
# There are three complications: tied ground truth values, tied predictions, and censored
# observations.
#
# - To handle tied true event times, we modify the inner loop to work in *batches* of observations
# p_1, ..., p_n whose true event times are tied, and then add them all to the pool
# simultaneously at the end.
#
# - To handle tied predictions, which should each count for 0.5, we switch to
# n_correct += min_rank(pool, p)
# n_tied += count(pool, p)
#
# - To handle censored observations, we handle each batch of tied, censored observations just
# after the batch of observations that died at the same time (since those censored observations
# are comparable all the observations that died at the same time or previously). However, we do
# NOT add them to the pool at the end, because they are NOT comparable with any observations
# that leave the study afterward--whether or not those observations get censored.
if np.logical_not(event_observed).all():
return (0, 0, 0)
observed_times = set(event_times)
died_mask = event_observed.astype(bool)
# TODO: is event_times already sorted? That would be nice...
died_truth = event_times[died_mask]
ix = np.argsort(died_truth)
died_truth = died_truth[ix]
died_pred = predicted_event_times[died_mask][ix]
censored_truth = event_times[~died_mask]
ix = np.argsort(censored_truth)
censored_truth = censored_truth[ix]
censored_pred = predicted_event_times[~died_mask][ix]
from lifelines.utils.btree import _BTree
censored_ix = 0
died_ix = 0
times_to_compare = {}
for time in observed_times:
times_to_compare[time] = _BTree(np.unique(died_pred[:, int(time)]))
num_pairs = np.int64(0)
num_correct = np.int64(0)
num_tied = np.int64(0)
# we iterate through cases sorted by exit time:
# - First, all cases that died at time t0. We add these to the sortedlist of died times.
# - Then, all cases that were censored at time t0. We DON'T add these since they are NOT
# comparable to subsequent elements.
while True:
has_more_censored = censored_ix < len(censored_truth)
has_more_died = died_ix < len(died_truth)
# Should we look at some censored indices next, or died indices?
if has_more_censored and (
not has_more_died or died_truth[died_ix] > censored_truth[censored_ix]
):
pairs, correct, tied, next_ix, weight = _handle_pairs(
censored_truth,
censored_pred,
censored_ix,
times_to_compare,
censoring_dist,
)
censored_ix = next_ix
elif has_more_died and (
not has_more_censored or died_truth[died_ix] <= censored_truth[censored_ix]
):
pairs, correct, tied, next_ix, weight = _handle_pairs(
died_truth, died_pred, died_ix, times_to_compare, censoring_dist
)
for pred in died_pred[died_ix:next_ix]:
for time in observed_times:
times_to_compare[time].insert(pred[int(time)])
died_ix = next_ix
else:
assert not (has_more_died or has_more_censored)
break
num_pairs += pairs * weight
num_correct += correct * weight
num_tied += tied * weight
return (num_correct, num_tied, num_pairs)
def _handle_pairs(truth, pred, first_ix, times_to_compare, censoring_dist):
"""
Handle all pairs that exited at the same time as truth[first_ix].
Returns
-------
(pairs, correct, tied, next_ix)
new_pairs: The number of new comparisons performed
new_correct: The number of comparisons correctly predicted
next_ix: The next index that needs to be handled
"""
next_ix = first_ix
truth_time = truth[first_ix]
weight = 1.0 / (censoring_dist[str(int(truth_time))] ** 2)
while next_ix < len(truth) and truth[next_ix] == truth[first_ix]:
next_ix += 1
pairs = len(times_to_compare[truth_time]) * (next_ix - first_ix)
correct = np.int64(0)
tied = np.int64(0)
for i in range(first_ix, next_ix):
rank, count = times_to_compare[truth_time].rank(pred[i][int(truth_time)])
correct += rank
tied += count
return (pairs, correct, tied, next_ix, weight)