[3f1788]: / utils / metrics.py

Download this file

221 lines (176 with data), 7.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from functools import partial
from typing import List
from sklearn.metrics import confusion_matrix, roc_auc_score
from scipy import stats
def _mean_confidence_interval(data, conf=0.95, decimal=3):
assert(conf > 0 and conf < 1), f"Confidence interval must be within (0, 1). It is {conf}"
a = 1.0 * np.array(data)
n = len(a)
m, se = np.mean(a), stats.sem(a)
h = se * stats.t.ppf((1 + conf) / 2., n-1)
return np.round(m, decimal), np.round(m-h, decimal), np.round(m+h, decimal)
class BinaryAvgMetrics(object):
def __init__(self, targets: List[int], predictions: List[int], probs: List[float], decimal=3) -> None:
assert (len(targets) == len(predictions) == len(probs)), f"Target list (length = {len(targets)}), predictions list (length = {len(predictions)}) and probabilities list (length = {len(probs)}) must all be of the same length!))"
self.targs = targets
self.n_runs = len(self.targs)
self.preds = predictions
self.probs = probs
self.decimal = 3
self.cms = np.zeros((len(self.targs), 2, 2), dtype=np.int64)
for i, (targ, pred) in enumerate(zip(self.targs, self.preds)):
self.cms[i] = confusion_matrix(targ, pred)
@property
def tns(self):
return self.cms[:, 0, 0]
@property
def fps(self):
return self.cms[:, 0, 1]
@property
def fns(self):
return self.cms[:, 1, 0]
@property
def tps(self):
return self.cms[:, 1, 1]
@property
def cm_avg(self):
return np.ceil(np.array([[self.tns.mean(), self.fps.mean()], [self.fns.mean(), self.tps.mean()]])).astype(np.int64)
@property
def prevalence_avg(self):
return np.round(((self.fns + self.tps) / (self.tns + self.fps + self.fns + self.tps)).mean(), self.decimal)
def sensitivities(self):
return self.tps / (self.tps + self.fns)
def sensitivity_avg(self, conf=None):
se = (self.tps / (self.tps + self.fns))
if conf is not None:
return _mean_confidence_interval(se, conf)
return np.round(se.mean(), self.decimal,)
def specificities(self):
return self.tns / (self.tns + self.fps)
def specificity_avg(self, conf=None):
sp = (self.tns / (self.tns + self.fps))
if conf is not None:
return _mean_confidence_interval(sp, conf)
return np.round(sp.mean(), self.decimal)
def ppvs(self):
return self.tps / (self.tps + self.fps)
def ppv_avg(self, conf=None):
ppv = (self.tps / (self.tps + self.fps))
if conf is not None:
return _mean_confidence_interval(ppv, conf)
return np.round(ppv.mean(), self.decimal)
def npvs(self):
return self.tns / (self.tns + self.fns)
def npv_avg(self, conf=None):
npv = (self.tns / (self.tns + self.fns))
if conf is not None:
return _mean_confidence_interval(npv, conf)
return np.round(npv.mean(), self.decimal)
def f1s(self):
return (2 * self.sensitivities() * self.ppvs()) / (self.sensitivities() + self.ppvs())
def f1_avg(self, conf=None):
se = (self.tps / (self.tps + self.fns))
ppv = (self.tps / (self.tps + self.fps))
f1 = (2 * se * ppv) / (se + ppv)
if conf is not None:
return _mean_confidence_interval(f1, conf)
return np.round(f1.mean(), self.decimal)
def aurocs(self):
return np.array([roc_auc_score(targ, prob) for targ, prob in zip(self.targs, self.probs)])
def auroc_avg(self, conf=None):
auroc = np.array([roc_auc_score(targ, prob) for targ, prob in zip(self.targs, self.probs)])
if conf is not None:
return _mean_confidence_interval(auroc, conf)
return np.round(auroc.mean(), self.decimal)
def get_avg_metrics(self, conf=None, defn=False):
definitions = {
'sensitivity': "When it's ACTUALLY YES, how often does it PREDICT YES?",
'specificity': "When it's ACTUALLY NO, how often does it PREDICT NO?",
'ppv': "When it PREDICTS YES, how often is it correct?",
'auroc': "Indicates how well the model is capable of distinguishing between classes",
'npv': "When it PREDICTS NO, how often is it correct?",
'f1': "Harmonic mean of sensitivity and ppv",
}
if conf is None:
metrics = {
'sensitivity': [self.sensitivity_avg() * 100],
'specificity': [self.specificity_avg() * 100],
'ppv': [self.ppv_avg() * 100],
'auroc': [self.auroc_avg() * 100],
'npv': [self.npv_avg() * 100],
'f1': [self.f1_avg() * 100],
}
if defn:
for metric, value in metrics.items():
value.append(definitions[metric])
d = pd.DataFrame(metrics.values(), index=metrics.keys(), columns=['Value', 'Definition'])
else:
d = pd.DataFrame(metrics.values(), index=metrics.keys(), columns=['Value'])
return d
else:
metrics = {
'sensitivity': [*[value * 100 for value in self.sensitivity_avg(conf)]],
'specificity': [*[value * 100 for value in self.specificity_avg(conf)]],
'ppv': [*[value * 100 for value in self.ppv_avg(conf)]],
'auroc': [*[value * 100 for value in self.auroc_avg(conf)]],
'npv': [*[value * 100 for value in self.npv_avg(conf)]],
'f1': [*[value * 100 for value in self.f1_avg(conf)]],
}
if defn:
for metric, value in metrics.items():
value.append(definitions[metric])
d = pd.DataFrame(metrics.values(), index=metrics.keys(), columns=['Mean', 'Lower', 'Upper', 'Definition'])
else:
d = pd.DataFrame(metrics.values(), index=metrics.keys(), columns=['Mean', 'Lower', 'Upper'])
return d
def __repr__(self):
s = f"Number of Runs: {self.n_runs}\n"
return s
def __len__(self):
return len(self.targs)
def get_best_model(bam: BinaryAvgMetrics, fnames: List[str]):
best_se, best_se_model = 0, None
best_sp, best_sp_model = 0, None
best_ppv, best_ppv_model = 0, None
best_auroc, best_auroc_model = 0, None
best_npv, best_npv_model = 0, None
best_f1, best_f1_model = 0, None
for i in range(bam.n_runs):
se = bam.tps[i] / (bam.tps[i] + bam.fns[i])
sp = bam.tns[i] / (bam.tns[i] + bam.fps[i])
ppv = bam.tps[i] / (bam.tps[i] + bam.fps[i])
npv = bam.tns[i] / (bam.tns[i] + bam.fns[i])
f1 = (2 * se * ppv) / (se + ppv)
if best_se < se:
best_se = se
best_se_model = fnames[i]
if best_sp < sp:
best_sp = sp
best_sp_model = fnames[i]
if best_ppv < ppv:
best_ppv = ppv
best_ppv_model = fnames[i]
if best_npv < npv:
best_npv = npv
best_npv_model = fnames[i]
if best_f1 < f1:
best_f1 = f1
best_f1_model = fnames[i]
for i, (targ, prob) in enumerate(zip(bam.targs, bam.probs)):
auroc = roc_auc_score(targ, prob)
if best_auroc < auroc:
best_auroc = auroc
best_auroc_model = fnames[i]
d = {
'sensitivity': [best_se, best_se_model],
'specificity': [best_sp, best_sp_model],
'ppv': [best_ppv, best_ppv_model],
'auroc': [best_auroc, best_auroc_model],
'npv': [best_npv, best_npv_model],
'f1': [best_f1, best_f1_model],
}
return pd.DataFrame(d.values(), index=d.keys(), columns=['Value', 'Model File'])