|
a |
|
b/utils/metrics.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
|
|
|
3 |
import numpy as np |
|
|
4 |
import pandas as pd |
|
|
5 |
pd.set_option('display.max_colwidth', -1) |
|
|
6 |
|
|
|
7 |
from functools import partial |
|
|
8 |
from typing import List |
|
|
9 |
from sklearn.metrics import confusion_matrix, roc_auc_score |
|
|
10 |
from scipy import stats |
|
|
11 |
|
|
|
12 |
def _mean_confidence_interval(data, conf=0.95, decimal=3): |
|
|
13 |
assert(conf > 0 and conf < 1), f"Confidence interval must be within (0, 1). It is {conf}" |
|
|
14 |
a = 1.0 * np.array(data) |
|
|
15 |
n = len(a) |
|
|
16 |
m, se = np.mean(a), stats.sem(a) |
|
|
17 |
h = se * stats.t.ppf((1 + conf) / 2., n-1) |
|
|
18 |
return np.round(m, decimal), np.round(m-h, decimal), np.round(m+h, decimal) |
|
|
19 |
|
|
|
20 |
class BinaryAvgMetrics(object): |
|
|
21 |
def __init__(self, targets: List[int], predictions: List[int], probs: List[float], decimal=3) -> None: |
|
|
22 |
assert (len(targets) == len(predictions) == len(probs)), f"Target list (length = {len(targets)}), predictions list (length = {len(predictions)}) and probabilities list (length = {len(probs)}) must all be of the same length!))" |
|
|
23 |
self.targs = targets |
|
|
24 |
self.n_runs = len(self.targs) |
|
|
25 |
self.preds = predictions |
|
|
26 |
self.probs = probs |
|
|
27 |
self.decimal = 3 |
|
|
28 |
|
|
|
29 |
self.cms = np.zeros((len(self.targs), 2, 2), dtype=np.int64) |
|
|
30 |
|
|
|
31 |
for i, (targ, pred) in enumerate(zip(self.targs, self.preds)): |
|
|
32 |
self.cms[i] = confusion_matrix(targ, pred) |
|
|
33 |
|
|
|
34 |
@property |
|
|
35 |
def tns(self): |
|
|
36 |
return self.cms[:, 0, 0] |
|
|
37 |
|
|
|
38 |
@property |
|
|
39 |
def fps(self): |
|
|
40 |
return self.cms[:, 0, 1] |
|
|
41 |
|
|
|
42 |
@property |
|
|
43 |
def fns(self): |
|
|
44 |
return self.cms[:, 1, 0] |
|
|
45 |
|
|
|
46 |
@property |
|
|
47 |
def tps(self): |
|
|
48 |
return self.cms[:, 1, 1] |
|
|
49 |
|
|
|
50 |
@property |
|
|
51 |
def cm_avg(self): |
|
|
52 |
return np.ceil(np.array([[self.tns.mean(), self.fps.mean()], [self.fns.mean(), self.tps.mean()]])).astype(np.int64) |
|
|
53 |
|
|
|
54 |
@property |
|
|
55 |
def prevalence_avg(self): |
|
|
56 |
return np.round(((self.fns + self.tps) / (self.tns + self.fps + self.fns + self.tps)).mean(), self.decimal) |
|
|
57 |
|
|
|
58 |
def sensitivities(self): |
|
|
59 |
return self.tps / (self.tps + self.fns) |
|
|
60 |
|
|
|
61 |
def sensitivity_avg(self, conf=None): |
|
|
62 |
se = (self.tps / (self.tps + self.fns)) |
|
|
63 |
if conf is not None: |
|
|
64 |
return _mean_confidence_interval(se, conf) |
|
|
65 |
|
|
|
66 |
return np.round(se.mean(), self.decimal,) |
|
|
67 |
|
|
|
68 |
def specificities(self): |
|
|
69 |
return self.tns / (self.tns + self.fps) |
|
|
70 |
|
|
|
71 |
def specificity_avg(self, conf=None): |
|
|
72 |
sp = (self.tns / (self.tns + self.fps)) |
|
|
73 |
if conf is not None: |
|
|
74 |
return _mean_confidence_interval(sp, conf) |
|
|
75 |
|
|
|
76 |
return np.round(sp.mean(), self.decimal) |
|
|
77 |
|
|
|
78 |
def ppvs(self): |
|
|
79 |
return self.tps / (self.tps + self.fps) |
|
|
80 |
|
|
|
81 |
def ppv_avg(self, conf=None): |
|
|
82 |
ppv = (self.tps / (self.tps + self.fps)) |
|
|
83 |
if conf is not None: |
|
|
84 |
return _mean_confidence_interval(ppv, conf) |
|
|
85 |
|
|
|
86 |
return np.round(ppv.mean(), self.decimal) |
|
|
87 |
|
|
|
88 |
def npvs(self): |
|
|
89 |
return self.tns / (self.tns + self.fns) |
|
|
90 |
|
|
|
91 |
def npv_avg(self, conf=None): |
|
|
92 |
npv = (self.tns / (self.tns + self.fns)) |
|
|
93 |
if conf is not None: |
|
|
94 |
return _mean_confidence_interval(npv, conf) |
|
|
95 |
|
|
|
96 |
return np.round(npv.mean(), self.decimal) |
|
|
97 |
|
|
|
98 |
def f1s(self): |
|
|
99 |
return (2 * self.sensitivities() * self.ppvs()) / (self.sensitivities() + self.ppvs()) |
|
|
100 |
|
|
|
101 |
def f1_avg(self, conf=None): |
|
|
102 |
se = (self.tps / (self.tps + self.fns)) |
|
|
103 |
ppv = (self.tps / (self.tps + self.fps)) |
|
|
104 |
f1 = (2 * se * ppv) / (se + ppv) |
|
|
105 |
if conf is not None: |
|
|
106 |
return _mean_confidence_interval(f1, conf) |
|
|
107 |
|
|
|
108 |
return np.round(f1.mean(), self.decimal) |
|
|
109 |
|
|
|
110 |
def aurocs(self): |
|
|
111 |
return np.array([roc_auc_score(targ, prob) for targ, prob in zip(self.targs, self.probs)]) |
|
|
112 |
|
|
|
113 |
def auroc_avg(self, conf=None): |
|
|
114 |
auroc = np.array([roc_auc_score(targ, prob) for targ, prob in zip(self.targs, self.probs)]) |
|
|
115 |
if conf is not None: |
|
|
116 |
return _mean_confidence_interval(auroc, conf) |
|
|
117 |
|
|
|
118 |
return np.round(auroc.mean(), self.decimal) |
|
|
119 |
|
|
|
120 |
def get_avg_metrics(self, conf=None, defn=False): |
|
|
121 |
definitions = { |
|
|
122 |
'sensitivity': "When it's ACTUALLY YES, how often does it PREDICT YES?", |
|
|
123 |
'specificity': "When it's ACTUALLY NO, how often does it PREDICT NO?", |
|
|
124 |
'ppv': "When it PREDICTS YES, how often is it correct?", |
|
|
125 |
'auroc': "Indicates how well the model is capable of distinguishing between classes", |
|
|
126 |
'npv': "When it PREDICTS NO, how often is it correct?", |
|
|
127 |
'f1': "Harmonic mean of sensitivity and ppv", |
|
|
128 |
} |
|
|
129 |
if conf is None: |
|
|
130 |
metrics = { |
|
|
131 |
'sensitivity': [self.sensitivity_avg() * 100], |
|
|
132 |
'specificity': [self.specificity_avg() * 100], |
|
|
133 |
'ppv': [self.ppv_avg() * 100], |
|
|
134 |
'auroc': [self.auroc_avg() * 100], |
|
|
135 |
'npv': [self.npv_avg() * 100], |
|
|
136 |
'f1': [self.f1_avg() * 100], |
|
|
137 |
} |
|
|
138 |
|
|
|
139 |
if defn: |
|
|
140 |
for metric, value in metrics.items(): |
|
|
141 |
value.append(definitions[metric]) |
|
|
142 |
d = pd.DataFrame(metrics.values(), index=metrics.keys(), columns=['Value', 'Definition']) |
|
|
143 |
else: |
|
|
144 |
d = pd.DataFrame(metrics.values(), index=metrics.keys(), columns=['Value']) |
|
|
145 |
|
|
|
146 |
return d |
|
|
147 |
|
|
|
148 |
else: |
|
|
149 |
metrics = { |
|
|
150 |
'sensitivity': [*[value * 100 for value in self.sensitivity_avg(conf)]], |
|
|
151 |
'specificity': [*[value * 100 for value in self.specificity_avg(conf)]], |
|
|
152 |
'ppv': [*[value * 100 for value in self.ppv_avg(conf)]], |
|
|
153 |
'auroc': [*[value * 100 for value in self.auroc_avg(conf)]], |
|
|
154 |
'npv': [*[value * 100 for value in self.npv_avg(conf)]], |
|
|
155 |
'f1': [*[value * 100 for value in self.f1_avg(conf)]], |
|
|
156 |
} |
|
|
157 |
|
|
|
158 |
if defn: |
|
|
159 |
for metric, value in metrics.items(): |
|
|
160 |
value.append(definitions[metric]) |
|
|
161 |
d = pd.DataFrame(metrics.values(), index=metrics.keys(), columns=['Mean', 'Lower', 'Upper', 'Definition']) |
|
|
162 |
else: |
|
|
163 |
d = pd.DataFrame(metrics.values(), index=metrics.keys(), columns=['Mean', 'Lower', 'Upper']) |
|
|
164 |
|
|
|
165 |
return d |
|
|
166 |
|
|
|
167 |
def __repr__(self): |
|
|
168 |
s = f"Number of Runs: {self.n_runs}\n" |
|
|
169 |
return s |
|
|
170 |
|
|
|
171 |
def __len__(self): |
|
|
172 |
return len(self.targs) |
|
|
173 |
|
|
|
174 |
def get_best_model(bam: BinaryAvgMetrics, fnames: List[str]): |
|
|
175 |
best_se, best_se_model = 0, None |
|
|
176 |
best_sp, best_sp_model = 0, None |
|
|
177 |
best_ppv, best_ppv_model = 0, None |
|
|
178 |
best_auroc, best_auroc_model = 0, None |
|
|
179 |
best_npv, best_npv_model = 0, None |
|
|
180 |
best_f1, best_f1_model = 0, None |
|
|
181 |
|
|
|
182 |
for i in range(bam.n_runs): |
|
|
183 |
se = bam.tps[i] / (bam.tps[i] + bam.fns[i]) |
|
|
184 |
sp = bam.tns[i] / (bam.tns[i] + bam.fps[i]) |
|
|
185 |
ppv = bam.tps[i] / (bam.tps[i] + bam.fps[i]) |
|
|
186 |
npv = bam.tns[i] / (bam.tns[i] + bam.fns[i]) |
|
|
187 |
f1 = (2 * se * ppv) / (se + ppv) |
|
|
188 |
|
|
|
189 |
if best_se < se: |
|
|
190 |
best_se = se |
|
|
191 |
best_se_model = fnames[i] |
|
|
192 |
if best_sp < sp: |
|
|
193 |
best_sp = sp |
|
|
194 |
best_sp_model = fnames[i] |
|
|
195 |
if best_ppv < ppv: |
|
|
196 |
best_ppv = ppv |
|
|
197 |
best_ppv_model = fnames[i] |
|
|
198 |
if best_npv < npv: |
|
|
199 |
best_npv = npv |
|
|
200 |
best_npv_model = fnames[i] |
|
|
201 |
if best_f1 < f1: |
|
|
202 |
best_f1 = f1 |
|
|
203 |
best_f1_model = fnames[i] |
|
|
204 |
|
|
|
205 |
for i, (targ, prob) in enumerate(zip(bam.targs, bam.probs)): |
|
|
206 |
auroc = roc_auc_score(targ, prob) |
|
|
207 |
if best_auroc < auroc: |
|
|
208 |
best_auroc = auroc |
|
|
209 |
best_auroc_model = fnames[i] |
|
|
210 |
|
|
|
211 |
d = { |
|
|
212 |
'sensitivity': [best_se, best_se_model], |
|
|
213 |
'specificity': [best_sp, best_sp_model], |
|
|
214 |
'ppv': [best_ppv, best_ppv_model], |
|
|
215 |
'auroc': [best_auroc, best_auroc_model], |
|
|
216 |
'npv': [best_npv, best_npv_model], |
|
|
217 |
'f1': [best_f1, best_f1_model], |
|
|
218 |
} |
|
|
219 |
|
|
|
220 |
return pd.DataFrame(d.values(), index=d.keys(), columns=['Value', 'Model File']) |