|
a |
|
b/mura.py |
|
|
1 |
from __future__ import absolute_import, division, print_function |
|
|
2 |
|
|
|
3 |
import re |
|
|
4 |
|
|
|
5 |
import numpy as np |
|
|
6 |
import pandas as pd |
|
|
7 |
from sklearn.metrics import (accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score) |
|
|
8 |
|
|
|
9 |
pd.set_option('display.max_rows', 20) |
|
|
10 |
pd.set_option('precision', 4) |
|
|
11 |
np.set_printoptions(precision=4) |
|
|
12 |
|
|
|
13 |
|
|
|
14 |
class Mura(object): |
|
|
15 |
"""`MURA <https://stanfordmlgroup.github.io/projects/mura/>`_ Dataset : |
|
|
16 |
Towards Radiologist-Level Abnormality Detection in Musculoskeletal Radiographs. |
|
|
17 |
""" |
|
|
18 |
url = "https://cs.stanford.edu/group/mlgroup/mura-v1.0.zip" |
|
|
19 |
filename = "mura-v1.0.zip" |
|
|
20 |
md5_checksum = '4c36feddb7f5698c8bf291b912c438b1' |
|
|
21 |
_patient_re = re.compile(r'patient(\d+)') |
|
|
22 |
_study_re = re.compile(r'study(\d+)') |
|
|
23 |
_image_re = re.compile(r'image(\d+)') |
|
|
24 |
_study_type_re = re.compile(r'_(\w+)_patient') |
|
|
25 |
|
|
|
26 |
def __init__(self, image_file_names, y_true, y_pred=None): |
|
|
27 |
self.imgs = image_file_names |
|
|
28 |
df_img = pd.Series(np.array(image_file_names), name='img') |
|
|
29 |
self.y_true = y_true |
|
|
30 |
df_true = pd.Series(np.array(y_true), name='y_true') |
|
|
31 |
self.y_pred = y_pred |
|
|
32 |
# number of unique classes |
|
|
33 |
self.patient = [] |
|
|
34 |
self.study = [] |
|
|
35 |
self.study_type = [] |
|
|
36 |
self.image_num = [] |
|
|
37 |
self.encounter = [] |
|
|
38 |
for img in image_file_names: |
|
|
39 |
self.patient.append(self._parse_patient(img)) |
|
|
40 |
self.study.append(self._parse_study(img)) |
|
|
41 |
self.image_num.append(self._parse_image(img)) |
|
|
42 |
self.study_type.append(self._parse_study_type(img)) |
|
|
43 |
self.encounter.append("{}_{}_{}".format( |
|
|
44 |
self._parse_study_type(img), |
|
|
45 |
self._parse_patient(img), |
|
|
46 |
self._parse_study(img), )) |
|
|
47 |
|
|
|
48 |
self.classes = np.unique(self.y_true) |
|
|
49 |
df_patient = pd.Series(np.array(self.patient), name='patient') |
|
|
50 |
df_study = pd.Series(np.array(self.study), name='study') |
|
|
51 |
df_image_num = pd.Series(np.array(self.image_num), name='image_num') |
|
|
52 |
df_study_type = pd.Series(np.array(self.study_type), name='study_type') |
|
|
53 |
df_encounter = pd.Series(np.array(self.encounter), name='encounter') |
|
|
54 |
|
|
|
55 |
self.data = pd.concat( |
|
|
56 |
[ |
|
|
57 |
df_img, |
|
|
58 |
df_encounter, |
|
|
59 |
df_true, |
|
|
60 |
df_patient, |
|
|
61 |
df_patient, |
|
|
62 |
df_study, |
|
|
63 |
df_image_num, |
|
|
64 |
df_study_type, |
|
|
65 |
], axis=1) |
|
|
66 |
|
|
|
67 |
if self.y_pred is not None: |
|
|
68 |
self.y_pred_probability = self.y_pred.flatten() |
|
|
69 |
self.y_pred = self.y_pred_probability.round().astype(int) |
|
|
70 |
df_y_pred = pd.Series(self.y_pred, name='y_pred') |
|
|
71 |
df_y_pred_probability = pd.Series(self.y_pred_probability, name='y_pred_probs') |
|
|
72 |
self.data = pd.concat((self.data, df_y_pred, df_y_pred_probability), axis=1) |
|
|
73 |
|
|
|
74 |
def __len__(self): |
|
|
75 |
return len(self.imgs) |
|
|
76 |
|
|
|
77 |
def _parse_patient(self, img_filename): |
|
|
78 |
return int(self._patient_re.search(img_filename).group(1)) |
|
|
79 |
|
|
|
80 |
def _parse_study(self, img_filename): |
|
|
81 |
return int(self._study_re.search(img_filename).group(1)) |
|
|
82 |
|
|
|
83 |
def _parse_image(self, img_filename): |
|
|
84 |
return int(self._image_re.search(img_filename).group(1)) |
|
|
85 |
|
|
|
86 |
def _parse_study_type(self, img_filename): |
|
|
87 |
return self._study_type_re.search(img_filename).group(1) |
|
|
88 |
|
|
|
89 |
def metrics(self): |
|
|
90 |
return "per image metrics:\n\taccuracy : {:.2f}\tf1 : {:.2f}\tprecision : {:.2f}\trecall : {:.2f}\tcohen_kappa : {:.2f}".format( |
|
|
91 |
accuracy_score(self.y_true, self.y_pred), |
|
|
92 |
f1_score(self.y_true, self.y_pred), |
|
|
93 |
precision_score(self.y_true, self.y_pred), |
|
|
94 |
recall_score(self.y_true, self.y_pred), |
|
|
95 |
cohen_kappa_score(self.y_true, self.y_pred), ) |
|
|
96 |
|
|
|
97 |
def metrics_by_encounter(self): |
|
|
98 |
y_pred = self.data.groupby(['encounter'])['y_pred_probs'].mean().round() |
|
|
99 |
y_true = self.data.groupby(['encounter'])['y_true'].mean().round() |
|
|
100 |
return "per encounter metrics:\n\taccuracy : {:.2f}\tf1 : {:.2f}\tprecision : {:.2f}\trecall : {:.2f}\tcohen_kappa : {:.2f}".format( |
|
|
101 |
accuracy_score(y_true, y_pred), |
|
|
102 |
f1_score(y_true, y_pred), |
|
|
103 |
precision_score(y_true, y_pred), |
|
|
104 |
recall_score(y_true, y_pred), |
|
|
105 |
cohen_kappa_score(self.y_true, self.y_pred), ) |
|
|
106 |
|
|
|
107 |
# def metrics_by_study_type(self): |
|
|
108 |
# y_pred = self.data.groupby(['study_type', 'encounter'])['y_pred_probs'].mean().round() |
|
|
109 |
# y_true = self.data.groupby(['study_type', 'encounter'])['y_true'].mean().round() |
|
|
110 |
# return "per study_type metrics:\n\taccuracy : {:.2f}\tf1 : {:.2f}\tprecision : {:.2f}\trecall : {:.2f}".format( |
|
|
111 |
# accuracy_score(y_true, y_pred), |
|
|
112 |
# f1_score(y_true, y_pred), |
|
|
113 |
# precision_score(y_true, y_pred), |
|
|
114 |
# recall_score(y_true, y_pred), ) |