|
a |
|
b/mura.py |
|
|
1 |
from __future__ import absolute_import, division, print_function |
|
|
2 |
|
|
|
3 |
import re |
|
|
4 |
|
|
|
5 |
import numpy as np |
|
|
6 |
import pandas as pd |
|
|
7 |
from sklearn.metrics import (accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score) |
|
|
8 |
|
|
|
9 |
pd.set_option('display.max_rows', 20) |
|
|
10 |
pd.set_option('precision', 4) |
|
|
11 |
np.set_printoptions(precision=4) |
|
|
12 |
|
|
|
13 |
|
|
|
14 |
class Mura(object): |
|
|
15 |
"""`MURA <https://stanfordmlgroup.github.io/projects/mura/>`_ Dataset : |
|
|
16 |
Towards Radiologist-Level Abnormality Detection in Musculoskeletal Radiographs. |
|
|
17 |
""" |
|
|
18 |
url = "https://cs.stanford.edu/group/mlgroup/mura-v1.0.zip" |
|
|
19 |
filename = "mura-v1.0.zip" |
|
|
20 |
md5_checksum = '4c36feddb7f5698c8bf291b912c438b1' |
|
|
21 |
_patient_re = re.compile(r'patient(\d+)') |
|
|
22 |
_study_re = re.compile(r'study(\d+)') |
|
|
23 |
_image_re = re.compile(r'image(\d+)') |
|
|
24 |
_study_type_re = re.compile(r'_(\w+)_patient') |
|
|
25 |
|
|
|
26 |
def __init__(self, image_file_names, y_true, y_pred1=None, y_pred2=None, y_pred3=None, y_pred4=None, y_pred5=None, output_path=None): |
|
|
27 |
self.imgs = image_file_names |
|
|
28 |
df_img = pd.Series(np.array(image_file_names), name='img') |
|
|
29 |
self.y_true = y_true |
|
|
30 |
df_true = pd.Series(np.array(y_true), name='y_true') |
|
|
31 |
self.y_pred1 = y_pred1 |
|
|
32 |
self.y_pred2 = y_pred2 |
|
|
33 |
self.y_pred3 = y_pred3 |
|
|
34 |
self.y_pred4 = y_pred4 |
|
|
35 |
self.y_pred5 = y_pred5 |
|
|
36 |
self.output_path = output_path |
|
|
37 |
# number of unique classes |
|
|
38 |
self.patient = [] |
|
|
39 |
self.study = [] |
|
|
40 |
self.study_type = [] |
|
|
41 |
self.image_num = [] |
|
|
42 |
self.encounter = [] |
|
|
43 |
self.valid =[] |
|
|
44 |
for img in image_file_names: |
|
|
45 |
self.patient.append(self._parse_patient(img)) |
|
|
46 |
self.study.append(self._parse_study(img)) |
|
|
47 |
self.image_num.append(self._parse_image(img)) |
|
|
48 |
self.study_type.append(self._parse_study_type(img)) |
|
|
49 |
self.valid.append(self._parse_valid(img)) |
|
|
50 |
self.encounter.append("MURA-v1.1/{}/XR_{}/patient{}/study{}_{}".format( |
|
|
51 |
self._parse_valid(img), |
|
|
52 |
self._parse_study_type(img), |
|
|
53 |
self._parse_patient(img), |
|
|
54 |
self._parse_study(img), |
|
|
55 |
self._parse_normal(img))) |
|
|
56 |
|
|
|
57 |
self.classes = np.unique(self.y_true) |
|
|
58 |
df_patient = pd.Series(np.array(self.patient), name='patient') |
|
|
59 |
df_study = pd.Series(np.array(self.study), name='study') |
|
|
60 |
df_image_num = pd.Series(np.array(self.image_num), name='image_num') |
|
|
61 |
df_study_type = pd.Series(np.array(self.study_type), name='study_type') |
|
|
62 |
df_encounter = pd.Series(np.array(self.encounter), name='encounter') |
|
|
63 |
|
|
|
64 |
self.data = pd.concat( |
|
|
65 |
[ |
|
|
66 |
df_img, |
|
|
67 |
df_encounter, |
|
|
68 |
df_true, |
|
|
69 |
df_patient, |
|
|
70 |
# df_patient, |
|
|
71 |
df_study, |
|
|
72 |
df_image_num, |
|
|
73 |
df_study_type, |
|
|
74 |
], axis=1) |
|
|
75 |
|
|
|
76 |
# print(self.data) |
|
|
77 |
|
|
|
78 |
if self.y_pred1 is not None: |
|
|
79 |
self.y_pred1_probability = self.y_pred1.flatten() |
|
|
80 |
self.y_pred1 = self.y_pred1_probability.round().astype(int) |
|
|
81 |
df_y_pred1 = pd.Series(self.y_pred1, name='y_pred1') |
|
|
82 |
df_y_pred1_probability = pd.Series(self.y_pred1_probability, name='y_pred1_probs') |
|
|
83 |
self.data = pd.concat((self.data, df_y_pred1, df_y_pred1_probability), axis=1) |
|
|
84 |
|
|
|
85 |
if self.y_pred2 is not None: |
|
|
86 |
self.y_pred2_probability = self.y_pred2.flatten() |
|
|
87 |
self.y_pred2 = self.y_pred2_probability.round().astype(int) |
|
|
88 |
df_y_pred2 = pd.Series(self.y_pred2, name='y_pred2') |
|
|
89 |
df_y_pred2_probability = pd.Series(self.y_pred2_probability, name='y_pred2_probs') |
|
|
90 |
self.data = pd.concat((self.data, df_y_pred2, df_y_pred2_probability), axis=1) |
|
|
91 |
|
|
|
92 |
if self.y_pred3 is not None: |
|
|
93 |
self.y_pred3_probability = self.y_pred3.flatten() |
|
|
94 |
self.y_pred3 = self.y_pred3_probability.round().astype(int) |
|
|
95 |
df_y_pred3 = pd.Series(self.y_pred3, name='y_pred3') |
|
|
96 |
df_y_pred3_probability = pd.Series(self.y_pred3_probability, name='y_pred3_probs') |
|
|
97 |
self.data = pd.concat((self.data, df_y_pred3, df_y_pred3_probability), axis=1) |
|
|
98 |
|
|
|
99 |
if self.y_pred4 is not None: |
|
|
100 |
self.y_pred4_probability = self.y_pred4.flatten() |
|
|
101 |
self.y_pred4 = self.y_pred4_probability.round().astype(int) |
|
|
102 |
df_y_pred4 = pd.Series(self.y_pred4, name='y_pred4') |
|
|
103 |
df_y_pred4_probability = pd.Series(self.y_pred4_probability, name='y_pred4_probs') |
|
|
104 |
self.data = pd.concat((self.data, df_y_pred3, df_y_pred4_probability), axis=1) |
|
|
105 |
|
|
|
106 |
if self.y_pred5 is not None: |
|
|
107 |
self.y_pred5_probability = self.y_pred5.flatten() |
|
|
108 |
self.y_pred5 = self.y_pred5_probability.round().astype(int) |
|
|
109 |
df_y_pred5 = pd.Series(self.y_pred5, name='y_pred5') |
|
|
110 |
df_y_pred5_probability = pd.Series(self.y_pred5_probability, name='y_pred5_probs') |
|
|
111 |
self.data = pd.concat((self.data, df_y_pred5, df_y_pred5_probability), axis=1) |
|
|
112 |
|
|
|
113 |
def __len__(self): |
|
|
114 |
return len(self.imgs) |
|
|
115 |
|
|
|
116 |
def _parse_normal(self, img_filename): |
|
|
117 |
return "positive" if ("abnormal" in img_filename ) else "negative" |
|
|
118 |
|
|
|
119 |
def _parse_valid(self, img_filename): |
|
|
120 |
return "valid" if ("valid" in img_filename ) else "test" |
|
|
121 |
|
|
|
122 |
def _parse_patient(self, img_filename): |
|
|
123 |
return int(self._patient_re.search(img_filename).group(1)) |
|
|
124 |
|
|
|
125 |
def _parse_study(self, img_filename): |
|
|
126 |
return int(self._study_re.search(img_filename).group(1)) |
|
|
127 |
|
|
|
128 |
def _parse_image(self, img_filename): |
|
|
129 |
return int(self._image_re.search(img_filename).group(1)) |
|
|
130 |
|
|
|
131 |
def _parse_study_type(self, img_filename): |
|
|
132 |
return self._study_type_re.search(img_filename).group(1) |
|
|
133 |
|
|
|
134 |
def metrics(self): |
|
|
135 |
return "per image metrics:\n\taccuracy : {:.3f}\tf1 : {:.3f}\tprecision : {:.3f}\trecall : {:.3f}\tcohen_kappa : {:.3f}".format( |
|
|
136 |
accuracy_score(self.y_true, self.y_pred2), |
|
|
137 |
f1_score(self.y_true, self.y_pred2), |
|
|
138 |
precision_score(self.y_true, self.y_pred2), |
|
|
139 |
recall_score(self.y_true, self.y_pred2), |
|
|
140 |
cohen_kappa_score(self.y_true, self.y_pred2), ) |
|
|
141 |
|
|
|
142 |
def metrics_by_encounter(self): |
|
|
143 |
y_pred1 = self.data.groupby(['encounter'])['y_pred1_probs'].mean() |
|
|
144 |
y_pred2 = self.data.groupby(['encounter'])['y_pred2_probs'].mean() |
|
|
145 |
y_pred3 = self.data.groupby(['encounter'])['y_pred3_probs'].mean() |
|
|
146 |
y_pred4 = self.data.groupby(['encounter'])['y_pred4_probs'].mean() |
|
|
147 |
y_pred5 = self.data.groupby(['encounter'])['y_pred5_probs'].mean() |
|
|
148 |
week_group = (list( self.data.groupby(['encounter']).groups.keys())) |
|
|
149 |
|
|
|
150 |
y_pred = ((y_pred1 + y_pred2 + y_pred3 + y_pred4 + y_pred5)/5).round() |
|
|
151 |
y_pred_ = (y_pred + 1) % 2 |
|
|
152 |
#y_pred = y_pred.round() |
|
|
153 |
df_pred = pd.Series(np.array(y_pred_, np.int32), index=week_group) |
|
|
154 |
|
|
|
155 |
df_pred.to_csv(self.output_path) |
|
|
156 |
self.data.to_csv("data.csv", mode="a", header=True) |
|
|
157 |
|
|
|
158 |
# print(df_pred) |
|
|
159 |
#df_filename = pd.Series(np.array(week_group)) |
|
|
160 |
# self.group_data = pd.concat([df_pred]) |
|
|
161 |
|
|
|
162 |
# self.group_data.to_csv(self.output_path) |
|
|
163 |
|
|
|
164 |
y_true = self.data.groupby(['encounter'])['y_true'].mean().round() |
|
|
165 |
return "per encounter metrics:\n\taccuracy : {:.3f}\tf1 : {:.3f}\tprecision : {:.3f}\trecall : {:.3f}\tcohen_kappa : {:.3f}".format( |
|
|
166 |
accuracy_score(y_true, y_pred), |
|
|
167 |
f1_score(y_true, y_pred), |
|
|
168 |
precision_score(y_true, y_pred), |
|
|
169 |
recall_score(y_true, y_pred), |
|
|
170 |
cohen_kappa_score(y_true, y_pred), ) |
|
|
171 |
|
|
|
172 |
def metrics_by_study_type(self): |
|
|
173 |
y_pred1 = self.data.groupby(['patient'])['y_pred1_probs'].mean() |
|
|
174 |
y_pred2 = self.data.groupby(['patient'])['y_pred2_probs'].mean() |
|
|
175 |
y_pred3 = self.data.groupby(['patient'])['y_pred3_probs'].mean() |
|
|
176 |
y_pred4 = self.data.groupby(['patient'])['y_pred4_probs'].mean() |
|
|
177 |
y_pred5 = self.data.groupby(['patient'])['y_pred5_probs'].mean() |
|
|
178 |
|
|
|
179 |
y_pred = ((y_pred1 + y_pred5 + y_pred3 + y_pred3 + y_pred5)/5).round() |
|
|
180 |
# y_pred = y_pred1 |
|
|
181 |
y_true = self.data.groupby(['patient'])['y_true'].mean().round() |
|
|
182 |
|
|
|
183 |
self.data.to_csv("data.csv",mode="a",header=True) |
|
|
184 |
self.group_data = pd.concat([self.data, y_pred, y_true,], axis=1) |
|
|
185 |
self.group_data.to_csv("group_data.csv", mode="a", header=True) |
|
|
186 |
|
|
|
187 |
return "per study_type metrics:\n\taccuracy : {:.3f}\tf1 : {:.3f}\tprecision : {:.3f}\trecall : {:.3f}\tcohen_kappa : {:.3f}".format( |
|
|
188 |
accuracy_score(y_true, y_pred), |
|
|
189 |
f1_score(y_true, y_pred), |
|
|
190 |
precision_score(y_true, y_pred), |
|
|
191 |
recall_score(y_true, y_pred), |
|
|
192 |
cohen_kappa_score(y_true, y_pred), ) |