Diff of /mura.py [000000] .. [38391a]

Switch to unified view

a b/mura.py
1
from __future__ import absolute_import, division, print_function
2
3
import re
4
5
import numpy as np
6
import pandas as pd
7
from sklearn.metrics import (accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score)
8
9
pd.set_option('display.max_rows', 20)
10
pd.set_option('precision', 4)
11
np.set_printoptions(precision=4)
12
13
14
class Mura(object):
15
    """`MURA <https://stanfordmlgroup.github.io/projects/mura/>`_ Dataset :
16
    Towards Radiologist-Level Abnormality Detection in Musculoskeletal Radiographs.
17
    """
18
    url = "https://cs.stanford.edu/group/mlgroup/mura-v1.0.zip"
19
    filename = "mura-v1.0.zip"
20
    md5_checksum = '4c36feddb7f5698c8bf291b912c438b1'
21
    _patient_re = re.compile(r'patient(\d+)')
22
    _study_re = re.compile(r'study(\d+)')
23
    _image_re = re.compile(r'image(\d+)')
24
    _study_type_re = re.compile(r'_(\w+)_patient')
25
26
    def __init__(self, image_file_names, y_true, y_pred1=None, y_pred2=None, y_pred3=None, y_pred4=None, y_pred5=None, output_path=None):
27
        self.imgs = image_file_names
28
        df_img = pd.Series(np.array(image_file_names), name='img')
29
        self.y_true = y_true
30
        df_true = pd.Series(np.array(y_true), name='y_true')
31
        self.y_pred1 = y_pred1
32
        self.y_pred2 = y_pred2
33
        self.y_pred3 = y_pred3
34
        self.y_pred4 = y_pred4
35
        self.y_pred5 = y_pred5
36
        self.output_path = output_path
37
        # number of unique classes
38
        self.patient = []
39
        self.study = []
40
        self.study_type = []
41
        self.image_num = []
42
        self.encounter = []
43
        self.valid =[]
44
        for img in image_file_names:
45
            self.patient.append(self._parse_patient(img))
46
            self.study.append(self._parse_study(img))
47
            self.image_num.append(self._parse_image(img))
48
            self.study_type.append(self._parse_study_type(img))
49
            self.valid.append(self._parse_valid(img))
50
            self.encounter.append("MURA-v1.1/{}/XR_{}/patient{}/study{}_{}".format(
51
                self._parse_valid(img),
52
                self._parse_study_type(img),
53
                self._parse_patient(img),
54
                self._parse_study(img),
55
                self._parse_normal(img)))
56
57
        self.classes = np.unique(self.y_true)
58
        df_patient = pd.Series(np.array(self.patient), name='patient')
59
        df_study = pd.Series(np.array(self.study), name='study')
60
        df_image_num = pd.Series(np.array(self.image_num), name='image_num')
61
        df_study_type = pd.Series(np.array(self.study_type), name='study_type')
62
        df_encounter = pd.Series(np.array(self.encounter), name='encounter')
63
64
        self.data = pd.concat(
65
            [
66
                df_img,
67
                df_encounter,
68
                df_true,
69
                df_patient,
70
        #        df_patient,
71
                df_study,
72
                df_image_num,
73
                df_study_type,
74
            ], axis=1)
75
76
   #     print(self.data)
77
78
        if self.y_pred1 is not None:
79
            self.y_pred1_probability = self.y_pred1.flatten()
80
            self.y_pred1 = self.y_pred1_probability.round().astype(int)
81
            df_y_pred1 = pd.Series(self.y_pred1, name='y_pred1')
82
            df_y_pred1_probability = pd.Series(self.y_pred1_probability, name='y_pred1_probs')
83
            self.data = pd.concat((self.data, df_y_pred1, df_y_pred1_probability), axis=1)
84
85
        if self.y_pred2 is not None:
86
            self.y_pred2_probability = self.y_pred2.flatten()
87
            self.y_pred2 = self.y_pred2_probability.round().astype(int)
88
            df_y_pred2 = pd.Series(self.y_pred2, name='y_pred2')
89
            df_y_pred2_probability = pd.Series(self.y_pred2_probability, name='y_pred2_probs')
90
            self.data = pd.concat((self.data, df_y_pred2, df_y_pred2_probability), axis=1)
91
92
        if self.y_pred3 is not None:
93
            self.y_pred3_probability = self.y_pred3.flatten()
94
            self.y_pred3 = self.y_pred3_probability.round().astype(int)
95
            df_y_pred3 = pd.Series(self.y_pred3, name='y_pred3')
96
            df_y_pred3_probability = pd.Series(self.y_pred3_probability, name='y_pred3_probs')
97
            self.data = pd.concat((self.data, df_y_pred3, df_y_pred3_probability), axis=1)
98
99
        if self.y_pred4 is not None:
100
            self.y_pred4_probability = self.y_pred4.flatten()
101
            self.y_pred4 = self.y_pred4_probability.round().astype(int)
102
            df_y_pred4 = pd.Series(self.y_pred4, name='y_pred4')
103
            df_y_pred4_probability = pd.Series(self.y_pred4_probability, name='y_pred4_probs')
104
            self.data = pd.concat((self.data, df_y_pred3, df_y_pred4_probability), axis=1)
105
106
        if self.y_pred5 is not None:
107
            self.y_pred5_probability = self.y_pred5.flatten()
108
            self.y_pred5 = self.y_pred5_probability.round().astype(int)
109
            df_y_pred5 = pd.Series(self.y_pred5, name='y_pred5')
110
            df_y_pred5_probability = pd.Series(self.y_pred5_probability, name='y_pred5_probs')
111
            self.data = pd.concat((self.data, df_y_pred5, df_y_pred5_probability), axis=1)
112
113
    def __len__(self):
114
        return len(self.imgs)
115
116
    def _parse_normal(self, img_filename):
117
        return "positive" if ("abnormal" in img_filename ) else "negative"
118
119
    def _parse_valid(self, img_filename):
120
        return "valid" if ("valid" in img_filename ) else "test"
121
122
    def _parse_patient(self, img_filename):
123
        return int(self._patient_re.search(img_filename).group(1))
124
125
    def _parse_study(self, img_filename):
126
        return int(self._study_re.search(img_filename).group(1))
127
128
    def _parse_image(self, img_filename):
129
        return int(self._image_re.search(img_filename).group(1))
130
131
    def _parse_study_type(self, img_filename):
132
        return self._study_type_re.search(img_filename).group(1)
133
134
    def metrics(self):
135
        return "per image metrics:\n\taccuracy : {:.3f}\tf1 : {:.3f}\tprecision : {:.3f}\trecall : {:.3f}\tcohen_kappa : {:.3f}".format(
136
            accuracy_score(self.y_true, self.y_pred2),
137
            f1_score(self.y_true, self.y_pred2),
138
            precision_score(self.y_true, self.y_pred2),
139
            recall_score(self.y_true, self.y_pred2),
140
            cohen_kappa_score(self.y_true, self.y_pred2), )
141
142
    def metrics_by_encounter(self):
143
        y_pred1 = self.data.groupby(['encounter'])['y_pred1_probs'].mean()
144
        y_pred2 = self.data.groupby(['encounter'])['y_pred2_probs'].mean()
145
        y_pred3 = self.data.groupby(['encounter'])['y_pred3_probs'].mean()
146
        y_pred4 = self.data.groupby(['encounter'])['y_pred4_probs'].mean()
147
        y_pred5 = self.data.groupby(['encounter'])['y_pred5_probs'].mean()
148
        week_group  = (list( self.data.groupby(['encounter']).groups.keys()))
149
150
        y_pred = ((y_pred1 + y_pred2 + y_pred3 + y_pred4 + y_pred5)/5).round()
151
        y_pred_ = (y_pred + 1) % 2
152
        #y_pred = y_pred.round()
153
        df_pred = pd.Series(np.array(y_pred_, np.int32), index=week_group)
154
155
        df_pred.to_csv(self.output_path)
156
        self.data.to_csv("data.csv", mode="a", header=True)
157
158
    #    print(df_pred)
159
        #df_filename = pd.Series(np.array(week_group))
160
   #     self.group_data = pd.concat([df_pred])
161
162
  #      self.group_data.to_csv(self.output_path)
163
164
        y_true = self.data.groupby(['encounter'])['y_true'].mean().round()
165
        return "per encounter metrics:\n\taccuracy : {:.3f}\tf1 : {:.3f}\tprecision : {:.3f}\trecall : {:.3f}\tcohen_kappa : {:.3f}".format(
166
            accuracy_score(y_true, y_pred),
167
            f1_score(y_true, y_pred),
168
            precision_score(y_true, y_pred),
169
            recall_score(y_true, y_pred),
170
            cohen_kappa_score(y_true, y_pred), )
171
172
    def metrics_by_study_type(self):
173
        y_pred1 = self.data.groupby(['patient'])['y_pred1_probs'].mean()
174
        y_pred2 = self.data.groupby(['patient'])['y_pred2_probs'].mean()
175
        y_pred3 = self.data.groupby(['patient'])['y_pred3_probs'].mean()
176
        y_pred4 = self.data.groupby(['patient'])['y_pred4_probs'].mean()
177
        y_pred5 = self.data.groupby(['patient'])['y_pred5_probs'].mean()
178
179
        y_pred = ((y_pred1 + y_pred5 + y_pred3 + y_pred3 + y_pred5)/5).round()
180
#        y_pred = y_pred1
181
        y_true = self.data.groupby(['patient'])['y_true'].mean().round()
182
183
        self.data.to_csv("data.csv",mode="a",header=True)
184
        self.group_data =  pd.concat([self.data, y_pred, y_true,], axis=1)
185
        self.group_data.to_csv("group_data.csv", mode="a", header=True)
186
187
        return "per study_type metrics:\n\taccuracy : {:.3f}\tf1 : {:.3f}\tprecision : {:.3f}\trecall : {:.3f}\tcohen_kappa : {:.3f}".format(
188
            accuracy_score(y_true, y_pred),
189
            f1_score(y_true, y_pred),
190
            precision_score(y_true, y_pred),
191
            recall_score(y_true, y_pred),
192
            cohen_kappa_score(y_true, y_pred), )