a b/mura.py
1
from __future__ import absolute_import, division, print_function
2
3
import re
4
5
import numpy as np
6
import pandas as pd
7
from sklearn.metrics import (accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score)
8
9
pd.set_option('display.max_rows', 20)
10
pd.set_option('precision', 4)
11
np.set_printoptions(precision=4)
12
13
14
class Mura(object):
15
    """`MURA <https://stanfordmlgroup.github.io/projects/mura/>`_ Dataset :
16
    Towards Radiologist-Level Abnormality Detection in Musculoskeletal Radiographs.
17
    """
18
    url = "https://cs.stanford.edu/group/mlgroup/mura-v1.0.zip"
19
    filename = "mura-v1.0.zip"
20
    md5_checksum = '4c36feddb7f5698c8bf291b912c438b1'
21
    _patient_re = re.compile(r'patient(\d+)')
22
    _study_re = re.compile(r'study(\d+)')
23
    _image_re = re.compile(r'image(\d+)')
24
    _study_type_re = re.compile(r'_(\w+)_patient')
25
26
    def __init__(self, image_file_names, y_true, y_pred=None):
27
        self.imgs = image_file_names
28
        df_img = pd.Series(np.array(image_file_names), name='img')
29
        self.y_true = y_true
30
        df_true = pd.Series(np.array(y_true), name='y_true')
31
        self.y_pred = y_pred
32
        # number of unique classes
33
        self.patient = []
34
        self.study = []
35
        self.study_type = []
36
        self.image_num = []
37
        self.encounter = []
38
        for img in image_file_names:
39
            self.patient.append(self._parse_patient(img))
40
            self.study.append(self._parse_study(img))
41
            self.image_num.append(self._parse_image(img))
42
            self.study_type.append(self._parse_study_type(img))
43
            self.encounter.append("{}_{}_{}".format(
44
                self._parse_study_type(img),
45
                self._parse_patient(img),
46
                self._parse_study(img), ))
47
48
        self.classes = np.unique(self.y_true)
49
        df_patient = pd.Series(np.array(self.patient), name='patient')
50
        df_study = pd.Series(np.array(self.study), name='study')
51
        df_image_num = pd.Series(np.array(self.image_num), name='image_num')
52
        df_study_type = pd.Series(np.array(self.study_type), name='study_type')
53
        df_encounter = pd.Series(np.array(self.encounter), name='encounter')
54
55
        self.data = pd.concat(
56
            [
57
                df_img,
58
                df_encounter,
59
                df_true,
60
                df_patient,
61
                df_patient,
62
                df_study,
63
                df_image_num,
64
                df_study_type,
65
            ], axis=1)
66
67
        if self.y_pred is not None:
68
            self.y_pred_probability = self.y_pred.flatten()
69
            self.y_pred = self.y_pred_probability.round().astype(int)
70
            df_y_pred = pd.Series(self.y_pred, name='y_pred')
71
            df_y_pred_probability = pd.Series(self.y_pred_probability, name='y_pred_probs')
72
            self.data = pd.concat((self.data, df_y_pred, df_y_pred_probability), axis=1)
73
74
    def __len__(self):
75
        return len(self.imgs)
76
77
    def _parse_patient(self, img_filename):
78
        return int(self._patient_re.search(img_filename).group(1))
79
80
    def _parse_study(self, img_filename):
81
        return int(self._study_re.search(img_filename).group(1))
82
83
    def _parse_image(self, img_filename):
84
        return int(self._image_re.search(img_filename).group(1))
85
86
    def _parse_study_type(self, img_filename):
87
        return self._study_type_re.search(img_filename).group(1)
88
89
    def metrics(self):
90
        return "per image metrics:\n\taccuracy : {:.2f}\tf1 : {:.2f}\tprecision : {:.2f}\trecall : {:.2f}\tcohen_kappa : {:.2f}".format(
91
            accuracy_score(self.y_true, self.y_pred),
92
            f1_score(self.y_true, self.y_pred),
93
            precision_score(self.y_true, self.y_pred),
94
            recall_score(self.y_true, self.y_pred),
95
            cohen_kappa_score(self.y_true, self.y_pred), )
96
97
    def metrics_by_encounter(self):
98
        y_pred = self.data.groupby(['encounter'])['y_pred_probs'].mean().round()
99
        y_true = self.data.groupby(['encounter'])['y_true'].mean().round()
100
        return "per encounter metrics:\n\taccuracy : {:.2f}\tf1 : {:.2f}\tprecision : {:.2f}\trecall : {:.2f}\tcohen_kappa : {:.2f}".format(
101
            accuracy_score(y_true, y_pred),
102
            f1_score(y_true, y_pred),
103
            precision_score(y_true, y_pred),
104
            recall_score(y_true, y_pred),
105
            cohen_kappa_score(self.y_true, self.y_pred), )
106
107
    # def metrics_by_study_type(self):
108
    #     y_pred = self.data.groupby(['study_type', 'encounter'])['y_pred_probs'].mean().round()
109
    #     y_true = self.data.groupby(['study_type', 'encounter'])['y_true'].mean().round()
110
    #     return "per study_type metrics:\n\taccuracy : {:.2f}\tf1 : {:.2f}\tprecision : {:.2f}\trecall : {:.2f}".format(
111
    #         accuracy_score(y_true, y_pred),
112
    #         f1_score(y_true, y_pred),
113
    #         precision_score(y_true, y_pred),
114
    #         recall_score(y_true, y_pred), )