[38391a]: / download_and_convert_mura.py

Download this file

119 lines (92 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function
import re
import os
from os import getcwd
from os.path import exists, isdir, isfile, join
import shutil
import numpy as np
import pandas as pd
class ImageString(object):
_patient_re = re.compile(r'patient(\d+)')
_study_re = re.compile(r'study(\d+)')
_image_re = re.compile(r'image(\d+)')
_study_type_re = re.compile(r'XR_(\w+)')
def __init__(self, img_filename):
self.img_filename = img_filename
self.patient = self._parse_patient()
self.study = self._parse_study()
self.image_num = self._parse_image()
self.study_type = self._parse_study_type()
self.image = self._parse_image()
self.normal = self._parse_normal()
def flat_file_name(self):
return "{}_{}_patient{}_study{}_image{}.png".format(self.normal, self.study_type, self.patient, self.study,
self.image, self.normal)
def _parse_patient(self):
return int(self._patient_re.search(self.img_filename).group(1))
def _parse_study(self):
return int(self._study_re.search(self.img_filename).group(1))
def _parse_image(self):
return int(self._image_re.search(self.img_filename).group(1))
def _parse_study_type(self):
return self._study_type_re.search(self.img_filename).group(1)
def _parse_normal(self):
return "normal" if ("negative" in self.img_filename) else "abnormal"
# processed
# data
# ├── train
# │   ├── abnormal
# │   └── normal
# └── val
# ├── abnormal
# └── normal
proc_data_dir = join(os.getcwd(), 'data')
proc_train_dir = join(proc_data_dir, 'train')
proc_val_dir = join(proc_data_dir, 'val')
# Data loading code
orig_data_dir = join(os.getcwd(), 'MURA-v1.0')
train_dir = join(orig_data_dir, 'train')
train_csv = join(orig_data_dir, 'train.csv')
val_dir = join(orig_data_dir, 'valid')
val_csv = join(orig_data_dir, 'valid.csv')
test_dir = join(orig_data_dir, 'test')
assert isdir(orig_data_dir) and isdir(train_dir) and isdir(val_dir) and isdir(test_dir)
assert exists(train_csv) and isfile(train_csv) and exists(val_csv) and isfile(val_csv)
df = pd.read_csv(train_csv, names=['img', 'label'], header=None)
# imgs = df.img.values.tolist()
# labels = df.label.values.tolist()
# following datasets/folder.py's weird convention here...
samples = [tuple(x) for x in df.values]
for img, label in samples:
assert ("negative" in img) is (label is 0)
enc = ImageString(img)
cat_dir = join(proc_train_dir, enc.normal)
study_dir = join(proc_data_dir, enc.study_type)
study_dir2 = join(study_dir, enc.normal)
if not os.path.exists(cat_dir):
os.mkdir(cat_dir)
if not os.path.exists(study_dir):
os.mkdir(study_dir)
if not os.path.exists(study_dir2):
os.mkdir(study_dir2)
shutil.copy2(enc.img_filename, join(cat_dir, enc.flat_file_name()))
# shutil.copy2(enc.img_filename, join(study_dir2, enc.flat_file_name()))
df = pd.read_csv(val_csv, names=['img', 'label'], header=None)
samples = [tuple(x) for x in df.values]
for img, label in samples:
assert ("negative" in img) is (label is 0)
enc = ImageString(img)
study_dir = join(proc_data_dir, enc.study_type)
# study_dir2 = join(study_dir, enc.normal)
cat_dir = join(proc_val_dir, enc.normal)
study_dir = join(proc_data_dir, enc.study_type)
study_dir2 = join(study_dir, enc.normal)
# if not os.path.exists(cat_dir):
# os.mkdir(cat_dir)
# if not os.path.exists(study_dir):
# os.mkdir(study_dir)
# if not os.path.exists(study_dir2):
# os.mkdir(study_dir2)
# shutil.copy2(enc.img_filename, join(cat_dir, enc.flat_file_name()))
# shutil.copy2(enc.img_filename, join(study_dir2, enc.flat_file_name()))