[853718]: / bm_ANHIR / generate_regist_pairs.py

Download this file

229 lines (202 with data), 8.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""
Creating cover file for configuring registration image pairs for ANHIR dataset.
The paths and all other constants are set to run on CMP grid.
Copyright (C) 2016-2019 Jiri Borovec <jiri.borovec@fel.cvut.cz>
"""
import glob
import logging
import os
import sys
from functools import partial
import pandas as pd
import tqdm
sys.path += [os.path.abspath('.'), os.path.abspath('..')] # Add path to root
from birl.benchmark import ImRegBenchmark
from birl.utilities.data_io import image_sizes, update_path
from birl.utilities.dataset import generate_pairing, IMAGE_EXTENSIONS
DATASET_IMAGES = '/datagrid/Medical/dataset_ANHIR/images_private'
DATASET_LANDMARKS = '/datagrid/Medical/dataset_ANHIR/landmarks_all'
DATASET_COVERS = '/datagrid/Medical/dataset_ANHIR/images'
NAME_COVER_FILE = 'dataset_%s.csv'
GENERATED_SCALES = (5, 10, 15, 20, 25, 50, 100)
NAME_DIR_SCALE = 'scale-%ipc'
# define datasets scale size names and the shift in GENERATED_SCALES
SCALE_NAMES = (
'small', # just thumbnail about 2500 image edge
'medium', # the image edge should have abound 10k
)
# define tissues with all landmarks presented
DATASET_TISSUE_SCALE_COMPLETE = {
'lung-lesion_[1,3]': {
'small': 5,
'medium': 50,
},
'lung-lesion_2': {
'small': 5,
'medium': 25,
},
'lung-lobes_*': {
'small': 5,
'medium': 100,
},
'mammary-gland_*': {
'small': 5,
'medium': 25,
},
}
# define tissues which hide some samples as test
DATASET_TISSUE_SCALE_PARTIAL = {
'mice-kidney_*': {
'small': 5,
'medium': 25,
},
'COAD_*': {
'small': 5,
'medium': 25,
},
'gastric_*': {
'small': 2,
'medium': 15,
},
'breast_*': {
'small': 2,
'medium': 20,
},
'kidney_*': {
'small': 5,
'medium': 25,
},
}
# define tissues to be part of the dataset
DATASET_TISSUE_SCALE = DATASET_TISSUE_SCALE_COMPLETE.copy()
DATASET_TISSUE_SCALE.update(DATASET_TISSUE_SCALE_PARTIAL)
# each N sample in test will be considers as test case
HIDE_TEST_TISSUE_STEP = 3
# requires empty columns in the dataset cover
COLUMNS_EMPTY = (ImRegBenchmark.COL_POINTS_REF_WARP, ImRegBenchmark.COL_POINTS_MOVE_WARP, ImRegBenchmark.COL_TIME)
# define train / test status
VAL_STATUS_TRAIN = 'training'
VAL_STATUS_TEST = 'evaluation'
def get_relative_paths(paths, path_base):
""" transform paths to relati according given base path
:param list(str) paths: collection of paths
:param str path_base: past that can be removed from the input paths
:return str:
"""
paths_r = [p.replace(path_base, '')[1:] for p in sorted(paths)]
return paths_r
def list_landmarks_images(path_tissue, sc, path_landmarks, path_images):
""" list image and landmarks paths
:param str path_tissue: path to a tissue - image set
:param int sc: used scale
:param str path_landmarks:
:param str path_images:
:return tuple(list(str),list(str)):
"""
path_ = os.path.join(path_tissue, NAME_DIR_SCALE % sc, '*.csv')
rp_lnds = get_relative_paths(glob.glob(path_), path_landmarks)
if not rp_lnds:
logging.debug('found no landmarks for: %s', path_)
return [], []
paths_imgs, rp_lnds_filter = [], []
for rp_lnd in rp_lnds:
pattern = os.path.splitext(rp_lnd)[0] + '.*'
p_imgs = glob.glob(os.path.join(path_images, pattern))
p_imgs = [p for p in p_imgs if os.path.splitext(p)[-1] in IMAGE_EXTENSIONS]
if not p_imgs:
logging.warning('missing image for "%s"', rp_lnd)
else:
rp_lnds_filter.append(rp_lnd)
paths_imgs.append(sorted(p_imgs)[0])
rp_imgs = get_relative_paths(paths_imgs, path_images)
return rp_lnds_filter, rp_imgs
def generate_reg_pairs(rp_imgs, rp_lnds, pairs, public, path_images=DATASET_IMAGES):
""" format a registration pair as dictionaries/rows in cover table for a set
:param list(str) rp_imgs: relative paths to images
:param rp_lnds: relative paths to related landmarks
:param list(tuple(int,int)) pairs: pairing among images/landmarks
:param list(bool) public: marks whether the particular pair is training or evaluation
:param str path_images: path to the dataset folder
:return list(dict): registration pairs
"""
reg_pairs = []
for k, (i, j) in enumerate(pairs):
img_size, img_diag = image_sizes(update_path(rp_imgs[i], pre_path=path_images))
reg_pairs.append({
ImRegBenchmark.COL_IMAGE_REF: rp_imgs[i],
ImRegBenchmark.COL_IMAGE_MOVE: rp_imgs[j],
ImRegBenchmark.COL_POINTS_REF: rp_lnds[i],
ImRegBenchmark.COL_POINTS_MOVE: rp_lnds[j],
ImRegBenchmark.COL_STATUS: VAL_STATUS_TRAIN if public[k] else VAL_STATUS_TEST,
ImRegBenchmark.COL_IMAGE_SIZE: img_size,
ImRegBenchmark.COL_IMAGE_DIAGONAL: img_diag,
})
return reg_pairs
def create_dataset_cover(name, dataset, path_images, path_landmarks, path_out, step_hide_landmarks, tissue_partial):
""" generate cover CSV file for particular dataset size/scale
:param str name: name of selected scale
:param dict({scale_name: float}) dataset: definition of dataset
fist level key is name of the tissue,
next dictionary is composed from scale name and used scale in percents
:param str path_images: path to folder with images
:param str path_landmarks: path to folder with landmarks
:param str path_out: path to output directory
:param int step_hide_landmarks: take each N-th image/landmark out as a test case
:param list(str) tissue_partial:
"""
# name, scale_step = dataset
tissues = [(tissue, p) for tissue in sorted(dataset) for p in glob.glob(os.path.join(path_landmarks, tissue))
if os.path.isdir(p)]
reg_pairs = []
logging.debug('found: %r', sorted({os.path.basename(tp[1]) for tp in tissues}))
for tissue, p_tissue in tqdm.tqdm(sorted(tissues)):
sc = dataset[tissue][name]
rp_lnds, rp_imgs = list_landmarks_images(p_tissue, sc, path_landmarks, path_images)
if len(rp_lnds) != len(rp_imgs):
raise ValueError('the list of landmarks and images does not match')
step_hide_lnds = step_hide_landmarks if tissue in tissue_partial else None
pairs, pub = generate_pairing(len(rp_lnds), step_hide_lnds)
reg_pairs += generate_reg_pairs(rp_imgs, rp_lnds, pairs, pub)
df_overview = pd.DataFrame(reg_pairs)
for col in COLUMNS_EMPTY:
df_overview[col] = None
path_csv = os.path.join(path_out, NAME_COVER_FILE % name)
logging.info('exporting CSV: %s', path_csv)
df_overview.to_csv(path_csv)
def main(path_images, path_landmarks, path_out, step_lnds, dataset, tissue_partial, scale_names):
""" the main entry point
:param str path_images: path to folder with images
:param str path_landmarks: path to folder with landmarks
:param str path_out: path to output directory
:param int step_lnds: take each N-th image/landmark out as a test case
:param dict({scale_name: float}) dataset: definition of dataset
fist level key is name of the tissue,
next dictionary is composed from scale name and used scale in percents
:param list(str) tissue_partial: names of tissues which will have partially hidden cases
also consider a testing tissues
:param list(str) scale_names: name of chosen scales
"""
_create_cover = partial(
create_dataset_cover,
dataset=dataset,
path_images=path_images,
path_landmarks=path_landmarks,
path_out=path_out,
step_hide_landmarks=step_lnds,
tissue_partial=tissue_partial,
)
for sc_name in scale_names:
_create_cover(sc_name)
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
logging.info('running...')
main(
path_images=DATASET_IMAGES,
path_landmarks=DATASET_LANDMARKS,
path_out=DATASET_COVERS,
step_lnds=HIDE_TEST_TISSUE_STEP,
dataset=DATASET_TISSUE_SCALE,
scale_names=SCALE_NAMES,
tissue_partial=DATASET_TISSUE_SCALE_PARTIAL.keys(),
)
logging.info('Done :]')