--- a +++ b/NIH-Chest-X-ray-dataset.py @@ -0,0 +1,224 @@ +# Copyright 2022 Cristóbal Alcázar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""NIH Chest X-ray Dataset""" + + +import os +import datasets + +from requests import get +from pandas import read_csv + +logger = datasets.logging.get_logger(__name__) + +_CITATION = """\ +@inproceedings{Wang_2017, + doi = {10.1109/cvpr.2017.369}, + url = {https://doi.org/10.1109%2Fcvpr.2017.369}, + year = 2017, + month = {jul}, + publisher = {{IEEE} +}, + author = {Xiaosong Wang and Yifan Peng and Le Lu and Zhiyong Lu and Mohammadhadi Bagheri and Ronald M. Summers}, + title = {{ChestX}-Ray8: Hospital-Scale Chest X-Ray Database and Benchmarks on Weakly-Supervised Classification and Localization of Common Thorax Diseases}, + booktitle = {2017 {IEEE} Conference on Computer Vision and Pattern Recognition ({CVPR})} +} +""" + + +_DESCRIPTION = """\ +The NIH Chest X-ray dataset consists of 100,000 de-identified images of chest x-rays. The images are in PNG format. + +The data is provided by the NIH Clinical Center and is available through the NIH download site: https://nihcc.app.box.com/v/ChestXray-NIHCC +""" + + +_HOMEPAGE = "https://nihcc.app.box.com/v/chestxray-nihcc" + + +_REPO = "https://huggingface.co/datasets/alkzar90/NIH-Chest-X-ray-dataset/resolve/main/data" + + +_IMAGE_URLS = [ + f"{_REPO}/images/images_001.zip", + f"{_REPO}/images/images_002.zip", + f"{_REPO}/images/images_003.zip", + f"{_REPO}/images/images_004.zip", + f"{_REPO}/images/images_005.zip", + f"{_REPO}/images/images_006.zip", + f"{_REPO}/images/images_007.zip", + f"{_REPO}/images/images_008.zip", + f"{_REPO}/images/images_009.zip", + f"{_REPO}/images/images_010.zip", + f"{_REPO}/images/images_011.zip", + f"{_REPO}/images/images_012.zip" + #'https://huggingface.co/datasets/alkzar90/NIH-Chest-X-ray-dataset/resolve/main/dummy/0.0.0/images_001.tar.gz', + #'https://huggingface.co/datasets/alkzar90/NIH-Chest-X-ray-dataset/resolve/main/dummy/0.0.0/images_002.tar.gz' +] + + +_URLS = { + "train_val_list": f"{_REPO}/train_val_list.txt", + "test_list": f"{_REPO}/test_list.txt", + "labels": f"{_REPO}/Data_Entry_2017_v2020.csv", + "BBox": f"{_REPO}/BBox_List_2017.csv", + "image_urls": _IMAGE_URLS +} + + +_LABEL2IDX = {"No Finding": 0, + "Atelectasis": 1, + "Cardiomegaly": 2, + "Effusion": 3, + "Infiltration": 4, + "Mass": 5, + "Nodule": 6, + "Pneumonia": 7, + "Pneumothorax": 8, + "Consolidation": 9, + "Edema": 10, + "Emphysema": 11, + "Fibrosis": 12, + "Pleural_Thickening": 13, + "Hernia": 14} + + +_NAMES = list(_LABEL2IDX.keys()) + + +class ChestXray14Config(datasets.BuilderConfig): + """NIH Image Chest X-ray14 configuration.""" + + def __init__(self, name, **kwargs): + super(ChestXray14Config, self).__init__( + version=datasets.Version("1.0.0"), + name=name, + description="NIH ChestX-ray14", + **kwargs, + ) + + + +class ChestXray14(datasets.GeneratorBasedBuilder): + """NIH Image Chest X-ray14 dataset.""" + + + BUILDER_CONFIGS = [ + ChestXray14Config("image-classification"), + ChestXray14Config("object-detection"), + ] + + def _info(self): + if self.config.name == "image-classification": + features = datasets.Features( + { + "image": datasets.Image(), + "labels": datasets.features.Sequence( + datasets.features.ClassLabel( + num_classes=len(_NAMES), + names=_NAMES + ) + ), + } + ) + keys = ("image", "labels") + + + if self.config.name == "object-detection": + features = datasets.Features( + { + "image_id": datasets.Value("string"), + "patient_id": datasets.Value("int32"), + "image": datasets.Image(), + "width": datasets.Value("int32"), + "height": datasets.Value("int32"), + } + ) + object_dict = { + "image_id": datasets.Value("string"), + "area": datasets.Value("int64"), + "bbox": datasets.Sequence(datasets.Value("float32"), length=4), + } + features["objects"] = [object_dict] + keys = ("image", "objects") + + + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=keys, + homepage=_HOMEPAGE, + citation=_CITATION, + ) + + + def _split_generators(self, dl_manager): + # Get the image names that belong to the train-val dataset + logger.info("Downloading the train_val_list image names") + train_val_list = get(_URLS['train_val_list']).iter_lines() + train_val_list = set([x.decode('UTF8') for x in train_val_list]) + logger.info(f"Check train_val_list: {train_val_list}") + + # Create list for store the name of the images for each dataset + train_files = [] + test_files = [] + + # Download batches + data_files = dl_manager.download_and_extract(_URLS["image_urls"]) + + # Iterate trought image folder and check if they belong to + # the trainset or testset + + for batch in data_files: + logger.info(f"Batch for data_files: {batch}") + path_files = dl_manager.iter_files(batch) + for img in path_files: + if os.path.basename(img) in train_val_list: + train_files.append(img) + else: + test_files.append(img) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "files": train_files + } + + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "files": test_files + } + ) + ] + + def _generate_examples(self, files): + + if self.config.name == "image-classification": + # Read csv with image labels + label_csv = read_csv(_URLS["labels"]) + for i, path in enumerate(files): + file_name = os.path.basename(path) + # Get image id to filter the respective row of the csv + image_id = file_name + image_labels = label_csv[label_csv["Image Index"] == image_id]["Finding Labels"].values[0].split("|") + if file_name.endswith(".png"): + yield i, { + "image": path, + "labels": image_labels, + } +