NIHChestXRay / Git / [f391e7] /NIH-Chest-X-ray-dataset.py

Datasets:
GeorgeSullivan/
NIHChestXRay
Downloads: 1
[f391e7]: / NIH-Chest-X-ray-dataset.py
History
Download this file
225 lines (179 with data), 6.6 kB

# Copyright 2022 Cristóbal Alcázar
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NIH Chest X-ray Dataset"""


import os
import datasets

from requests import get
from pandas import read_csv

logger = datasets.logging.get_logger(__name__)

_CITATION = """\
@inproceedings{Wang_2017,
	doi = {10.1109/cvpr.2017.369},
	url = {https://doi.org/10.1109%2Fcvpr.2017.369},
	year = 2017,
	month = {jul},
	publisher = {{IEEE}
},
	author = {Xiaosong Wang and Yifan Peng and Le Lu and Zhiyong Lu and Mohammadhadi Bagheri and Ronald M. Summers},
	title = {{ChestX}-Ray8: Hospital-Scale Chest X-Ray Database and Benchmarks on Weakly-Supervised Classification and Localization of Common Thorax Diseases},
	booktitle = {2017 {IEEE} Conference on Computer Vision and Pattern Recognition ({CVPR})}
}
"""


_DESCRIPTION = """\
The NIH Chest X-ray dataset consists of 100,000 de-identified images of chest x-rays. The images are in PNG format.

The data is provided by the NIH Clinical Center and is available through the NIH download site: https://nihcc.app.box.com/v/ChestXray-NIHCC
"""


_HOMEPAGE = "https://nihcc.app.box.com/v/chestxray-nihcc"


_REPO = "https://huggingface.co/datasets/alkzar90/NIH-Chest-X-ray-dataset/resolve/main/data"


_IMAGE_URLS = [
	f"{_REPO}/images/images_001.zip",
	f"{_REPO}/images/images_002.zip",
	f"{_REPO}/images/images_003.zip",
	f"{_REPO}/images/images_004.zip",
	f"{_REPO}/images/images_005.zip",
	f"{_REPO}/images/images_006.zip",
	f"{_REPO}/images/images_007.zip",
	f"{_REPO}/images/images_008.zip",
	f"{_REPO}/images/images_009.zip",
    f"{_REPO}/images/images_010.zip",
	f"{_REPO}/images/images_011.zip",
	f"{_REPO}/images/images_012.zip"
	#'https://huggingface.co/datasets/alkzar90/NIH-Chest-X-ray-dataset/resolve/main/dummy/0.0.0/images_001.tar.gz',
	#'https://huggingface.co/datasets/alkzar90/NIH-Chest-X-ray-dataset/resolve/main/dummy/0.0.0/images_002.tar.gz'
]


_URLS = {
	"train_val_list": f"{_REPO}/train_val_list.txt",
	"test_list": f"{_REPO}/test_list.txt",
	"labels": f"{_REPO}/Data_Entry_2017_v2020.csv",
	"BBox": f"{_REPO}/BBox_List_2017.csv",
	"image_urls": _IMAGE_URLS
}


_LABEL2IDX = {"No Finding": 0,
	     "Atelectasis": 1,
	     "Cardiomegaly": 2,
	     "Effusion": 3,
	     "Infiltration": 4,
	     "Mass": 5,
	     "Nodule": 6,
	     "Pneumonia": 7,
	     "Pneumothorax": 8,
  	     "Consolidation": 9,
	     "Edema": 10,
	     "Emphysema": 11,
	     "Fibrosis": 12,
	     "Pleural_Thickening": 13,
	     "Hernia": 14}


_NAMES = list(_LABEL2IDX.keys())


class ChestXray14Config(datasets.BuilderConfig):
	"""NIH Image Chest X-ray14 configuration."""
	
	def __init__(self, name, **kwargs):
	    super(ChestXray14Config, self).__init__(
		version=datasets.Version("1.0.0"),
		name=name,
		description="NIH ChestX-ray14",
		**kwargs,
	    )



class ChestXray14(datasets.GeneratorBasedBuilder):
	"""NIH Image Chest X-ray14 dataset."""


	BUILDER_CONFIGS = [
		ChestXray14Config("image-classification"),
		ChestXray14Config("object-detection"),
	]

	def _info(self):
		if self.config.name == "image-classification":
		    features = datasets.Features(
                       {
		        "image": datasets.Image(),
  		        "labels": datasets.features.Sequence(
                                     datasets.features.ClassLabel(
                                        num_classes=len(_NAMES),
                                        names=_NAMES
                                     )
                                 ),
                       }
                    )
		    keys = ("image", "labels")


		if self.config.name == "object-detection":
		    features = datasets.Features(
                       {
			"image_id": datasets.Value("string"),
			"patient_id": datasets.Value("int32"),
		        "image": datasets.Image(),
			"width": datasets.Value("int32"),
			"height": datasets.Value("int32"),
                       }
                    )
		    object_dict = {
			"image_id": datasets.Value("string"), 
			"area": datasets.Value("int64"),
			"bbox": datasets.Sequence(datasets.Value("float32"), length=4),
			}
		    features["objects"] = [object_dict]
		    keys = ("image", "objects")



		return datasets.DatasetInfo(
		    description=_DESCRIPTION,
		    features=features,
		    supervised_keys=keys,
		    homepage=_HOMEPAGE,
		    citation=_CITATION,
		)


	def _split_generators(self, dl_manager):
		# Get the image names that belong to the train-val dataset
		logger.info("Downloading the train_val_list image names")
		train_val_list = get(_URLS['train_val_list']).iter_lines()
		train_val_list = set([x.decode('UTF8') for x in train_val_list])
		logger.info(f"Check train_val_list: {train_val_list}")

		# Create list for store the name of the images for each dataset
		train_files = []
		test_files = []
		
		# Download batches
		data_files = dl_manager.download_and_extract(_URLS["image_urls"])

		# Iterate trought image folder and check if they belong to
		# the trainset or testset

		for batch in data_files:
		  logger.info(f"Batch for data_files: {batch}")
		  path_files = dl_manager.iter_files(batch)
		  for img in path_files:
		    if os.path.basename(img) in train_val_list:
		      train_files.append(img)
		    else:
		      test_files.append(img)
		
		return [
		    datasets.SplitGenerator(
			name=datasets.Split.TRAIN,
			gen_kwargs={
				"files": train_files
			}

		    ),
		    datasets.SplitGenerator(
			name=datasets.Split.TEST,
			gen_kwargs={
				"files": test_files
			}
		    )
		]

	def _generate_examples(self, files):

		if self.config.name == "image-classification":
		    # Read csv with image labels
		    label_csv = read_csv(_URLS["labels"])
		    for i, path in enumerate(files):
		        file_name = os.path.basename(path)
		        # Get image id to filter the respective row of the csv 	
		        image_id = file_name
		        image_labels = label_csv[label_csv["Image Index"] == image_id]["Finding Labels"].values[0].split("|")
		        if file_name.endswith(".png"):
		            yield i, {
			        "image": path,
			        "labels": image_labels,
			    }