|
a |
|
b/scripts/download_data.py |
|
|
1 |
#==============================================================================# |
|
|
2 |
# Author: Dominik Müller # |
|
|
3 |
# Copyright: 2020 IT-Infrastructure for Translational Medical Research, # |
|
|
4 |
# University of Augsburg # |
|
|
5 |
# # |
|
|
6 |
# This program is free software: you can redistribute it and/or modify # |
|
|
7 |
# it under the terms of the GNU General Public License as published by # |
|
|
8 |
# the Free Software Foundation, either version 3 of the License, or # |
|
|
9 |
# (at your option) any later version. # |
|
|
10 |
# # |
|
|
11 |
# This program is distributed in the hope that it will be useful, # |
|
|
12 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of # |
|
|
13 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # |
|
|
14 |
# GNU General Public License for more details. # |
|
|
15 |
# # |
|
|
16 |
# You should have received a copy of the GNU General Public License # |
|
|
17 |
# along with this program. If not, see <http://www.gnu.org/licenses/>. # |
|
|
18 |
#==============================================================================# |
|
|
19 |
#-----------------------------------------------------# |
|
|
20 |
# Library imports # |
|
|
21 |
#-----------------------------------------------------# |
|
|
22 |
import requests |
|
|
23 |
from tqdm import tqdm |
|
|
24 |
import os |
|
|
25 |
import zipfile |
|
|
26 |
|
|
|
27 |
#-----------------------------------------------------# |
|
|
28 |
# Configurations # |
|
|
29 |
#-----------------------------------------------------# |
|
|
30 |
# Data directory |
|
|
31 |
path_data = "data" |
|
|
32 |
# Links to the data set |
|
|
33 |
url_vol = "https://zenodo.org/record/3757476/files/COVID-19-CT-Seg_20cases.zip?download=1" |
|
|
34 |
url_seg = "https://zenodo.org/record/3757476/files/Lung_and_Infection_Mask.zip?download=1" |
|
|
35 |
|
|
|
36 |
#-----------------------------------------------------# |
|
|
37 |
# Download Function # |
|
|
38 |
#-----------------------------------------------------# |
|
|
39 |
# Author: Shenghan Gao (wy193777) |
|
|
40 |
# Modifications: MCrazy |
|
|
41 |
# Source: https://gist.github.com/wy193777/0e2a4932e81afc6aa4c8f7a2984f34e2 |
|
|
42 |
def download_from_url(url, dst): |
|
|
43 |
""" |
|
|
44 |
@param: url to download file |
|
|
45 |
@param: dst place to put the file |
|
|
46 |
""" |
|
|
47 |
file_size = int(requests.head(url).headers["Content-Length"]) |
|
|
48 |
if os.path.exists(dst): |
|
|
49 |
first_byte = os.path.getsize(dst) |
|
|
50 |
else: |
|
|
51 |
first_byte = 0 |
|
|
52 |
if first_byte >= file_size: |
|
|
53 |
print("WARNING: Skipping download due to files are already there.") |
|
|
54 |
return file_size |
|
|
55 |
header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} |
|
|
56 |
pbar = tqdm( |
|
|
57 |
total=file_size, initial=first_byte, |
|
|
58 |
unit='B', unit_scale=True, desc=url.split('/')[-1]) |
|
|
59 |
req = requests.get(url, headers=header, stream=True) |
|
|
60 |
with(open(dst, 'ab')) as f: |
|
|
61 |
for chunk in req.iter_content(chunk_size=1024): |
|
|
62 |
if chunk: |
|
|
63 |
f.write(chunk) |
|
|
64 |
pbar.update(1024) |
|
|
65 |
pbar.close() |
|
|
66 |
return file_size |
|
|
67 |
|
|
|
68 |
#-----------------------------------------------------# |
|
|
69 |
# Runner Code # |
|
|
70 |
#-----------------------------------------------------# |
|
|
71 |
# Create data structure |
|
|
72 |
if not os.path.exists(path_data) : os.makedirs(path_data) |
|
|
73 |
|
|
|
74 |
# Download CT volumes and save them into the data directory |
|
|
75 |
path_vol_zip = os.path.join(path_data, "volumes.zip") |
|
|
76 |
print("INFO:", "Downloading Volumes") |
|
|
77 |
download_from_url(url_vol, path_vol_zip) |
|
|
78 |
# Download segmentations and save them into the data directory |
|
|
79 |
path_seg_zip = os.path.join(path_data, "segmentations.zip") |
|
|
80 |
print("INFO:", "Downloading Segmentations") |
|
|
81 |
download_from_url(url_seg, path_seg_zip) |
|
|
82 |
|
|
|
83 |
# Extract sample list from the ZIP file |
|
|
84 |
print("INFO:", "Obtain sample list from the volumes ZIP file") |
|
|
85 |
with zipfile.ZipFile(path_vol_zip, "r") as zip_vol: |
|
|
86 |
sample_list = zip_vol.namelist() |
|
|
87 |
|
|
|
88 |
# Iterate over the sample list and extract each sample from the ZIP files |
|
|
89 |
print("INFO:", "Extracting data from ZIP files") |
|
|
90 |
for sample in tqdm(sample_list): |
|
|
91 |
# Skip if file does not end with nii.gz |
|
|
92 |
if not sample.endswith(".nii.gz") : continue |
|
|
93 |
# Create sample directory |
|
|
94 |
path_sample = os.path.join(path_data, sample[:-len(".nii.gz")]) |
|
|
95 |
if not os.path.exists(path_sample) : os.makedirs(path_sample) |
|
|
96 |
# Extract volume and store file into the sample directory |
|
|
97 |
with zipfile.ZipFile(path_vol_zip, "r") as zip_vol: |
|
|
98 |
zip_vol.extract(sample, path_sample) |
|
|
99 |
os.rename(os.path.join(path_sample, sample), |
|
|
100 |
os.path.join(path_sample, "imaging.nii.gz")) |
|
|
101 |
# Extract segmentation and store file into the sample directory |
|
|
102 |
with zipfile.ZipFile(path_seg_zip, "r") as zip_seg: |
|
|
103 |
zip_seg.extract(sample, path_sample) |
|
|
104 |
os.rename(os.path.join(path_sample, sample), |
|
|
105 |
os.path.join(path_sample, "segmentation.nii.gz")) |
|
|
106 |
|
|
|
107 |
# Remove ZIP files due to disk space reduction |
|
|
108 |
os.remove(path_vol_zip) |
|
|
109 |
os.remove(path_seg_zip) |
|
|
110 |
|
|
|
111 |
# Final info to console |
|
|
112 |
print("INFO:", "Finished file structure creation") |