Diff of /scripts/download_data.py [000000] .. [2afb35]

Switch to unified view

a b/scripts/download_data.py
1
#==============================================================================#
2
#  Author:       Dominik Müller                                                #
3
#  Copyright:    2020 IT-Infrastructure for Translational Medical Research,    #
4
#                University of Augsburg                                        #
5
#                                                                              #
6
#  This program is free software: you can redistribute it and/or modify        #
7
#  it under the terms of the GNU General Public License as published by        #
8
#  the Free Software Foundation, either version 3 of the License, or           #
9
#  (at your option) any later version.                                         #
10
#                                                                              #
11
#  This program is distributed in the hope that it will be useful,             #
12
#  but WITHOUT ANY WARRANTY; without even the implied warranty of              #
13
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
14
#  GNU General Public License for more details.                                #
15
#                                                                              #
16
#  You should have received a copy of the GNU General Public License           #
17
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
18
#==============================================================================#
19
#-----------------------------------------------------#
20
#                   Library imports                   #
21
#-----------------------------------------------------#
22
import requests
23
from tqdm import tqdm
24
import os
25
import zipfile
26
27
#-----------------------------------------------------#
28
#                    Configurations                   #
29
#-----------------------------------------------------#
30
# Data directory
31
path_data = "data"
32
# Links to the data set
33
url_vol = "https://zenodo.org/record/3757476/files/COVID-19-CT-Seg_20cases.zip?download=1"
34
url_seg = "https://zenodo.org/record/3757476/files/Lung_and_Infection_Mask.zip?download=1"
35
36
#-----------------------------------------------------#
37
#                  Download Function                  #
38
#-----------------------------------------------------#
39
# Author: Shenghan Gao (wy193777)
40
# Modifications: MCrazy
41
# Source: https://gist.github.com/wy193777/0e2a4932e81afc6aa4c8f7a2984f34e2
42
def download_from_url(url, dst):
43
    """
44
    @param: url to download file
45
    @param: dst place to put the file
46
    """
47
    file_size = int(requests.head(url).headers["Content-Length"])
48
    if os.path.exists(dst):
49
        first_byte = os.path.getsize(dst)
50
    else:
51
        first_byte = 0
52
    if first_byte >= file_size:
53
        print("WARNING: Skipping download due to files are already there.")
54
        return file_size
55
    header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
56
    pbar = tqdm(
57
        total=file_size, initial=first_byte,
58
        unit='B', unit_scale=True, desc=url.split('/')[-1])
59
    req = requests.get(url, headers=header, stream=True)
60
    with(open(dst, 'ab')) as f:
61
        for chunk in req.iter_content(chunk_size=1024):
62
            if chunk:
63
                f.write(chunk)
64
                pbar.update(1024)
65
    pbar.close()
66
    return file_size
67
68
#-----------------------------------------------------#
69
#                     Runner Code                     #
70
#-----------------------------------------------------#
71
# Create data structure
72
if not os.path.exists(path_data) : os.makedirs(path_data)
73
74
# Download CT volumes and save them into the data directory
75
path_vol_zip = os.path.join(path_data, "volumes.zip")
76
print("INFO:", "Downloading Volumes")
77
download_from_url(url_vol, path_vol_zip)
78
# Download segmentations and save them into the data directory
79
path_seg_zip = os.path.join(path_data, "segmentations.zip")
80
print("INFO:", "Downloading Segmentations")
81
download_from_url(url_seg, path_seg_zip)
82
83
# Extract sample list from the ZIP file
84
print("INFO:", "Obtain sample list from the volumes ZIP file")
85
with zipfile.ZipFile(path_vol_zip, "r") as zip_vol:
86
    sample_list = zip_vol.namelist()
87
88
# Iterate over the sample list and extract each sample from the ZIP files
89
print("INFO:", "Extracting data from ZIP files")
90
for sample in tqdm(sample_list):
91
    # Skip if file does not end with nii.gz
92
    if not sample.endswith(".nii.gz") : continue
93
    # Create sample directory
94
    path_sample = os.path.join(path_data, sample[:-len(".nii.gz")])
95
    if not os.path.exists(path_sample) : os.makedirs(path_sample)
96
    # Extract volume and store file into the sample directory
97
    with zipfile.ZipFile(path_vol_zip, "r") as zip_vol:
98
        zip_vol.extract(sample, path_sample)
99
    os.rename(os.path.join(path_sample, sample),
100
              os.path.join(path_sample, "imaging.nii.gz"))
101
    # Extract segmentation and store file into the sample directory
102
    with zipfile.ZipFile(path_seg_zip, "r") as zip_seg:
103
        zip_seg.extract(sample, path_sample)
104
    os.rename(os.path.join(path_sample, sample),
105
              os.path.join(path_sample, "segmentation.nii.gz"))
106
107
# Remove ZIP files due to disk space reduction
108
os.remove(path_vol_zip)
109
os.remove(path_seg_zip)
110
111
# Final info to console
112
print("INFO:", "Finished file structure creation")