Diff of /dataset_generation.py [000000] .. [afa31e]

Switch to unified view

a b/dataset_generation.py
1
"""
2
# ===============Part_2: Dataset Production===============
3
4
To do: <Strategy>
5
    1. Read images/labels/labelsJSPN'path list
6
        Input:
7
            images[] (.png)
8
            labelsJSON[] (.json)
9
            labels[] (.txt)
10
    2. Create YOLO dataset: 
11
        [1] split set & copy files
12
        [2] create config file (.yaml) 
13
        Structure:
14
                dataset:(dir: sub_dir)
15
                        train/val/test: images
16
                                        labels -> YOLO TXT (annotation file)
17
                                        labelsJson
18
                dataset.yaml (config file)
19
                        -> format:
20
                                train: trainset_absolute_path
21
                                val: valset_absolute_path
22
                                # test: testset_absolute_path
23
                                nc: num(classes)
24
                                names: ['names_class', ...]
25
"""
26
27
import os
28
from sklearn.model_selection import train_test_split
29
from tqdm import tqdm  # progress bar
30
import shutil
31
32
MODELTYPES = ["YOLO", "UNET"]
33
34
35
def make_dataset(
36
    model, data_path, train_size=0.8, test_set=True
37
):
38
39
    # create dataset folder
40
    global MODELTYPES
41
    if model in MODELTYPES:
42
        dataset_path = os.path.join(os.path.dirname(__file__), f"{str(model)}Dataset")
43
        if model == "YOLO":
44
            yaml_path = os.path.join(dataset_path, "dataset.yaml")
45
    else:
46
        print(
47
            f"Missing creating {str(model)}'s Dataset solution. Recommended: {', '.join(MODELTYPES)}"
48
        )
49
        return
50
    ensure_dir(dataset_path)
51
52
    images_path = os.path.join(data_path, "images")
53
    masks_path = os.path.join(data_path, "masks")
54
    labels_path = os.path.join(data_path, "labels")
55
    labelsJSON_path = os.path.join(data_path, "labelsJSON")
56
57
    # 1. Read images/labels/labelsJSPN'path list
58
    images = []
59
    masks = []
60
    labels = []
61
    labelsJSON = []
62
    for image_name in os.listdir(images_path):
63
        image_path = os.path.join(images_path, image_name)
64
        mask_path = os.path.join(masks_path, image_name)
65
66
        name, ext = os.path.splitext(image_name)
67
        label_name = os.path.join(name + ".txt")
68
        label_path = os.path.join(labels_path, label_name)
69
70
        labelJSON_name = os.path.join(name + ".json")
71
        labelJSON_path = os.path.join(labelsJSON_path, labelJSON_name)
72
73
        if all(map(os.path.exists, [mask_path, label_path, labelJSON_path])):
74
            images.append(image_path)
75
            masks.append(mask_path)
76
            labels.append(label_path)
77
            labelsJSON.append(labelJSON_path)
78
        else:
79
            print("The initial data in images/masks/labels/labelsJSON do not correspond")
80
81
    # 2.1.1 Split data to train/val/test set
82
    data = list(zip(images, masks, labels, labelsJSON))
83
84
    if test_set:
85
        test_size = int(len(images) * (1 - train_size) / 2)
86
        train_val_data, test_data = train_test_split(
87
            data, test_size=test_size, random_state=42
88
        )  # split
89
        train_data, val_data = train_test_split(
90
            train_val_data, test_size=test_size, random_state=42
91
        )  # test_size: specific len(val_images)
92
    else:
93
        train_data, val_data = train_test_split(
94
            data, test_size=1 - train_size, random_state=42
95
        )  # split # test_size: radio
96
        test_data = list(), list(), list(), list()
97
98
    train_images, train_masks, train_labels, train_labelsJSON = zip(*train_data)  # unpack
99
    val_images, val_masks, val_labels, val_labelsJSON = zip(*val_data)
100
    test_images, test_masks, test_labels, test_labelsJSON = (zip(*test_data) if test_set else ([], [], [], []))
101
102
    # make CSV ???
103
104
    # dataset_structure
105
    dataset_structure = {
106
        "train": {
107
            "images": train_images,
108
            "masks": train_masks,
109
            "labels": train_labels,
110
            "labelsJson": train_labelsJSON,
111
        },
112
        "val": {
113
            "images": val_images,
114
            "masks": val_masks,
115
            "labels": val_labels,
116
            "labelsJson": val_labelsJSON,
117
        },
118
        "test": {
119
            "images": test_images,
120
            "masks": test_masks,
121
            "labels": test_labels,
122
            "labelsJson": test_labelsJSON,
123
        },
124
    }
125
126
    # 2.1.2 Copy data to train/val/test set
127
    dataset_paths = {}
128
    for base_dir, sub_dirs in dataset_structure.items():
129
        dataset_paths[base_dir] = {}
130
        for sub_dir, file_list in sub_dirs.items():
131
            # create each sub_dir & save paths
132
            sub_dir_path = os.path.join(dataset_path, base_dir, sub_dir)
133
            dataset_paths[base_dir][sub_dir] = sub_dir_path
134
            if base_dir == "test" and test_set is False:
135
                continue
136
            if sub_dir == "masks" and model == "YOLO":
137
                continue
138
            if (sub_dir == "labels" or sub_dir == "labelsJson") and model == "UNET":
139
                continue
140
141
            ensure_dir(sub_dir_path)
142
143
            # copy
144
            # print(f"\nlength: {base_dir + '_' + sub_dir}: {len(file_list)}")
145
            copy_files(sub_dir_path, file_list)
146
147
    # 2.2 Create config file (.yaml)
148
    if model == "YOLO":
149
        """ YAML
150
        format:
151
            train: trainset_absolute_path
152
            val: valset_absolute_path
153
            # test: testset_absolute_path
154
            nc: num(classes)
155
            names: ['names_class', ...]
156
        """
157
        class_mapping = {"pneumonia": 1}
158
        YAML_OUT = {
159
            "train": os.path.abspath(dataset_paths["train"]["images"]),
160
            "val": os.path.abspath(dataset_paths["val"]["images"]),
161
            'test': os.path.abspath(dataset_paths["test"]["images"]),
162
            "nc": len(class_mapping),
163
            "names": list(class_mapping.keys())
164
        }
165
166
        with open(yaml_path, "w") as yaml_output:
167
            idx = 0
168
            for key, value in YAML_OUT.items():
169
                idx += 1
170
                if key == 'test' and test_set is False:
171
                    continue
172
                row = f'{key}: {value}'
173
                if idx != len(YAML_OUT):
174
                    row = row + '\n'
175
                yaml_output.write(row)
176
177
    print(f"\n{str(model)}Dataset was successfully created.")
178
179
def ensure_dir(file_path):
180
    if not os.path.exists(file_path):
181
        os.makedirs(file_path)
182
183
def copy_files(sub_dir_path, file_list):
184
    # copy files to the dataset folders
185
    for file_path in tqdm(file_list, desc=sub_dir_path):
186
        shutil.copy(
187
            file_path,
188
            os.path.join(sub_dir_path, os.path.basename(file_path)),
189
        )
190
191
192
data_path = "./data/mosmed/data_preprocessing/"
193
# make_dataset("YOLO", data_path, train_size=0.8, test_set=False)
194
make_dataset("UNET", data_path, train_size=0.9, test_set=False)