|
a |
|
b/toolscripts/train_valid_split.py |
|
|
1 |
import os, glob |
|
|
2 |
import random |
|
|
3 |
import shutil |
|
|
4 |
from tqdm.auto import tqdm |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
def print_info(string, writepath): |
|
|
8 |
print(string) |
|
|
9 |
print(string, file=open(writepath, "a")) |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
## Use when all you care is just random split |
|
|
13 |
def split_image_folder(source_root, target_root, portion=2, ratio= [0.8, 0.2]): |
|
|
14 |
random.seed(73) |
|
|
15 |
assert len(ratio) == portion |
|
|
16 |
assert sum(ratio) == 1 |
|
|
17 |
os.makedirs(target_root, exist_ok = True) |
|
|
18 |
class_folder = [n for n in os.listdir(source_root)] |
|
|
19 |
|
|
|
20 |
for cf in class_folder: |
|
|
21 |
ipath = os.path.join(source_root, cf, "*.png") |
|
|
22 |
imnames = [os.path.basename(x) for x in glob.glob(ipath)] |
|
|
23 |
random.shuffle(imnames) |
|
|
24 |
lsize = [round(len(imnames)*r) for r in ratio ] ## floor will be cleaner |
|
|
25 |
begin = 0; outnames = [] |
|
|
26 |
# print(lsize) |
|
|
27 |
for i in range(portion): |
|
|
28 |
outnames.append(imnames[begin:begin+lsize[i]]) |
|
|
29 |
begin += lsize[i] |
|
|
30 |
## handle leftouts due to rounding |
|
|
31 |
for j in range(begin, len(imnames)): |
|
|
32 |
outnames[j%portion].append(imnames[j]) |
|
|
33 |
print_info(f"ClassFolder: {cf}, Count: {len(imnames)}", target_root+"/split-info.txt") |
|
|
34 |
ipath_dir = os.path.dirname(ipath) |
|
|
35 |
for o, grp in tqdm(enumerate(outnames)): |
|
|
36 |
print_info(f'Group-{o+1}:',len(grp), target_root+"/split-info.txt") |
|
|
37 |
|
|
|
38 |
opath = os.path.join(target_root, f"group{o+1}",cf) |
|
|
39 |
os.makedirs(opath, exist_ok = True) |
|
|
40 |
for g in grp: |
|
|
41 |
shutil.copy(os.path.join(ipath_dir, g), opath) |
|
|
42 |
|
|
|
43 |
return None |
|
|
44 |
|
|
|
45 |
|
|
|
46 |
|
|
|
47 |
## Ensures a given ID is not present in train -val splits within a class |
|
|
48 |
def split_image_folder_byID(source_root, target_root, |
|
|
49 |
ratio= [0.8, 0.2], ): |
|
|
50 |
""" |
|
|
51 |
Written just for two folder logic - train, valid |
|
|
52 |
|
|
|
53 |
""" |
|
|
54 |
|
|
|
55 |
def parse_study_id(string): |
|
|
56 |
# str_list = [ x for x in string.split("_") if "Study" in x ] |
|
|
57 |
# out = str_list[0] |
|
|
58 |
out = string.split("_")[0] # PateintID |
|
|
59 |
return out |
|
|
60 |
|
|
|
61 |
def grouper(name_list): |
|
|
62 |
out_dic = {} |
|
|
63 |
for name in name_list: |
|
|
64 |
id = parse_study_id(name) |
|
|
65 |
if id in out_dic: |
|
|
66 |
out_dic[id]["count"] += 1 |
|
|
67 |
out_dic[id]["files"].append(name) |
|
|
68 |
else: |
|
|
69 |
out_dic[id] = {} |
|
|
70 |
out_dic[id]["count"] = 1 |
|
|
71 |
out_dic[id]["files"] = [name] |
|
|
72 |
out_dic = dict(sorted(out_dic.items(), key = lambda item: item[1]["count"], |
|
|
73 |
reverse=True )) |
|
|
74 |
return out_dic |
|
|
75 |
|
|
|
76 |
random.seed(73) |
|
|
77 |
assert sum(ratio) == 1 |
|
|
78 |
assert len(ratio) <= 2 |
|
|
79 |
|
|
|
80 |
os.makedirs(target_root, exist_ok = True) |
|
|
81 |
class_folder = [n for n in os.listdir(source_root)] |
|
|
82 |
|
|
|
83 |
for cf in class_folder: |
|
|
84 |
ipath = os.path.join(source_root, cf, "*.png") |
|
|
85 |
imnames = [os.path.basename(x) for x in glob.glob(ipath)] |
|
|
86 |
total_count = len(imnames) |
|
|
87 |
grouped_dic = grouper(imnames) |
|
|
88 |
|
|
|
89 |
#TODO: rewrite logic for N splits when life bestows time |
|
|
90 |
curr_count = 0 |
|
|
91 |
valid_dic = {} |
|
|
92 |
while curr_count < total_count * ratio[-1]: |
|
|
93 |
k = random.sample(grouped_dic.keys(), 1)[0] |
|
|
94 |
item = grouped_dic.pop(k) |
|
|
95 |
valid_dic[k] = item |
|
|
96 |
curr_count += item["count"] |
|
|
97 |
print_info(f'Class:{cf},Tot:{total_count} Train:{total_count - curr_count} Valid:{curr_count} ratio:{curr_count/total_count}', |
|
|
98 |
target_root+"/split-info.txt") |
|
|
99 |
|
|
|
100 |
ipath_dir = os.path.dirname(ipath) |
|
|
101 |
for o, dic in enumerate([grouped_dic, valid_dic]): |
|
|
102 |
for k in tqdm(dic.keys()): |
|
|
103 |
grp = dic[k]["files"] |
|
|
104 |
opath = os.path.join(target_root, f"group{o+1}",cf) |
|
|
105 |
os.makedirs(opath, exist_ok = True) |
|
|
106 |
for g in grp: |
|
|
107 |
shutil.copy(os.path.join(ipath_dir, g), opath) |
|
|
108 |
|
|
|
109 |
return None |
|
|
110 |
|
|
|
111 |
|
|
|
112 |
|
|
|
113 |
|
|
|
114 |
if __name__ == "__main__": |
|
|
115 |
|
|
|
116 |
split_image_folder_by_nameid( |
|
|
117 |
"/home/joseph.benjamin/WERK/UltraSound/FetalDataImg", |
|
|
118 |
"/home/joseph.benjamin/WERK/data/study-split/", |
|
|
119 |
ratio=[0.8, 0.2] |
|
|
120 |
) |
|
|
121 |
|
|
|
122 |
|
|
|
123 |
|
|
|
124 |
|