a b/toolscripts/train_valid_split.py
1
import os, glob
2
import random
3
import shutil
4
from tqdm.auto import tqdm
5
6
7
def print_info(string, writepath):
8
    print(string)
9
    print(string, file=open(writepath, "a"))
10
11
12
## Use when all you care is just random split
13
def split_image_folder(source_root, target_root, portion=2, ratio= [0.8, 0.2]):
14
    random.seed(73)
15
    assert len(ratio) == portion
16
    assert sum(ratio) == 1
17
    os.makedirs(target_root, exist_ok = True)
18
    class_folder = [n for n in os.listdir(source_root)]
19
20
    for cf in class_folder:
21
        ipath = os.path.join(source_root, cf, "*.png")
22
        imnames = [os.path.basename(x) for x in glob.glob(ipath)]
23
        random.shuffle(imnames)
24
        lsize = [round(len(imnames)*r) for r in ratio ] ## floor will be cleaner
25
        begin = 0; outnames = []
26
        # print(lsize)
27
        for i in range(portion):
28
            outnames.append(imnames[begin:begin+lsize[i]])
29
            begin += lsize[i]
30
        ## handle leftouts due to rounding
31
        for j in range(begin, len(imnames)):
32
            outnames[j%portion].append(imnames[j])
33
        print_info(f"ClassFolder: {cf}, Count: {len(imnames)}", target_root+"/split-info.txt")
34
        ipath_dir =  os.path.dirname(ipath)
35
        for o, grp in tqdm(enumerate(outnames)):
36
            print_info(f'Group-{o+1}:',len(grp), target_root+"/split-info.txt")
37
38
            opath = os.path.join(target_root, f"group{o+1}",cf)
39
            os.makedirs(opath, exist_ok = True)
40
            for g in grp:
41
                shutil.copy(os.path.join(ipath_dir, g), opath)
42
43
    return None
44
45
46
47
## Ensures a given ID is not present in train -val splits within a class
48
def split_image_folder_byID(source_root, target_root,
49
                                    ratio= [0.8, 0.2], ):
50
    """
51
    Written just for two folder logic - train, valid
52
53
    """
54
55
    def parse_study_id(string):
56
        # str_list = [ x for x in string.split("_") if "Study" in x ]
57
        # out = str_list[0]
58
        out =  string.split("_")[0] # PateintID
59
        return out
60
61
    def grouper(name_list):
62
        out_dic = {}
63
        for name in name_list:
64
            id = parse_study_id(name)
65
            if id in out_dic:
66
                out_dic[id]["count"] += 1
67
                out_dic[id]["files"].append(name)
68
            else:
69
                out_dic[id] = {}
70
                out_dic[id]["count"] = 1
71
                out_dic[id]["files"] = [name]
72
        out_dic = dict(sorted(out_dic.items(), key = lambda item: item[1]["count"],
73
                                reverse=True ))
74
        return out_dic
75
76
    random.seed(73)
77
    assert sum(ratio) == 1
78
    assert len(ratio) <= 2
79
80
    os.makedirs(target_root, exist_ok = True)
81
    class_folder = [n for n in os.listdir(source_root)]
82
83
    for cf in class_folder:
84
        ipath = os.path.join(source_root, cf, "*.png")
85
        imnames = [os.path.basename(x) for x in glob.glob(ipath)]
86
        total_count = len(imnames)
87
        grouped_dic = grouper(imnames)
88
89
        #TODO: rewrite logic for N splits when life bestows time
90
        curr_count = 0
91
        valid_dic = {}
92
        while curr_count < total_count * ratio[-1]:
93
            k = random.sample(grouped_dic.keys(), 1)[0]
94
            item = grouped_dic.pop(k)
95
            valid_dic[k] = item
96
            curr_count += item["count"]
97
        print_info(f'Class:{cf},Tot:{total_count} Train:{total_count - curr_count} Valid:{curr_count} ratio:{curr_count/total_count}',
98
                    target_root+"/split-info.txt")
99
100
        ipath_dir =  os.path.dirname(ipath)
101
        for o, dic in enumerate([grouped_dic, valid_dic]):
102
            for k in tqdm(dic.keys()):
103
                grp = dic[k]["files"]
104
                opath = os.path.join(target_root, f"group{o+1}",cf)
105
                os.makedirs(opath, exist_ok = True)
106
                for g in grp:
107
                    shutil.copy(os.path.join(ipath_dir, g), opath)
108
109
    return None
110
111
112
113
114
if __name__ == "__main__":
115
116
    split_image_folder_by_nameid(
117
        "/home/joseph.benjamin/WERK/UltraSound/FetalDataImg",
118
        "/home/joseph.benjamin/WERK/data/study-split/",
119
        ratio=[0.8, 0.2]
120
    )
121
122
123
124