[c3444c]: / test / mantra / data_util.py

Download this file

36 lines (28 with data), 1.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import os
def load_file(filepath):
with open(filepath, "r", encoding="utf-8") as f:
lines = f.readlines()
lines = [line.strip() for line in lines if line.strip().startswith("<e")]
output_label = []
output_text = []
label_set = set()
for line in lines:
term_start = line.find(">")
cui_start = line.find("cui=")
term = line[term_start + 1:-4]
cui = line[cui_start + 5:cui_start+13]
output_label.append(cui)
output_text.append(term)
label_set.update([cui])
return output_text, output_label, label_set
def load(dataset, lang):
file_path = os.path.join("dataset", dataset + "_GSC_" + lang + "_man.xml")
output_text, output_label, label_set = load_file(file_path)
print(dataset, lang)
print(f"Load count: {len(output_text)}")
print(f"Different cui: {len(label_set)}")
return output_text, output_label, label_set
if __name__ == "__main__":
output_text, output_label, label_set = load("EMEA", "de")
print(output_text[0:5])
print(output_label[0:5])