--- a +++ b/test/mantra/data_util.py @@ -0,0 +1,35 @@ +import os + + +def load_file(filepath): + with open(filepath, "r", encoding="utf-8") as f: + lines = f.readlines() + lines = [line.strip() for line in lines if line.strip().startswith("<e")] + + output_label = [] + output_text = [] + label_set = set() + + for line in lines: + term_start = line.find(">") + cui_start = line.find("cui=") + term = line[term_start + 1:-4] + cui = line[cui_start + 5:cui_start+13] + output_label.append(cui) + output_text.append(term) + label_set.update([cui]) + + return output_text, output_label, label_set + +def load(dataset, lang): + file_path = os.path.join("dataset", dataset + "_GSC_" + lang + "_man.xml") + output_text, output_label, label_set = load_file(file_path) + print(dataset, lang) + print(f"Load count: {len(output_text)}") + print(f"Different cui: {len(label_set)}") + return output_text, output_label, label_set + +if __name__ == "__main__": + output_text, output_label, label_set = load("EMEA", "de") + print(output_text[0:5]) + print(output_label[0:5])