|
a |
|
b/development/rasa/synonym_generator.py |
|
|
1 |
import ast |
|
|
2 |
import pandas as pd |
|
|
3 |
from tqdm import tqdm |
|
|
4 |
from ruamel import yaml |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
class nlu_generator(): |
|
|
8 |
def __init__(self, mode, med_dataset): |
|
|
9 |
super().__init__() |
|
|
10 |
df = pd.read_csv(med_dataset) |
|
|
11 |
if mode == 'drug': |
|
|
12 |
col = 'drug_name' |
|
|
13 |
aka_col = 'Brand names' |
|
|
14 |
else: |
|
|
15 |
col = 'Lab test' |
|
|
16 |
aka_col = 'Also Known As' |
|
|
17 |
self.list = df[col].str.lower() |
|
|
18 |
aka_list = df[aka_col].str.lower() |
|
|
19 |
self.aka_list = aka_list.str.replace('®', '') |
|
|
20 |
self.mode = mode |
|
|
21 |
|
|
|
22 |
# Synonym key generator |
|
|
23 |
def __syn_generator(self, item, aka_list): |
|
|
24 |
inp = f"""\ |
|
|
25 |
nlu: |
|
|
26 |
- synonym: {item} |
|
|
27 |
examples: | |
|
|
28 |
""" |
|
|
29 |
code = yaml.load(inp, Loader=yaml.RoundTripLoader) |
|
|
30 |
if self.mode == 'lab': |
|
|
31 |
aka_list = ast.literal_eval(aka_list) |
|
|
32 |
else: |
|
|
33 |
aka_list = [aka_list] |
|
|
34 |
for aka in aka_list: |
|
|
35 |
code['nlu'][0]['examples'] += f'- {aka}\n' |
|
|
36 |
return code |
|
|
37 |
|
|
|
38 |
# First keys generator |
|
|
39 |
def __block_generator(self): |
|
|
40 |
inp = f"""\ |
|
|
41 |
version: "2.0" |
|
|
42 |
|
|
|
43 |
nlu: |
|
|
44 |
""" |
|
|
45 |
code = yaml.load(inp, Loader=yaml.RoundTripLoader) |
|
|
46 |
code['nlu'] = self.__syn_generator( |
|
|
47 |
self.list[0], self.aka_list[0])['nlu'] |
|
|
48 |
return code |
|
|
49 |
|
|
|
50 |
# NaN check |
|
|
51 |
def __isnan(self, num): |
|
|
52 |
isnan = (num != num) |
|
|
53 |
return isnan |
|
|
54 |
|
|
|
55 |
# YAML generator |
|
|
56 |
def generate(self): |
|
|
57 |
# Generate first keys |
|
|
58 |
code = self.__block_generator() |
|
|
59 |
for i, item in tqdm(enumerate(self.list[1:], start=1)): |
|
|
60 |
# Check if aka_list for that item exists or not |
|
|
61 |
if not self.__isnan(self.aka_list[i]): |
|
|
62 |
# Append aka_list to YAML |
|
|
63 |
code['nlu'].append(self.__syn_generator( |
|
|
64 |
item, self.aka_list[i])['nlu'][0]) |
|
|
65 |
return code |
|
|
66 |
|
|
|
67 |
# Write data in the disk |
|
|
68 |
def write_data(self, code, nlu_file): |
|
|
69 |
with open(nlu_file, 'w') as f: |
|
|
70 |
yaml.dump(code, f, Dumper=yaml.RoundTripDumper) |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
def main(): |
|
|
74 |
mode = 'lab' |
|
|
75 |
#mode = 'drug' |
|
|
76 |
generator = nlu_generator( |
|
|
77 |
mode, '../labtestonline_datasets/dataset_files/labtest_dataset.csv') |
|
|
78 |
#generator = nlu_generator( |
|
|
79 |
#mode, '../medlineplus_drug_dataset/dataset_files/MedlinePlus_2.csv') |
|
|
80 |
code = generator.generate() |
|
|
81 |
generator.write_data(code, 'data/synonym_lab.yml') |
|
|
82 |
#generator.write_data(code, 'data/synonym_drug.yml') |
|
|
83 |
|
|
|
84 |
|
|
|
85 |
if __name__ == "__main__": |
|
|
86 |
main() |