[507a54]: / development / paraphrase / nlu_augment.py

Download this file

94 lines (76 with data), 3.4 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
from tqdm import tqdm
from ruamel import yaml
from paraphrase import wordtune
from decouple import config
class DataAugment():
def __init__(self, nlu_file, token, draft_id):
super(DataAugment, self).__init__()
self.token = token
self.paraphrase = wordtune(token, draft_id)
with open(nlu_file, 'r') as f:
self.nlu_dict = yaml.load(f, Loader=yaml.RoundTripLoader)
# Function to remove entity tags
def __tag_remove(self, inp):
entity_tag = ""
entity = ""
# Check if entity tags in input exist or not
if inp.find("[") != -1:
# entity and its tag
entity_tag = inp[inp.find("["):inp.find(")")+1]
# entity
entity = inp[inp.find("[")+1:inp.find("]")]
# remove tag from input. only entity exists in input
inp = inp.replace(entity_tag, entity)
return inp, entity_tag, entity
# augment nlu.yml datas with wordtune suggestions
def data_augment(self):
for intent in tqdm(self.nlu_dict['nlu']):
inps = intent['examples'].split('\n')[:-1]
# give each intents examples to wordtune
for inp in inps:
# strip dash and space from example
inp = inp[2:]
# remove tags from words
inp, entity_tag, entity = self.__tag_remove(inp)
# paraphrase examples
payload = self.paraphrase.payload_generator(inp)
headers = self.paraphrase.headers_generator()
response = self.paraphrase.requests(payload, headers)
# list of wordtune suggestions
suggestion_list = self.paraphrase.get_suggestion(response, inp)
for suggestion in suggestion_list:
# add tags to each entity case insensitivly
prog = re.compile(re.escape(entity), re.IGNORECASE)
intent['examples'] += f'- {prog.sub(entity_tag, suggestion)}\n'
# remove all lines without entity tags
def data_clean(self):
for intent in self.nlu_dict['nlu']:
# exclude request_drug and request_lab intents. there is not entity tags in these intents examples
if intent['intent'] not in ['request_drug', 'request_lab']:
example_list = intent['examples'].split('\n')
intent['examples'] = ''.join(
example+'\n' for example in example_list if "[" in example)
# write nlu datas to the disk
def write_data(self, out_name):
with open(out_name, 'w') as f:
yaml.dump(self.nlu_dict, f, Dumper=yaml.RoundTripDumper)
def main():
# environmental variables must be in .env file in the same dir as this file
token = config('TOKEN')
draftId = config('DRAFTID')
nlu_file = 'nlu.yml'
output_file = 'nlu_augment.yml'
cleaned_output_file = 'nlu_cleaned.yml'
augment = DataAugment(nlu_file, token, draftId)
# rephrase input examples with wordtune
augment.data_augment()
# write augmented data to the disk
augment.write_data(output_file)
# remove examples without tags.
# manual remove preferred
augment.data_clean()
# write cleaned data to the disk
augment.write_data(cleaned_output_file)
if __name__ == '__main__':
main()