--- a +++ b/development/paraphrase/randomize entities.py @@ -0,0 +1,67 @@ +import re +from ruamel import yaml +import numpy as np +import sys + + +def load_entities(entity_names): + entities_dict = dict() + for entity in entity_names: + entity_file_adr = f'./{entity}.yml' + entities_dict[entity] = load_entities_list(entity_file_adr) + return entities_dict + + +def load_entities_list(file_adr): + with open(file_adr, 'r') as f: + entities = yaml.load(f, Loader=yaml.RoundTripLoader) + entities = entities['nlu'][0] + entities = entities['examples'].split('\n') + entities = [row[2:] for row in entities] + return entities + + +def randomize_entity_names(nlu_dict, entities_dict): + for intent in nlu_dict['nlu']: + examples = '' + for example in intent['examples'].split('\n'): + entity_match = dict() + for entity in entities_dict: + entity_match = re.search(f'\[[^\]]*\]\({entity}\)', example) + if entity_match: + start, end = entity_match.span() + random_entity_name = f'[{np.random.choice(entities_dict[entity]).strip()}]({entity})' + example = example.replace(example[start:end] , random_entity_name) + + examples += example + '\n' + intent['examples'] = examples[:-1] #removing last \n to avoid \n\n after the last example + return nlu_dict + + + +if __name__=='__main__': + + NLU_FILE = './nlu_cleaned.yml' + ENTITY_NAMES = ['drug', 'lab'] #lookup files should be in same directoty as this file + OUTPUT_FILE = 'nlu_random.yml' + #load files + with open(NLU_FILE, 'r') as f: + nlu = yaml.load(f, Loader=yaml.RoundTripLoader) + entities = load_entities(ENTITY_NAMES) + + randomized_nlu = randomize_entity_names(nlu, entities) + #save + with open(OUTPUT_FILE, 'w') as f: + yaml.dump(randomized_nlu, f, Dumper=yaml.RoundTripDumper, default_flow_style=None) + + + + + + + + + + + + \ No newline at end of file