|
a |
|
b/code/bert_code/preprocessor.py |
|
|
1 |
import os |
|
|
2 |
import numpy as np |
|
|
3 |
import pickle |
|
|
4 |
import re |
|
|
5 |
|
|
|
6 |
class PreProcessor(object): |
|
|
7 |
def __init__(self,concept_path,text_path,save_path): |
|
|
8 |
""" |
|
|
9 |
Training path - |
|
|
10 |
concept_path = "../dnc_code/medical_data/train_data/concept" |
|
|
11 |
text_path = "../dnc_code/medical_data/train_data/txt" |
|
|
12 |
Testing path |
|
|
13 |
concept_path = "../dnc_code/medical_data/test_data/concept" |
|
|
14 |
text_path = "../dnc_code/medical_data/test_data/txt" |
|
|
15 |
""" |
|
|
16 |
self.concept_path = concept_path |
|
|
17 |
self.text_path = text_path |
|
|
18 |
self.save_path = save_path |
|
|
19 |
|
|
|
20 |
def initialize_labels(self): # Initializing label dictionaries for Labels->IDX and IDX->Labels |
|
|
21 |
self.labelDict = {} # Label Dictionary - Labels to Index |
|
|
22 |
self.reverseDict = {} # Inverse Label Dictionary - Index to Labels |
|
|
23 |
|
|
|
24 |
# Using BIEOS labelling scheme |
|
|
25 |
self.labelDict['b-problem'] = 0 # Problem - Beginning |
|
|
26 |
self.labelDict['i-problem'] = 1 # Problem - Inside |
|
|
27 |
self.labelDict['b-test'] = 2 # Test - Beginning |
|
|
28 |
self.labelDict['i-test'] = 3 # Test - Inside |
|
|
29 |
self.labelDict['b-treatment'] = 4 # Treatment - Beginning |
|
|
30 |
self.labelDict['i-treatment'] = 5 # Treatment - Inside |
|
|
31 |
self.labelDict['o'] = 6 # Outside Token |
|
|
32 |
|
|
|
33 |
# Making Inverse Label Dictionary |
|
|
34 |
for k in self.labelDict.keys(): |
|
|
35 |
self.reverseDict[self.labelDict[k]] = k |
|
|
36 |
|
|
|
37 |
# Saving the diictionaries into a file |
|
|
38 |
self.save_data([self.labelDict, self.reverseDict], os.path.join(self.save_path, "label_dicts_bio.dat")) |
|
|
39 |
|
|
|
40 |
def parse_concepts(self, file_path): # Parses the concept file to extract concepts and labels |
|
|
41 |
conceptList = [] # Stores all the Concept in the File |
|
|
42 |
|
|
|
43 |
f = open(file_path) # Opening and reading a concept file |
|
|
44 |
content = f.readlines() # Reading all the lines in the concept file |
|
|
45 |
f.close() # Closing the concept file |
|
|
46 |
|
|
|
47 |
for x in content: # Reading each line in the concept file |
|
|
48 |
dic = {} |
|
|
49 |
|
|
|
50 |
# Cleaning and extracting the entities, labels and their positions in the corresponding medical summaries |
|
|
51 |
x = re.sub('\n', ' ', x) |
|
|
52 |
x = re.sub(r'\ +', ' ', x) |
|
|
53 |
x = x.strip().split('||') |
|
|
54 |
|
|
|
55 |
temp1, label = x[0].split(' '), x[1].split('=')[1][1:-1] |
|
|
56 |
|
|
|
57 |
temp1[0] = temp1[0][3:] |
|
|
58 |
temp1[-3] = temp1[-3][0:-1] |
|
|
59 |
entity = temp1[0:-2] |
|
|
60 |
|
|
|
61 |
if len(entity) >= 1: |
|
|
62 |
lab = ['i']*len(entity) |
|
|
63 |
lab[0] = 'b' |
|
|
64 |
lab = [l+"-"+label for l in lab] |
|
|
65 |
else: |
|
|
66 |
print("Data in File: " + file_path + ", not in expected format..") |
|
|
67 |
exit() |
|
|
68 |
|
|
|
69 |
noLab = [self.labelDict[l] for l in lab] |
|
|
70 |
sLine, sCol = int(temp1[-2].split(":")[0]), int(temp1[-2].split(":")[1]) |
|
|
71 |
eLine, eCol = int(temp1[-1].split(":")[0]), int(temp1[-1].split(":")[1]) |
|
|
72 |
|
|
|
73 |
''' |
|
|
74 |
# Printing the information |
|
|
75 |
print("------------------------------------------------------------") |
|
|
76 |
print("Entity: " + str(entity)) |
|
|
77 |
print("Entity Label: " + label) |
|
|
78 |
print("Labels - BIO form: " + str(lab)) |
|
|
79 |
print("Labels Index: " + str(noLab)) |
|
|
80 |
print("Start Line: " + str(sLine) + ", Start Column: " + str(sCol)) |
|
|
81 |
print("End Line: " + str(eLine) + ", End Column: " + str(eCol)) |
|
|
82 |
print("------------------------------------------------------------") |
|
|
83 |
''' |
|
|
84 |
|
|
|
85 |
# Storing the information as a dictionary |
|
|
86 |
dic['entity'] = entity # Entity Name (In the form of list of words) |
|
|
87 |
dic['label'] = label # Common Label |
|
|
88 |
dic['BIO_labels'] = lab # List of BIO labels for each word |
|
|
89 |
dic['label_index'] = noLab # Labels in the index form |
|
|
90 |
dic['start_line'] = sLine # Start line of the concept in the corresponding text summaries |
|
|
91 |
dic['start_word_no'] = sCol # Starting word number of the concept in the corresponding start line |
|
|
92 |
dic['end_line'] = eLine # End line of the concept in the corresponding text summaries |
|
|
93 |
dic['end_word_no'] = eCol # Ending word number of the concept in the corresponding end line |
|
|
94 |
|
|
|
95 |
# Appending the concept dictionary to the list |
|
|
96 |
conceptList.append(dic) |
|
|
97 |
|
|
|
98 |
return conceptList # Returning the all the concepts in the current file in the form of dictionary list |
|
|
99 |
|
|
|
100 |
def parse_summary(self, file_path): # Parses the Text summaries |
|
|
101 |
file_lines = [] # Stores the lins of files in the list form |
|
|
102 |
tags = [] # Stores corresponding labels for each word in the file (Default label: 'o' [Outside]) |
|
|
103 |
default_label = len(self.labelDict)-1 # default_label is "7" (Corresponding to 'Other' entity) |
|
|
104 |
# counter = 1 # Temporary variable used during print |
|
|
105 |
|
|
|
106 |
f = open(file_path) # Opening and reading a concept file |
|
|
107 |
content = f.readlines() # Reading all the lines in the concept file |
|
|
108 |
f.close() |
|
|
109 |
|
|
|
110 |
for x in content: |
|
|
111 |
x = re.sub('\n', ' ', x) |
|
|
112 |
x = re.sub(r'\ +', ' ', x) |
|
|
113 |
file_lines.append(x.strip().split(" ")) # Spliting the lines into word list and Appending each of them in the file list |
|
|
114 |
tags.append([default_label]*len(file_lines[-1])) # Assigining the default_label to all the words in a line |
|
|
115 |
''' |
|
|
116 |
# Printing the information |
|
|
117 |
print("------------------------------------------------------------") |
|
|
118 |
print("File Lines No: " + str(counter)) |
|
|
119 |
print(file_lines[-1]) |
|
|
120 |
print("\nCorresponding labels:") |
|
|
121 |
print(tags[-1]) |
|
|
122 |
print("------------------------------------------------------------") |
|
|
123 |
counter += 1 |
|
|
124 |
''' |
|
|
125 |
assert len(tags[-1]) == len(file_lines[-1]), "Line length is not matching labels length..." # Sanity Check |
|
|
126 |
return file_lines, tags |
|
|
127 |
|
|
|
128 |
def modify_labels(self, conceptList, tags): # Modifies the default labels of each word in text files with the true labels from the concept files |
|
|
129 |
for e in conceptList: # Iterating over all the dictionary elements in the Concept List |
|
|
130 |
if e['start_line'] == e['end_line']: # Checking whether concept is spanning over a single line or multiple line in the summary |
|
|
131 |
tags[e['start_line']-1][e['start_word_no']:e['end_word_no']+1] = e['label_index'][:] |
|
|
132 |
else: |
|
|
133 |
start = e['start_line'] |
|
|
134 |
end = e['end_line'] |
|
|
135 |
beg = 0 |
|
|
136 |
for i in range(start, end+1): # Distributing labels over multiple lines in the text summaries |
|
|
137 |
if i == start: |
|
|
138 |
tags[i-1][e['start_word_no']:] = e['label_index'][0:len(tags[i-1])-e['start_word_no']] |
|
|
139 |
beg = len(tags[i-1])-e['start_word_no'] |
|
|
140 |
elif i == end: |
|
|
141 |
tags[i-1][0:e['end_word_no']+1] = e['label_index'][beg:] |
|
|
142 |
else: |
|
|
143 |
tags[i-1][:] = e['label_index'][beg:beg+len(tags[i-1])] |
|
|
144 |
beg = beg+len(tags[i-1]) |
|
|
145 |
return tags |
|
|
146 |
|
|
|
147 |
def save_data(self,obj_list, s_path): # Saves the file into the binary file using Pickle |
|
|
148 |
pickle.dump(tuple(obj_list), open(s_path,'wb')) |
|
|
149 |
|
|
|
150 |
def process_data(self,c_path, t_path, s_path): # Read all the concept files to get concepts and labels, proces them and save them |
|
|
151 |
for f in os.listdir(t_path): |
|
|
152 |
f1 = f.split('.')[0] + ".con" |
|
|
153 |
if os.path.isfile(os.path.join(c_path, f1)): |
|
|
154 |
conceptList = self.parse_concepts(os.path.join(c_path, f1)) # Parsing concepts and labels from the corresponding concept file |
|
|
155 |
file_lines, tags = self.parse_summary(os.path.join(t_path, f)) # Parses the document summaries to get the written notes |
|
|
156 |
tags = self.modify_labels(conceptList, tags) # Modifies he default labels to each word with the true labels from the concept files |
|
|
157 |
self.save_data([conceptList, file_lines, tags], os.path.join(s_path, f.split('.')[0]+".dat")) # Saving the objects into a file |
|
|
158 |
# print_data(f, file_lines, tags) # Printing the details |
|
|
159 |
|
|
|
160 |
|
|
|
161 |
def pre_process(self): |
|
|
162 |
""" |
|
|
163 |
Any other preprocessing needed can be called from pre_process method. |
|
|
164 |
""" |
|
|
165 |
# self.remove_duplicates_from_visual_descriptor_dataset() |
|
|
166 |
# self.rename_image_ids_from_visual_descriptor_dataset() |
|
|
167 |
# self.add_missing_objects_to_dataset() |
|
|
168 |
# self.transform_graph_file_to_dict_graph() |
|
|
169 |
# self.transform_edgelist_to_list_of_list_graph() -> not used |
|
|
170 |
|
|
|
171 |
self.initialize_labels() |
|
|
172 |
self.process_data(self.concept_path,self.text_path, self.save_path) |
|
|
173 |
|
|
|
174 |
|