|
a |
|
b/code/data_preprocessing/data_processing.py |
|
|
1 |
import os |
|
|
2 |
import numpy as np |
|
|
3 |
import pickle |
|
|
4 |
|
|
|
5 |
labelDict = {} # Label Dictionary - Labels to Index |
|
|
6 |
reverseDict = {} # Inverse Label Dictionary - Index to Labels |
|
|
7 |
|
|
|
8 |
def initialize_labels(s_path): # Initializing label dictionaries for Labels->IDX and IDX->Labels |
|
|
9 |
# Using BIEOS labelling scheme |
|
|
10 |
labelDict['problem_b'] = 0 # Problem - Beginning |
|
|
11 |
labelDict['problem_i'] = 1 # Problem - Inside |
|
|
12 |
labelDict['problem_e'] = 2 # Problem - End |
|
|
13 |
labelDict['problem_s'] = 3 # Problem - Single |
|
|
14 |
labelDict['test_b'] = 4 # Test - Beginning |
|
|
15 |
labelDict['test_i'] = 5 # Test - Inside |
|
|
16 |
labelDict['test_e'] = 6 # Test - End |
|
|
17 |
labelDict['test_s'] = 7 # Test - Single |
|
|
18 |
labelDict['treatment_b'] = 8 # Treatment - Beginning |
|
|
19 |
labelDict['treatment_i'] = 9 # Treatment - Inside |
|
|
20 |
labelDict['treatment_e'] = 10 # Treatment - End |
|
|
21 |
labelDict['treatment_s'] = 11 # Treatment - Single |
|
|
22 |
labelDict['o'] = 12 # Outside Token |
|
|
23 |
|
|
|
24 |
# Making Inverse Label Dictionary |
|
|
25 |
for k in labelDict.keys(): |
|
|
26 |
reverseDict[labelDict[k]] = k |
|
|
27 |
|
|
|
28 |
# Saving the diictionaries into a file |
|
|
29 |
save_data([labelDict, reverseDict], os.path.join(s_path, "label_dicts.dat")) |
|
|
30 |
|
|
|
31 |
def parse_concepts(file_path): # Parses the concept file to extract concepts and labels |
|
|
32 |
conceptList = [] # Stores all the Concept in the File |
|
|
33 |
|
|
|
34 |
f = open(file_path) # Opening and reading a concept file |
|
|
35 |
content = f.readlines() # Reading all the lines in the concept file |
|
|
36 |
f.close() # Closing the concept file |
|
|
37 |
|
|
|
38 |
for x in content: # Reading each line in the concept file |
|
|
39 |
dic = {} |
|
|
40 |
|
|
|
41 |
# Cleaning and extracting the entities, labels and their positions in the corresponding medical summaries |
|
|
42 |
x = x.strip().split('||') |
|
|
43 |
|
|
|
44 |
temp1, label = x[0].split(' '), x[1].split('=')[1][1:-1] |
|
|
45 |
|
|
|
46 |
temp1[0] = temp1[0][3:] |
|
|
47 |
temp1[-3] = temp1[-3][0:-1] |
|
|
48 |
entity = temp1[0:-2] |
|
|
49 |
|
|
|
50 |
if len(entity) > 1: |
|
|
51 |
lab = ['i']*len(entity) |
|
|
52 |
lab[0] = 'b' |
|
|
53 |
lab[-1] = 'e' |
|
|
54 |
lab = [label+"_"+l for l in lab] |
|
|
55 |
elif len(entity) == 1: |
|
|
56 |
lab = [label+"_"+"s"] |
|
|
57 |
else: |
|
|
58 |
print("Data in File: " + file_path + ", not in expected format..") |
|
|
59 |
exit() |
|
|
60 |
|
|
|
61 |
noLab = [labelDict[l] for l in lab] |
|
|
62 |
sLine, sCol = int(temp1[-2].split(":")[0]), int(temp1[-2].split(":")[1]) |
|
|
63 |
eLine, eCol = int(temp1[-1].split(":")[0]), int(temp1[-1].split(":")[1]) |
|
|
64 |
|
|
|
65 |
''' |
|
|
66 |
# Printing the information |
|
|
67 |
print("------------------------------------------------------------") |
|
|
68 |
print("Entity: " + str(entity)) |
|
|
69 |
print("Entity Label: " + label) |
|
|
70 |
print("Labels - BIEOS form: " + str(lab)) |
|
|
71 |
print("Labels Index: " + str(noLab)) |
|
|
72 |
print("Start Line: " + str(sLine) + ", Start Column: " + str(sCol)) |
|
|
73 |
print("End Line: " + str(eLine) + ", End Column: " + str(eCol)) |
|
|
74 |
print("------------------------------------------------------------") |
|
|
75 |
''' |
|
|
76 |
|
|
|
77 |
# Storing the information as a dictionary |
|
|
78 |
dic['entity'] = entity # Entity Name (In the form of list of words) |
|
|
79 |
dic['label'] = label # Common Label |
|
|
80 |
dic['BIEOS_labels'] = lab # List of BIEOS label for each word |
|
|
81 |
dic['label_index'] = noLab # Labels in the index form |
|
|
82 |
dic['start_line'] = sLine # Start line of the concept in the corresponding text summaries |
|
|
83 |
dic['start_word_no'] = sCol # Starting word number of the concept in the corresponding start line |
|
|
84 |
dic['end_line'] = eLine # End line of the concept in the corresponding text summaries |
|
|
85 |
dic['end_word_no'] = eCol # Ending word number of the concept in the corresponding end line |
|
|
86 |
|
|
|
87 |
# Appending the concept dictionary to the list |
|
|
88 |
conceptList.append(dic) |
|
|
89 |
|
|
|
90 |
return conceptList # Returning the all the concepts in the current file in the form of dictionary list |
|
|
91 |
|
|
|
92 |
def parse_summary(file_path): # Parses the Text summaries |
|
|
93 |
file_lines = [] # Stores the lins of files in the list form |
|
|
94 |
tags = [] # Stores corresponding labels for each word in the file (Default label: 'o' [Outside]) |
|
|
95 |
# counter = 1 # Temporary variable |
|
|
96 |
|
|
|
97 |
f = open(file_path) # Opening and reading a concept file |
|
|
98 |
content = f.readlines() # Reading all the lines in the concept file |
|
|
99 |
f.close() |
|
|
100 |
|
|
|
101 |
for x in content: |
|
|
102 |
file_lines.append(x.strip().split(" ")) # Appending the lines in the list |
|
|
103 |
tags.append([12]*len(file_lines[-1])) # Assigining the default labels to all the words in a line |
|
|
104 |
''' |
|
|
105 |
# Printing the information |
|
|
106 |
print("------------------------------------------------------------") |
|
|
107 |
print("File Lines No: " + str(counter)) |
|
|
108 |
print(file_lines[-1]) |
|
|
109 |
print("\nCorresponding labels:") |
|
|
110 |
print(tags[-1]) |
|
|
111 |
print("------------------------------------------------------------") |
|
|
112 |
counter += 1 |
|
|
113 |
''' |
|
|
114 |
assert len(tags[-1]) == len(file_lines[-1]), "Line length is not matching labels length..." # Sanity Check |
|
|
115 |
return file_lines, tags |
|
|
116 |
|
|
|
117 |
def modify_labels(conceptList, tags): # Modifies he default labels to each word with the true labels from the concept files |
|
|
118 |
for e in conceptList: # Iterating over all the dictionary elements in the Concept List |
|
|
119 |
if e['start_line'] == e['end_line']: # Checking whether concept is spanning over a single line or multiple line in the summary |
|
|
120 |
tags[e['start_line']-1][e['start_word_no']:e['end_word_no']+1] = e['label_index'][:] |
|
|
121 |
else: |
|
|
122 |
start = e['start_line'] |
|
|
123 |
end = e['end_line'] |
|
|
124 |
beg = 0 |
|
|
125 |
for i in range(start, end+1): # Distributing labels over multiple lines in the text summaries |
|
|
126 |
if i == start: |
|
|
127 |
tags[i-1][e['start_word_no']:] = e['label_index'][0:len(tags[i-1])-e['start_word_no']] |
|
|
128 |
beg = len(tags[i-1])-e['start_word_no'] |
|
|
129 |
elif i == end: |
|
|
130 |
tags[i-1][0:e['end_word_no']+1] = e['label_index'][beg:] |
|
|
131 |
else: |
|
|
132 |
tags[i-1][:] = e['label_index'][beg:beg+len(tags[i-1])] |
|
|
133 |
beg = beg+len(tags[i-1]) |
|
|
134 |
return tags |
|
|
135 |
|
|
|
136 |
def print_data(file, file_lines, tags): # Prints the given data |
|
|
137 |
counter = 1 |
|
|
138 |
|
|
|
139 |
print("\n************ Printing details of the file: " + file + " ************\n") |
|
|
140 |
for x in file_lines: |
|
|
141 |
print("------------------------------------------------------------") |
|
|
142 |
print("File Lines No: " + str(counter)) |
|
|
143 |
print(x) |
|
|
144 |
print("\nCorresponding labels:") |
|
|
145 |
print([reverseDict[i] for i in tags[counter-1]]) |
|
|
146 |
print("\nCorresponding Label Indices:") |
|
|
147 |
print(tags[counter-1]) |
|
|
148 |
print("------------------------------------------------------------") |
|
|
149 |
counter += 1 |
|
|
150 |
|
|
|
151 |
def save_data(obj_list, s_path): # Saves the file into the binary file using Pickle |
|
|
152 |
pickle.dump(tuple(obj_list), open(s_path,'wb')) |
|
|
153 |
|
|
|
154 |
def process_data(c_path, t_path, s_path): # Read all the concept files to get concepts and labels, proces them and save them |
|
|
155 |
for f in os.listdir(t_path): |
|
|
156 |
f1 = f.split('.')[0] + ".con" |
|
|
157 |
if os.path.isfile(os.path.join(c_path, f1)): |
|
|
158 |
conceptList = parse_concepts(os.path.join(c_path, f1)) # Parsing concepts and labels from the corresponding concept file |
|
|
159 |
file_lines, tags = parse_summary(os.path.join(t_path, f)) # Parses the document summaries to get the written notes |
|
|
160 |
tags = modify_labels(conceptList, tags) # Modifies he default labels to each word with the true labels from the concept files |
|
|
161 |
save_data([conceptList, file_lines, tags], os.path.join(s_path, f.split('.')[0]+".dat")) # Saving the objects into a file |
|
|
162 |
# print_data(f, file_lines, tags) # Printing the details |
|
|
163 |
|
|
|
164 |
if __name__ == '__main__': |
|
|
165 |
|
|
|
166 |
# File paths |
|
|
167 |
save_path = "./cleaned_files" |
|
|
168 |
concept_path = "../dnc_code/medical_data/train/concept" |
|
|
169 |
text_path = "../dnc_code/medical_data/train/txt" |
|
|
170 |
|
|
|
171 |
initialize_labels(save_path) # Initializing and saving the label dictionaries |
|
|
172 |
process_data(concept_path, text_path, save_path) # Processing the data |