|
a |
|
b/code/data_preprocessing/data_analysis.py |
|
|
1 |
import os |
|
|
2 |
import numpy as np |
|
|
3 |
import pickle |
|
|
4 |
import matplotlib.pyplot as plt |
|
|
5 |
from matplotlib.ticker import MultipleLocator, FormatStrFormatter |
|
|
6 |
|
|
|
7 |
labelDict = {} # Label Dictionary - Labels to Index |
|
|
8 |
reverseDict = {} # Inverse Label Dictionary - Index to Labels |
|
|
9 |
|
|
|
10 |
tot_prob = 0 |
|
|
11 |
tot_treatment = 0 |
|
|
12 |
tot_test = 0 |
|
|
13 |
|
|
|
14 |
def initialize_labels(s_path): # Initializing label dictionaries for Labels->IDX and IDX->Labels |
|
|
15 |
# Using BIEOS labelling scheme |
|
|
16 |
labelDict['problem_b'] = 0 # Problem - Beginning |
|
|
17 |
labelDict['problem_i'] = 1 # Problem - Inside |
|
|
18 |
labelDict['problem_e'] = 2 # Problem - End |
|
|
19 |
labelDict['problem_s'] = 3 # Problem - Single |
|
|
20 |
labelDict['test_b'] = 4 # Test - Beginning |
|
|
21 |
labelDict['test_i'] = 5 # Test - Inside |
|
|
22 |
labelDict['test_e'] = 6 # Test - End |
|
|
23 |
labelDict['test_s'] = 7 # Test - Single |
|
|
24 |
labelDict['treatment_b'] = 8 # Treatment - Beginning |
|
|
25 |
labelDict['treatment_i'] = 9 # Treatment - Inside |
|
|
26 |
labelDict['treatment_e'] = 10 # Treatment - End |
|
|
27 |
labelDict['treatment_s'] = 11 # Treatment - Single |
|
|
28 |
labelDict['o'] = 12 # Outside Token |
|
|
29 |
|
|
|
30 |
# Making Inverse Label Dictionary |
|
|
31 |
for k in labelDict.keys(): |
|
|
32 |
reverseDict[labelDict[k]] = k |
|
|
33 |
|
|
|
34 |
# Saving the diictionaries into a file |
|
|
35 |
save_data([labelDict, reverseDict], os.path.join(s_path, "label_dicts.dat")) |
|
|
36 |
|
|
|
37 |
def parse_concepts(file_path): # Parses the concept file to extract concepts and labels |
|
|
38 |
conceptList = [] # Stores all the Concept in the File |
|
|
39 |
|
|
|
40 |
f = open(file_path) # Opening and reading a concept file |
|
|
41 |
content = f.readlines() # Reading all the lines in the concept file |
|
|
42 |
f.close() # Closing the concept file |
|
|
43 |
|
|
|
44 |
for x in content: # Reading each line in the concept file |
|
|
45 |
dic = {} |
|
|
46 |
|
|
|
47 |
# Cleaning and extracting the entities, labels and their positions in the corresponding medical summaries |
|
|
48 |
x = x.strip().split('||') |
|
|
49 |
|
|
|
50 |
temp1, label = x[0].split(' '), x[1].split('=')[1][1:-1] |
|
|
51 |
|
|
|
52 |
temp1[0] = temp1[0][3:] |
|
|
53 |
temp1[-3] = temp1[-3][0:-1] |
|
|
54 |
entity = temp1[0:-2] |
|
|
55 |
|
|
|
56 |
if len(entity) > 1: |
|
|
57 |
lab = ['i']*len(entity) |
|
|
58 |
lab[0] = 'b' |
|
|
59 |
lab[-1] = 'e' |
|
|
60 |
lab = [label+"_"+l for l in lab] |
|
|
61 |
elif len(entity) == 1: |
|
|
62 |
lab = [label+"_"+"s"] |
|
|
63 |
else: |
|
|
64 |
print("Data in File: " + file_path + ", not in expected format..") |
|
|
65 |
exit() |
|
|
66 |
|
|
|
67 |
noLab = [labelDict[l] for l in lab] |
|
|
68 |
sLine, sCol = int(temp1[-2].split(":")[0]), int(temp1[-2].split(":")[1]) |
|
|
69 |
eLine, eCol = int(temp1[-1].split(":")[0]), int(temp1[-1].split(":")[1]) |
|
|
70 |
|
|
|
71 |
''' |
|
|
72 |
# Printing the information |
|
|
73 |
print("------------------------------------------------------------") |
|
|
74 |
print("Entity: " + str(entity)) |
|
|
75 |
print("Entity Label: " + label) |
|
|
76 |
print("Labels - BIEOS form: " + str(lab)) |
|
|
77 |
print("Labels Index: " + str(noLab)) |
|
|
78 |
print("Start Line: " + str(sLine) + ", Start Column: " + str(sCol)) |
|
|
79 |
print("End Line: " + str(eLine) + ", End Column: " + str(eCol)) |
|
|
80 |
print("------------------------------------------------------------") |
|
|
81 |
''' |
|
|
82 |
|
|
|
83 |
# Storing the information as a dictionary |
|
|
84 |
dic['entity'] = entity # Entity Name (In the form of list of words) |
|
|
85 |
dic['label'] = label # Common Label |
|
|
86 |
dic['BIEOS_labels'] = lab # List of BIEOS label for each word |
|
|
87 |
dic['label_index'] = noLab # Labels in the index form |
|
|
88 |
dic['start_line'] = sLine # Start line of the concept in the corresponding text summaries |
|
|
89 |
dic['start_word_no'] = sCol # Starting word number of the concept in the corresponding start line |
|
|
90 |
dic['end_line'] = eLine # End line of the concept in the corresponding text summaries |
|
|
91 |
dic['end_word_no'] = eCol # Ending word number of the concept in the corresponding end line |
|
|
92 |
|
|
|
93 |
# Appending the concept dictionary to the list |
|
|
94 |
conceptList.append(dic) |
|
|
95 |
|
|
|
96 |
return conceptList # Returning the all the concepts in the current file in the form of dictionary list |
|
|
97 |
|
|
|
98 |
def parse_summary(file_path): # Parses the Text summaries |
|
|
99 |
file_lines = [] # Stores the lins of files in the list form |
|
|
100 |
tags = [] # Stores corresponding labels for each word in the file (Default label: 'o' [Outside]) |
|
|
101 |
# counter = 1 # Temporary variable |
|
|
102 |
|
|
|
103 |
f = open(file_path) # Opening and reading a concept file |
|
|
104 |
content = f.readlines() # Reading all the lines in the concept file |
|
|
105 |
f.close() |
|
|
106 |
|
|
|
107 |
for x in content: |
|
|
108 |
file_lines.append(x.strip().split(" ")) # Appending the lines in the list |
|
|
109 |
tags.append([12]*len(file_lines[-1])) # Assigining the default labels to all the words in a line |
|
|
110 |
''' |
|
|
111 |
# Printing the information |
|
|
112 |
print("------------------------------------------------------------") |
|
|
113 |
print("File Lines No: " + str(counter)) |
|
|
114 |
print(file_lines[-1]) |
|
|
115 |
print("\nCorresponding labels:") |
|
|
116 |
print(tags[-1]) |
|
|
117 |
print("------------------------------------------------------------") |
|
|
118 |
counter += 1 |
|
|
119 |
''' |
|
|
120 |
assert len(tags[-1]) == len(file_lines[-1]), "Line length is not matching labels length..." # Sanity Check |
|
|
121 |
return file_lines, tags |
|
|
122 |
|
|
|
123 |
def modify_labels(conceptList, tags): # Modifies he default labels to each word with the true labels from the concept files |
|
|
124 |
for e in conceptList: # Iterating over all the dictionary elements in the Concept List |
|
|
125 |
if e['start_line'] == e['end_line']: # Checking whether concept is spanning over a single line or multiple line in the summary |
|
|
126 |
tags[e['start_line']-1][e['start_word_no']:e['end_word_no']+1] = e['label_index'][:] |
|
|
127 |
else: |
|
|
128 |
start = e['start_line'] |
|
|
129 |
end = e['end_line'] |
|
|
130 |
beg = 0 |
|
|
131 |
for i in range(start, end+1): # Distributing labels over multiple lines in the text summaries |
|
|
132 |
if i == start: |
|
|
133 |
tags[i-1][e['start_word_no']:] = e['label_index'][0:len(tags[i-1])-e['start_word_no']] |
|
|
134 |
beg = len(tags[i-1])-e['start_word_no'] |
|
|
135 |
elif i == end: |
|
|
136 |
tags[i-1][0:e['end_word_no']+1] = e['label_index'][beg:] |
|
|
137 |
else: |
|
|
138 |
tags[i-1][:] = e['label_index'][beg:beg+len(tags[i-1])] |
|
|
139 |
beg = beg+len(tags[i-1]) |
|
|
140 |
return tags |
|
|
141 |
|
|
|
142 |
def print_data(file, file_lines, tags): # Prints the given data |
|
|
143 |
counter = 1 |
|
|
144 |
|
|
|
145 |
print("\n************ Printing details of the file: " + file + " ************\n") |
|
|
146 |
for x in file_lines: |
|
|
147 |
print("------------------------------------------------------------") |
|
|
148 |
print("File Lines No: " + str(counter)) |
|
|
149 |
print(x) |
|
|
150 |
print("\nCorresponding labels:") |
|
|
151 |
print([reverseDict[i] for i in tags[counter-1]]) |
|
|
152 |
print("\nCorresponding Label Indices:") |
|
|
153 |
print(tags[counter-1]) |
|
|
154 |
print("------------------------------------------------------------") |
|
|
155 |
counter += 1 |
|
|
156 |
|
|
|
157 |
def save_data(obj_list, s_path): # Saves the file into the binary file using Pickle |
|
|
158 |
pickle.dump(tuple(obj_list), open(s_path,'wb')) |
|
|
159 |
|
|
|
160 |
def concept_metric(conceptList): # Gathering Concepts metadata |
|
|
161 |
global tot_prob |
|
|
162 |
global tot_test |
|
|
163 |
global tot_treatment |
|
|
164 |
|
|
|
165 |
loc_prob = 0 |
|
|
166 |
loc_treatment = 0 |
|
|
167 |
loc_test = 0 |
|
|
168 |
avg_concept_length = [] |
|
|
169 |
|
|
|
170 |
for c in conceptList: |
|
|
171 |
avg_concept_length.append(len(c['entity'])) |
|
|
172 |
|
|
|
173 |
if c['label'] == 'problem': |
|
|
174 |
loc_prob += 1 |
|
|
175 |
tot_prob += 1 |
|
|
176 |
elif c['label'] == 'treatment': |
|
|
177 |
loc_treatment += 1 |
|
|
178 |
tot_treatment += 1 |
|
|
179 |
else: |
|
|
180 |
loc_test += 1 |
|
|
181 |
tot_test += 1 |
|
|
182 |
|
|
|
183 |
return avg_concept_length, loc_prob, loc_treatment, loc_test |
|
|
184 |
|
|
|
185 |
def plot_histogram(data, title, xlab, bin_size=5): |
|
|
186 |
data = np.asarray(data) |
|
|
187 |
mean = "{:.2f}".format(data.mean()) |
|
|
188 |
std_dev = "{:.2f}".format(data.std()) |
|
|
189 |
|
|
|
190 |
# String Statement |
|
|
191 |
line = ', Mean: ' + str(mean) + ', Standard Deviation: ' + str(std_dev) |
|
|
192 |
|
|
|
193 |
# Calculating Histogram |
|
|
194 |
hist, bin_edges = np.histogram(data, bins=np.linspace(start = data.min(), stop = data.max(), num = int((data.max()-data.min())/bin_size))) |
|
|
195 |
|
|
|
196 |
# Plotting Histogram |
|
|
197 |
# plt.figure(figsize=[10,8]) |
|
|
198 |
fig, ax = plt.subplots() |
|
|
199 |
plt.bar(bin_edges[:-1], hist, width = 1, color='#0504aa') |
|
|
200 |
plt.xlim(min(bin_edges)-1, max(bin_edges)+1) |
|
|
201 |
ax.xaxis.set_major_locator(MultipleLocator(bin_size)) |
|
|
202 |
plt.xlabel(xlab,fontsize=15) |
|
|
203 |
plt.ylabel('Counts',fontsize=15) |
|
|
204 |
plt.title(title + line,fontsize=15) |
|
|
205 |
plt.show() |
|
|
206 |
|
|
|
207 |
def process_data(c_path, t_path, s_path, counter): # Read all the concept files to get concepts and labels, proces them and save them |
|
|
208 |
prob_list = [] |
|
|
209 |
treat_list = [] |
|
|
210 |
test_list = [] |
|
|
211 |
avg_length_list = [] |
|
|
212 |
for f in os.listdir(t_path): |
|
|
213 |
f1 = f.split('.')[0] + ".con" |
|
|
214 |
if os.path.isfile(os.path.join(c_path, f1)): |
|
|
215 |
conceptList = parse_concepts(os.path.join(c_path, f1)) # Parsing concepts and labels from the corresponding concept file |
|
|
216 |
file_lines, tags = parse_summary(os.path.join(t_path, f)) # Parses the document summaries to get the written notes |
|
|
217 |
tags = modify_labels(conceptList, tags) # Modifies he default labels to each word with the true labels from the concept files |
|
|
218 |
avg_concept_length, loc_prob, loc_treatment, loc_test = concept_metric(conceptList) |
|
|
219 |
|
|
|
220 |
counter += 1 |
|
|
221 |
prob_list.append(loc_prob) |
|
|
222 |
treat_list.append(loc_treatment) |
|
|
223 |
test_list.append(loc_test) |
|
|
224 |
avg_length_list.extend(avg_concept_length) |
|
|
225 |
# save_data([conceptList, file_lines, tags], os.path.join(s_path, f.split('.')[0]+".dat")) # Saving the objects into a file |
|
|
226 |
# print_data(f, file_lines, tags) # Printing the details |
|
|
227 |
return prob_list, treat_list, test_list, avg_length_list, counter |
|
|
228 |
|
|
|
229 |
if __name__ == '__main__': |
|
|
230 |
|
|
|
231 |
# File paths |
|
|
232 |
save_path = "../../Medical Data/cleaned_files" |
|
|
233 |
concept_path = "../../Medical Data/training_data/partners/concept" |
|
|
234 |
text_path = "../../Medical Data/training_data/partners/txt" |
|
|
235 |
concept_path1 = "../../Medical Data/training_data/beth/concept" |
|
|
236 |
text_path1 = "../../Medical Data/training_data/beth/txt" |
|
|
237 |
counter = 0 |
|
|
238 |
|
|
|
239 |
super_prob_list = [] |
|
|
240 |
super_treat_list = [] |
|
|
241 |
super_test_list = [] |
|
|
242 |
super_len_list = [] |
|
|
243 |
|
|
|
244 |
initialize_labels(save_path) # Initializing and saving the label dictionaries |
|
|
245 |
|
|
|
246 |
# 1 |
|
|
247 |
prob_list, treat_list, test_list, avg_length_list, counter = process_data(concept_path, text_path, save_path, counter) # Processing the data |
|
|
248 |
|
|
|
249 |
super_prob_list.extend(prob_list) |
|
|
250 |
super_treat_list.extend(treat_list) |
|
|
251 |
super_test_list.extend(test_list) |
|
|
252 |
super_len_list.extend(avg_length_list) |
|
|
253 |
|
|
|
254 |
# 2 |
|
|
255 |
prob_list, treat_list, test_list, avg_length_list, counter = process_data(concept_path1, text_path1, save_path, counter) # Processing the data |
|
|
256 |
|
|
|
257 |
super_prob_list.extend(prob_list) |
|
|
258 |
super_treat_list.extend(treat_list) |
|
|
259 |
super_test_list.extend(test_list) |
|
|
260 |
super_len_list.extend(avg_length_list) |
|
|
261 |
|
|
|
262 |
# Plotting Histogram |
|
|
263 |
plot_histogram(super_prob_list, 'Average Problem Concepts Distribution', 'Average Problem concepts per file', 3) |
|
|
264 |
plot_histogram(super_treat_list, 'Average Treatment Concepts Distribution', 'Average Treatment concepts per file', 3) |
|
|
265 |
plot_histogram(super_test_list, 'Average Test Concepts Distribution', 'Average Test concepts per file', 3) |
|
|
266 |
plot_histogram(super_len_list, 'Concept Length Distribution', 'Concepts length', 1) |
|
|
267 |
|
|
|
268 |
# Calculating Overall Mean Average |
|
|
269 |
avg_prob = tot_prob/counter |
|
|
270 |
avg_treat = tot_treatment/counter |
|
|
271 |
avg_test = tot_test/counter |
|
|
272 |
|
|
|
273 |
print("Total Concepts: " + str(len(super_len_list))) |
|
|
274 |
print("Total Files: " + str(counter)) |
|
|
275 |
print("Total Problem concepts in Dataset: " + "{:.0f}".format(tot_prob)) |
|
|
276 |
print("Average Problem concepts per file in Dataset: " + "{:.2f}".format(avg_prob)) |
|
|
277 |
print("Total Treatment concepts in Dataset: " + "{:.0f}".format(tot_treatment)) |
|
|
278 |
print("Average Treatment concepts per file in Dataset: " + "{:.2f}".format(avg_treat)) |
|
|
279 |
print("Total Test concepts in Dataset: " + "{:.0f}".format(tot_test)) |
|
|
280 |
print("Average Test concepts per file in Dataset: " + "{:.2f}".format(avg_test)) |