Switch to unified view

a b/code/data_preprocessing/data_analysis.py
1
import os
2
import numpy as np
3
import pickle
4
import matplotlib.pyplot as plt
5
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
6
7
labelDict = {}     # Label Dictionary - Labels to Index
8
reverseDict = {}   # Inverse Label Dictionary - Index to Labels
9
10
tot_prob = 0
11
tot_treatment = 0
12
tot_test = 0
13
14
def initialize_labels(s_path):     # Initializing label dictionaries for Labels->IDX and IDX->Labels
15
    # Using BIEOS labelling scheme
16
    labelDict['problem_b'] = 0     # Problem - Beginning 
17
    labelDict['problem_i'] = 1     # Problem - Inside
18
    labelDict['problem_e'] = 2     # Problem - End
19
    labelDict['problem_s'] = 3     # Problem - Single
20
    labelDict['test_b'] = 4        # Test - Beginning
21
    labelDict['test_i'] = 5        # Test - Inside
22
    labelDict['test_e'] = 6        # Test - End
23
    labelDict['test_s'] = 7        # Test - Single
24
    labelDict['treatment_b'] = 8   # Treatment - Beginning
25
    labelDict['treatment_i'] = 9   # Treatment - Inside
26
    labelDict['treatment_e'] = 10  # Treatment - End
27
    labelDict['treatment_s'] = 11  # Treatment - Single
28
    labelDict['o'] = 12            # Outside Token
29
30
    # Making Inverse Label Dictionary
31
    for k in labelDict.keys():
32
        reverseDict[labelDict[k]] = k
33
    
34
    # Saving the diictionaries into a file
35
    save_data([labelDict, reverseDict], os.path.join(s_path, "label_dicts.dat"))
36
37
def parse_concepts(file_path):      # Parses the concept file to extract concepts and labels
38
    conceptList = []                # Stores all the Concept in the File
39
40
    f = open(file_path)             # Opening and reading a concept file
41
    content = f.readlines()         # Reading all the lines in the concept file
42
    f.close()                       # Closing the concept file
43
44
    for x in content:               # Reading each line in the concept file
45
        dic = {}
46
47
        # Cleaning and extracting the entities, labels and their positions in the corresponding medical summaries
48
        x = x.strip().split('||')
49
50
        temp1, label = x[0].split(' '), x[1].split('=')[1][1:-1]
51
52
        temp1[0] = temp1[0][3:]
53
        temp1[-3] = temp1[-3][0:-1]
54
        entity = temp1[0:-2]
55
56
        if len(entity) > 1:
57
            lab = ['i']*len(entity)
58
            lab[0] = 'b'
59
            lab[-1] = 'e'
60
            lab = [label+"_"+l for l in lab]
61
        elif len(entity) == 1:
62
            lab = [label+"_"+"s"]
63
        else:
64
            print("Data in File: " + file_path + ", not in expected format..")
65
            exit()
66
67
        noLab = [labelDict[l] for l in lab]
68
        sLine, sCol = int(temp1[-2].split(":")[0]), int(temp1[-2].split(":")[1])
69
        eLine, eCol = int(temp1[-1].split(":")[0]), int(temp1[-1].split(":")[1])
70
        
71
        '''
72
        # Printing the information
73
        print("------------------------------------------------------------")
74
        print("Entity: " + str(entity))
75
        print("Entity Label: " + label)
76
        print("Labels - BIEOS form: " + str(lab))
77
        print("Labels  Index: " + str(noLab))
78
        print("Start Line: " + str(sLine) + ", Start Column: " + str(sCol))
79
        print("End Line: " + str(eLine) + ", End Column: " + str(eCol))
80
        print("------------------------------------------------------------")
81
        '''
82
83
        # Storing the information as a dictionary
84
        dic['entity'] = entity      # Entity Name (In the form of list of words)
85
        dic['label'] = label        # Common Label
86
        dic['BIEOS_labels'] = lab   # List of BIEOS label for each word
87
        dic['label_index'] = noLab  # Labels in the index form
88
        dic['start_line'] = sLine   # Start line of the concept in the corresponding text summaries
89
        dic['start_word_no'] = sCol # Starting word number of the concept in the corresponding start line
90
        dic['end_line'] = eLine     # End line of the concept in the corresponding text summaries
91
        dic['end_word_no'] = eCol   # Ending word number of the concept in the corresponding end line
92
93
        # Appending the concept dictionary to the list
94
        conceptList.append(dic)
95
96
    return conceptList  # Returning the all the concepts in the current file in the form of dictionary list
97
98
def parse_summary(file_path):       # Parses the Text summaries
99
    file_lines = []                 # Stores the lins of files in the list form
100
    tags = []                       # Stores corresponding labels for each word in the file (Default label: 'o' [Outside])
101
    # counter = 1                   # Temporary variable
102
103
    f = open(file_path)             # Opening and reading a concept file
104
    content = f.readlines()         # Reading all the lines in the concept file
105
    f.close()
106
107
    for x in content:
108
        file_lines.append(x.strip().split(" "))             # Appending the lines in the list
109
        tags.append([12]*len(file_lines[-1]))               # Assigining the default labels to all the words in a line
110
        '''
111
        # Printing the information
112
        print("------------------------------------------------------------")
113
        print("File Lines No: " + str(counter))
114
        print(file_lines[-1])
115
        print("\nCorresponding labels:")
116
        print(tags[-1])
117
        print("------------------------------------------------------------")
118
        counter += 1
119
        '''
120
        assert len(tags[-1]) == len(file_lines[-1]), "Line length is not matching labels length..."    # Sanity Check
121
    return file_lines, tags
122
123
def modify_labels(conceptList, tags):   # Modifies he default labels to each word with the true labels from the concept files
124
    for e in conceptList:                           # Iterating over all the dictionary elements in the Concept List
125
        if e['start_line'] == e['end_line']:        # Checking whether concept is spanning over a single line or multiple line in the summary
126
            tags[e['start_line']-1][e['start_word_no']:e['end_word_no']+1] = e['label_index'][:]
127
        else:
128
            start = e['start_line']
129
            end = e['end_line']
130
            beg = 0
131
            for i in range(start, end+1):           # Distributing labels over multiple lines in the text summaries
132
                if i == start:
133
                    tags[i-1][e['start_word_no']:] = e['label_index'][0:len(tags[i-1])-e['start_word_no']]
134
                    beg = len(tags[i-1])-e['start_word_no']
135
                elif i == end:
136
                    tags[i-1][0:e['end_word_no']+1] = e['label_index'][beg:]
137
                else:
138
                    tags[i-1][:] = e['label_index'][beg:beg+len(tags[i-1])]
139
                    beg = beg+len(tags[i-1])
140
    return tags
141
142
def print_data(file, file_lines, tags):       # Prints the given data
143
    counter = 1
144
145
    print("\n************ Printing details of the file: " + file + " ************\n")
146
    for x in file_lines:
147
        print("------------------------------------------------------------")
148
        print("File Lines No: " + str(counter))
149
        print(x)
150
        print("\nCorresponding labels:")
151
        print([reverseDict[i] for i in tags[counter-1]])
152
        print("\nCorresponding Label Indices:")
153
        print(tags[counter-1])
154
        print("------------------------------------------------------------")
155
        counter += 1
156
157
def save_data(obj_list, s_path):                # Saves the file into the binary file using Pickle
158
    pickle.dump(tuple(obj_list), open(s_path,'wb'))
159
160
def concept_metric(conceptList):                # Gathering Concepts metadata
161
    global tot_prob
162
    global tot_test
163
    global tot_treatment
164
    
165
    loc_prob = 0
166
    loc_treatment = 0
167
    loc_test  = 0
168
    avg_concept_length = []
169
170
    for c in conceptList:
171
        avg_concept_length.append(len(c['entity']))
172
173
        if c['label'] == 'problem':
174
            loc_prob += 1
175
            tot_prob += 1
176
        elif c['label'] == 'treatment':
177
            loc_treatment += 1
178
            tot_treatment += 1
179
        else:
180
            loc_test += 1
181
            tot_test += 1
182
    
183
    return avg_concept_length, loc_prob, loc_treatment, loc_test
184
185
def plot_histogram(data, title, xlab, bin_size=5):
186
    data = np.asarray(data)
187
    mean = "{:.2f}".format(data.mean())
188
    std_dev = "{:.2f}".format(data.std())
189
190
    # String Statement
191
    line = ', Mean: ' + str(mean) + ', Standard Deviation: ' + str(std_dev)
192
193
    # Calculating Histogram
194
    hist, bin_edges = np.histogram(data, bins=np.linspace(start = data.min(), stop = data.max(), num = int((data.max()-data.min())/bin_size)))
195
196
    # Plotting Histogram
197
    # plt.figure(figsize=[10,8])
198
    fig, ax = plt.subplots()
199
    plt.bar(bin_edges[:-1], hist, width = 1, color='#0504aa')
200
    plt.xlim(min(bin_edges)-1, max(bin_edges)+1)
201
    ax.xaxis.set_major_locator(MultipleLocator(bin_size))
202
    plt.xlabel(xlab,fontsize=15)
203
    plt.ylabel('Counts',fontsize=15)
204
    plt.title(title + line,fontsize=15)
205
    plt.show()
206
207
def process_data(c_path, t_path, s_path, counter):       # Read all the concept files to get concepts and labels, proces them and save them
208
    prob_list = []
209
    treat_list = []
210
    test_list = []
211
    avg_length_list = []
212
    for f in os.listdir(t_path):
213
        f1 = f.split('.')[0] + ".con"
214
        if os.path.isfile(os.path.join(c_path, f1)):
215
            conceptList = parse_concepts(os.path.join(c_path, f1))      # Parsing concepts and labels from the corresponding concept file
216
            file_lines, tags = parse_summary(os.path.join(t_path, f))   # Parses the document summaries to get the written notes
217
            tags = modify_labels(conceptList, tags)                     # Modifies he default labels to each word with the true labels from the concept files
218
            avg_concept_length, loc_prob, loc_treatment, loc_test = concept_metric(conceptList)
219
220
            counter += 1
221
            prob_list.append(loc_prob)
222
            treat_list.append(loc_treatment)
223
            test_list.append(loc_test)
224
            avg_length_list.extend(avg_concept_length)
225
            # save_data([conceptList, file_lines, tags], os.path.join(s_path, f.split('.')[0]+".dat"))          # Saving the objects into a file
226
            # print_data(f, file_lines, tags)                           # Printing the details
227
    return prob_list, treat_list, test_list, avg_length_list, counter
228
229
if __name__ == '__main__':
230
231
    # File paths
232
    save_path = "../../Medical Data/cleaned_files"
233
    concept_path = "../../Medical Data/training_data/partners/concept"
234
    text_path = "../../Medical Data/training_data/partners/txt"
235
    concept_path1 = "../../Medical Data/training_data/beth/concept"
236
    text_path1 = "../../Medical Data/training_data/beth/txt"
237
    counter = 0
238
239
    super_prob_list = []
240
    super_treat_list = []
241
    super_test_list = []
242
    super_len_list = []
243
244
    initialize_labels(save_path)                        # Initializing and saving the label dictionaries
245
246
    # 1
247
    prob_list, treat_list, test_list, avg_length_list, counter = process_data(concept_path, text_path, save_path, counter)    # Processing the data
248
249
    super_prob_list.extend(prob_list)
250
    super_treat_list.extend(treat_list)
251
    super_test_list.extend(test_list)
252
    super_len_list.extend(avg_length_list)
253
254
    # 2
255
    prob_list, treat_list, test_list, avg_length_list, counter = process_data(concept_path1, text_path1, save_path, counter)    # Processing the data
256
257
    super_prob_list.extend(prob_list)
258
    super_treat_list.extend(treat_list)
259
    super_test_list.extend(test_list)
260
    super_len_list.extend(avg_length_list)
261
262
    # Plotting Histogram
263
    plot_histogram(super_prob_list, 'Average Problem Concepts Distribution', 'Average Problem concepts per file', 3)
264
    plot_histogram(super_treat_list, 'Average Treatment Concepts Distribution', 'Average Treatment concepts per file', 3)
265
    plot_histogram(super_test_list, 'Average Test Concepts Distribution', 'Average Test concepts per file', 3)
266
    plot_histogram(super_len_list, 'Concept Length Distribution', 'Concepts length', 1)
267
268
    # Calculating Overall Mean Average
269
    avg_prob = tot_prob/counter
270
    avg_treat = tot_treatment/counter
271
    avg_test = tot_test/counter
272
273
    print("Total Concepts: " + str(len(super_len_list)))
274
    print("Total Files: " + str(counter))
275
    print("Total Problem concepts in Dataset: " + "{:.0f}".format(tot_prob))
276
    print("Average Problem concepts per file in Dataset: " + "{:.2f}".format(avg_prob))
277
    print("Total Treatment concepts in Dataset: " + "{:.0f}".format(tot_treatment))
278
    print("Average Treatment concepts per file in Dataset: " + "{:.2f}".format(avg_treat))
279
    print("Total Test concepts in Dataset: " + "{:.0f}".format(tot_test))
280
    print("Average Test concepts per file in Dataset: " + "{:.2f}".format(avg_test))