a b/code/data_preprocessing/data_processing.py
1
import os
2
import numpy as np
3
import pickle
4
5
labelDict = {}     # Label Dictionary - Labels to Index
6
reverseDict = {}   # Inverse Label Dictionary - Index to Labels
7
8
def initialize_labels(s_path):     # Initializing label dictionaries for Labels->IDX and IDX->Labels
9
    # Using BIEOS labelling scheme
10
    labelDict['problem_b'] = 0     # Problem - Beginning 
11
    labelDict['problem_i'] = 1     # Problem - Inside
12
    labelDict['problem_e'] = 2     # Problem - End
13
    labelDict['problem_s'] = 3     # Problem - Single
14
    labelDict['test_b'] = 4        # Test - Beginning
15
    labelDict['test_i'] = 5        # Test - Inside
16
    labelDict['test_e'] = 6        # Test - End
17
    labelDict['test_s'] = 7        # Test - Single
18
    labelDict['treatment_b'] = 8   # Treatment - Beginning
19
    labelDict['treatment_i'] = 9   # Treatment - Inside
20
    labelDict['treatment_e'] = 10  # Treatment - End
21
    labelDict['treatment_s'] = 11  # Treatment - Single
22
    labelDict['o'] = 12            # Outside Token
23
24
    # Making Inverse Label Dictionary
25
    for k in labelDict.keys():
26
        reverseDict[labelDict[k]] = k
27
    
28
    # Saving the diictionaries into a file
29
    save_data([labelDict, reverseDict], os.path.join(s_path, "label_dicts.dat"))
30
31
def parse_concepts(file_path):      # Parses the concept file to extract concepts and labels
32
    conceptList = []                # Stores all the Concept in the File
33
34
    f = open(file_path)             # Opening and reading a concept file
35
    content = f.readlines()         # Reading all the lines in the concept file
36
    f.close()                       # Closing the concept file
37
38
    for x in content:               # Reading each line in the concept file
39
        dic = {}
40
41
        # Cleaning and extracting the entities, labels and their positions in the corresponding medical summaries
42
        x = x.strip().split('||')
43
44
        temp1, label = x[0].split(' '), x[1].split('=')[1][1:-1]
45
46
        temp1[0] = temp1[0][3:]
47
        temp1[-3] = temp1[-3][0:-1]
48
        entity = temp1[0:-2]
49
50
        if len(entity) > 1:
51
            lab = ['i']*len(entity)
52
            lab[0] = 'b'
53
            lab[-1] = 'e'
54
            lab = [label+"_"+l for l in lab]
55
        elif len(entity) == 1:
56
            lab = [label+"_"+"s"]
57
        else:
58
            print("Data in File: " + file_path + ", not in expected format..")
59
            exit()
60
61
        noLab = [labelDict[l] for l in lab]
62
        sLine, sCol = int(temp1[-2].split(":")[0]), int(temp1[-2].split(":")[1])
63
        eLine, eCol = int(temp1[-1].split(":")[0]), int(temp1[-1].split(":")[1])
64
        
65
        '''
66
        # Printing the information
67
        print("------------------------------------------------------------")
68
        print("Entity: " + str(entity))
69
        print("Entity Label: " + label)
70
        print("Labels - BIEOS form: " + str(lab))
71
        print("Labels  Index: " + str(noLab))
72
        print("Start Line: " + str(sLine) + ", Start Column: " + str(sCol))
73
        print("End Line: " + str(eLine) + ", End Column: " + str(eCol))
74
        print("------------------------------------------------------------")
75
        '''
76
77
        # Storing the information as a dictionary
78
        dic['entity'] = entity      # Entity Name (In the form of list of words)
79
        dic['label'] = label        # Common Label
80
        dic['BIEOS_labels'] = lab   # List of BIEOS label for each word
81
        dic['label_index'] = noLab  # Labels in the index form
82
        dic['start_line'] = sLine   # Start line of the concept in the corresponding text summaries
83
        dic['start_word_no'] = sCol # Starting word number of the concept in the corresponding start line
84
        dic['end_line'] = eLine     # End line of the concept in the corresponding text summaries
85
        dic['end_word_no'] = eCol   # Ending word number of the concept in the corresponding end line
86
87
        # Appending the concept dictionary to the list
88
        conceptList.append(dic)
89
90
    return conceptList  # Returning the all the concepts in the current file in the form of dictionary list
91
92
def parse_summary(file_path):       # Parses the Text summaries
93
    file_lines = []                 # Stores the lins of files in the list form
94
    tags = []                       # Stores corresponding labels for each word in the file (Default label: 'o' [Outside])
95
    # counter = 1                   # Temporary variable
96
97
    f = open(file_path)             # Opening and reading a concept file
98
    content = f.readlines()         # Reading all the lines in the concept file
99
    f.close()
100
101
    for x in content:
102
        file_lines.append(x.strip().split(" "))             # Appending the lines in the list
103
        tags.append([12]*len(file_lines[-1]))               # Assigining the default labels to all the words in a line
104
        '''
105
        # Printing the information
106
        print("------------------------------------------------------------")
107
        print("File Lines No: " + str(counter))
108
        print(file_lines[-1])
109
        print("\nCorresponding labels:")
110
        print(tags[-1])
111
        print("------------------------------------------------------------")
112
        counter += 1
113
        '''
114
        assert len(tags[-1]) == len(file_lines[-1]), "Line length is not matching labels length..."    # Sanity Check
115
    return file_lines, tags
116
117
def modify_labels(conceptList, tags):   # Modifies he default labels to each word with the true labels from the concept files
118
    for e in conceptList:                           # Iterating over all the dictionary elements in the Concept List
119
        if e['start_line'] == e['end_line']:        # Checking whether concept is spanning over a single line or multiple line in the summary
120
            tags[e['start_line']-1][e['start_word_no']:e['end_word_no']+1] = e['label_index'][:]
121
        else:
122
            start = e['start_line']
123
            end = e['end_line']
124
            beg = 0
125
            for i in range(start, end+1):           # Distributing labels over multiple lines in the text summaries
126
                if i == start:
127
                    tags[i-1][e['start_word_no']:] = e['label_index'][0:len(tags[i-1])-e['start_word_no']]
128
                    beg = len(tags[i-1])-e['start_word_no']
129
                elif i == end:
130
                    tags[i-1][0:e['end_word_no']+1] = e['label_index'][beg:]
131
                else:
132
                    tags[i-1][:] = e['label_index'][beg:beg+len(tags[i-1])]
133
                    beg = beg+len(tags[i-1])
134
    return tags
135
136
def print_data(file, file_lines, tags):       # Prints the given data
137
    counter = 1
138
139
    print("\n************ Printing details of the file: " + file + " ************\n")
140
    for x in file_lines:
141
        print("------------------------------------------------------------")
142
        print("File Lines No: " + str(counter))
143
        print(x)
144
        print("\nCorresponding labels:")
145
        print([reverseDict[i] for i in tags[counter-1]])
146
        print("\nCorresponding Label Indices:")
147
        print(tags[counter-1])
148
        print("------------------------------------------------------------")
149
        counter += 1
150
151
def save_data(obj_list, s_path):                # Saves the file into the binary file using Pickle
152
    pickle.dump(tuple(obj_list), open(s_path,'wb'))
153
154
def process_data(c_path, t_path, s_path):      # Read all the concept files to get concepts and labels, proces them and save them
155
    for f in os.listdir(t_path):
156
        f1 = f.split('.')[0] + ".con"
157
        if os.path.isfile(os.path.join(c_path, f1)):
158
            conceptList = parse_concepts(os.path.join(c_path, f1))      # Parsing concepts and labels from the corresponding concept file
159
            file_lines, tags = parse_summary(os.path.join(t_path, f))   # Parses the document summaries to get the written notes
160
            tags = modify_labels(conceptList, tags)                     # Modifies he default labels to each word with the true labels from the concept files
161
            save_data([conceptList, file_lines, tags], os.path.join(s_path, f.split('.')[0]+".dat"))          # Saving the objects into a file
162
            # print_data(f, file_lines, tags)                           # Printing the details
163
164
if __name__ == '__main__':
165
166
    # File paths
167
    save_path = "./cleaned_files"
168
    concept_path = "../dnc_code/medical_data/train/concept"
169
    text_path = "../dnc_code/medical_data/train/txt"
170
171
    initialize_labels(save_path)                        # Initializing and saving the label dictionaries
172
    process_data(concept_path, text_path, save_path)    # Processing the data