Switch to unified view

a b/code/bert_code/preprocessor.py
1
import os
2
import numpy as np
3
import pickle
4
import re
5
6
class PreProcessor(object):
7
    def __init__(self,concept_path,text_path,save_path):
8
        """
9
        Training path - 
10
        concept_path = "../dnc_code/medical_data/train_data/concept"
11
    text_path = "../dnc_code/medical_data/train_data/txt"
12
        Testing path
13
        concept_path = "../dnc_code/medical_data/test_data/concept"
14
    text_path = "../dnc_code/medical_data/test_data/txt"
15
        """
16
        self.concept_path = concept_path
17
        self.text_path = text_path
18
        self.save_path = save_path
19
20
    def initialize_labels(self):            # Initializing label dictionaries for Labels->IDX and IDX->Labels
21
        self.labelDict = {}                 # Label Dictionary - Labels to Index
22
        self.reverseDict = {}               # Inverse Label Dictionary - Index to Labels
23
24
        # Using BIEOS labelling scheme
25
        self.labelDict['b-problem'] = 0     # Problem - Beginning 
26
        self.labelDict['i-problem'] = 1     # Problem - Inside
27
        self.labelDict['b-test'] = 2        # Test - Beginning
28
        self.labelDict['i-test'] = 3        # Test - Inside
29
        self.labelDict['b-treatment'] = 4   # Treatment - Beginning
30
        self.labelDict['i-treatment'] = 5   # Treatment - Inside
31
        self.labelDict['o'] = 6             # Outside Token
32
33
        # Making Inverse Label Dictionary
34
        for k in self.labelDict.keys():
35
            self.reverseDict[self.labelDict[k]] = k
36
37
        # Saving the diictionaries into a file
38
        self.save_data([self.labelDict, self.reverseDict], os.path.join(self.save_path, "label_dicts_bio.dat"))
39
40
    def parse_concepts(self, file_path):    # Parses the concept file to extract concepts and labels
41
        conceptList = []                    # Stores all the Concept in the File
42
43
        f = open(file_path)                 # Opening and reading a concept file
44
        content = f.readlines()             # Reading all the lines in the concept file
45
        f.close()                           # Closing the concept file
46
47
        for x in content:                   # Reading each line in the concept file
48
            dic = {}
49
50
            # Cleaning and extracting the entities, labels and their positions in the corresponding medical summaries
51
            x = re.sub('\n', ' ', x)
52
            x = re.sub(r'\ +', ' ', x)
53
            x = x.strip().split('||')
54
55
            temp1, label = x[0].split(' '), x[1].split('=')[1][1:-1]
56
57
            temp1[0] = temp1[0][3:]
58
            temp1[-3] = temp1[-3][0:-1]
59
            entity = temp1[0:-2]
60
61
            if len(entity) >= 1:
62
                lab = ['i']*len(entity)
63
                lab[0] = 'b'
64
                lab = [l+"-"+label for l in lab]
65
            else:
66
                print("Data in File: " + file_path + ", not in expected format..")
67
                exit()
68
69
            noLab = [self.labelDict[l] for l in lab]
70
            sLine, sCol = int(temp1[-2].split(":")[0]), int(temp1[-2].split(":")[1])
71
            eLine, eCol = int(temp1[-1].split(":")[0]), int(temp1[-1].split(":")[1])
72
            
73
            '''
74
            # Printing the information
75
            print("------------------------------------------------------------")
76
            print("Entity: " + str(entity))
77
            print("Entity Label: " + label)
78
            print("Labels - BIO form: " + str(lab))
79
            print("Labels  Index: " + str(noLab))
80
            print("Start Line: " + str(sLine) + ", Start Column: " + str(sCol))
81
            print("End Line: " + str(eLine) + ", End Column: " + str(eCol))
82
            print("------------------------------------------------------------")
83
            '''
84
85
            # Storing the information as a dictionary
86
            dic['entity'] = entity      # Entity Name (In the form of list of words)
87
            dic['label'] = label        # Common Label
88
            dic['BIO_labels'] = lab     # List of BIO labels for each word
89
            dic['label_index'] = noLab  # Labels in the index form
90
            dic['start_line'] = sLine   # Start line of the concept in the corresponding text summaries
91
            dic['start_word_no'] = sCol # Starting word number of the concept in the corresponding start line
92
            dic['end_line'] = eLine     # End line of the concept in the corresponding text summaries
93
            dic['end_word_no'] = eCol   # Ending word number of the concept in the corresponding end line
94
95
            # Appending the concept dictionary to the list
96
            conceptList.append(dic)
97
98
        return conceptList  # Returning the all the concepts in the current file in the form of dictionary list
99
100
    def parse_summary(self, file_path):         # Parses the Text summaries
101
        file_lines = []                         # Stores the lins of files in the list form
102
        tags = []                               # Stores corresponding labels for each word in the file (Default label: 'o' [Outside])
103
        default_label = len(self.labelDict)-1   # default_label is "7" (Corresponding to 'Other' entity) 
104
        # counter = 1                           # Temporary variable used during print
105
106
        f = open(file_path)             # Opening and reading a concept file
107
        content = f.readlines()         # Reading all the lines in the concept file
108
        f.close()
109
110
        for x in content:
111
            x = re.sub('\n', ' ', x)
112
            x = re.sub(r'\ +', ' ', x)
113
            file_lines.append(x.strip().split(" "))             # Spliting the lines into word list and Appending each of them in the file list
114
            tags.append([default_label]*len(file_lines[-1]))    # Assigining the default_label to all the words in a line
115
            '''
116
            # Printing the information
117
            print("------------------------------------------------------------")
118
            print("File Lines No: " + str(counter))
119
            print(file_lines[-1])
120
            print("\nCorresponding labels:")
121
            print(tags[-1])
122
            print("------------------------------------------------------------")
123
            counter += 1
124
            '''
125
            assert len(tags[-1]) == len(file_lines[-1]), "Line length is not matching labels length..."    # Sanity Check
126
        return file_lines, tags
127
128
    def modify_labels(self, conceptList, tags):         # Modifies the default labels of each word in text files with the true labels from the concept files
129
        for e in conceptList:                           # Iterating over all the dictionary elements in the Concept List
130
            if e['start_line'] == e['end_line']:        # Checking whether concept is spanning over a single line or multiple line in the summary
131
                tags[e['start_line']-1][e['start_word_no']:e['end_word_no']+1] = e['label_index'][:]
132
            else:
133
                start = e['start_line']
134
                end = e['end_line']
135
                beg = 0
136
                for i in range(start, end+1):           # Distributing labels over multiple lines in the text summaries
137
                    if i == start:
138
                        tags[i-1][e['start_word_no']:] = e['label_index'][0:len(tags[i-1])-e['start_word_no']]
139
                        beg = len(tags[i-1])-e['start_word_no']
140
                    elif i == end:
141
                        tags[i-1][0:e['end_word_no']+1] = e['label_index'][beg:]
142
                    else:
143
                        tags[i-1][:] = e['label_index'][beg:beg+len(tags[i-1])]
144
                        beg = beg+len(tags[i-1])
145
        return tags
146
147
    def save_data(self,obj_list, s_path):                # Saves the file into the binary file using Pickle
148
        pickle.dump(tuple(obj_list), open(s_path,'wb'))
149
150
    def process_data(self,c_path, t_path, s_path):      # Read all the concept files to get concepts and labels, proces them and save them
151
        for f in os.listdir(t_path):
152
            f1 = f.split('.')[0] + ".con"
153
            if os.path.isfile(os.path.join(c_path, f1)):
154
                conceptList = self.parse_concepts(os.path.join(c_path, f1))      # Parsing concepts and labels from the corresponding concept file
155
                file_lines, tags = self.parse_summary(os.path.join(t_path, f))   # Parses the document summaries to get the written notes
156
                tags = self.modify_labels(conceptList, tags)                     # Modifies he default labels to each word with the true labels from the concept files
157
                self.save_data([conceptList, file_lines, tags], os.path.join(s_path, f.split('.')[0]+".dat"))          # Saving the objects into a file
158
                # print_data(f, file_lines, tags)                           # Printing the details
159
160
161
    def pre_process(self):
162
        """
163
        Any other preprocessing needed can be called from pre_process method.
164
        """
165
        # self.remove_duplicates_from_visual_descriptor_dataset()
166
        # self.rename_image_ids_from_visual_descriptor_dataset()
167
        # self.add_missing_objects_to_dataset()
168
        # self.transform_graph_file_to_dict_graph()
169
        # self.transform_edgelist_to_list_of_list_graph() -> not used
170
171
        self.initialize_labels()
172
        self.process_data(self.concept_path,self.text_path, self.save_path)
173
174