a b/MedicalRelationExtractor/entitiesmiddle.py
1
#AUTHOR: RAHUL VERMA and SPIRO RAZIS
2
import sys
3
import pprint
4
import numpy
5
from sklearn import svm
6
from sklearn import linear_model
7
import time
8
start_time = time.time()
9
10
numpy.set_printoptions(threshold=numpy.nan)
11
12
def parseTextViaPMCID(textFile, pmcidFeatureList, uniqueWordsDictionary,lim):
13
    
14
    if textFile.startswith("beneficial"):
15
        #print("beneficial")
16
        fileType = "beneficial".encode('utf-8')
17
    elif textFile.startswith("harmful"):
18
        #print("harmful")
19
        fileType = "harmful".encode('utf-8')
20
    else:
21
        fileType = ""
22
        #print("invalid file name")
23
        #sys.exit(2)
24
       
25
    entryCount       = 0
26
    disease          = ""
27
    causeOrTreatment = ""
28
    relation         = ""
29
    sentence = ""
30
    newEntry = False    
31
    limit = 0
32
    
33
    
34
    with open(textFile, "r") as openedTextFile:
35
        for line in openedTextFile:
36
            if line.startswith("pmcid   : "): #it's the idNumber
37
                entryCount += 1
38
                newEntry = True
39
            elif line.startswith("sentence: "): #it's a sentence
40
                sentence = line[10:-1]
41
                pass
42
                  
43
            elif line.startswith("entities: "): #it's the two in a relationship
44
                disease = line[11:line.index(",")].lower().encode('utf-8')
45
                causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
46
                #print("fileType: %s" %fileType)
47
                #print("disease: %s" %disease)
48
                #print("causeOrTreatment: %s" %causeOrTreatment)
49
50
                #feature 1: 11 to comma
51
                #feature 2: comma+2 to -2, since -1 is "\n"
52
                #entities: {asthmatic, budesonide}                
53
                #add disease and cause/treatment to dictionary of unique words/phrases
54
                if disease not in uniqueWordsDictionary:
55
                    uniqueWordsDictionary[disease] = {}
56
                    #uniqueWordsDictionary[disease] = disease
57
                if causeOrTreatment not in uniqueWordsDictionary:
58
                    uniqueWordsDictionary[causeOrTreatment] = {}
59
                    #uniqueWordsDictionary[causeOrTreatment] = causeOrTreatment
60
61
            elif line.startswith("offsets : "): #the position of the entities
62
                actualsentencefeaturelist = []                
63
                # In this part I will try to add all the words in between the disease and causeOrTreatment
64
                firstoffset = int(line[11:line.index(",")])
65
                secondoffset = int(line[(line.index(",")+2):-2])
66
                beginning = min(firstoffset,secondoffset)
67
                ending =  max(firstoffset,secondoffset)
68
                #print(beginning,ending)
69
                sentencefeaturelist = sentence[beginning:ending].split(" ")
70
                #print(sentencefeaturelist)
71
                #sys.exit(0)
72
                sentencefeaturelist.pop()
73
                for word in sentencefeaturelist:
74
                    if (len(word)>3):
75
                        actualsentencefeaturelist.append(word.encode('utf-8'))
76
                if limit < lim:
77
                    for word in actualsentencefeaturelist:
78
                        if word not in uniqueWordsDictionary:
79
                            uniqueWordsDictionary[word]={}
80
                #print(actualsentencefeaturelist)
81
                #sys.exit(0)
82
                pass
83
                #offset
84
                #pmcidData.append(line[10:-1])
85
                #currentLine += 1
86
            elif line.startswith("relation: "): #the actual relationship
87
                #feature3, but not processed for this assignment 
88
                relation = line[10:-1].lower().encode('utf-8')
89
                #if relation not in uniqueWordsDictionary:
90
                #    uniqueWordsDictionary[relation] = {}
91
                #pmcidData.append(line[10:-1])                    
92
            else:             
93
                if line.startswith("\n") and (newEntry == True):
94
                    features = []
95
                    for word in actualsentencefeaturelist:
96
                        features.append(word)
97
                    features = features + [disease, causeOrTreatment, relation, fileType]
98
99
                    pmcidFeatureList.append(features)
100
                    #print(pmcidFeatureList)
101
                    #sys.exit(0)
102
                    disease          = ""
103
                    causeOrTreatment = ""
104
                    relation         = ""
105
                    sentence         = ""
106
                    newEntry = False
107
                    limit += 1
108
                else:
109
                    print("invalid line: %s" %(line))
110
                    sys.exit(2)
111
112
    return (pmcidFeatureList, entryCount, uniqueWordsDictionary)
113
114
115
def printFeatureWithCellValue(numpyRow, featureRow):
116
    for index, feature in enumerate(featureRow):
117
        print("%s: %d" %(feature, numpyRow[index]))
118
    print("harmfulOrBeneficial: %d" %(numpyRow[-1]))
119
    return
120
121
122
123
def main(argv):
124
    #Python3 training.py beneficial.txt harmful.txt
125
    if len(argv) != 3:
126
        print("invalid number of arguments")
127
        sys.exit(2)
128
    
129
    #two separate lists because don't know how many entries in each, so dividing one list will be difficult
130
    pmcidBeneficialData = []
131
    pmcidHarmfulData = []
132
    uniqueWordsDictionary = {}
133
    
134
    (pmcidBeneficialData, beneficialCount, uniqueWordsDictionary)    = parseTextViaPMCID(argv[1], pmcidBeneficialData, uniqueWordsDictionary,10356)
135
    (pmcidHarmfulData, harmfulCount, uniqueWordsDictionary)       = parseTextViaPMCID(argv[2], pmcidHarmfulData, uniqueWordsDictionary,9797)
136
    benprec = 10356/beneficialCount
137
    harmprec = 9797/harmfulCount
138
139
    uniqueFeaturesArray = numpy.empty(shape = (1, len(uniqueWordsDictionary)), dtype="S128")
140
    #place the dictionary words into the array
141
    for index, feature in enumerate(uniqueWordsDictionary):
142
        uniqueFeaturesArray[0, index] = feature
143
        
144
    uniqueFeaturesArray.sort()
145
146
    
147
    
148
    #now to create the three individual arrays    
149
    numFeatures = len(uniqueWordsDictionary) + 1  #plus 1 for harmful or beneficial
150
    
151
    #from 20 to 80%:
152
    #beneficial60Percent = int(beneficialCount * 0.6)
153
    beneficial80Percent = int(beneficialCount * benprec)-1
154
    beneficial20Percent = int(beneficialCount - beneficial80Percent)
155
    
156
    #harmful60Percent    = int(harmfulCount * 0.6)
157
    harmful80Percent    = int(harmfulCount * harmprec)-1
158
    harmful20Percent    = int(harmfulCount - harmful80Percent)
159
160
    #shape = (rows, columns)    
161
    trainArray = numpy.empty(shape=((beneficial80Percent + harmful80Percent), numFeatures), dtype=numpy.int8) #Default is numpy.float64
162
    testArray   = numpy.empty(shape=((beneficial20Percent + harmful20Percent), numFeatures), dtype=numpy.int8)
163
    #
164
    
165
    #training data
166
    for entry in range(0, beneficial80Percent):
167
        #for each entry, find the index of the given feature
168
        for index, feature in enumerate(pmcidBeneficialData[entry]):
169
            if index < 2:
170
                #get the index of the given feature
171
                featureColumn = numpy.searchsorted(uniqueFeaturesArray[0], feature)
172
                if uniqueFeaturesArray[0][featureColumn] == feature:
173
                    trainArray[entry, featureColumn] = 1
174
                else: print("trainArray: incorrect beneficial feature match"); sys.exit(0)
175
            else: break 
176
        trainArray[entry, -1] = 1
177
        
178
    for entry in range(0, harmful80Percent):
179
        for index, feature in enumerate(pmcidHarmfulData[entry]):
180
            if index < 2:
181
                trainingEntry = entry + beneficial80Percent
182
                #get the index of the given feature
183
                featureColumn = numpy.searchsorted(uniqueFeaturesArray[0], feature)
184
                #set it to 1
185
                if uniqueFeaturesArray[0][featureColumn] == feature:
186
                    trainArray[trainingEntry, featureColumn] = 1
187
                else: print("trainArray: incorrect harmful feature match"); sys.exit(0)
188
            else: break #beneficialOrHarmful column remains 0
189
190
    
191
    #test data
192
    for entry in range(0, beneficial20Percent):
193
        dataEntry = entry + beneficial80Percent #finding next beneficial entry, starting from 60% until 80%        
194
        for index, feature in enumerate(pmcidBeneficialData[dataEntry]):
195
            if index < 2:
196
                for featureColumn in range(0,len(uniqueFeaturesArray[0])):
197
                    if uniqueFeaturesArray[0][featureColumn] == feature:
198
                        testArray[entry, featureColumn] = 1
199
            else: break #index == 3 and the column should remain 0
200
        testArray[entry, -1] = 1 
201
    for entry in range(0, harmful20Percent):
202
        dataEntry = entry + harmful80Percent # finding the next harmful entry starting from 60% until 80%
203
        for index, feature in enumerate(pmcidHarmfulData[dataEntry]):
204
            if index < 2:
205
                devEntry  = entry + beneficial20Percent #because the prior data entered ended with beneficial20Percent
206
                for featureColumn in range(0,len(uniqueFeaturesArray[0])):
207
                    if uniqueFeaturesArray[0][featureColumn] == feature:
208
                        testArray[devEntry, featureColumn] = 1
209
            else: break #index == 3 and column should remain 0
210
              
211
    
212
    
213
    ###########################################CLASSIFICATION SECTION#############################################################
214
    
215
    #Here we set up our list for support vectors and our  list for classes.
216
    #We will setup lists to hold our support vectors our classes.
217
    supportVectorsL = []
218
    classesListL = []
219
220
    for row in trainArray:
221
        y1 = row[len(row)-1]
222
        supportVectorsL.append(row[:-1])
223
        classesListL.append(y1)
224
    #Here we initialize our Linear classifier
225
    supportVectors = numpy.asarray(supportVectorsL)
226
    classesList = numpy.asarray(classesListL)
227
    #Here we try out the linear regresion stuff
228
    classifier = linear_model.LogisticRegression()
229
    classifier.fit(supportVectors,classesList)
230
    ############Test our sets through our logisitc model##################
231
    print("--------------------LOGISTIC------------------------")
232
    logistic(classifier,testArray,"TEST")
233
234
    print("--------------------SVM------------------------")
235
    #Here we set up the svm
236
    classifier = svm.SVC()
237
    classifier.fit(supportVectors,classesList)
238
    classifier.kernel="linear"
239
    ############Test our sets through our SVM model##################
240
    SVC(classifier,testArray,"TEST")
241
242
    
243
    
244
    
245
    sys.exit(0)
246
247
def SVC(classifier, testArray,t):
248
    testpredictionarray = []
249
    for row in testArray:
250
        predictionvector = row[:-1]
251
        if 1 in predictionvector:
252
            predictionvector = [predictionvector]
253
            prediction = classifier.predict(predictionvector)
254
            pre = int(prediction[0])
255
        else:
256
            pre = -1
257
        testpredictionarray.append(pre)
258
    totalAccuray(testArray,testpredictionarray,t)
259
    featAccuracy(testArray,testpredictionarray,t,1)
260
    featAccuracy(testArray,testpredictionarray,t,2)
261
262
def logistic(classifier, testArray,t):
263
    testpredictionarray = []
264
    for row in testArray:
265
        predictionvector = row[:-1]
266
        if 1 in predictionvector:
267
            predictionvector = [predictionvector]
268
            prediction = classifier.predict(predictionvector)
269
            pre = int(prediction[0])
270
        else:
271
            pre = -1
272
        testpredictionarray.append(pre)
273
    totalAccuray(testArray,testpredictionarray,t)
274
    featAccuracy(testArray,testpredictionarray,t,1)
275
    featAccuracy(testArray,testpredictionarray,t,2)
276
277
278
def totalAccuray(testArray,testpredictionarray,t):
279
    testcounter = 0
280
    #here we test for accuracy in the test set results.
281
    for x in range(0,len(testArray)):
282
        t1= testArray[x][len(testArray[x])-1]
283
        t1 = int(t1)
284
        if t1 == testpredictionarray[x]:
285
            testcounter = testcounter + 1
286
    accuracy= testcounter/len(testArray)   
287
    print(t+" set accuracy = " + str(accuracy))        
288
289
def featAccuracy(testArray,testpredictionarray,t,y):
290
    actual = 0
291
    testcounter = 0
292
    for x in range(0,len(testArray)):
293
        l = list(testArray[x])
294
        c = l.count(1)
295
        if c == y:
296
            actual+=1
297
            t1= testArray[x][len(testArray[x])-1]
298
            t1 = int(t1)
299
            if t1 == testpredictionarray[x]:
300
                testcounter = testcounter + 1
301
    try:
302
        accuracy= testcounter/actual
303
    except ZeroDivisionError:
304
        print(t+" set accuracy for only "+str(y)+" feature vectors = UNDEFINED")
305
        return
306
    
307
    print(t+" set accuracy for only "+str(y)+" feature vectors = " + str(accuracy))        
308
309
310
311
main(sys.argv)
312
313
#