a b/MedicalRelationExtractor/entitiesbased.py
1
#AUTHOR: RAHUL VERMA and SPIRO RAZIS
2
import sys
3
import pprint
4
import numpy
5
from sklearn import svm
6
from sklearn import linear_model
7
import time
8
9
start_time = time.time()
10
11
numpy.set_printoptions(threshold=numpy.nan)
12
13
def parseTextViaPMCID(textFile, pmcidFeatureList, uniqueWordsDictionary,lim):
14
    
15
    if textFile.startswith("randomBeni"):
16
        print("beneficial")
17
        fileType = "beneficial".encode('utf-8')
18
    elif textFile.startswith("randomHarm"):
19
        print("harmful")
20
        fileType = "harmful".encode('utf-8')
21
    else:
22
        if textFile.startswith("b"):
23
            fileType = "beneficial".encode('utf-8')
24
        else:
25
            fileType = "harmful".encode('utf-8')
26
27
    limit = 0 
28
    entryCount       = 0
29
    disease          = ""
30
    causeOrTreatment = ""
31
    relation         = ""
32
    newEntry = False    
33
    lindex = 0
34
    
35
    with open(textFile, "r") as openedTextFile:
36
        for line in openedTextFile:
37
            lindex+=1
38
            if line.startswith("pmcid   : "): #it's the idNumber
39
                entryCount += 1
40
                newEntry = True
41
            elif line.startswith("sentence: "): #it's a sentence
42
                pass
43
                  
44
            elif line.startswith("entities: "): #it's the two in a relationship
45
                disease = line[11:line.index(",")].lower().encode('utf-8')
46
                causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
47
                if disease not in uniqueWordsDictionary:
48
                    if limit < lim:
49
                        uniqueWordsDictionary[disease] = {}
50
                if causeOrTreatment not in uniqueWordsDictionary:
51
                    if limit < lim:
52
                        uniqueWordsDictionary[causeOrTreatment] = {}
53
            elif line.startswith("offsets : "): #the position of the entities
54
                pass
55
            elif line.startswith("relation: "): #the actual relationship
56
                relation = line[10:-1].lower().encode('utf-8')                   
57
            else:             
58
                if line.startswith("\n") and (newEntry == True):
59
                    pmcidFeatureList.append([disease, causeOrTreatment, relation, fileType])
60
                    disease          = ""
61
                    causeOrTreatment = ""
62
                    relation         = ""
63
                    newEntry = False
64
                    limit += 1
65
                else:
66
                    print("invalid line: %s" %(line))
67
                    print(textFile,lindex)
68
                    sys.exit(2)
69
70
    return (pmcidFeatureList, entryCount, uniqueWordsDictionary)
71
72
73
def printFeatureWithCellValue(numpyRow, featureRow):
74
    for index, feature in enumerate(featureRow):
75
        print("%s: %d" %(feature, numpyRow[index]))
76
    print("harmfulOrBeneficial: %d" %(numpyRow[-1]))
77
    return
78
79
def printFeaturesWithValuesEqualOne(numpyRow, featureRow):
80
    for index, feature in enumerate(featureRow):
81
        if numpyRow[index] == 1:
82
            print("%s: %d" %(feature, numpyRow[index]))
83
    print("harmfulOrBeneficial: %d" %(numpyRow[-1]))
84
    return
85
86
def main(argv):
87
    #Python3 training.py beneficial.txt harmful.txt
88
    if len(argv) != 3:
89
        print("invalid number of arguments")
90
        sys.exit(2)
91
    
92
    #two separate lists because don't know how many entries in each, so dividing one list will be difficult
93
    pmcidBeneficialData = []
94
    pmcidHarmfulData = []
95
    uniqueWordsDictionary = {}
96
    
97
    (pmcidBeneficialData, beneficialCount, uniqueWordsDictionary)    = parseTextViaPMCID(argv[1], pmcidBeneficialData, uniqueWordsDictionary,10356)
98
    (pmcidHarmfulData, harmfulCount, uniqueWordsDictionary)       = parseTextViaPMCID(argv[2], pmcidHarmfulData, uniqueWordsDictionary,9797)
99
    benprec = 10356/beneficialCount
100
    harmprec = 9797/harmfulCount
101
102
    uniqueFeaturesArray = numpy.empty(shape = (1, len(uniqueWordsDictionary)), dtype="S128")
103
    #place the dictionary words into the array
104
    for index, feature in enumerate(uniqueWordsDictionary):
105
        uniqueFeaturesArray[0, index] = feature
106
        
107
    uniqueFeaturesArray.sort()
108
109
    
110
    
111
    #now to create the three individual arrays    
112
    numFeatures = len(uniqueWordsDictionary) + 1  #plus 1 for harmful or beneficial
113
    
114
    #from 20 to 80%:
115
    #beneficial60Percent = int(beneficialCount * 0.6)
116
    beneficial80Percent = int(beneficialCount * benprec)-1
117
    beneficial20Percent = int(beneficialCount - beneficial80Percent)
118
    
119
    #harmful60Percent    = int(harmfulCount * 0.6)
120
    harmful80Percent    = int(harmfulCount * harmprec)-1
121
    harmful20Percent    = int(harmfulCount - harmful80Percent)
122
123
    #shape = (rows, columns)    
124
    trainArray = numpy.empty(shape=((beneficial80Percent + harmful80Percent), numFeatures), dtype=numpy.int8) #Default is numpy.float64
125
    testArray   = numpy.empty(shape=((beneficial20Percent + harmful20Percent), numFeatures), dtype=numpy.int8)
126
    #
127
    
128
    #training data
129
    for entry in range(0, beneficial80Percent):
130
        #for each entry, find the index of the given feature
131
        for index, feature in enumerate(pmcidBeneficialData[entry]):
132
            if index < 2:
133
                #get the index of the given feature
134
                featureColumn = numpy.searchsorted(uniqueFeaturesArray[0], feature)
135
                if uniqueFeaturesArray[0][featureColumn] == feature:
136
                    trainArray[entry, featureColumn] = 1
137
                else: print("trainArray: incorrect beneficial feature match"); sys.exit(0)
138
            else: break 
139
        trainArray[entry, -1] = 1
140
        
141
    for entry in range(0, harmful80Percent):
142
        for index, feature in enumerate(pmcidHarmfulData[entry]):
143
            if index < 2:
144
                trainingEntry = entry + beneficial80Percent
145
                #get the index of the given feature
146
                featureColumn = numpy.searchsorted(uniqueFeaturesArray[0], feature)
147
                #set it to 1
148
                if uniqueFeaturesArray[0][featureColumn] == feature:
149
                    trainArray[trainingEntry, featureColumn] = 1
150
                else: print("trainArray: incorrect harmful feature match"); sys.exit(0)
151
            else: break #beneficialOrHarmful column remains 0
152
153
    
154
    #test data
155
    for entry in range(0, beneficial20Percent):
156
        dataEntry = entry + beneficial80Percent #finding next beneficial entry, starting from 60% until 80%        
157
        for index, feature in enumerate(pmcidBeneficialData[dataEntry]):
158
            if index < 2:
159
                for featureColumn in range(0,len(uniqueFeaturesArray[0])):
160
                    if uniqueFeaturesArray[0][featureColumn] == feature:
161
                        testArray[entry, featureColumn] = 1
162
            else: break #index == 3 and the column should remain 0
163
        testArray[entry, -1] = 1 
164
    for entry in range(0, harmful20Percent):
165
        dataEntry = entry + harmful80Percent # finding the next harmful entry starting from 60% until 80%
166
        for index, feature in enumerate(pmcidHarmfulData[dataEntry]):
167
            if index < 2:
168
                devEntry  = entry + beneficial20Percent #because the prior data entered ended with beneficial20Percent
169
                for featureColumn in range(0,len(uniqueFeaturesArray[0])):
170
                    if uniqueFeaturesArray[0][featureColumn] == feature:
171
                        testArray[devEntry, featureColumn] = 1
172
            else: break #index == 3 and column should remain 0
173
              
174
    
175
    
176
    ###########################################CLASSIFICATION SECTION#############################################################
177
    
178
    #Here we set up our list for support vectors and our  list for classes.
179
    #We will setup lists to hold our support vectors our classes.
180
    supportVectorsL = []
181
    classesListL = []
182
183
    for row in trainArray:
184
        y1 = row[len(row)-1]
185
        supportVectorsL.append(row[:-1])
186
        classesListL.append(y1)
187
    #Here we initialize our Linear classifier
188
    supportVectors = numpy.asarray(supportVectorsL)
189
    classesList = numpy.asarray(classesListL)
190
    #Here we try out the linear regresion stuff
191
    classifier = linear_model.LogisticRegression()
192
    classifier.fit(supportVectors,classesList)
193
    ############Test our sets through our logisitc model##################
194
    print("--------------------LOGISTIC------------------------")
195
    logistic(classifier,testArray,"TEST")
196
197
    print("--------------------SVM------------------------")
198
    #Here we set up the svm
199
    classifier = svm.SVC()
200
    classifier.fit(supportVectors,classesList)
201
    classifier.kernel="linear"
202
    ############Test our sets through our SVM model##################
203
    SVC(classifier,testArray,"TEST")
204
205
    
206
    
207
    
208
    sys.exit(0)
209
210
def SVC(classifier, testArray,t):
211
    testpredictionarray = []
212
    for row in testArray:
213
        predictionvector = row[:-1]
214
        if 1 in predictionvector:
215
            predictionvector = [predictionvector]
216
            prediction = classifier.predict(predictionvector)
217
            pre = int(prediction[0])
218
        else:
219
            pre = -1
220
        testpredictionarray.append(pre)
221
    totalAccuray(testArray,testpredictionarray,t)
222
    featAccuracy(testArray,testpredictionarray,t,1)
223
    featAccuracy(testArray,testpredictionarray,t,2)
224
225
def logistic(classifier, testArray,t):
226
    testpredictionarray = []
227
    for row in testArray:
228
        predictionvector = row[:-1]
229
        if 1 in predictionvector:
230
            predictionvector = [predictionvector]
231
            prediction = classifier.predict(predictionvector)
232
            pre = int(prediction[0])
233
        else:
234
            pre = -1
235
        testpredictionarray.append(pre)
236
    totalAccuray(testArray,testpredictionarray,t)
237
    featAccuracy(testArray,testpredictionarray,t,1)
238
    featAccuracy(testArray,testpredictionarray,t,2)
239
240
241
def totalAccuray(testArray,testpredictionarray,t):
242
    testcounter = 0
243
    #here we test for accuracy in the test set results.
244
    for x in range(0,len(testArray)):
245
        t1= testArray[x][len(testArray[x])-1]
246
        t1 = int(t1)
247
        if t1 == testpredictionarray[x]:
248
            testcounter = testcounter + 1
249
    accuracy= testcounter/len(testArray)   
250
    print(t+" set accuracy = " + str(accuracy))        
251
252
def featAccuracy(testArray,testpredictionarray,t,y):
253
    actual = 0
254
    testcounter = 0
255
    for x in range(0,len(testArray)):
256
        l = list(testArray[x])
257
        c = l.count(1)
258
        if c == y:
259
            actual+=1
260
            t1= testArray[x][len(testArray[x])-1]
261
            t1 = int(t1)
262
            if t1 == testpredictionarray[x]:
263
                testcounter = testcounter + 1
264
    try:
265
        accuracy= testcounter/actual
266
    except ZeroDivisionError:
267
        print(t+" set accuracy for only "+str(y)+" feature vectors = UNDEFINED")
268
        return
269
    
270
    print(t+" set accuracy for only "+str(y)+" feature vectors = " + str(accuracy))        
271
272
273
274
main(sys.argv)
275
276
#