Medical-Relation-Extract / Git / [9397fa] /MedicalRelationExtractor/entitiesbased.py

Models:
philipB/
Medical-Relation-Extract
Downloads: 1
[9397fa]: / MedicalRelationExtractor / entitiesbased.py
History
Download this file
276 lines (230 with data), 10.9 kB

#AUTHOR: RAHUL VERMA and SPIRO RAZIS
import sys
import pprint
import numpy
from sklearn import svm
from sklearn import linear_model
import time

start_time = time.time()

numpy.set_printoptions(threshold=numpy.nan)

def parseTextViaPMCID(textFile, pmcidFeatureList, uniqueWordsDictionary,lim):
    
    if textFile.startswith("randomBeni"):
        print("beneficial")
        fileType = "beneficial".encode('utf-8')
    elif textFile.startswith("randomHarm"):
        print("harmful")
        fileType = "harmful".encode('utf-8')
    else:
        if textFile.startswith("b"):
            fileType = "beneficial".encode('utf-8')
        else:
            fileType = "harmful".encode('utf-8')

    limit = 0 
    entryCount       = 0
    disease          = ""
    causeOrTreatment = ""
    relation         = ""
    newEntry = False    
    lindex = 0
    
    with open(textFile, "r") as openedTextFile:
        for line in openedTextFile:
            lindex+=1
            if line.startswith("pmcid   : "): #it's the idNumber
                entryCount += 1
                newEntry = True
            elif line.startswith("sentence: "): #it's a sentence
                pass
                  
            elif line.startswith("entities: "): #it's the two in a relationship
                disease = line[11:line.index(",")].lower().encode('utf-8')
                causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
                if disease not in uniqueWordsDictionary:
                    if limit < lim:
                        uniqueWordsDictionary[disease] = {}
                if causeOrTreatment not in uniqueWordsDictionary:
                    if limit < lim:
                        uniqueWordsDictionary[causeOrTreatment] = {}
            elif line.startswith("offsets : "): #the position of the entities
                pass
            elif line.startswith("relation: "): #the actual relationship
                relation = line[10:-1].lower().encode('utf-8')                   
            else:             
                if line.startswith("\n") and (newEntry == True):
                    pmcidFeatureList.append([disease, causeOrTreatment, relation, fileType])
                    disease          = ""
                    causeOrTreatment = ""
                    relation         = ""
                    newEntry = False
                    limit += 1
                else:
                    print("invalid line: %s" %(line))
                    print(textFile,lindex)
                    sys.exit(2)

    return (pmcidFeatureList, entryCount, uniqueWordsDictionary)


def printFeatureWithCellValue(numpyRow, featureRow):
    for index, feature in enumerate(featureRow):
        print("%s: %d" %(feature, numpyRow[index]))
    print("harmfulOrBeneficial: %d" %(numpyRow[-1]))
    return

def printFeaturesWithValuesEqualOne(numpyRow, featureRow):
    for index, feature in enumerate(featureRow):
        if numpyRow[index] == 1:
            print("%s: %d" %(feature, numpyRow[index]))
    print("harmfulOrBeneficial: %d" %(numpyRow[-1]))
    return

def main(argv):
    #Python3 training.py beneficial.txt harmful.txt
    if len(argv) != 3:
        print("invalid number of arguments")
        sys.exit(2)
    
    #two separate lists because don't know how many entries in each, so dividing one list will be difficult
    pmcidBeneficialData = []
    pmcidHarmfulData = []
    uniqueWordsDictionary = {}
    
    (pmcidBeneficialData, beneficialCount, uniqueWordsDictionary)    = parseTextViaPMCID(argv[1], pmcidBeneficialData, uniqueWordsDictionary,10356)
    (pmcidHarmfulData, harmfulCount, uniqueWordsDictionary)       = parseTextViaPMCID(argv[2], pmcidHarmfulData, uniqueWordsDictionary,9797)
    benprec = 10356/beneficialCount
    harmprec = 9797/harmfulCount

    uniqueFeaturesArray = numpy.empty(shape = (1, len(uniqueWordsDictionary)), dtype="S128")
    #place the dictionary words into the array
    for index, feature in enumerate(uniqueWordsDictionary):
        uniqueFeaturesArray[0, index] = feature
        
    uniqueFeaturesArray.sort()

    
    
    #now to create the three individual arrays    
    numFeatures = len(uniqueWordsDictionary) + 1  #plus 1 for harmful or beneficial
    
    #from 20 to 80%:
    #beneficial60Percent = int(beneficialCount * 0.6)
    beneficial80Percent = int(beneficialCount * benprec)-1
    beneficial20Percent = int(beneficialCount - beneficial80Percent)
    
    #harmful60Percent    = int(harmfulCount * 0.6)
    harmful80Percent    = int(harmfulCount * harmprec)-1
    harmful20Percent    = int(harmfulCount - harmful80Percent)

    #shape = (rows, columns)    
    trainArray = numpy.empty(shape=((beneficial80Percent + harmful80Percent), numFeatures), dtype=numpy.int8) #Default is numpy.float64
    testArray   = numpy.empty(shape=((beneficial20Percent + harmful20Percent), numFeatures), dtype=numpy.int8)
    #
    
    #training data
    for entry in range(0, beneficial80Percent):
        #for each entry, find the index of the given feature
        for index, feature in enumerate(pmcidBeneficialData[entry]):
            if index < 2:
                #get the index of the given feature
                featureColumn = numpy.searchsorted(uniqueFeaturesArray[0], feature)
                if uniqueFeaturesArray[0][featureColumn] == feature:
                    trainArray[entry, featureColumn] = 1
                else: print("trainArray: incorrect beneficial feature match"); sys.exit(0)
            else: break 
        trainArray[entry, -1] = 1
        
    for entry in range(0, harmful80Percent):
        for index, feature in enumerate(pmcidHarmfulData[entry]):
            if index < 2:
                trainingEntry = entry + beneficial80Percent
                #get the index of the given feature
                featureColumn = numpy.searchsorted(uniqueFeaturesArray[0], feature)
                #set it to 1
                if uniqueFeaturesArray[0][featureColumn] == feature:
                    trainArray[trainingEntry, featureColumn] = 1
                else: print("trainArray: incorrect harmful feature match"); sys.exit(0)
            else: break #beneficialOrHarmful column remains 0

    
    #test data
    for entry in range(0, beneficial20Percent):
        dataEntry = entry + beneficial80Percent #finding next beneficial entry, starting from 60% until 80%        
        for index, feature in enumerate(pmcidBeneficialData[dataEntry]):
            if index < 2:
                for featureColumn in range(0,len(uniqueFeaturesArray[0])):
                    if uniqueFeaturesArray[0][featureColumn] == feature:
                        testArray[entry, featureColumn] = 1
            else: break #index == 3 and the column should remain 0
        testArray[entry, -1] = 1 
    for entry in range(0, harmful20Percent):
        dataEntry = entry + harmful80Percent # finding the next harmful entry starting from 60% until 80%
        for index, feature in enumerate(pmcidHarmfulData[dataEntry]):
            if index < 2:
                devEntry  = entry + beneficial20Percent #because the prior data entered ended with beneficial20Percent
                for featureColumn in range(0,len(uniqueFeaturesArray[0])):
                    if uniqueFeaturesArray[0][featureColumn] == feature:
                        testArray[devEntry, featureColumn] = 1
            else: break #index == 3 and column should remain 0
              
    
    
    ###########################################CLASSIFICATION SECTION#############################################################
    
    #Here we set up our list for support vectors and our  list for classes.
    #We will setup lists to hold our support vectors our classes.
    supportVectorsL = []
    classesListL = []

    for row in trainArray:
        y1 = row[len(row)-1]
        supportVectorsL.append(row[:-1])
        classesListL.append(y1)
    #Here we initialize our Linear classifier
    supportVectors = numpy.asarray(supportVectorsL)
    classesList = numpy.asarray(classesListL)
    #Here we try out the linear regresion stuff
    classifier = linear_model.LogisticRegression()
    classifier.fit(supportVectors,classesList)
    ############Test our sets through our logisitc model##################
    print("--------------------LOGISTIC------------------------")
    logistic(classifier,testArray,"TEST")

    print("--------------------SVM------------------------")
    #Here we set up the svm
    classifier = svm.SVC()
    classifier.fit(supportVectors,classesList)
    classifier.kernel="linear"
    ############Test our sets through our SVM model##################
    SVC(classifier,testArray,"TEST")

    
    
    
    sys.exit(0)

def SVC(classifier, testArray,t):
    testpredictionarray = []
    for row in testArray:
        predictionvector = row[:-1]
        if 1 in predictionvector:
            predictionvector = [predictionvector]
            prediction = classifier.predict(predictionvector)
            pre = int(prediction[0])
        else:
            pre = -1
        testpredictionarray.append(pre)
    totalAccuray(testArray,testpredictionarray,t)
    featAccuracy(testArray,testpredictionarray,t,1)
    featAccuracy(testArray,testpredictionarray,t,2)

def logistic(classifier, testArray,t):
    testpredictionarray = []
    for row in testArray:
        predictionvector = row[:-1]
        if 1 in predictionvector:
            predictionvector = [predictionvector]
            prediction = classifier.predict(predictionvector)
            pre = int(prediction[0])
        else:
            pre = -1
        testpredictionarray.append(pre)
    totalAccuray(testArray,testpredictionarray,t)
    featAccuracy(testArray,testpredictionarray,t,1)
    featAccuracy(testArray,testpredictionarray,t,2)


def totalAccuray(testArray,testpredictionarray,t):
    testcounter = 0
    #here we test for accuracy in the test set results.
    for x in range(0,len(testArray)):
        t1= testArray[x][len(testArray[x])-1]
        t1 = int(t1)
        if t1 == testpredictionarray[x]:
            testcounter = testcounter + 1
    accuracy= testcounter/len(testArray)   
    print(t+" set accuracy = " + str(accuracy))        

def featAccuracy(testArray,testpredictionarray,t,y):
    actual = 0
    testcounter = 0
    for x in range(0,len(testArray)):
        l = list(testArray[x])
        c = l.count(1)
        if c == y:
            actual+=1
            t1= testArray[x][len(testArray[x])-1]
            t1 = int(t1)
            if t1 == testpredictionarray[x]:
                testcounter = testcounter + 1
    try:
        accuracy= testcounter/actual
    except ZeroDivisionError:
        print(t+" set accuracy for only "+str(y)+" feature vectors = UNDEFINED")
        return
    
    print(t+" set accuracy for only "+str(y)+" feature vectors = " + str(accuracy))        



main(sys.argv)

#