b/MedicalRelationExtractor/entitiesmiddle.py
+#AUTHOR: RAHUL VERMA and SPIRO RAZIS
+import sys
+import pprint
+import numpy
+from sklearn import svm
+from sklearn import linear_model
+import time
+start_time = time.time()
+numpy.set_printoptions(threshold=numpy.nan)
+def parseTextViaPMCID(textFile, pmcidFeatureList, uniqueWordsDictionary,lim):
+    if textFile.startswith("beneficial"):
+        #print("beneficial")
+        fileType = "beneficial".encode('utf-8')
+    elif textFile.startswith("harmful"):
+        #print("harmful")
+        fileType = "harmful".encode('utf-8')
+    else:
+        fileType = ""
+        #print("invalid file name")
+        #sys.exit(2)
+    entryCount       = 0
+    disease          = ""
+    causeOrTreatment = ""
+    relation         = ""
+    sentence = ""
+    newEntry = False
+    limit = 0
+    with open(textFile, "r") as openedTextFile:
+        for line in openedTextFile:
+            if line.startswith("pmcid   : "): #it's the idNumber
+                entryCount += 1
+                newEntry = True
+            elif line.startswith("sentence: "): #it's a sentence
+                sentence = line[10:-1]
+                pass
+            elif line.startswith("entities: "): #it's the two in a relationship
+                disease = line[11:line.index(",")].lower().encode('utf-8')
+                causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
+                #print("fileType: %s" %fileType)
+                #print("disease: %s" %disease)
+                #print("causeOrTreatment: %s" %causeOrTreatment)
+                #feature 1: 11 to comma
+                #feature 2: comma+2 to -2, since -1 is "\n"
+                #entities: {asthmatic, budesonide}
+                #add disease and cause/treatment to dictionary of unique words/phrases
+                if disease not in uniqueWordsDictionary:
+                    uniqueWordsDictionary[disease] = {}
+                    #uniqueWordsDictionary[disease] = disease
+                if causeOrTreatment not in uniqueWordsDictionary:
+                    uniqueWordsDictionary[causeOrTreatment] = {}
+                    #uniqueWordsDictionary[causeOrTreatment] = causeOrTreatment
+            elif line.startswith("offsets : "): #the position of the entities
+                actualsentencefeaturelist = []
+                # In this part I will try to add all the words in between the disease and causeOrTreatment
+                firstoffset = int(line[11:line.index(",")])
+                secondoffset = int(line[(line.index(",")+2):-2])
+                beginning = min(firstoffset,secondoffset)
+                ending =  max(firstoffset,secondoffset)
+                #print(beginning,ending)
+                sentencefeaturelist = sentence[beginning:ending].split(" ")
+                #print(sentencefeaturelist)
+                #sys.exit(0)
+                sentencefeaturelist.pop()
+                for word in sentencefeaturelist:
+                    if (len(word)>3):
+                        actualsentencefeaturelist.append(word.encode('utf-8'))
+                if limit < lim:
+                    for word in actualsentencefeaturelist:
+                        if word not in uniqueWordsDictionary:
+                            uniqueWordsDictionary[word]={}
+                #print(actualsentencefeaturelist)
+                #sys.exit(0)
+                pass
+                #offset
+                #pmcidData.append(line[10:-1])
+                #currentLine += 1
+            elif line.startswith("relation: "): #the actual relationship
+                #feature3, but not processed for this assignment
+                relation = line[10:-1].lower().encode('utf-8')
+                #if relation not in uniqueWordsDictionary:
+                #    uniqueWordsDictionary[relation] = {}
+                #pmcidData.append(line[10:-1])
+            else:
+                if line.startswith("\n") and (newEntry == True):
+                    features = []
+                    for word in actualsentencefeaturelist:
+                        features.append(word)
+                    features = features + [disease, causeOrTreatment, relation, fileType]
+                    pmcidFeatureList.append(features)
+                    #print(pmcidFeatureList)
+                    #sys.exit(0)
+                    disease          = ""
+                    causeOrTreatment = ""
+                    relation         = ""
+                    sentence         = ""
+                    newEntry = False
+                    limit += 1
+                else:
+                    print("invalid line: %s" %(line))
+                    sys.exit(2)
+    return (pmcidFeatureList, entryCount, uniqueWordsDictionary)
+def printFeatureWithCellValue(numpyRow, featureRow):
+    for index, feature in enumerate(featureRow):
+        print("%s: %d" %(feature, numpyRow[index]))
+    print("harmfulOrBeneficial: %d" %(numpyRow[-1]))
+    return
+def main(argv):
+    #Python3 training.py beneficial.txt harmful.txt
+    if len(argv) != 3:
+        print("invalid number of arguments")
+        sys.exit(2)
+    #two separate lists because don't know how many entries in each, so dividing one list will be difficult
+    pmcidBeneficialData = []
+    pmcidHarmfulData = []
+    uniqueWordsDictionary = {}
+    (pmcidBeneficialData, beneficialCount, uniqueWordsDictionary)    = parseTextViaPMCID(argv[1], pmcidBeneficialData, uniqueWordsDictionary,10356)
+    (pmcidHarmfulData, harmfulCount, uniqueWordsDictionary)       = parseTextViaPMCID(argv[2], pmcidHarmfulData, uniqueWordsDictionary,9797)
+    benprec = 10356/beneficialCount
+    harmprec = 9797/harmfulCount
+    uniqueFeaturesArray = numpy.empty(shape = (1, len(uniqueWordsDictionary)), dtype="S128")
+    #place the dictionary words into the array
+    for index, feature in enumerate(uniqueWordsDictionary):
+        uniqueFeaturesArray[0, index] = feature
+    uniqueFeaturesArray.sort()
+    #now to create the three individual arrays
+    numFeatures = len(uniqueWordsDictionary) + 1  #plus 1 for harmful or beneficial
+    #from 20 to 80%:
+    #beneficial60Percent = int(beneficialCount * 0.6)
+    beneficial80Percent = int(beneficialCount * benprec)-1
+    beneficial20Percent = int(beneficialCount - beneficial80Percent)
+    #harmful60Percent    = int(harmfulCount * 0.6)
+    harmful80Percent    = int(harmfulCount * harmprec)-1
+    harmful20Percent    = int(harmfulCount - harmful80Percent)
+    #shape = (rows, columns)
+    trainArray = numpy.empty(shape=((beneficial80Percent + harmful80Percent), numFeatures), dtype=numpy.int8) #Default is numpy.float64
+    testArray   = numpy.empty(shape=((beneficial20Percent + harmful20Percent), numFeatures), dtype=numpy.int8)
+    #
+    #training data
+    for entry in range(0, beneficial80Percent):
+        #for each entry, find the index of the given feature
+        for index, feature in enumerate(pmcidBeneficialData[entry]):
+            if index < 2:
+                #get the index of the given feature
+                featureColumn = numpy.searchsorted(uniqueFeaturesArray[0], feature)
+                if uniqueFeaturesArray[0][featureColumn] == feature:
+                    trainArray[entry, featureColumn] = 1
+                else: print("trainArray: incorrect beneficial feature match"); sys.exit(0)
+            else: break
+        trainArray[entry, -1] = 1
+    for entry in range(0, harmful80Percent):
+        for index, feature in enumerate(pmcidHarmfulData[entry]):
+            if index < 2:
+                trainingEntry = entry + beneficial80Percent
+                #get the index of the given feature
+                featureColumn = numpy.searchsorted(uniqueFeaturesArray[0], feature)
+                #set it to 1
+                if uniqueFeaturesArray[0][featureColumn] == feature:
+                    trainArray[trainingEntry, featureColumn] = 1
+                else: print("trainArray: incorrect harmful feature match"); sys.exit(0)
+            else: break #beneficialOrHarmful column remains 0
+    #test data
+    for entry in range(0, beneficial20Percent):
+        dataEntry = entry + beneficial80Percent #finding next beneficial entry, starting from 60% until 80%
+        for index, feature in enumerate(pmcidBeneficialData[dataEntry]):
+            if index < 2:
+                for featureColumn in range(0,len(uniqueFeaturesArray[0])):
+                    if uniqueFeaturesArray[0][featureColumn] == feature:
+                        testArray[entry, featureColumn] = 1
+            else: break #index == 3 and the column should remain 0
+        testArray[entry, -1] = 1
+    for entry in range(0, harmful20Percent):
+        dataEntry = entry + harmful80Percent # finding the next harmful entry starting from 60% until 80%
+        for index, feature in enumerate(pmcidHarmfulData[dataEntry]):
+            if index < 2:
+                devEntry  = entry + beneficial20Percent #because the prior data entered ended with beneficial20Percent
+                for featureColumn in range(0,len(uniqueFeaturesArray[0])):
+                    if uniqueFeaturesArray[0][featureColumn] == feature:
+                        testArray[devEntry, featureColumn] = 1
+            else: break #index == 3 and column should remain 0
+    ###########################################CLASSIFICATION SECTION#############################################################
+    #Here we set up our list for support vectors and our  list for classes.
+    #We will setup lists to hold our support vectors our classes.
+    supportVectorsL = []
+    classesListL = []
+    for row in trainArray:
+        y1 = row[len(row)-1]
+        supportVectorsL.append(row[:-1])
+        classesListL.append(y1)
+    #Here we initialize our Linear classifier
+    supportVectors = numpy.asarray(supportVectorsL)
+    classesList = numpy.asarray(classesListL)
+    #Here we try out the linear regresion stuff
+    classifier = linear_model.LogisticRegression()
+    classifier.fit(supportVectors,classesList)
+    ############Test our sets through our logisitc model##################
+    print("--------------------LOGISTIC------------------------")
+    logistic(classifier,testArray,"TEST")
+    print("--------------------SVM------------------------")
+    #Here we set up the svm
+    classifier = svm.SVC()
+    classifier.fit(supportVectors,classesList)
+    classifier.kernel="linear"
+    ############Test our sets through our SVM model##################
+    SVC(classifier,testArray,"TEST")
+    sys.exit(0)
+def SVC(classifier, testArray,t):
+    testpredictionarray = []
+    for row in testArray:
+        predictionvector = row[:-1]
+        if 1 in predictionvector:
+            predictionvector = [predictionvector]
+            prediction = classifier.predict(predictionvector)
+            pre = int(prediction[0])
+        else:
+            pre = -1
+        testpredictionarray.append(pre)
+    totalAccuray(testArray,testpredictionarray,t)
+    featAccuracy(testArray,testpredictionarray,t,1)
+    featAccuracy(testArray,testpredictionarray,t,2)
+def logistic(classifier, testArray,t):
+    testpredictionarray = []
+    for row in testArray:
+        predictionvector = row[:-1]
+        if 1 in predictionvector:
+            predictionvector = [predictionvector]
+            prediction = classifier.predict(predictionvector)
+            pre = int(prediction[0])
+        else:
+            pre = -1
+        testpredictionarray.append(pre)
+    totalAccuray(testArray,testpredictionarray,t)
+    featAccuracy(testArray,testpredictionarray,t,1)
+    featAccuracy(testArray,testpredictionarray,t,2)
+def totalAccuray(testArray,testpredictionarray,t):
+    testcounter = 0
+    #here we test for accuracy in the test set results.
+    for x in range(0,len(testArray)):
+        t1= testArray[x][len(testArray[x])-1]
+        t1 = int(t1)
+        if t1 == testpredictionarray[x]:
+            testcounter = testcounter + 1
+    accuracy= testcounter/len(testArray)
+    print(t+" set accuracy = " + str(accuracy))
+def featAccuracy(testArray,testpredictionarray,t,y):
+    actual = 0
+    testcounter = 0
+    for x in range(0,len(testArray)):
+        l = list(testArray[x])
+        c = l.count(1)
+        if c == y:
+            actual+=1
+            t1= testArray[x][len(testArray[x])-1]
+            t1 = int(t1)
+            if t1 == testpredictionarray[x]:
+                testcounter = testcounter + 1
+    try:
+        accuracy= testcounter/actual
+    except ZeroDivisionError:
+        print(t+" set accuracy for only "+str(y)+" feature vectors = UNDEFINED")
+        return
+    print(t+" set accuracy for only "+str(y)+" feature vectors = " + str(accuracy))
+main(sys.argv)
+#