b/MedicalRelationExtractor/unibiased.py
+#AUTHOR: RAHUL VERMA and SPIRO RAZIS
+import sys
+import re
+import pprint
+import numpy
+from sklearn import svm
+from sklearn import linear_model
+import time
+from random import shuffle
+start_time = time.time()
+numpy.set_printoptions(threshold=numpy.nan)
+def parseTextViaPMCID(textFile, pmcidFeatureList, uniqueWordsDictionary,lim):
+    if textFile.startswith("beneficial"):
+        #print("beneficial")
+        fileType = "beneficial".encode('utf-8')
+    elif textFile.startswith("harmful"):
+        #print("harmful")
+        fileType = "harmful".encode('utf-8')
+    else:
+        #print("invalid file name")
+        sys.exit(2)
+    limit = 0
+    entryCount       = 0
+    disease          = ""
+    causeOrTreatment = ""
+    relation         = ""
+    newEntry = False
+    with open(textFile, "r") as openedTextFile:
+        for line in openedTextFile:
+            if limit < lim:
+                if line.startswith("pmcid   : "): #it's the idNumber
+                    entryCount += 1
+                    newEntry = True
+                elif line.startswith("sentence: "): #it's a sentence
+                    pass
+                elif line.startswith("entities: "): #it's the two in a relationship
+                    disease = line[11:line.index(",")].lower().encode('utf-8')
+                    causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
+                    #add disease and cause/treatment to dictionary of unique words/phrases
+                    if disease not in uniqueWordsDictionary:
+                        uniqueWordsDictionary[disease] = {}
+                    if causeOrTreatment not in uniqueWordsDictionary:
+                        uniqueWordsDictionary[causeOrTreatment] = {}
+                elif line.startswith("offsets : "): #the position of the entities
+                    pass
+                elif line.startswith("relation: "): #the actual relationship
+                    relation = line[10:-1].lower().encode('utf-8')
+                else:
+                    if line.startswith("\n") and (newEntry == True):
+                        pmcidFeatureList.append([disease, causeOrTreatment, relation, fileType])
+                        disease          = ""
+                        causeOrTreatment = ""
+                        relation         = ""
+                        newEntry = False
+                        limit += 1
+                    else:
+                        print("invalid line: %s" %(line))
+                        sys.exit(2)
+            else: break
+    return (pmcidFeatureList, entryCount, uniqueWordsDictionary)
+def printFeatureWithCellValue(numpyRow, featureRow):
+    for index, feature in enumerate(featureRow):
+        print("%s: %d" %(feature, numpyRow[index]))
+    print("harmfulOrBeneficial: %d" %(numpyRow[-1]))
+    return
+def printFeaturesWithValuesEqualOne(numpyRow, featureRow):
+    for index, feature in enumerate(featureRow):
+        if numpyRow[index] == 1:
+            print("%s: %d" %(feature, numpyRow[index]))
+    print("harmfulOrBeneficial: %d" %(numpyRow[-1]))
+    return
+def parseEntitiesIntoUnigrams(beneficialFile, harmfulFile, beneficialLimit, harmfulLimit):
+    beneficialEntry = 0
+    harmfulEntry = 0
+    entitiesTrainingDictionary= {}
+    entityUnigramList           = []
+    beneficialFullEntitiesList  = []
+    harmfulFullEntitiesList     = []
+    sentenceUnigramList         = []
+    beneficialSplitSentences    = []
+    harmfulSplitSentences       = []
+    entityUnigrams              = {}
+    harmfulUnigrams             = {}
+    beneficialUnigrams          = {}
+    sentenceFeatureUnigrams             = {}
+    testArrayForWritingEntries = numpy.empty(shape = (1, 1), dtype = "S128")
+    #WORKING ON THE UNIGRAMS OF THE TRAINING BENEFICIAL ENTITIES HERE
+    with open(beneficialFile, "r") as openedBeneficialFile:
+        for line in openedBeneficialFile:
+            if beneficialEntry < beneficialLimit:
+                if line.startswith("entities: "):
+                    #individual entities
+                    disease = line[11:line.index(",")].lower().encode('utf-8')
+                    causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
+                    if disease not in entitiesTrainingDictionary:
+                        entitiesTrainingDictionary[disease] = {}
+                    if causeOrTreatment not in entitiesTrainingDictionary:
+                        entitiesTrainingDictionary[causeOrTreatment] = {}
+                    #unigrams composing the entities
+                    entityUnigramList = re.split("-|, |\. |\/| ", line[11:-2].lower())
+                    for entry in entityUnigramList:
+                        if (entry != "") and (entry not in entityUnigrams):
+                            entityUnigrams[entry] = {}
+                    beneficialEntry += 1
+            else: break
+    #WORKING ON THE TRAINING HARMFUL ENTITIES HERE
+    with open(harmfulFile, "r") as openedHarmfulFile:
+        for line in openedHarmfulFile:
+            if harmfulEntry < harmfulLimit:
+                if line.startswith("entities: "):
+                    #individual entities
+                    disease = line[11:line.index(",")].lower().encode('utf-8')
+                    causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
+                    if disease not in entitiesTrainingDictionary:
+                        entitiesTrainingDictionary[disease] = {}
+                    if causeOrTreatment not in entitiesTrainingDictionary:
+                        entitiesTrainingDictionary[causeOrTreatment] = {}
+                    entityUnigramList = re.split("-|, |\. |\/| ", line[11:-2].lower())
+                    for entry in entityUnigramList:
+                        if (entry != "")  and (entry not in entityUnigrams):
+                            entityUnigrams[entry] = {}
+                    harmfulEntry += 1
+            else: break
+    beneficialEntry = 0
+    mostRecentPMCID = ""
+    with open(beneficialFile, "r") as openedBeneficialFile:
+        for line in openedBeneficialFile:
+            if line.startswith("pmcid   : "): #it's the pmcid line
+                mostRecentPMCID = line[11:-1]
+            elif line.startswith("sentence: "):
+                sentenceUnigramList = re.split("\—|\-|\, |\.|\/|\(|\)|\'|\"|\[|\]|\ |\“|\”|\,|\d|\<|\>|\:|\$|\%|\*|\′", line[10:-2].lower())
+                beneficialSplitSentences.append(sentenceUnigramList)
+                if beneficialEntry < beneficialLimit:
+                    for word in sentenceUnigramList:
+                        if (word != "") and (word not in entityUnigrams):
+                            if word not in sentenceFeatureUnigrams:
+                                try:
+                                    testArrayForWritingEntries[0,0] = word
+                                    sentenceFeatureUnigrams[word] = {}
+                                    sentenceFeatureUnigrams[word]["beneficial"] = {}
+                                    sentenceFeatureUnigrams[word]["beneficial"]["pmcid"] = {}
+                                    sentenceFeatureUnigrams[word]["beneficial"]["pmcid"][mostRecentPMCID] = {}
+                                    sentenceFeatureUnigrams[word]["beneficial"]["count"] = 0
+                                    sentenceFeatureUnigrams[word]["harmful"] = {}
+                                    sentenceFeatureUnigrams[word]["harmful"]["pmcid"] = {}
+                                    sentenceFeatureUnigrams[word]["harmful"]["count"] = 0
+                                except UnicodeEncodeError: pass
+                            else: #it is in the feature unigrams already, so add the
+                                if mostRecentPMCID not in sentenceFeatureUnigrams[word]["beneficial"]["pmcid"]: #and the same pmcid isn't already there
+                                    sentenceFeatureUnigrams[word]["beneficial"]["pmcid"][mostRecentPMCID] = {}
+                beneficialEntry += 1
+            elif line.startswith("entities: "):
+                #individual entities
+                disease = line[11:line.index(",")].lower().encode('utf-8')
+                causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
+                beneficialFullEntitiesList.append([disease, causeOrTreatment])
+            else: pass
+    harmfulEntry = 0
+    mostRecentPMCID = ""
+    with open(harmfulFile, "r") as openedHarmfulFile:
+        for line in openedHarmfulFile:
+            if line.startswith("pmcid   : "): #it's the pmcid line
+                mostRecentPMCID = line[11:-1]
+            elif line.startswith("sentence: "):
+                sentenceUnigramList = re.split("\—|\-|\, |\.|\/|\(|\)|\'|\"|\[|\]|\ |\“|\”|\,|\d|\<|\>|\:|\$|\%|\*|\′", line[10:-2].lower())
+                harmfulSplitSentences.append(sentenceUnigramList)
+                if harmfulEntry < harmfulLimit:
+                    for word in sentenceUnigramList:
+                        if (word != "") and (word not in entityUnigrams):
+                            if word not in sentenceFeatureUnigrams:
+                                try:
+                                    testArrayForWritingEntries[0,0] = word
+                                    sentenceFeatureUnigrams[word] = {}
+                                    sentenceFeatureUnigrams[word]["beneficial"] = {}
+                                    sentenceFeatureUnigrams[word]["beneficial"]["pmcid"] = {}
+                                    sentenceFeatureUnigrams[word]["beneficial"]["count"] = 0
+                                    sentenceFeatureUnigrams[word]["harmful"] = {}
+                                    sentenceFeatureUnigrams[word]["harmful"]["pmcid"] = {}
+                                    sentenceFeatureUnigrams[word]["harmful"]["pmcid"][mostRecentPMCID] = {}
+                                    sentenceFeatureUnigrams[word]["harmful"]["count"] = 0
+                                except UnicodeEncodeError: pass
+                            else:
+                                if mostRecentPMCID not in sentenceFeatureUnigrams[word]["harmful"]["pmcid"]: #and the same pmcid isn't already there
+                                    sentenceFeatureUnigrams[word]["harmful"]["pmcid"][mostRecentPMCID] = {}
+                harmfulEntry += 1
+            elif line.startswith("entities: "):
+                disease = line[11:line.index(",")].lower().encode('utf-8')
+                causeOrTreatment = line[(line.index(",")+2):-2].lower().encode('utf-8')
+                harmfulFullEntitiesList.append([disease, causeOrTreatment])
+            else: pass
+    for word in sentenceFeatureUnigrams:
+        for benefitHarmfulOrEntity in sentenceFeatureUnigrams[word]:
+            #start counting!
+            for pmcid in sentenceFeatureUnigrams[word][benefitHarmfulOrEntity]["pmcid"]:
+                sentenceFeatureUnigrams[word][benefitHarmfulOrEntity]["count"] += 1
+        if (sentenceFeatureUnigrams[word]["beneficial"]["count"] > 1) or (sentenceFeatureUnigrams[word]["harmful"]["count"] > 1):
+            if sentenceFeatureUnigrams[word]["beneficial"]["count"] > (2*sentenceFeatureUnigrams[word]["harmful"]["count"]):
+                beneficialUnigrams[word] = {}
+            elif sentenceFeatureUnigrams[word]["harmful"]["count"] > (2*sentenceFeatureUnigrams[word]["beneficial"]["count"]):
+                harmfulUnigrams[word] = {}
+            else: pass #the words can't be categorized one way or the other
+    return (entitiesTrainingDictionary,
+            beneficialUnigrams, harmfulUnigrams,
+            beneficialEntry, harmfulEntry,
+            beneficialSplitSentences, harmfulSplitSentences,
+            beneficialFullEntitiesList, harmfulFullEntitiesList)
+def main(argv):
+    #Python3 training.py beneficial.txt harmful.txt
+    if len(argv) != 3:
+        print("invalid number of arguments")
+        sys.exit(2)
+    #two separate lists because don't know how many entries in each, so dividing one list will be difficult
+    (entitiesTrainingDictionary, beneficialUnigrams, harmfulUnigrams,
+                beneficialCount, harmfulCount, pmcidBeneficialSentences, pmcidHarmfulSentences,
+                beneficialFullEntitiesList, harmfulFullEntitiesList) = parseEntitiesIntoUnigrams(argv[1], argv[2], 10356, 9797)
+    benprec = 10356/beneficialCount
+    harmprec = 9797/harmfulCount
+    numFeatures = len(entitiesTrainingDictionary) + len(beneficialUnigrams) + len(harmfulUnigrams) + 1  #plus 1 for harmful or beneficial
+    uniqueFeaturesArray = numpy.empty(shape = (1, numFeatures), dtype="S128")
+    #place the dictionary words into the array
+    for index, feature in enumerate(entitiesTrainingDictionary):
+        uniqueFeaturesArray[0, index] = feature
+    finalColumn = len(entitiesTrainingDictionary)
+    for index, feature in enumerate(beneficialUnigrams):
+        currentColumn = index + finalColumn
+        uniqueFeaturesArray[0, currentColumn] = feature
+    finalColumn += len(beneficialUnigrams)
+    for index, feature in enumerate(harmfulUnigrams):
+        currentColumn = index + finalColumn
+        uniqueFeaturesArray[0, currentColumn] = feature
+    uniqueFeaturesArray[0][:-1].sort()
+    beneficial80Percent = int(beneficialCount * benprec)-1
+    beneficial20Percent = int(beneficialCount - beneficial80Percent)
+    harmful80Percent    = int(harmfulCount * harmprec)-1
+    harmful20Percent    = int(harmfulCount - harmful80Percent)
+    trainArray = numpy.empty(shape=((beneficial80Percent + harmful80Percent), numFeatures), dtype=numpy.int8) #Default is numpy.float64
+    testArray   = numpy.empty(shape=((beneficial20Percent + harmful20Percent), numFeatures), dtype=numpy.int8)
+    #training data
+    for entry in range(0, beneficial80Percent):
+        #for each entry, find the index of the given feature
+        for word in pmcidBeneficialSentences[entry]:
+            #get the index of the given feature
+            featureColumn = numpy.searchsorted(uniqueFeaturesArray[0][:-1], word.encode("utf-8"))
+            if uniqueFeaturesArray[0][featureColumn] == word.encode("utf-8"):
+                trainArray[entry, featureColumn] = 1
+        for entity in beneficialFullEntitiesList[entry]:
+            featureColumn = numpy.searchsorted(uniqueFeaturesArray[0][:-1], entity)
+            if uniqueFeaturesArray[0][featureColumn] == entity:
+                trainArray[entry, featureColumn] = 1
+        trainArray[entry, -1] = 1
+    for entry in range(0, harmful80Percent):
+        trainingEntry = entry + beneficial80Percent
+        for word in pmcidHarmfulSentences[entry]:
+            featureColumn = numpy.searchsorted(uniqueFeaturesArray[0][:-1], word.encode("utf-8"))
+            if uniqueFeaturesArray[0][featureColumn] == word.encode("utf-8"):
+                trainArray[trainingEntry, featureColumn] = 1
+        for entity in harmfulFullEntitiesList[entry]:
+            #get the index of the given feature
+            featureColumn = numpy.searchsorted(uniqueFeaturesArray[0][:-1], entity)
+            if uniqueFeaturesArray[0][featureColumn] == entity:
+                trainArray[trainingEntry, featureColumn] = 1
+    #test data
+    for entry in range(0, beneficial20Percent):
+        dataEntry = entry + beneficial80Percent #finding next beneficial entry, starting from 60% until 80%
+        for word in pmcidBeneficialSentences[dataEntry]:
+            #get the index of the given feature
+            featureColumn = numpy.searchsorted(uniqueFeaturesArray[0][:-1], word.encode("utf-8"))
+            if uniqueFeaturesArray[0][featureColumn] == word.encode("utf-8"):
+                testArray[entry, featureColumn] = 1
+        for entity in beneficialFullEntitiesList[dataEntry]:
+            featureColumn = numpy.searchsorted(uniqueFeaturesArray[0][:-1], entity)
+            if uniqueFeaturesArray[0][featureColumn] == entity:
+                testArray[entry, featureColumn] = 1
+        testArray[entry, -1] = 1
+    for entry in range(0, harmful20Percent):
+        dataEntry = entry + harmful80Percent # finding the next harmful entry starting from 60% until 80%
+        testEntry  = entry + beneficial20Percent #because the prior data entered ended with beneficial20Percent
+        for word in pmcidHarmfulSentences[dataEntry]:
+            featureColumn = numpy.searchsorted(uniqueFeaturesArray[0][:-1], feature.encode("utf-8"))
+            if uniqueFeaturesArray[0][featureColumn] == word.encode("utf-8"):
+                testArray[testEntry, featureColumn] = 1
+        for entity in harmfulFullEntitiesList[dataEntry]:
+            featureColumn = numpy.searchsorted(uniqueFeaturesArray[0][:-1], entity)
+            if uniqueFeaturesArray[0][featureColumn] == entity:
+                testArray[testEntry, featureColumn] = 1
+    ###################################################CLASSIFICATION SECTION################################################################
+    #Here we set up our list for support vectors and our  list for classes.
+    #We will setup lists to hold our support vectors our classes.
+    supportVectorsL = []
+    classesListL = []
+    for row in trainArray:
+        y1 = row[len(row)-1]
+        supportVectorsL.append(row[:-1])
+        classesListL.append(y1)
+    #Here we initialize our Linear classifier
+    supportVectors = numpy.asarray(supportVectorsL)
+    classesList = numpy.asarray(classesListL)
+    #Here we try out the linear regresion stuff
+    classifier = linear_model.LogisticRegression()
+    classifier.fit(supportVectors,classesList)
+    ############Test our sets through our logisitc model##################
+    print("--------------------LOGISTIC------------------------")
+    logistic(classifier,testArray,"TEST")
+    print("--------------------SVM------------------------")
+    #Here we set up the svm
+    classifier = svm.SVC()
+    classifier.fit(supportVectors,classesList)
+    classifier.kernel="linear"
+    ############Test our sets through our SVM model##################
+    SVC(classifier,testArray,"TEST")
+    sys.exit(0)
+def SVC(classifier, testArray,t):
+    testpredictionarray = []
+    for row in testArray:
+        predictionvector = row[:-1]
+        if 1 in predictionvector:
+            predictionvector = [predictionvector]
+            prediction = classifier.predict(predictionvector)
+            pre = int(prediction[0])
+        else:
+            pre = -1
+        testpredictionarray.append(pre)
+    totalAccuray(testArray,testpredictionarray,t)
+    featAccuracy(testArray,testpredictionarray,t,1)
+    featAccuracy(testArray,testpredictionarray,t,2)
+def logistic(classifier, testArray,t):
+    testpredictionarray = []
+    for row in testArray:
+        predictionvector = row[:-1]
+        if 1 in predictionvector:
+            predictionvector = [predictionvector]
+            prediction = classifier.predict(predictionvector)
+            pre = int(prediction[0])
+        else:
+            pre = -1
+        testpredictionarray.append(pre)
+    totalAccuray(testArray,testpredictionarray,t)
+    featAccuracy(testArray,testpredictionarray,t,1)
+    featAccuracy(testArray,testpredictionarray,t,2)
+def totalAccuray(testArray,testpredictionarray,t):
+    testcounter = 0
+    #here we test for accuracy in the test set results.
+    for x in range(0,len(testArray)):
+        t1= testArray[x][len(testArray[x])-1]
+        t1 = int(t1)
+        if t1 == testpredictionarray[x]:
+            testcounter = testcounter + 1
+    accuracy= testcounter/len(testArray)
+    print(t+" set accuracy = " + str(accuracy))
+def featAccuracy(testArray,testpredictionarray,t,y):
+    actual = 0
+    testcounter = 0
+    for x in range(0,len(testArray)):
+        l = list(testArray[x])
+        c = l.count(1)
+        if c == y:
+            actual+=1
+            t1= testArray[x][len(testArray[x])-1]
+            t1 = int(t1)
+            if t1 == testpredictionarray[x]:
+                testcounter = testcounter + 1
+    try:
+        accuracy= testcounter/actual
+    except ZeroDivisionError:
+        print(t+" set accuracy for only "+str(y)+" feature vectors = UNDEFINED")
+        return
+    print(t+" set accuracy for only "+str(y)+" feature vectors = " + str(accuracy))
+main(sys.argv)
+#