Medical-Relation-Extract / Git / Diff of /MedicalRelationExtractor/80_20

Models:

philipB/

Medical-Relation-Extract

Downloads: 1

Diff of /MedicalRelationExtractor/80_20_split.py [000000] .. [9397fa]

Switch to unified view

 b/MedicalRelationExtractor/80_20_split.py
+#AUTHOR: SPIRO RAZIS
+import sys
+import os
+from operator import itemgetter
+def file_80_20Splitter(beneficialFile, harmfulFile):
+    pseorData = [] #pseor: pmcid, sentence, entities, offset, relation
+    beneficialData = []
+    beneficialEntityPairs = {}
+    with open(beneficialFile, "r") as openedBeneficialFile:
+        for line in openedBeneficialFile:
+            if line != "\n":
+                if line.startswith("entities: "): #it's the two in a relationship
+                    entityPair = line[10:-1].lower().encode('utf-8')
+                    if entityPair not in beneficialEntityPairs:
+                        beneficialEntityPairs[entityPair] = {}
+                        #print(entityPair)
+                pseorData.append(line)
+            else:
+                beneficialData.append(pseorData)
+                pseorData = []
+    beneficial80Percent = int(len(beneficialEntityPairs) * 0.8)
+    beneficialEntityPairs_80 = {}
+    for index, pair in enumerate(beneficialEntityPairs):
+        if index < beneficial80Percent:
+            beneficialEntityPairs_80[pair] = {}
+    #["pmcid...", "sentence...", "entities...", "offsets...", "relation..."]
+    beneficialData_80Percent = []
+    beneficialData_20Percent = []
+    for entry in beneficialData:
+        if entry[2][10:-1].lower().encode('utf-8') in beneficialEntityPairs_80:
+            beneficialData_80Percent.append(entry)
+            #print("80 Percent: ")
+            #print(entry)
+        else:
+            beneficialData_20Percent.append(entry)
+            #print("20 Percent:" )
+            #print(entry)
+    beneficialData_80Percent.sort(key=itemgetter(2), reverse=False)
+    beneficialData_20Percent.sort(key=itemgetter(2), reverse=False)
+    try: os.remove("beneficial_80_20_Split.txt")
+    except OSError: pass
+    print("Beneficial Entries: First 80 Percent of Entity Pairs: %d" %(len(beneficialData_80Percent)))
+    with open("beneficial_80_20_Split.txt", "w") as splitBeneficialOutput:
+        for pseorEntry in beneficialData_80Percent:
+            for line in pseorEntry:
+                splitBeneficialOutput.write(line)
+            splitBeneficialOutput.write("\n")
+        for pseorEntry in beneficialData_20Percent:
+            for line in pseorEntry:
+                splitBeneficialOutput.write(line)
+            splitBeneficialOutput.write("\n")
+    ##########
+    #HARMFUL VERSION
+    pseorData = [] #pseor: pmcid, sentence, entities, offset, relation
+    harmfulData = []
+    harmfulEntityPairs = {}
+    with open(harmfulFile, "r") as openedHarmfulFile:
+        for line in openedHarmfulFile:
+            if line != "\n":
+                if line.startswith("entities: "): #it's the two in a relationship
+                    entityPair = line[10:-1].lower().encode('utf-8')
+                    if entityPair not in harmfulEntityPairs:
+                        harmfulEntityPairs[entityPair] = {}
+                pseorData.append(line)
+            else:
+                harmfulData.append(pseorData)
+                pseorData = []
+    harmful80Percent = int(len(harmfulEntityPairs) * 0.8)
+    harmfulEntityPairs_80 = {}
+    for index, pair in enumerate(harmfulEntityPairs):
+        if index < harmful80Percent:
+            harmfulEntityPairs_80[pair] = {}
+    #["pmcid...", "sentence...", "entities...", "offsets...", "relation..."]
+    harmfulData_80Percent = []
+    harmfulData_20Percent = []
+    for entry in harmfulData:
+        if entry[2][10:-1].lower().encode('utf-8') in harmfulEntityPairs_80:
+            harmfulData_80Percent.append(entry)
+        else:
+            harmfulData_20Percent.append(entry)
+    harmfulData_80Percent.sort(key=itemgetter(2), reverse=False)
+    harmfulData_20Percent.sort(key=itemgetter(2), reverse=False)
+    try: os.remove("harmful_80_20_Split.txt")
+    except OSError: pass
+    print("Harmful Entries: First 80 Percent of Entity Pairs: %d" %(len(harmfulData_80Percent)))
+    with open("harmful_80_20_Split.txt", "w") as splitHarmfulOutput:
+        for pseorEntry in harmfulData_80Percent:
+            for line in pseorEntry:
+                splitHarmfulOutput.write(line)
+            splitHarmfulOutput.write("\n")
+        for pseorEntry in harmfulData_20Percent:
+            for line in pseorEntry:
+                splitHarmfulOutput.write(line)
+            splitHarmfulOutput.write("\n")
+    return
+def main(argv):
+    file_80_20Splitter(argv[1], argv[2])
+    sys.exit(0)
+main(sys.argv)
+#