[9397fa]: / MedicalRelationExtractor / 80_20_split.py

Download this file

147 lines (100 with data), 4.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#AUTHOR: SPIRO RAZIS
import sys
import os
from operator import itemgetter
def file_80_20Splitter(beneficialFile, harmfulFile):
pseorData = [] #pseor: pmcid, sentence, entities, offset, relation
beneficialData = []
beneficialEntityPairs = {}
with open(beneficialFile, "r") as openedBeneficialFile:
for line in openedBeneficialFile:
if line != "\n":
if line.startswith("entities: "): #it's the two in a relationship
entityPair = line[10:-1].lower().encode('utf-8')
if entityPair not in beneficialEntityPairs:
beneficialEntityPairs[entityPair] = {}
#print(entityPair)
pseorData.append(line)
else:
beneficialData.append(pseorData)
pseorData = []
beneficial80Percent = int(len(beneficialEntityPairs) * 0.8)
beneficialEntityPairs_80 = {}
for index, pair in enumerate(beneficialEntityPairs):
if index < beneficial80Percent:
beneficialEntityPairs_80[pair] = {}
#["pmcid...", "sentence...", "entities...", "offsets...", "relation..."]
beneficialData_80Percent = []
beneficialData_20Percent = []
for entry in beneficialData:
if entry[2][10:-1].lower().encode('utf-8') in beneficialEntityPairs_80:
beneficialData_80Percent.append(entry)
#print("80 Percent: ")
#print(entry)
else:
beneficialData_20Percent.append(entry)
#print("20 Percent:" )
#print(entry)
beneficialData_80Percent.sort(key=itemgetter(2), reverse=False)
beneficialData_20Percent.sort(key=itemgetter(2), reverse=False)
try: os.remove("beneficial_80_20_Split.txt")
except OSError: pass
print("Beneficial Entries: First 80 Percent of Entity Pairs: %d" %(len(beneficialData_80Percent)))
with open("beneficial_80_20_Split.txt", "w") as splitBeneficialOutput:
for pseorEntry in beneficialData_80Percent:
for line in pseorEntry:
splitBeneficialOutput.write(line)
splitBeneficialOutput.write("\n")
for pseorEntry in beneficialData_20Percent:
for line in pseorEntry:
splitBeneficialOutput.write(line)
splitBeneficialOutput.write("\n")
##########
#HARMFUL VERSION
pseorData = [] #pseor: pmcid, sentence, entities, offset, relation
harmfulData = []
harmfulEntityPairs = {}
with open(harmfulFile, "r") as openedHarmfulFile:
for line in openedHarmfulFile:
if line != "\n":
if line.startswith("entities: "): #it's the two in a relationship
entityPair = line[10:-1].lower().encode('utf-8')
if entityPair not in harmfulEntityPairs:
harmfulEntityPairs[entityPair] = {}
pseorData.append(line)
else:
harmfulData.append(pseorData)
pseorData = []
harmful80Percent = int(len(harmfulEntityPairs) * 0.8)
harmfulEntityPairs_80 = {}
for index, pair in enumerate(harmfulEntityPairs):
if index < harmful80Percent:
harmfulEntityPairs_80[pair] = {}
#["pmcid...", "sentence...", "entities...", "offsets...", "relation..."]
harmfulData_80Percent = []
harmfulData_20Percent = []
for entry in harmfulData:
if entry[2][10:-1].lower().encode('utf-8') in harmfulEntityPairs_80:
harmfulData_80Percent.append(entry)
else:
harmfulData_20Percent.append(entry)
harmfulData_80Percent.sort(key=itemgetter(2), reverse=False)
harmfulData_20Percent.sort(key=itemgetter(2), reverse=False)
try: os.remove("harmful_80_20_Split.txt")
except OSError: pass
print("Harmful Entries: First 80 Percent of Entity Pairs: %d" %(len(harmfulData_80Percent)))
with open("harmful_80_20_Split.txt", "w") as splitHarmfulOutput:
for pseorEntry in harmfulData_80Percent:
for line in pseorEntry:
splitHarmfulOutput.write(line)
splitHarmfulOutput.write("\n")
for pseorEntry in harmfulData_20Percent:
for line in pseorEntry:
splitHarmfulOutput.write(line)
splitHarmfulOutput.write("\n")
return
def main(argv):
file_80_20Splitter(argv[1], argv[2])
sys.exit(0)
main(sys.argv)
#