Switch to unified view

a b/MedicalRelationExtractor/80_20_split.py
1
#AUTHOR: SPIRO RAZIS
2
import sys
3
import os
4
from operator import itemgetter
5
6
def file_80_20Splitter(beneficialFile, harmfulFile):
7
    
8
    pseorData = [] #pseor: pmcid, sentence, entities, offset, relation
9
    beneficialData = []
10
    beneficialEntityPairs = {}
11
    
12
    with open(beneficialFile, "r") as openedBeneficialFile:
13
        for line in openedBeneficialFile:
14
            if line != "\n":
15
                if line.startswith("entities: "): #it's the two in a relationship
16
                    entityPair = line[10:-1].lower().encode('utf-8')
17
                    if entityPair not in beneficialEntityPairs:
18
                        beneficialEntityPairs[entityPair] = {}
19
                        #print(entityPair)
20
                pseorData.append(line)
21
            else: 
22
                beneficialData.append(pseorData)
23
                pseorData = []
24
                
25
    beneficial80Percent = int(len(beneficialEntityPairs) * 0.8)
26
    beneficialEntityPairs_80 = {}
27
28
    for index, pair in enumerate(beneficialEntityPairs):
29
        if index < beneficial80Percent:
30
            beneficialEntityPairs_80[pair] = {}
31
32
    #["pmcid...", "sentence...", "entities...", "offsets...", "relation..."]
33
34
    beneficialData_80Percent = []
35
    beneficialData_20Percent = []
36
37
    for entry in beneficialData:
38
        if entry[2][10:-1].lower().encode('utf-8') in beneficialEntityPairs_80:
39
            beneficialData_80Percent.append(entry)
40
            #print("80 Percent: ")
41
            #print(entry)
42
        else: 
43
            beneficialData_20Percent.append(entry)
44
            #print("20 Percent:" )
45
            #print(entry)
46
47
48
    
49
    beneficialData_80Percent.sort(key=itemgetter(2), reverse=False)
50
    beneficialData_20Percent.sort(key=itemgetter(2), reverse=False)
51
    
52
    try: os.remove("beneficial_80_20_Split.txt")
53
    except OSError: pass
54
    
55
    print("Beneficial Entries: First 80 Percent of Entity Pairs: %d" %(len(beneficialData_80Percent)))
56
    
57
    with open("beneficial_80_20_Split.txt", "w") as splitBeneficialOutput:
58
        for pseorEntry in beneficialData_80Percent:
59
            for line in pseorEntry:
60
                splitBeneficialOutput.write(line)
61
            splitBeneficialOutput.write("\n")
62
    
63
        for pseorEntry in beneficialData_20Percent:
64
            for line in pseorEntry:
65
                splitBeneficialOutput.write(line)
66
            splitBeneficialOutput.write("\n")            
67
    
68
    
69
    ##########
70
    #HARMFUL VERSION
71
    
72
    pseorData = [] #pseor: pmcid, sentence, entities, offset, relation
73
    harmfulData = []
74
    harmfulEntityPairs = {}
75
    
76
    with open(harmfulFile, "r") as openedHarmfulFile:
77
        for line in openedHarmfulFile:
78
            if line != "\n":
79
                if line.startswith("entities: "): #it's the two in a relationship
80
                    entityPair = line[10:-1].lower().encode('utf-8')
81
                    if entityPair not in harmfulEntityPairs:
82
                        harmfulEntityPairs[entityPair] = {}
83
                pseorData.append(line)
84
            else: 
85
                harmfulData.append(pseorData)
86
                pseorData = []
87
                
88
    harmful80Percent = int(len(harmfulEntityPairs) * 0.8)
89
    harmfulEntityPairs_80 = {}
90
91
    for index, pair in enumerate(harmfulEntityPairs):
92
        if index < harmful80Percent:
93
            harmfulEntityPairs_80[pair] = {}
94
95
    #["pmcid...", "sentence...", "entities...", "offsets...", "relation..."]
96
97
    harmfulData_80Percent = []
98
    harmfulData_20Percent = []
99
100
    for entry in harmfulData:
101
        if entry[2][10:-1].lower().encode('utf-8') in harmfulEntityPairs_80:
102
            harmfulData_80Percent.append(entry)
103
        else: 
104
            harmfulData_20Percent.append(entry)
105
    
106
    harmfulData_80Percent.sort(key=itemgetter(2), reverse=False)
107
    harmfulData_20Percent.sort(key=itemgetter(2), reverse=False)
108
    
109
    
110
    
111
    try: os.remove("harmful_80_20_Split.txt")
112
    except OSError: pass
113
    
114
    print("Harmful Entries: First 80 Percent of Entity Pairs: %d" %(len(harmfulData_80Percent)))
115
    
116
    with open("harmful_80_20_Split.txt", "w") as splitHarmfulOutput:
117
        for pseorEntry in harmfulData_80Percent:
118
            for line in pseorEntry:
119
                splitHarmfulOutput.write(line)
120
            splitHarmfulOutput.write("\n")
121
    
122
        for pseorEntry in harmfulData_20Percent:
123
            for line in pseorEntry:
124
                splitHarmfulOutput.write(line)
125
            splitHarmfulOutput.write("\n")            
126
127
    return
128
129
130
def main(argv):
131
132
    file_80_20Splitter(argv[1], argv[2])
133
134
    sys.exit(0)
135
136
137
138
139
140
main(sys.argv)
141
#
142
143
144
145
146