Medical-Relation-Extract / Git / [9397fa] /MedicalRelationExtractor/80_20

Models:
philipB/
Medical-Relation-Extract
Downloads: 1
[9397fa]: / MedicalRelationExtractor / 80_20_split.py
History
Download this file
147 lines (100 with data), 4.6 kB

#AUTHOR: SPIRO RAZIS
import sys
import os
from operator import itemgetter

def file_80_20Splitter(beneficialFile, harmfulFile):
    
    pseorData = [] #pseor: pmcid, sentence, entities, offset, relation
    beneficialData = []
    beneficialEntityPairs = {}
    
    with open(beneficialFile, "r") as openedBeneficialFile:
        for line in openedBeneficialFile:
            if line != "\n":
                if line.startswith("entities: "): #it's the two in a relationship
                    entityPair = line[10:-1].lower().encode('utf-8')
                    if entityPair not in beneficialEntityPairs:
                        beneficialEntityPairs[entityPair] = {}
                        #print(entityPair)
                pseorData.append(line)
            else: 
                beneficialData.append(pseorData)
                pseorData = []
                
    beneficial80Percent = int(len(beneficialEntityPairs) * 0.8)
    beneficialEntityPairs_80 = {}

    for index, pair in enumerate(beneficialEntityPairs):
        if index < beneficial80Percent:
            beneficialEntityPairs_80[pair] = {}

    #["pmcid...", "sentence...", "entities...", "offsets...", "relation..."]

    beneficialData_80Percent = []
    beneficialData_20Percent = []

    for entry in beneficialData:
        if entry[2][10:-1].lower().encode('utf-8') in beneficialEntityPairs_80:
            beneficialData_80Percent.append(entry)
            #print("80 Percent: ")
            #print(entry)
        else: 
            beneficialData_20Percent.append(entry)
            #print("20 Percent:" )
            #print(entry)


    
    beneficialData_80Percent.sort(key=itemgetter(2), reverse=False)
    beneficialData_20Percent.sort(key=itemgetter(2), reverse=False)
    
    try: os.remove("beneficial_80_20_Split.txt")
    except OSError: pass
    
    print("Beneficial Entries: First 80 Percent of Entity Pairs: %d" %(len(beneficialData_80Percent)))
    
    with open("beneficial_80_20_Split.txt", "w") as splitBeneficialOutput:
        for pseorEntry in beneficialData_80Percent:
            for line in pseorEntry:
                splitBeneficialOutput.write(line)
            splitBeneficialOutput.write("\n")
    
        for pseorEntry in beneficialData_20Percent:
            for line in pseorEntry:
                splitBeneficialOutput.write(line)
            splitBeneficialOutput.write("\n")            
    
    
    ##########
    #HARMFUL VERSION
    
    pseorData = [] #pseor: pmcid, sentence, entities, offset, relation
    harmfulData = []
    harmfulEntityPairs = {}
    
    with open(harmfulFile, "r") as openedHarmfulFile:
        for line in openedHarmfulFile:
            if line != "\n":
                if line.startswith("entities: "): #it's the two in a relationship
                    entityPair = line[10:-1].lower().encode('utf-8')
                    if entityPair not in harmfulEntityPairs:
                        harmfulEntityPairs[entityPair] = {}
                pseorData.append(line)
            else: 
                harmfulData.append(pseorData)
                pseorData = []
                
    harmful80Percent = int(len(harmfulEntityPairs) * 0.8)
    harmfulEntityPairs_80 = {}

    for index, pair in enumerate(harmfulEntityPairs):
        if index < harmful80Percent:
            harmfulEntityPairs_80[pair] = {}

    #["pmcid...", "sentence...", "entities...", "offsets...", "relation..."]

    harmfulData_80Percent = []
    harmfulData_20Percent = []

    for entry in harmfulData:
        if entry[2][10:-1].lower().encode('utf-8') in harmfulEntityPairs_80:
            harmfulData_80Percent.append(entry)
        else: 
            harmfulData_20Percent.append(entry)
    
    harmfulData_80Percent.sort(key=itemgetter(2), reverse=False)
    harmfulData_20Percent.sort(key=itemgetter(2), reverse=False)
    
    
    
    try: os.remove("harmful_80_20_Split.txt")
    except OSError: pass
    
    print("Harmful Entries: First 80 Percent of Entity Pairs: %d" %(len(harmfulData_80Percent)))
    
    with open("harmful_80_20_Split.txt", "w") as splitHarmfulOutput:
        for pseorEntry in harmfulData_80Percent:
            for line in pseorEntry:
                splitHarmfulOutput.write(line)
            splitHarmfulOutput.write("\n")
    
        for pseorEntry in harmfulData_20Percent:
            for line in pseorEntry:
                splitHarmfulOutput.write(line)
            splitHarmfulOutput.write("\n")            

    return


def main(argv):

    file_80_20Splitter(argv[1], argv[2])

    sys.exit(0)





main(sys.argv)
#