--- a +++ b/loader/load.py @@ -0,0 +1,93 @@ +"""Define report loader class.""" +import warnings +import re +import bioc +import pandas as pd +from negbio.pipeline import text2bioc, ssplit, section_split + +from constants import * + + +class Loader(object): + """Report impression loader.""" + def __init__(self, reports_path, sections_to_extract, extract_strict): + self.reports_path = reports_path + self.sections_to_extract = sections_to_extract + self.extract_strict = extract_strict + self.punctuation_spacer = str.maketrans({key: f"{key} " + for key in ".,"}) + self.splitter = ssplit.NegBioSSplitter(newline=False) + + def load(self): + """Load and clean the reports.""" + collection = bioc.BioCCollection() + reports = pd.read_csv(self.reports_path, + header=None, + names=[REPORTS])[REPORTS].tolist() + + for i, report in enumerate(reports): + clean_report = self.clean(report) + document = text2bioc.text2document(str(i), clean_report) + + if self.sections_to_extract: + document = self.extract_sections(document) + + split_document = self.splitter.split_doc(document) + + assert len(split_document.passages) == 1,\ + ('Each document must be given as a single passage.') + + collection.add_document(split_document) + + self.reports = reports + self.collection = collection + + def extract_sections(self, document): + """Extract the Impression section from a Bioc Document.""" + split_document = section_split.split_document(document) + passages = [] + for i, passage in enumerate(split_document.passages): + if 'title' in passage.infons: + if (passage.infons['title'] in self.sections_to_extract and + len(split_document.passages) > i+1): + next_passage = split_document.passages[i+1] + if 'title' not in next_passage.infons: + passages.append(next_passage) + + if passages or self.extract_strict: + extracted_passages = bioc.BioCPassage() + if passages: + extracted_passages.offset = passages[0].offset + extracted_passages.text = ' '.join(map(lambda x: x.text, passages)) + else: + extracted_passages.offset = 0 + extracted_passages.text = '' + split_document.passages = [extracted_passages] + return split_document + else: + warnings.warn('Loader found document containing none of the ' + + 'provided sections to extract. Returning original ' + + 'document.') + return document + + def clean(self, report): + """Clean the report text.""" + lower_report = report.lower() + # Change `and/or` to `or`. + corrected_report = re.sub('and/or', + 'or', + lower_report) + # Change any `XXX/YYY` to `XXX or YYY`. + corrected_report = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])', + ' or ', + corrected_report) + # Clean double periods + clean_report = corrected_report.replace("..", ".") + # Insert space after commas and periods. + clean_report = clean_report.translate(self.punctuation_spacer) + # Convert any multi white spaces to single white spaces. + clean_report = ' '.join(clean_report.split()) + # Remove empty sentences + clean_report = re.sub(r'\.\s+\.', '.', clean_report) + + return clean_report