Diff of /loader/load.py [000000] .. [5e0db2]

Switch to side-by-side view

--- a
+++ b/loader/load.py
@@ -0,0 +1,93 @@
+"""Define report loader class."""
+import warnings
+import re
+import bioc
+import pandas as pd
+from negbio.pipeline import text2bioc, ssplit, section_split
+
+from constants import *
+
+
+class Loader(object):
+    """Report impression loader."""
+    def __init__(self, reports_path, sections_to_extract, extract_strict):
+        self.reports_path = reports_path
+        self.sections_to_extract = sections_to_extract
+        self.extract_strict = extract_strict
+        self.punctuation_spacer = str.maketrans({key: f"{key} "
+                                                 for key in ".,"})
+        self.splitter = ssplit.NegBioSSplitter(newline=False)
+
+    def load(self):
+        """Load and clean the reports."""
+        collection = bioc.BioCCollection()
+        reports = pd.read_csv(self.reports_path,
+                              header=None,
+                              names=[REPORTS])[REPORTS].tolist()
+
+        for i, report in enumerate(reports):
+            clean_report = self.clean(report)
+            document = text2bioc.text2document(str(i), clean_report)
+
+            if self.sections_to_extract:
+                document = self.extract_sections(document)
+
+            split_document = self.splitter.split_doc(document)
+
+            assert len(split_document.passages) == 1,\
+                ('Each document must be given as a single passage.')
+
+            collection.add_document(split_document)
+
+        self.reports = reports
+        self.collection = collection
+
+    def extract_sections(self, document):
+        """Extract the Impression section from a Bioc Document."""
+        split_document = section_split.split_document(document)
+        passages = []
+        for i, passage in enumerate(split_document.passages):
+            if 'title' in passage.infons:
+                if (passage.infons['title'] in self.sections_to_extract and
+                    len(split_document.passages) > i+1):
+                    next_passage = split_document.passages[i+1]
+                    if 'title' not in next_passage.infons:
+                        passages.append(next_passage)
+        
+        if passages or self.extract_strict:
+            extracted_passages = bioc.BioCPassage()
+            if passages:
+                extracted_passages.offset = passages[0].offset
+                extracted_passages.text = ' '.join(map(lambda x: x.text, passages))
+            else:
+                extracted_passages.offset = 0
+                extracted_passages.text = ''
+            split_document.passages = [extracted_passages]
+            return split_document
+        else:
+            warnings.warn('Loader found document containing none of the ' + 
+                          'provided sections to extract. Returning original ' + 
+                          'document.')
+            return document
+
+    def clean(self, report):
+        """Clean the report text."""
+        lower_report = report.lower()
+        # Change `and/or` to `or`.
+        corrected_report = re.sub('and/or',
+                                  'or',
+                                  lower_report)
+        # Change any `XXX/YYY` to `XXX or YYY`.
+        corrected_report = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])',
+                                  ' or ',
+                                  corrected_report)
+        # Clean double periods
+        clean_report = corrected_report.replace("..", ".")
+        # Insert space after commas and periods.
+        clean_report = clean_report.translate(self.punctuation_spacer)
+        # Convert any multi white spaces to single white spaces.
+        clean_report = ' '.join(clean_report.split())
+        # Remove empty sentences
+        clean_report = re.sub(r'\.\s+\.', '.', clean_report)
+
+        return clean_report