Diff of /loader/load.py [000000] .. [5e0db2]

Switch to unified view

a b/loader/load.py
1
"""Define report loader class."""
2
import warnings
3
import re
4
import bioc
5
import pandas as pd
6
from negbio.pipeline import text2bioc, ssplit, section_split
7
8
from constants import *
9
10
11
class Loader(object):
12
    """Report impression loader."""
13
    def __init__(self, reports_path, sections_to_extract, extract_strict):
14
        self.reports_path = reports_path
15
        self.sections_to_extract = sections_to_extract
16
        self.extract_strict = extract_strict
17
        self.punctuation_spacer = str.maketrans({key: f"{key} "
18
                                                 for key in ".,"})
19
        self.splitter = ssplit.NegBioSSplitter(newline=False)
20
21
    def load(self):
22
        """Load and clean the reports."""
23
        collection = bioc.BioCCollection()
24
        reports = pd.read_csv(self.reports_path,
25
                              header=None,
26
                              names=[REPORTS])[REPORTS].tolist()
27
28
        for i, report in enumerate(reports):
29
            clean_report = self.clean(report)
30
            document = text2bioc.text2document(str(i), clean_report)
31
32
            if self.sections_to_extract:
33
                document = self.extract_sections(document)
34
35
            split_document = self.splitter.split_doc(document)
36
37
            assert len(split_document.passages) == 1,\
38
                ('Each document must be given as a single passage.')
39
40
            collection.add_document(split_document)
41
42
        self.reports = reports
43
        self.collection = collection
44
45
    def extract_sections(self, document):
46
        """Extract the Impression section from a Bioc Document."""
47
        split_document = section_split.split_document(document)
48
        passages = []
49
        for i, passage in enumerate(split_document.passages):
50
            if 'title' in passage.infons:
51
                if (passage.infons['title'] in self.sections_to_extract and
52
                    len(split_document.passages) > i+1):
53
                    next_passage = split_document.passages[i+1]
54
                    if 'title' not in next_passage.infons:
55
                        passages.append(next_passage)
56
        
57
        if passages or self.extract_strict:
58
            extracted_passages = bioc.BioCPassage()
59
            if passages:
60
                extracted_passages.offset = passages[0].offset
61
                extracted_passages.text = ' '.join(map(lambda x: x.text, passages))
62
            else:
63
                extracted_passages.offset = 0
64
                extracted_passages.text = ''
65
            split_document.passages = [extracted_passages]
66
            return split_document
67
        else:
68
            warnings.warn('Loader found document containing none of the ' + 
69
                          'provided sections to extract. Returning original ' + 
70
                          'document.')
71
            return document
72
73
    def clean(self, report):
74
        """Clean the report text."""
75
        lower_report = report.lower()
76
        # Change `and/or` to `or`.
77
        corrected_report = re.sub('and/or',
78
                                  'or',
79
                                  lower_report)
80
        # Change any `XXX/YYY` to `XXX or YYY`.
81
        corrected_report = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])',
82
                                  ' or ',
83
                                  corrected_report)
84
        # Clean double periods
85
        clean_report = corrected_report.replace("..", ".")
86
        # Insert space after commas and periods.
87
        clean_report = clean_report.translate(self.punctuation_spacer)
88
        # Convert any multi white spaces to single white spaces.
89
        clean_report = ' '.join(clean_report.split())
90
        # Remove empty sentences
91
        clean_report = re.sub(r'\.\s+\.', '.', clean_report)
92
93
        return clean_report