|
a |
|
b/loader/load.py |
|
|
1 |
"""Define report loader class.""" |
|
|
2 |
import warnings |
|
|
3 |
import re |
|
|
4 |
import bioc |
|
|
5 |
import pandas as pd |
|
|
6 |
from negbio.pipeline import text2bioc, ssplit, section_split |
|
|
7 |
|
|
|
8 |
from constants import * |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
class Loader(object): |
|
|
12 |
"""Report impression loader.""" |
|
|
13 |
def __init__(self, reports_path, sections_to_extract, extract_strict): |
|
|
14 |
self.reports_path = reports_path |
|
|
15 |
self.sections_to_extract = sections_to_extract |
|
|
16 |
self.extract_strict = extract_strict |
|
|
17 |
self.punctuation_spacer = str.maketrans({key: f"{key} " |
|
|
18 |
for key in ".,"}) |
|
|
19 |
self.splitter = ssplit.NegBioSSplitter(newline=False) |
|
|
20 |
|
|
|
21 |
def load(self): |
|
|
22 |
"""Load and clean the reports.""" |
|
|
23 |
collection = bioc.BioCCollection() |
|
|
24 |
reports = pd.read_csv(self.reports_path, |
|
|
25 |
header=None, |
|
|
26 |
names=[REPORTS])[REPORTS].tolist() |
|
|
27 |
|
|
|
28 |
for i, report in enumerate(reports): |
|
|
29 |
clean_report = self.clean(report) |
|
|
30 |
document = text2bioc.text2document(str(i), clean_report) |
|
|
31 |
|
|
|
32 |
if self.sections_to_extract: |
|
|
33 |
document = self.extract_sections(document) |
|
|
34 |
|
|
|
35 |
split_document = self.splitter.split_doc(document) |
|
|
36 |
|
|
|
37 |
assert len(split_document.passages) == 1,\ |
|
|
38 |
('Each document must be given as a single passage.') |
|
|
39 |
|
|
|
40 |
collection.add_document(split_document) |
|
|
41 |
|
|
|
42 |
self.reports = reports |
|
|
43 |
self.collection = collection |
|
|
44 |
|
|
|
45 |
def extract_sections(self, document): |
|
|
46 |
"""Extract the Impression section from a Bioc Document.""" |
|
|
47 |
split_document = section_split.split_document(document) |
|
|
48 |
passages = [] |
|
|
49 |
for i, passage in enumerate(split_document.passages): |
|
|
50 |
if 'title' in passage.infons: |
|
|
51 |
if (passage.infons['title'] in self.sections_to_extract and |
|
|
52 |
len(split_document.passages) > i+1): |
|
|
53 |
next_passage = split_document.passages[i+1] |
|
|
54 |
if 'title' not in next_passage.infons: |
|
|
55 |
passages.append(next_passage) |
|
|
56 |
|
|
|
57 |
if passages or self.extract_strict: |
|
|
58 |
extracted_passages = bioc.BioCPassage() |
|
|
59 |
if passages: |
|
|
60 |
extracted_passages.offset = passages[0].offset |
|
|
61 |
extracted_passages.text = ' '.join(map(lambda x: x.text, passages)) |
|
|
62 |
else: |
|
|
63 |
extracted_passages.offset = 0 |
|
|
64 |
extracted_passages.text = '' |
|
|
65 |
split_document.passages = [extracted_passages] |
|
|
66 |
return split_document |
|
|
67 |
else: |
|
|
68 |
warnings.warn('Loader found document containing none of the ' + |
|
|
69 |
'provided sections to extract. Returning original ' + |
|
|
70 |
'document.') |
|
|
71 |
return document |
|
|
72 |
|
|
|
73 |
def clean(self, report): |
|
|
74 |
"""Clean the report text.""" |
|
|
75 |
lower_report = report.lower() |
|
|
76 |
# Change `and/or` to `or`. |
|
|
77 |
corrected_report = re.sub('and/or', |
|
|
78 |
'or', |
|
|
79 |
lower_report) |
|
|
80 |
# Change any `XXX/YYY` to `XXX or YYY`. |
|
|
81 |
corrected_report = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])', |
|
|
82 |
' or ', |
|
|
83 |
corrected_report) |
|
|
84 |
# Clean double periods |
|
|
85 |
clean_report = corrected_report.replace("..", ".") |
|
|
86 |
# Insert space after commas and periods. |
|
|
87 |
clean_report = clean_report.translate(self.punctuation_spacer) |
|
|
88 |
# Convert any multi white spaces to single white spaces. |
|
|
89 |
clean_report = ' '.join(clean_report.split()) |
|
|
90 |
# Remove empty sentences |
|
|
91 |
clean_report = re.sub(r'\.\s+\.', '.', clean_report) |
|
|
92 |
|
|
|
93 |
return clean_report |