[cad161]: / edsnlp / pipes / misc / consultation_dates / consultation_dates.py

Download this file

219 lines (177 with data), 7.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
from typing import List, Optional, Union
from loguru import logger
from spacy.tokens import Doc, Span
from edsnlp.core import PipelineProtocol
from edsnlp.pipes.core.matcher.matcher import GenericMatcher
from edsnlp.pipes.misc.dates.factory import DEFAULT_CONFIG, DatesMatcher
from ...base import SpanSetterArg
from . import patterns as consult_regex
class ConsultationDatesMatcher(GenericMatcher):
'''
The `eds.consultation-dates` matcher consists of two main parts:
- A **matcher** which finds mentions of _consultation events_ (more details below)
- A **date parser** (see the corresponding pipe) that links a date to those events
Examples
--------
!!! note
The matcher has been built to run on **consultation notes**
(`CR-CONS` at APHP), so please filter accordingly before proceeding.
```python
import edsnlp, edsnlp.pipes as eds
nlp = edsnlp.blank("eds")
nlp.add_pipe(eds.sentences())
nlp.add_pipe(
eds.normalizer(
lowercase=True,
accents=True,
quotes=True,
pollution=False,
),
)
nlp.add_pipe(eds.consultation_dates())
text = """
XXX
Objet : Compte-Rendu de Consultation du 03/10/2018.
XXX
"""
doc = nlp(text)
doc.spans["consultation_dates"]
# Out: [Consultation du 03/10/2018]
doc.spans["consultation_dates"][0]._.consultation_date.to_datetime()
# Out: 2018-10-03 00:00:00
```
Extensions
----------
The `eds.consultation_dates` pipeline declares one extension on the `Span` object:
the `consultation_date` attribute, which is a Python `datetime` object.
Parameters
----------
nlp : PipelineProtocol
Language pipeline object
consultation_mention : Union[List[str], bool]
List of RegEx for consultation mentions.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
This list contains terms directly referring to consultations, such as
"_Consultation du..._" or "_Compte rendu du..._". This list is the only one
enabled by default since it is fairly precise and not error-prone.
town_mention : Union[List[str], bool]
List of RegEx for all AP-HP hospitals' towns mentions.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
This list contains the towns of each AP-HP's hospital. Its goal is to fetch
dates mentioned as "_Paris, le 13 décembre 2015_". It has a high recall but
poor precision, since those dates can often be dates of letter redaction
instead of consultation dates.
document_date_mention : Union[List[str], bool]
List of RegEx for document date.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
This list contains expressions mentioning the date of creation/edition of a
document, such as "_Date du rapport: 13/12/2015_" or "_Signé le 13/12/2015_".
Like `town_mention` patterns, it has a high recall but is prone to errors since
document date and consultation date aren't necessary similar.
Authors and citation
--------------------
The `eds.consultation_dates` pipeline was developed by AP-HP's Data Science team.
'''
def __init__(
self,
nlp: PipelineProtocol,
name: Optional[str] = "consultation_dates",
*,
consultation_mention: Union[bool, List[str]] = True,
town_mention: Union[bool, List[str]] = False,
document_date_mention: Union[bool, List[str]] = False,
attr: str = "NORM",
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
label: str = "consultation_date",
span_setter: SpanSetterArg = {"ents": True, "consultation_dates": True},
):
logger.warning("This pipeline is still in beta")
logger.warning(
"This pipeline should ONLY be used on notes "
"where `note_class_source_value == 'CR-CONS'`"
)
logger.warning(
"""This pipeline requires to use the normalizer pipeline with:
lowercase=True,
accents=True,
quotes=True"""
)
if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):
self.date_matcher = DatesMatcher(nlp, **DEFAULT_CONFIG)
else:
self.date_matcher = None
if not consultation_mention:
consultation_mention = []
elif consultation_mention is True:
consultation_mention = consult_regex.consultation_mention
if not document_date_mention:
document_date_mention = []
elif document_date_mention is True:
document_date_mention = consult_regex.document_date_mention
if not town_mention:
town_mention = []
elif town_mention is True:
town_mention = consult_regex.town_mention
regex = dict(
consultation_mention=consultation_mention,
town_mention=town_mention,
document_date_mention=document_date_mention,
)
self.label = label
super().__init__(
nlp=nlp,
name=name,
regex=regex,
terms=dict(),
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
term_matcher="exact",
term_matcher_config=dict(),
span_setter=span_setter,
)
self.set_extensions()
def set_extensions(self) -> None:
super().set_extensions()
if not Span.has_extension(self.label):
Span.set_extension(self.label, default=None)
def process(self, doc: Doc) -> List[Span]:
"""
Finds entities
Parameters
----------
doc: spaCy Doc object
Returns
-------
doc: Doc
spaCy Doc object with additional
`doc.spans['consultation_dates]` `SpanGroup`
"""
matches = list(super().process(doc))
self.date_matcher.span_getter = lambda d: [m.sent for m in matches]
dates = [s for s in self.date_matcher.process(doc) if s.label_ == "date"]
self.date_matcher.span_getter = None
for match in matches:
# Looking for a date
# - In the same sentence
# - Not less than 10 tokens AFTER the consultation mention
matching_dates = [
date
for date in dates
if (
(match.sent == date.sent)
and (date.start > match.start)
and (date.start - match.end <= 10)
)
]
if matching_dates:
# We keep the first mention of a date
kept_date = min(matching_dates, key=lambda d: d.start)
span = doc[match.start : kept_date.end]
span.label_ = self.label
span._.consultation_date = kept_date._.date
yield span