[cad161]: / tests / pipelines / misc / test_dates.py

Download this file

298 lines (243 with data), 10.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import datetime
import pytest
import spacy
from pytest import fixture
from edsnlp.core import PipelineProtocol
from edsnlp.pipes.misc.dates.models import AbsoluteDate, Relative
from edsnlp.utils.examples import parse_example
BASE_DATE = datetime.datetime.strptime("2021-09-04", "%Y-%m-%d")
examples = [
(
"Le patient est venu en <ent norm='2019-??-??' year=2019>2019</ent> pour une "
"consultation"
),
"Le patient est venu <ent norm='-1 day' direction=past day=1>hier</ent>",
"le <ent norm='2021-09-04' day=4 month=9 year=2021>04/09/2021</ent>",
(
"Il est cas contact <ent norm='-7 days' direction=past week=1>"
"depuis la semaine dernière</ent>"
),
"le <ent norm='????-08-09' day=9 month=8>09/08</ent>",
"Le patient est venu le <ent norm='????-08-04' day=4 month=8>4 août</ent>",
(
"Le patient est venu le <ent norm='????-08-04 11h13m' day=4 month=8 "
"hour=11 minute=13>4 août à 11h13</ent>"
),
"Il est venu le <ent norm='????-09-01' day=1 month=9>1er Septembre</ent> pour",
(
"Il est venu en <ent norm='2020-10-??' month=10 year=2020>octobre 2020</ent> "
"pour..."
),
(
"Il est venu <ent norm='-90 days' direction=past month=3>il y a "
"trois mois</ent> pour..."
),
(
"Il lui était arrivé la même chose <ent norm='-365 days' "
"direction=past year=1>il y a un an</ent>."
),
(
"Il est venu le <ent norm='2001-09-20' day=20 month=9 "
"year=2001>20/09/2001</ent> pour..."
),
(
"Consultation du <ent norm='2019-07-03' bound=from "
"day=3 month=7 year=2019>03 07 19</ent>"
),
"En <ent norm='2017-11-??' month=11 year=2017>11/2017</ent> stabilité sur...",
"<ent norm='-90 days' direction=past month=3>depuis 3 mois</ent>",
"- <ent norm='2004-12-??' month=12 year=2004>Décembre 2004</ent> :",
"- <ent norm='2005-06-??' month=6 year=2005>Juin 2005</ent>: ",
# "-<ent norm=" month=6 year=2005>Juin 2005</ent>: ", # issues with "fr" language
"<ent norm='2017-09-??' month=9 year=2017>sept 2017</ent> :",
(
"<ent norm='-365 days' direction=past year=1>il y a 1 an</ent> "
"<ent norm='during 30 days' mode=duration month=1>pdt 1 mois</ent>"
),
(
"Prélevé le : <ent norm='2016-04-22' day=22 month=4 year=2016>22/04/2016</ent> "
"\n78 rue du Général Leclerc"
),
"Le <ent norm='????-01-07' day=7 month=1>07/01</ent>.",
"Il est venu en <ent norm='????-08-??' month=8>août</ent>.",
"Il est venu <ent norm='~0 day' day=0 direction=current>ce jour</ent>.",
"CS le <ent norm='2017-01-11' day=11 month=1 year=2017>11-01-2017</ent> 1/3",
"Vu le <ent norm='2017-01-11' day=11 month=1 year=2017>11 janvier\n2017</ent> .",
]
@fixture(autouse=True)
def add_date_pipeline(blank_nlp: PipelineProtocol):
blank_nlp.add_pipe(
"eds.dates", config=dict(detect_periods=True, as_ents=True, explain=True)
)
def test_dates_component(blank_nlp: PipelineProtocol):
note_datetime = datetime.datetime(year=1993, month=9, day=23)
for example in examples:
text, entities = parse_example(example)
doc = blank_nlp(text)
spans = sorted(doc.spans["dates"] + doc.spans["durations"])
assert len(spans) == len(entities)
assert len(doc.ents) == len(entities)
for span, entity in zip(spans, entities):
assert span.text == text[entity.start_char : entity.end_char]
assert bool(span._.date_cues)
date = span._.date if span.label_ == "date" else span._.duration
d = {modifier.key: modifier.value for modifier in entity.modifiers}
norm = d.pop("norm")
if "direction" in d:
d["mode"] = "relative"
if "mode" not in d:
d["mode"] = "absolute"
assert date.dict(exclude_none=True) == d
assert date.norm() == norm
set_d = set(d)
d.pop("mode", None)
d.pop("direction", None)
d.pop("bound", None)
if isinstance(date, AbsoluteDate) and {"year", "month", "day"}.issubset(
set_d
):
assert date.to_datetime() == datetime.datetime(**d)
elif isinstance(date, AbsoluteDate):
assert date.to_datetime() is None
# no year
if {"month", "day"}.issubset(set_d) and {"year"}.isdisjoint(set_d):
d["year"] = note_datetime.year
assert date.to_datetime(
note_datetime=note_datetime, infer_from_context=True
) == datetime.datetime(**d)
# no day
if {"month", "year"}.issubset(set_d) and {"day"}.isdisjoint(set_d):
d["day"] = 1
assert date.to_datetime(
note_datetime=note_datetime, infer_from_context=True
) == datetime.datetime(**d)
# year only
if {"year"}.issubset(set_d) and {"day", "month"}.isdisjoint(set_d):
d["day"] = 1
d["month"] = 1
assert date.to_datetime(
note_datetime=note_datetime, infer_from_context=True
) == datetime.datetime(**d)
# month only
if {"month"}.issubset(set_d) and {"day", "year"}.isdisjoint(set_d):
d["day"] = 1
d["year"] = note_datetime.year
assert date.to_datetime(
note_datetime=note_datetime, infer_from_context=True
) == datetime.datetime(**d)
elif isinstance(date, Relative):
assert date.to_datetime() is None
else:
assert date.to_duration()
assert date.to_datetime(note_datetime=note_datetime)
def test_periods(blank_nlp: PipelineProtocol):
period_examples = [
"à partir de <ent>juin 2017 pendant trois semaines</ent>",
"du <ent>5 juin au 6 juillet</ent>",
]
for example in period_examples:
text, entities = parse_example(example)
doc = blank_nlp(text)
assert len(doc.spans["periods"]) == len(entities)
for span, entity in zip(doc.spans["periods"], entities):
assert span.text == text[entity.start_char : entity.end_char]
@pytest.mark.parametrize("with_time", [False, True])
def test_time(with_time: bool):
nlp = spacy.blank("eds")
nlp.add_pipe("eds.dates", config={"detect_time": with_time})
if with_time:
time_examples = [
"Vu le <ent norm='2012-01-11 11h34m'>11/01/2012 à 11h34</ent> pour radio.",
]
else:
time_examples = [
"Vu le <ent norm='2012-01-11'>11/01/2012</ent> à 11h34 pour radio.",
]
for example in time_examples:
text, entities = parse_example(example)
doc = nlp(text)
spans = sorted(doc.spans["dates"] + doc.spans["durations"])
assert len(spans) == len(entities)
for span, entity in zip(spans, entities):
assert span.text == text[entity.start_char : entity.end_char]
norm = next(m.value for m in entity.modifiers if m.key == "norm")
assert span._.date.norm() == norm
def test_false_positives(blank_nlp: PipelineProtocol):
counter_examples = [
"page 1/1", # Often found in the form `1/1` only
"40 00",
"06 12 34 56 78",
"bien mais",
"thierry",
"436",
"12.0-16",
"27.0-33",
"7.0-11",
"03-0.70",
"4.09-11",
"2/2CR Urgences PSL",
"Dextro : 5.7 mmol/l",
"2.5",
]
for example in counter_examples:
doc = blank_nlp(example)
assert len((*doc.spans["dates"], *doc.spans["durations"])) == 0
def test_dates_on_ents_only():
text = (
"Le patient est venu hier (le 04/09/2021) pour un test PCR.\n"
"Il est cas contact <ent>depuis la semaine dernière</ent>, "
"le <ent>09/08</ent> (<ent>2021-08-09</ent>)."
)
nlp = spacy.blank("eds")
nlp.add_pipe("eds.sentences")
nlp.add_pipe("eds.matcher", config=dict(terms={"contact": "contact"}))
nlp.add_pipe("eds.dates", config=dict(on_ents_only=True))
text, entities = parse_example(text)
doc = nlp(text)
assert len(doc.ents) == 1
spans = sorted(doc.spans["dates"] + doc.spans["durations"])
assert len(spans) == len(entities)
for span, entity in zip(spans, entities):
assert span.text == text[entity.start_char : entity.end_char]
def test_illegal_dates(blank_nlp):
texts = (
" Le 31/06/17, la dernière dose.",
" Le 30/02/18 n'est pas une vraie date",
)
for text in texts:
doc = blank_nlp(text)
ent = sorted((*doc.spans["dates"], *doc.spans["durations"]))[0]
assert ent._.date.to_datetime() is None
@pytest.mark.parametrize(
"dt",
[
BASE_DATE.strftime("%Y-%m-%d"),
BASE_DATE.strftime("%Y-%m-%d %H:%M:%S"),
BASE_DATE,
BASE_DATE.timestamp(),
BASE_DATE.date(),
"un truc",
None,
],
)
def test_note_datetime(blank_nlp, dt):
doc = blank_nlp.make_doc(
"On le voit en décembre, dans 3 mois puis dans 2 ans pendant 1 an."
)
doc._.note_datetime = dt
doc = blank_nlp(doc)
if dt is not None and dt != "un truc":
assert doc._.note_datetime.year == BASE_DATE.year
assert doc._.note_datetime.month == BASE_DATE.month
assert doc._.note_datetime.day == BASE_DATE.day
assert doc.spans["dates"][0]._.date.datetime.year == BASE_DATE.year
assert doc.spans["dates"][0]._.date.duration.days == 88
assert str(doc.spans["dates"][0]._.value) == "????-12-??"
assert doc.spans["dates"][1]._.date.datetime.month == BASE_DATE.month + 3
assert doc.spans["dates"][1]._.date.duration.days == 90
assert str(doc.spans["dates"][1]._.date) == "+90 days"
assert doc.spans["dates"][2]._.date.datetime.year == BASE_DATE.year + 2
assert doc.spans["dates"][2]._.date.duration.days == 730
assert str(doc.spans["dates"][2]._.date) == "+730 days"
assert doc.spans["durations"][0]._.duration.duration.days == 365
assert str(doc.spans["durations"][0]._.duration) == "during 365 days"