[6c353a]: / medacy / pipeline_components / units / time_unit_component.py

Download this file

54 lines (47 with data), 2.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Token
from medacy.pipeline_components.feature_overlayers.base import BaseOverlayer
class TimeUnitOverlayer(BaseOverlayer):
"""
A pipeline component that tags time units
"""
name="time_unit_annotator"
dependencies = []
def __init__(self, spacy_pipeline):
self.nlp = spacy_pipeline
Token.set_extension('feature_is_time_unit', default=False)
self.nlp.entity.add_label('time_unit')
self.time_matcher = Matcher(self.nlp.vocab)
self.time_matcher.add('UNIT_OF_TIME', None,
[{'LOWER': 'sec'}],
[{'LOWER': 'second'}],
[{'LOWER': 'seconds'}],
[{'LOWER': 'min'}],
[{'LOWER': 'minute'}],
[{'LOWER': 'minutes'}],
[{'LOWER': 'hr'}],
[{'LOWER': 'hour'}],
[{'LOWER': 'day'}],
[{'LOWER': 'days'}],
[{'LOWER': 'week'}],
[{'LOWER': 'weeks'}],
[{'LOWER': 'month'}],
[{'LOWER': 'months'}],
[{'LOWER': 'year'}],
[{'LOWER': 'years'}],
[{'LOWER': 'yrs'}]
)
def __call__(self, doc):
nlp = self.nlp
with doc.retokenize() as retokenizer:
# match and tag time units
matches = self.time_matcher(doc)
for match_id, start, end in matches:
span = Span(doc, start, end, label=nlp.vocab.strings['time_unit'])
for token in span:
token._.feature_is_time_unit = True
if len(span) > 1:
retokenizer.merge(span)
doc.ents = list(doc.ents) + [span]
return doc