[6c353a]: / medacy / pipeline_components / tokenizers / clinical_tokenizer.py

Download this file

60 lines (50 with data), 1.8 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex, compile_prefix_regex
class ClinicalTokenizer:
"""
A tokenizer for clinical text
"""
def __init__(self, nlp):
if not isinstance(nlp, Language):
raise ValueError("NLP must be an instance of spacy.lang")
self.nlp = nlp
self.tokenizer = Tokenizer(
nlp.vocab,
nlp.Defaults.tokenizer_exceptions,
prefix_search=self._get_prefix_regex().search,
infix_finditer=self._get_infix_regex().finditer,
suffix_search=self._get_suffix_regex().search,
token_match=None
)
def add_exceptions(self, exceptions):
"""
Adds exception for tokenizer to ignore.
:param exceptions: an array of terms to not split on during tokenization
:return:
"""
raise NotImplementedError()
def _get_prefix_regex(self):
"""
Custom prefix tokenization rules
:return:
"""
custom_prefixes = [r"""^[\[\("'\\/@]"""]
all_prefixes_re = compile_prefix_regex(tuple(list(self.nlp.Defaults.prefixes) + custom_prefixes))
return all_prefixes_re
def _get_infix_regex(self):
"""
Custom infix tokenization rules
:return:
"""
custom_infixes = ['\[', '(?<=[0-9])-(?=[0-9])', '[!&:,()\*/-><]']
infix_re = compile_infix_regex(tuple(list(self.nlp.Defaults.infixes) + custom_infixes))
return infix_re
def _get_suffix_regex(self):
"""
Custom suffix tokenization rules
:return:
"""
suffix_re = re.compile(r'''[\]\)"',.x\-%\?\\n]|\*|(mg$)|(mcg$)|(mL$)|(cap$)|(-+$)$''')
return suffix_re