|
a |
|
b/edsnlp/utils/regex.py |
|
|
1 |
import re |
|
|
2 |
from typing import List, Optional |
|
|
3 |
|
|
|
4 |
import regex |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
def make_pattern( |
|
|
8 |
patterns: List[str], |
|
|
9 |
with_breaks: bool = False, |
|
|
10 |
name: Optional[str] = None, |
|
|
11 |
) -> str: |
|
|
12 |
r""" |
|
|
13 |
Create OR pattern from a list of patterns. |
|
|
14 |
|
|
|
15 |
Parameters |
|
|
16 |
---------- |
|
|
17 |
patterns : List[str] |
|
|
18 |
List of patterns to merge. |
|
|
19 |
with_breaks : bool, optional |
|
|
20 |
Whether to add breaks (`\b`) on each side, by default False |
|
|
21 |
name: str, optional |
|
|
22 |
Name of the group, using regex `?P<>` directive. |
|
|
23 |
|
|
|
24 |
Returns |
|
|
25 |
------- |
|
|
26 |
str |
|
|
27 |
Merged pattern. |
|
|
28 |
""" |
|
|
29 |
|
|
|
30 |
if name: |
|
|
31 |
prefix = f"(?P<{name}>" |
|
|
32 |
else: |
|
|
33 |
prefix = "(" |
|
|
34 |
|
|
|
35 |
# Sorting by length might be more efficient |
|
|
36 |
patterns.sort(key=len, reverse=True) |
|
|
37 |
|
|
|
38 |
pattern = prefix + "|".join(patterns) + ")" |
|
|
39 |
|
|
|
40 |
if with_breaks: |
|
|
41 |
pattern = r"\b" + pattern + r"\b" |
|
|
42 |
|
|
|
43 |
return pattern |
|
|
44 |
|
|
|
45 |
|
|
|
46 |
def compile_regex(reg: str, flags: re.RegexFlag): |
|
|
47 |
""" |
|
|
48 |
This function tries to compile `reg` using the `re` module, and |
|
|
49 |
fallbacks to the `regex` module that is more permissive. |
|
|
50 |
|
|
|
51 |
Parameters |
|
|
52 |
---------- |
|
|
53 |
reg: str |
|
|
54 |
|
|
|
55 |
Returns |
|
|
56 |
------- |
|
|
57 |
Union[re.Pattern, regex.Pattern] |
|
|
58 |
""" |
|
|
59 |
try: |
|
|
60 |
return re.compile(reg, flags=flags) |
|
|
61 |
except re.error: |
|
|
62 |
return regex.compile(reg, flags=flags) |