a b/edsnlp/utils/regex.py
1
import re
2
from typing import List, Optional
3
4
import regex
5
6
7
def make_pattern(
8
    patterns: List[str],
9
    with_breaks: bool = False,
10
    name: Optional[str] = None,
11
) -> str:
12
    r"""
13
    Create OR pattern from a list of patterns.
14
15
    Parameters
16
    ----------
17
    patterns : List[str]
18
        List of patterns to merge.
19
    with_breaks : bool, optional
20
        Whether to add breaks (`\b`) on each side, by default False
21
    name: str, optional
22
        Name of the group, using regex `?P<>` directive.
23
24
    Returns
25
    -------
26
    str
27
        Merged pattern.
28
    """
29
30
    if name:
31
        prefix = f"(?P<{name}>"
32
    else:
33
        prefix = "("
34
35
    # Sorting by length might be more efficient
36
    patterns.sort(key=len, reverse=True)
37
38
    pattern = prefix + "|".join(patterns) + ")"
39
40
    if with_breaks:
41
        pattern = r"\b" + pattern + r"\b"
42
43
    return pattern
44
45
46
def compile_regex(reg: str, flags: re.RegexFlag):
47
    """
48
    This function tries to compile `reg`  using the `re` module, and
49
    fallbacks to the `regex` module that is more permissive.
50
51
    Parameters
52
    ----------
53
    reg: str
54
55
    Returns
56
    -------
57
    Union[re.Pattern, regex.Pattern]
58
    """
59
    try:
60
        return re.compile(reg, flags=flags)
61
    except re.error:
62
        return regex.compile(reg, flags=flags)