Diff of /edsnlp/utils/regex.py [000000] .. [cad161]

Switch to side-by-side view

--- a
+++ b/edsnlp/utils/regex.py
@@ -0,0 +1,62 @@
+import re
+from typing import List, Optional
+
+import regex
+
+
+def make_pattern(
+    patterns: List[str],
+    with_breaks: bool = False,
+    name: Optional[str] = None,
+) -> str:
+    r"""
+    Create OR pattern from a list of patterns.
+
+    Parameters
+    ----------
+    patterns : List[str]
+        List of patterns to merge.
+    with_breaks : bool, optional
+        Whether to add breaks (`\b`) on each side, by default False
+    name: str, optional
+        Name of the group, using regex `?P<>` directive.
+
+    Returns
+    -------
+    str
+        Merged pattern.
+    """
+
+    if name:
+        prefix = f"(?P<{name}>"
+    else:
+        prefix = "("
+
+    # Sorting by length might be more efficient
+    patterns.sort(key=len, reverse=True)
+
+    pattern = prefix + "|".join(patterns) + ")"
+
+    if with_breaks:
+        pattern = r"\b" + pattern + r"\b"
+
+    return pattern
+
+
+def compile_regex(reg: str, flags: re.RegexFlag):
+    """
+    This function tries to compile `reg`  using the `re` module, and
+    fallbacks to the `regex` module that is more permissive.
+
+    Parameters
+    ----------
+    reg: str
+
+    Returns
+    -------
+    Union[re.Pattern, regex.Pattern]
+    """
+    try:
+        return re.compile(reg, flags=flags)
+    except re.error:
+        return regex.compile(reg, flags=flags)