deidentify / Git / Diff of /deidentify/util/replace

Models:

philipB/

deidentify

Downloads: 1

Diff of /deidentify/util/replace_phi.py [000000] .. [7fc5df]

Switch to unified view

 b/deidentify/util/replace_phi.py
+from typing import Callable, List
+from deidentify.base import Annotation, Document
+from deidentify.surrogates.dataset_deidentifier import DatasetDeidentifier
+from deidentify.surrogates.dataset_deidentifier import Document as SurrogateDocument
+from deidentify.surrogates.rewrite_dataset import apply_surrogates
+from deidentify.surrogates.generators import RandomData
+def _uppercase_formatter(annotation: Annotation):
+    return '[{}]'.format(annotation.tag.upper())
+def mask_annotations(document: Document,
+                     replacement_formatter: Callable[[Annotation], str] = _uppercase_formatter
+                     ) -> Document:
+    """Utility function to replace sensitive PHI spans with a placeholder.
+    Parameters
+    ----------
+    document : Document
+        The document whose PHI annotations should be replaced.
+    replacement_formatter : Callable[[Annotation], str]
+        A callable that can be used to configure the formatting of the replacement.
+        The default formatter replaces an annotation with `[annotation.tag.upper()]`.
+    Returns
+    -------
+    Document
+        The document with masked annotations.
+    """
+    # Amount of characters by which start point of annotation is adjusted
+    # Positive shift if replacement is longer than original annotation
+    # Negative shift if replacement is shorter
+    shift = 0
+    original_text_pointer = 0
+    text_rewritten = ''
+    annotations_rewritten = []
+    for annotation in document.annotations:
+        replacement = replacement_formatter(annotation)
+        part = document.text[original_text_pointer:annotation.start]
+        start = annotation.start + shift
+        end = start + len(replacement)
+        shift += len(replacement) - len(annotation.text)
+        text_rewritten += part + replacement
+        original_text_pointer = annotation.end
+        annotations_rewritten.append(annotation._replace(
+            start=start,
+            end=end,
+            text=replacement
+        ))
+    text_rewritten += document.text[original_text_pointer:]
+    return Document(name=document.name, text=text_rewritten, annotations=annotations_rewritten)
+def surrogate_annotations(docs: List[Document], seed=42, errors='raise') -> List[Document]:
+    """Replaces PHI annotations in documents with random surrogates.
+    Parameters
+    ----------
+    seed : int
+        Set this seed to make the random generation deterministic.
+    errors : str {'ignore', 'raise', 'coerce'}, default 'raise'
+        - If 'raise',  errors during surrogate generation will raise an exception.
+        - If 'ignore', failing annotations are skipped (they and PHI remains in text)
+        - If 'coerce', failing annotations are replaced with pattern `[annotation.tag]`
+    Returns
+    -------
+    List[Document]
+        A copy of `docs` with with text and annotations rewritten to their surrogates.
+        If errors is 'ignore' or 'coerce', an extra property of type List is added to the returned
+        documents (`Document.annotations_without_surrogates`), which includes annotations of the
+        *input document* that could not be replaced with a surrogate.
+    """
+    random_data = RandomData(seed=seed)
+    dataset_deidentifier = DatasetDeidentifier(random_data=random_data)
+    surrogate_docs = [SurrogateDocument(doc.annotations, doc.text) for doc in docs]
+    surrogate_docs = dataset_deidentifier.generate_surrogates(documents=surrogate_docs)
+    for doc in surrogate_docs:
+        annotations, surrogates = doc.annotation_surrogate_pairs()
+        doc_rewritten = apply_surrogates(doc.text, annotations, surrogates, errors=errors)
+        yield doc_rewritten