deidentify / Git / [7fc5df] /deidentify/util/replace

Models:
philipB/
deidentify
Downloads: 1
[7fc5df]: / deidentify / util / replace_phi.py
History
Download this file
93 lines (72 with data), 3.7 kB

from typing import Callable, List

from deidentify.base import Annotation, Document
from deidentify.surrogates.dataset_deidentifier import DatasetDeidentifier
from deidentify.surrogates.dataset_deidentifier import Document as SurrogateDocument
from deidentify.surrogates.rewrite_dataset import apply_surrogates
from deidentify.surrogates.generators import RandomData


def _uppercase_formatter(annotation: Annotation):
    return '[{}]'.format(annotation.tag.upper())


def mask_annotations(document: Document,
                     replacement_formatter: Callable[[Annotation], str] = _uppercase_formatter
                     ) -> Document:
    """Utility function to replace sensitive PHI spans with a placeholder.

    Parameters
    ----------
    document : Document
        The document whose PHI annotations should be replaced.
    replacement_formatter : Callable[[Annotation], str]
        A callable that can be used to configure the formatting of the replacement.
        The default formatter replaces an annotation with `[annotation.tag.upper()]`.

    Returns
    -------
    Document
        The document with masked annotations.
    """
    # Amount of characters by which start point of annotation is adjusted
    # Positive shift if replacement is longer than original annotation
    # Negative shift if replacement is shorter
    shift = 0

    original_text_pointer = 0
    text_rewritten = ''
    annotations_rewritten = []

    for annotation in document.annotations:
        replacement = replacement_formatter(annotation)
        part = document.text[original_text_pointer:annotation.start]

        start = annotation.start + shift
        end = start + len(replacement)
        shift += len(replacement) - len(annotation.text)

        text_rewritten += part + replacement
        original_text_pointer = annotation.end
        annotations_rewritten.append(annotation._replace(
            start=start,
            end=end,
            text=replacement
        ))

    text_rewritten += document.text[original_text_pointer:]
    return Document(name=document.name, text=text_rewritten, annotations=annotations_rewritten)


def surrogate_annotations(docs: List[Document], seed=42, errors='raise') -> List[Document]:
    """Replaces PHI annotations in documents with random surrogates.

    Parameters
    ----------
    seed : int
        Set this seed to make the random generation deterministic.
    errors : str {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise',  errors during surrogate generation will raise an exception.
        - If 'ignore', failing annotations are skipped (they and PHI remains in text)
        - If 'coerce', failing annotations are replaced with pattern `[annotation.tag]`

    Returns
    -------
    List[Document]
        A copy of `docs` with with text and annotations rewritten to their surrogates.

        If errors is 'ignore' or 'coerce', an extra property of type List is added to the returned
        documents (`Document.annotations_without_surrogates`), which includes annotations of the
        *input document* that could not be replaced with a surrogate.

    """
    random_data = RandomData(seed=seed)
    dataset_deidentifier = DatasetDeidentifier(random_data=random_data)

    surrogate_docs = [SurrogateDocument(doc.annotations, doc.text) for doc in docs]
    surrogate_docs = dataset_deidentifier.generate_surrogates(documents=surrogate_docs)

    for doc in surrogate_docs:
        annotations, surrogates = doc.annotation_surrogate_pairs()
        doc_rewritten = apply_surrogates(doc.text, annotations, surrogates, errors=errors)
        yield doc_rewritten