Switch to unified view

a b/deidentify/util/replace_phi.py
1
from typing import Callable, List
2
3
from deidentify.base import Annotation, Document
4
from deidentify.surrogates.dataset_deidentifier import DatasetDeidentifier
5
from deidentify.surrogates.dataset_deidentifier import Document as SurrogateDocument
6
from deidentify.surrogates.rewrite_dataset import apply_surrogates
7
from deidentify.surrogates.generators import RandomData
8
9
10
def _uppercase_formatter(annotation: Annotation):
11
    return '[{}]'.format(annotation.tag.upper())
12
13
14
def mask_annotations(document: Document,
15
                     replacement_formatter: Callable[[Annotation], str] = _uppercase_formatter
16
                     ) -> Document:
17
    """Utility function to replace sensitive PHI spans with a placeholder.
18
19
    Parameters
20
    ----------
21
    document : Document
22
        The document whose PHI annotations should be replaced.
23
    replacement_formatter : Callable[[Annotation], str]
24
        A callable that can be used to configure the formatting of the replacement.
25
        The default formatter replaces an annotation with `[annotation.tag.upper()]`.
26
27
    Returns
28
    -------
29
    Document
30
        The document with masked annotations.
31
    """
32
    # Amount of characters by which start point of annotation is adjusted
33
    # Positive shift if replacement is longer than original annotation
34
    # Negative shift if replacement is shorter
35
    shift = 0
36
37
    original_text_pointer = 0
38
    text_rewritten = ''
39
    annotations_rewritten = []
40
41
    for annotation in document.annotations:
42
        replacement = replacement_formatter(annotation)
43
        part = document.text[original_text_pointer:annotation.start]
44
45
        start = annotation.start + shift
46
        end = start + len(replacement)
47
        shift += len(replacement) - len(annotation.text)
48
49
        text_rewritten += part + replacement
50
        original_text_pointer = annotation.end
51
        annotations_rewritten.append(annotation._replace(
52
            start=start,
53
            end=end,
54
            text=replacement
55
        ))
56
57
    text_rewritten += document.text[original_text_pointer:]
58
    return Document(name=document.name, text=text_rewritten, annotations=annotations_rewritten)
59
60
61
def surrogate_annotations(docs: List[Document], seed=42, errors='raise') -> List[Document]:
62
    """Replaces PHI annotations in documents with random surrogates.
63
64
    Parameters
65
    ----------
66
    seed : int
67
        Set this seed to make the random generation deterministic.
68
    errors : str {'ignore', 'raise', 'coerce'}, default 'raise'
69
        - If 'raise',  errors during surrogate generation will raise an exception.
70
        - If 'ignore', failing annotations are skipped (they and PHI remains in text)
71
        - If 'coerce', failing annotations are replaced with pattern `[annotation.tag]`
72
73
    Returns
74
    -------
75
    List[Document]
76
        A copy of `docs` with with text and annotations rewritten to their surrogates.
77
78
        If errors is 'ignore' or 'coerce', an extra property of type List is added to the returned
79
        documents (`Document.annotations_without_surrogates`), which includes annotations of the
80
        *input document* that could not be replaced with a surrogate.
81
82
    """
83
    random_data = RandomData(seed=seed)
84
    dataset_deidentifier = DatasetDeidentifier(random_data=random_data)
85
86
    surrogate_docs = [SurrogateDocument(doc.annotations, doc.text) for doc in docs]
87
    surrogate_docs = dataset_deidentifier.generate_surrogates(documents=surrogate_docs)
88
89
    for doc in surrogate_docs:
90
        annotations, surrogates = doc.annotation_surrogate_pairs()
91
        doc_rewritten = apply_surrogates(doc.text, annotations, surrogates, errors=errors)
92
        yield doc_rewritten