|
a |
|
b/demo.py |
|
|
1 |
from deidentify.base import Document |
|
|
2 |
from deidentify.taggers import FlairTagger |
|
|
3 |
from deidentify.tokenizer import TokenizerFactory |
|
|
4 |
|
|
|
5 |
# Create some text |
|
|
6 |
text = ( |
|
|
7 |
"Dit is stukje tekst met daarin de naam Jan Jansen. De patient J. Jansen (e: " |
|
|
8 |
"j.jnsen@email.com, t: 06-12345678) is 64 jaar oud en woonachtig in Utrecht. Hij werd op 10 " |
|
|
9 |
"oktober door arts Peter de Visser ontslagen van de kliniek van het UMCU." |
|
|
10 |
) |
|
|
11 |
|
|
|
12 |
# Wrap text in document |
|
|
13 |
documents = [ |
|
|
14 |
Document(name='doc_01', text=text) |
|
|
15 |
] |
|
|
16 |
|
|
|
17 |
# Select downloaded model |
|
|
18 |
model = 'model_bilstmcrf_ons_fast-v0.2.0' |
|
|
19 |
|
|
|
20 |
# Instantiate tokenizer |
|
|
21 |
tokenizer = TokenizerFactory().tokenizer(corpus='ons', disable=("tagger", "ner")) |
|
|
22 |
|
|
|
23 |
# Load tagger with a downloaded model file and tokenizer |
|
|
24 |
tagger = FlairTagger(model=model, tokenizer=tokenizer, verbose=False) |
|
|
25 |
|
|
|
26 |
# Annotate your documents |
|
|
27 |
annotated_docs = tagger.annotate(documents) |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
from pprint import pprint |
|
|
31 |
|
|
|
32 |
first_doc = annotated_docs[0] |
|
|
33 |
pprint(first_doc.annotations) |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
from deidentify.util import mask_annotations |
|
|
37 |
|
|
|
38 |
masked_doc = mask_annotations(first_doc) |
|
|
39 |
print(masked_doc.text) |
|
|
40 |
|
|
|
41 |
|
|
|
42 |
from deidentify.util import surrogate_annotations |
|
|
43 |
|
|
|
44 |
iter_docs = surrogate_annotations(docs=[first_doc], seed=1) |
|
|
45 |
surrogate_doc = list(iter_docs)[0] |
|
|
46 |
print(surrogate_doc.text) |