Download this file

334 lines (271 with data), 13.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
"""Generate random surrogates for names that are of the same syntactic pattern.
Examples:
- Daniel MT de Groot => Jurrien HD Nguyen
- Dr. Annemarie van Heijer, Ph.D => Dr. Clara van der Linden, Ph.D
Algorithm:
1. Generate two random mappings for characters of the latin alphabet: one for firstnames and one
for lastnames (e.g., A->E, B->F...). This helps to maintain continuity across documents by
always replacing names and initials with the same character mapping.
2. For each name:
2.1 Normalize the name into it's parts (title, first, middle, last, suffix) using `nameparser`
2.2 Check lookup table if a surrogate for this name already exists. If not, continue.
2.3 Extract the first letter of firstname, middle name and lastname, respectively.
2.4 Map these letters using the appropriate character mapping from step 1.
2.5 Lookup a random name that starts with the mapped character.
2.6 Place (name: surrogate) mapping in the lookup list.
References:
- https://nameparser.readthedocs.io/en/latest/
- Stubbs, A., Uzuner, Ö., Kotfila, C., Goldstein, I., & Szolovits, P. (2015). Challenges in
Synthesizing Surrogate PHI in Narrative EMRs. https://doi.org/10.1007/978-3-319-23633-9_27
"""
import re
import string
from collections import defaultdict
from os.path import dirname, join
import nameparser.config
import pandas as pd
from loguru import logger
from nameparser import HumanName
from unidecode import unidecode
from .base import ExactMatchGenerator, SurrogateGenerator
RESOURCES_PATH = join(dirname(__file__), 'resources')
# Add common Dutch titles for Mr. and Mrs.
nameparser.config.CONSTANTS.titles.add('mw', 'dhr', 'mevr', 'mr')
# Add common Dutch prepositions to prefixes such that they are parsed as part of last names as
# opposed to middle names.
# See: https://en.wikipedia.org/wiki/Dutch_name#Tussenvoegsels
PREPOSITIONS = ['van', 'den', 'v.d.', 'vd', 'de', 'der', "'t", 'ten', 'ter', 'at', 'op']
nameparser.config.CONSTANTS.prefixes.add(*PREPOSITIONS)
PREPOSITIONS_REGEX = re.compile(r'((?:{})\s)*'.format('|'.join(PREPOSITIONS)))
INITIALS_REGEX = re.compile(r'([A-Z]?\.?)+')
def random_char_mapping(random_data):
"""Generate a random mapping between for the characters of the lowercase ASCII alphabet.
Parameters
----------
random_data : deidentify.surrogates.RandomData
The random operations provider.
Returns
-------
dict(str: str)
The random character mapping.
"""
alphabet = string.ascii_lowercase
shuffled = random_data.shuffle(alphabet)
return dict(zip(alphabet, shuffled))
def _load_firstnames(filename):
return pd.read_csv(filename, names=['name'])['name'].values
def _inverted_name_index(names, index_getter=lambda x: x[0]):
index = defaultdict(list)
for name in names:
key = index_getter(name)
key = NameDatabase.normalize_index_key(key)
index[key].append(name)
return index
class NameDatabase:
def __init__(self, firstnames_male=None, firstnames_female=None, lastnames=None):
"""Provides access to the 10,000 most common Dutch firstnames/lastnames fetched from the
Meertens Instituut (see: http://www.naamkunde.net).
Name lists are stored in a form of "inverted index" where the key is the first letter of
the firstname/lastname and the values are a list of names starting with this letter.
Example for firstnames [Anne, Alieke, Jan, Thomas]:
```
{
'a': [Anne, Alieke],
'j': [Jan],
't': [Thomas]
}
```
Lastnames are stored as (prefix, lastname) tuples. Example: ('de', 'Groot').
Parameters
----------
firstnames_male : iterable of type `str`
A list of male firstnames.
firstnames_female : iterable of type `str`
A list of female firstnames.
lastnames : iterable of (str, str) tuples
A list of lastname tuples in form of (prefix: str, lastname: str). Example: `('de', 'Groot')`.
"""
if not firstnames_male:
firstnames_male = _load_firstnames(join(RESOURCES_PATH, 'firstnames_male.txt'))
self.__male_normalized = set(name.lower() for name in firstnames_male)
self.male_index = _inverted_name_index(firstnames_male)
if not firstnames_female:
firstnames_female = _load_firstnames(join(RESOURCES_PATH, 'firstnames_female.txt'))
self.__female_normalized = set(name.lower() for name in firstnames_female)
self.female_index = _inverted_name_index(firstnames_female)
if not lastnames:
df_lastnames = pd.read_csv(join(RESOURCES_PATH, 'lastnames.csv'))
df_lastnames.prefix.fillna('', inplace=True)
lastnames = df_lastnames.apply(lambda row: (row['prefix'], row['name']), axis=1)
# Given (prefix, lastname) tuple select first character of lastname and use as index
def lastname_index_getter(prefix_lastname_tuple):
return prefix_lastname_tuple[1][0]
self.lastname_index = _inverted_name_index(lastnames, index_getter=lastname_index_getter)
def gender_index_for_name(self, firstname):
"""Make a best-guess at the gender of the given firstname and returns the appropriate index
name index.
Performs a lookup in the firstname database. If name is not present in male index, it is
assumed that the gender is female.
Returns
-------
dict(str: [str])
The name index correspoding to the gender of `firstname`.
"""
if firstname.lower() in self.__male_normalized:
return self.male_index
return self.female_index
@staticmethod
def normalize_index_key(key):
return unidecode(key).lower()
class InitialsSurrogates(ExactMatchGenerator):
def __init__(self, annotations, char_mapping):
super(InitialsSurrogates, self).__init__(annotations=annotations)
self.char_mapping = char_mapping
def replace_one(self, annotation):
replacement = ''
for initial in annotation:
replacement_initial = self.char_mapping.get(initial.lower(), initial)
# restore casing of original initial
if initial.islower():
replacement_initial = replacement_initial.lower()
elif initial.isupper():
replacement_initial = replacement_initial.upper()
replacement += replacement_initial
return replacement
class NameSurrogates(SurrogateGenerator):
def __init__(self, annotations, random_data, firstname_char_mapping, lastname_char_mapping,
name_database=NameDatabase()):
super(NameSurrogates, self).__init__(annotations=annotations, random_data=random_data)
self.firstname_char_mapping = firstname_char_mapping
self.lastname_char_mapping = lastname_char_mapping
self.name_database = name_database
self.initials_surrogates = InitialsSurrogates(annotations=[],
char_mapping=firstname_char_mapping)
@staticmethod
def normalize_name(annotation):
return HumanName(annotation)
@staticmethod
def remove_prepositions(lastname):
return PREPOSITIONS_REGEX.sub('', lastname)
@staticmethod
def is_initials(part_of_name):
return not INITIALS_REGEX.sub('', part_of_name)
def _get_surrogate_name(self, index, index_key, char_mapping):
"""Retrieve random surrogate from the given index according to a character mapped index.
Index keys are normalized by transliterating unicode characters to their ascii equivalent
and lowercasing (e.g., Ö => o).
Parameters
----------
index : dict(str: [str])
The name index to retrieve a random name from. The first letter of the values equals
the index key.
index_key : str
The first letter of a name.
char_mapping : dict(str: str)
A random character mapping.
Returns
-------
str
A random name starting with the value of `char_mapping[index_key]`.
"""
index_key = self.name_database.normalize_index_key(index_key)
first_letter_surrogate = char_mapping[index_key]
return self.random_data.choice(index[first_letter_surrogate])
@staticmethod
def restore_case(original_char, new_string):
if original_char.islower():
return new_string[0].lower() + new_string[1:]
return new_string[0].upper() + new_string[1:]
def surrogate_firstname(self, firstname):
name_index = self.name_database.gender_index_for_name(firstname)
first_letter = firstname[0]
return self._get_surrogate_name(name_index, first_letter, self.firstname_char_mapping)
def surrogate_lastname(self, lastname):
lastname = self.remove_prepositions(lastname)
first_letter = lastname[0]
return self._get_surrogate_name(self.name_database.lastname_index,
first_letter, self.lastname_char_mapping)
@staticmethod
def cached_surrogate(cache, name_string, replacement_generator):
"""Generate a new surrogate for the given name or use an existing one if it already exist.
Parameters
----------
cache : dict(str: str)
Lookup table of already replaced names. Keys are original names and values are
surrogates.
name_string : str
The name to generate a replacement for
replacement_generator : callable accepting `name_string`
A callable that generates a new surrogate for `name_string`, in case no previous
replacement exists.
Returns
-------
str
The surrogate name.
"""
replacement = cache.get(name_string.lower(), None)
if not replacement:
replacement = replacement_generator(name_string)
cache[name_string.lower()] = replacement
return replacement
@staticmethod
def strict_replace(part, replacement, whole):
# initials may end on ., so matching on a word boundary \b is insufficient.
# If we match on \W, we need to re-insert this in the substitute.
fmt = r'\b(?:{})(\W|\b)'.format(re.escape(part))
return re.sub(fmt, replacement + r'\1', whole)
def _replace_name(self, annotation, firstname_mapping, lastname_mapping):
new_name = annotation
name = self.normalize_name(annotation)
if name.first:
if self.is_initials(name.first):
replacement = self.initials_surrogates.replace_one(name.first)
else:
replacement = self.cached_surrogate(firstname_mapping,
name.first,
self.surrogate_firstname)
replacement = self.restore_case(name.first[0], replacement)
new_name = self.strict_replace(part=name.first, replacement=replacement, whole=new_name)
if name.middle:
replacement = ''
parts = name.middle.split()
for i, part in enumerate(parts):
if self.is_initials(part):
replacement += self.initials_surrogates.replace_one(part)
else:
# If part is not an initial, we assume it is a middle name
part_replacement = self.cached_surrogate(firstname_mapping,
part,
self.surrogate_firstname)
replacement += self.restore_case(part[0], part_replacement)
if i < len(parts) - 1:
replacement += ' '
new_name = self.strict_replace(part=name.middle, replacement=replacement,
whole=new_name)
if name.last:
original_lastname = self.remove_prepositions(name.last)
replacement = self.cached_surrogate(lastname_mapping,
name.last,
self.surrogate_lastname)
prefix, lastname = replacement
lastname = self.restore_case(original_lastname[0], lastname)
if prefix:
replacement = prefix + ' ' + lastname
else:
replacement = lastname
new_name = self.strict_replace(part=name.last, replacement=replacement, whole=new_name)
assert new_name != annotation
return new_name
def replace_all(self):
firstname_mapping = {}
lastname_mapping = {}
replaced = []
for annotation in self.annotations:
# TODO: Add an annotation object that encapsules automatic replacement errors
new_name = None
try:
new_name = self._replace_name(annotation, firstname_mapping, lastname_mapping)
except (AssertionError, KeyError):
logger.opt(exception=False).debug('Could not process name {}'.format(annotation))
replaced.append(new_name)
return replaced