--- a +++ b/deidentify/surrogates/generators/name.py @@ -0,0 +1,333 @@ +"""Generate random surrogates for names that are of the same syntactic pattern. + +Examples: +- Daniel MT de Groot => Jurrien HD Nguyen +- Dr. Annemarie van Heijer, Ph.D => Dr. Clara van der Linden, Ph.D + +Algorithm: +1. Generate two random mappings for characters of the latin alphabet: one for firstnames and one + for lastnames (e.g., A->E, B->F...). This helps to maintain continuity across documents by + always replacing names and initials with the same character mapping. +2. For each name: + 2.1 Normalize the name into it's parts (title, first, middle, last, suffix) using `nameparser` + 2.2 Check lookup table if a surrogate for this name already exists. If not, continue. + 2.3 Extract the first letter of firstname, middle name and lastname, respectively. + 2.4 Map these letters using the appropriate character mapping from step 1. + 2.5 Lookup a random name that starts with the mapped character. + 2.6 Place (name: surrogate) mapping in the lookup list. + +References: +- https://nameparser.readthedocs.io/en/latest/ +- Stubbs, A., Uzuner, Ö., Kotfila, C., Goldstein, I., & Szolovits, P. (2015). Challenges in + Synthesizing Surrogate PHI in Narrative EMRs. https://doi.org/10.1007/978-3-319-23633-9_27 +""" +import re +import string +from collections import defaultdict +from os.path import dirname, join + +import nameparser.config +import pandas as pd +from loguru import logger +from nameparser import HumanName +from unidecode import unidecode + +from .base import ExactMatchGenerator, SurrogateGenerator + +RESOURCES_PATH = join(dirname(__file__), 'resources') + +# Add common Dutch titles for Mr. and Mrs. +nameparser.config.CONSTANTS.titles.add('mw', 'dhr', 'mevr', 'mr') +# Add common Dutch prepositions to prefixes such that they are parsed as part of last names as +# opposed to middle names. +# See: https://en.wikipedia.org/wiki/Dutch_name#Tussenvoegsels +PREPOSITIONS = ['van', 'den', 'v.d.', 'vd', 'de', 'der', "'t", 'ten', 'ter', 'at', 'op'] +nameparser.config.CONSTANTS.prefixes.add(*PREPOSITIONS) + +PREPOSITIONS_REGEX = re.compile(r'((?:{})\s)*'.format('|'.join(PREPOSITIONS))) +INITIALS_REGEX = re.compile(r'([A-Z]?\.?)+') + + +def random_char_mapping(random_data): + """Generate a random mapping between for the characters of the lowercase ASCII alphabet. + + Parameters + ---------- + random_data : deidentify.surrogates.RandomData + The random operations provider. + + Returns + ------- + dict(str: str) + The random character mapping. + """ + alphabet = string.ascii_lowercase + shuffled = random_data.shuffle(alphabet) + return dict(zip(alphabet, shuffled)) + + +def _load_firstnames(filename): + return pd.read_csv(filename, names=['name'])['name'].values + + +def _inverted_name_index(names, index_getter=lambda x: x[0]): + index = defaultdict(list) + for name in names: + key = index_getter(name) + key = NameDatabase.normalize_index_key(key) + index[key].append(name) + return index + + +class NameDatabase: + + def __init__(self, firstnames_male=None, firstnames_female=None, lastnames=None): + """Provides access to the 10,000 most common Dutch firstnames/lastnames fetched from the + Meertens Instituut (see: http://www.naamkunde.net). + + Name lists are stored in a form of "inverted index" where the key is the first letter of + the firstname/lastname and the values are a list of names starting with this letter. + + Example for firstnames [Anne, Alieke, Jan, Thomas]: + ``` + { + 'a': [Anne, Alieke], + 'j': [Jan], + 't': [Thomas] + } + ``` + + Lastnames are stored as (prefix, lastname) tuples. Example: ('de', 'Groot'). + + Parameters + ---------- + firstnames_male : iterable of type `str` + A list of male firstnames. + firstnames_female : iterable of type `str` + A list of female firstnames. + lastnames : iterable of (str, str) tuples + A list of lastname tuples in form of (prefix: str, lastname: str). Example: `('de', 'Groot')`. + """ + if not firstnames_male: + firstnames_male = _load_firstnames(join(RESOURCES_PATH, 'firstnames_male.txt')) + self.__male_normalized = set(name.lower() for name in firstnames_male) + self.male_index = _inverted_name_index(firstnames_male) + + if not firstnames_female: + firstnames_female = _load_firstnames(join(RESOURCES_PATH, 'firstnames_female.txt')) + self.__female_normalized = set(name.lower() for name in firstnames_female) + self.female_index = _inverted_name_index(firstnames_female) + + if not lastnames: + df_lastnames = pd.read_csv(join(RESOURCES_PATH, 'lastnames.csv')) + df_lastnames.prefix.fillna('', inplace=True) + lastnames = df_lastnames.apply(lambda row: (row['prefix'], row['name']), axis=1) + + # Given (prefix, lastname) tuple select first character of lastname and use as index + def lastname_index_getter(prefix_lastname_tuple): + return prefix_lastname_tuple[1][0] + self.lastname_index = _inverted_name_index(lastnames, index_getter=lastname_index_getter) + + def gender_index_for_name(self, firstname): + """Make a best-guess at the gender of the given firstname and returns the appropriate index + name index. + + Performs a lookup in the firstname database. If name is not present in male index, it is + assumed that the gender is female. + + Returns + ------- + dict(str: [str]) + The name index correspoding to the gender of `firstname`. + """ + if firstname.lower() in self.__male_normalized: + return self.male_index + return self.female_index + + @staticmethod + def normalize_index_key(key): + return unidecode(key).lower() + + +class InitialsSurrogates(ExactMatchGenerator): + + def __init__(self, annotations, char_mapping): + super(InitialsSurrogates, self).__init__(annotations=annotations) + self.char_mapping = char_mapping + + def replace_one(self, annotation): + replacement = '' + for initial in annotation: + replacement_initial = self.char_mapping.get(initial.lower(), initial) + # restore casing of original initial + if initial.islower(): + replacement_initial = replacement_initial.lower() + elif initial.isupper(): + replacement_initial = replacement_initial.upper() + replacement += replacement_initial + return replacement + + +class NameSurrogates(SurrogateGenerator): + + def __init__(self, annotations, random_data, firstname_char_mapping, lastname_char_mapping, + name_database=NameDatabase()): + super(NameSurrogates, self).__init__(annotations=annotations, random_data=random_data) + + self.firstname_char_mapping = firstname_char_mapping + self.lastname_char_mapping = lastname_char_mapping + self.name_database = name_database + self.initials_surrogates = InitialsSurrogates(annotations=[], + char_mapping=firstname_char_mapping) + + @staticmethod + def normalize_name(annotation): + return HumanName(annotation) + + @staticmethod + def remove_prepositions(lastname): + return PREPOSITIONS_REGEX.sub('', lastname) + + @staticmethod + def is_initials(part_of_name): + return not INITIALS_REGEX.sub('', part_of_name) + + def _get_surrogate_name(self, index, index_key, char_mapping): + """Retrieve random surrogate from the given index according to a character mapped index. + + Index keys are normalized by transliterating unicode characters to their ascii equivalent + and lowercasing (e.g., Ö => o). + + Parameters + ---------- + index : dict(str: [str]) + The name index to retrieve a random name from. The first letter of the values equals + the index key. + index_key : str + The first letter of a name. + char_mapping : dict(str: str) + A random character mapping. + + Returns + ------- + str + A random name starting with the value of `char_mapping[index_key]`. + """ + index_key = self.name_database.normalize_index_key(index_key) + first_letter_surrogate = char_mapping[index_key] + return self.random_data.choice(index[first_letter_surrogate]) + + @staticmethod + def restore_case(original_char, new_string): + if original_char.islower(): + return new_string[0].lower() + new_string[1:] + + return new_string[0].upper() + new_string[1:] + + def surrogate_firstname(self, firstname): + name_index = self.name_database.gender_index_for_name(firstname) + first_letter = firstname[0] + return self._get_surrogate_name(name_index, first_letter, self.firstname_char_mapping) + + def surrogate_lastname(self, lastname): + lastname = self.remove_prepositions(lastname) + first_letter = lastname[0] + return self._get_surrogate_name(self.name_database.lastname_index, + first_letter, self.lastname_char_mapping) + + @staticmethod + def cached_surrogate(cache, name_string, replacement_generator): + """Generate a new surrogate for the given name or use an existing one if it already exist. + + Parameters + ---------- + cache : dict(str: str) + Lookup table of already replaced names. Keys are original names and values are + surrogates. + name_string : str + The name to generate a replacement for + replacement_generator : callable accepting `name_string` + A callable that generates a new surrogate for `name_string`, in case no previous + replacement exists. + + Returns + ------- + str + The surrogate name. + + """ + replacement = cache.get(name_string.lower(), None) + if not replacement: + replacement = replacement_generator(name_string) + cache[name_string.lower()] = replacement + return replacement + + @staticmethod + def strict_replace(part, replacement, whole): + # initials may end on ., so matching on a word boundary \b is insufficient. + # If we match on \W, we need to re-insert this in the substitute. + fmt = r'\b(?:{})(\W|\b)'.format(re.escape(part)) + return re.sub(fmt, replacement + r'\1', whole) + + def _replace_name(self, annotation, firstname_mapping, lastname_mapping): + new_name = annotation + name = self.normalize_name(annotation) + + if name.first: + if self.is_initials(name.first): + replacement = self.initials_surrogates.replace_one(name.first) + else: + replacement = self.cached_surrogate(firstname_mapping, + name.first, + self.surrogate_firstname) + replacement = self.restore_case(name.first[0], replacement) + new_name = self.strict_replace(part=name.first, replacement=replacement, whole=new_name) + + if name.middle: + replacement = '' + parts = name.middle.split() + for i, part in enumerate(parts): + if self.is_initials(part): + replacement += self.initials_surrogates.replace_one(part) + else: + # If part is not an initial, we assume it is a middle name + part_replacement = self.cached_surrogate(firstname_mapping, + part, + self.surrogate_firstname) + replacement += self.restore_case(part[0], part_replacement) + if i < len(parts) - 1: + replacement += ' ' + new_name = self.strict_replace(part=name.middle, replacement=replacement, + whole=new_name) + + if name.last: + original_lastname = self.remove_prepositions(name.last) + replacement = self.cached_surrogate(lastname_mapping, + name.last, + self.surrogate_lastname) + prefix, lastname = replacement + lastname = self.restore_case(original_lastname[0], lastname) + if prefix: + replacement = prefix + ' ' + lastname + else: + replacement = lastname + new_name = self.strict_replace(part=name.last, replacement=replacement, whole=new_name) + + assert new_name != annotation + return new_name + + def replace_all(self): + firstname_mapping = {} + lastname_mapping = {} + + replaced = [] + for annotation in self.annotations: + # TODO: Add an annotation object that encapsules automatic replacement errors + new_name = None + try: + new_name = self._replace_name(annotation, firstname_mapping, lastname_mapping) + except (AssertionError, KeyError): + logger.opt(exception=False).debug('Could not process name {}'.format(annotation)) + replaced.append(new_name) + + return replaced