deidentify / Git / Diff of /deidentify/surrogates/generators/name.py

Models:
philipB/
deidentify
Downloads: 1
Diff of /deidentify/surrogates/generators/name.py [000000] .. [7fc5df]
Switch to side-by-side view

--- a
+++ b/deidentify/surrogates/generators/name.py
@@ -0,0 +1,333 @@
+"""Generate random surrogates for names that are of the same syntactic pattern.
+
+Examples:
+- Daniel MT de Groot             => Jurrien HD Nguyen
+- Dr. Annemarie van Heijer, Ph.D => Dr. Clara van der Linden, Ph.D
+
+Algorithm:
+1. Generate two random mappings for characters of the latin alphabet: one for firstnames and one
+   for lastnames (e.g., A->E, B->F...). This helps to maintain continuity across documents by
+   always replacing names and initials with the same character mapping.
+2. For each name:
+   2.1 Normalize the name into it's parts (title, first, middle, last, suffix) using `nameparser`
+   2.2 Check lookup table if a surrogate for this name already exists. If not, continue.
+   2.3 Extract the first letter of firstname, middle name and lastname, respectively.
+   2.4 Map these letters using the appropriate character mapping from step 1.
+   2.5 Lookup a random name that starts with the mapped character.
+   2.6 Place (name: surrogate) mapping in the lookup list.
+
+References:
+- https://nameparser.readthedocs.io/en/latest/
+- Stubbs, A., Uzuner, Ö., Kotfila, C., Goldstein, I., & Szolovits, P. (2015). Challenges in
+  Synthesizing Surrogate PHI in Narrative EMRs. https://doi.org/10.1007/978-3-319-23633-9_27
+"""
+import re
+import string
+from collections import defaultdict
+from os.path import dirname, join
+
+import nameparser.config
+import pandas as pd
+from loguru import logger
+from nameparser import HumanName
+from unidecode import unidecode
+
+from .base import ExactMatchGenerator, SurrogateGenerator
+
+RESOURCES_PATH = join(dirname(__file__), 'resources')
+
+# Add common Dutch titles for Mr. and Mrs.
+nameparser.config.CONSTANTS.titles.add('mw', 'dhr', 'mevr', 'mr')
+# Add common Dutch prepositions to prefixes such that they are parsed as part of last names as
+# opposed to middle names.
+# See: https://en.wikipedia.org/wiki/Dutch_name#Tussenvoegsels
+PREPOSITIONS = ['van', 'den', 'v.d.', 'vd', 'de', 'der', "'t", 'ten', 'ter', 'at', 'op']
+nameparser.config.CONSTANTS.prefixes.add(*PREPOSITIONS)
+
+PREPOSITIONS_REGEX = re.compile(r'((?:{})\s)*'.format('|'.join(PREPOSITIONS)))
+INITIALS_REGEX = re.compile(r'([A-Z]?\.?)+')
+
+
+def random_char_mapping(random_data):
+    """Generate a random mapping between for the characters of the lowercase ASCII alphabet.
+
+    Parameters
+    ----------
+    random_data : deidentify.surrogates.RandomData
+        The random operations provider.
+
+    Returns
+    -------
+    dict(str: str)
+        The random character mapping.
+    """
+    alphabet = string.ascii_lowercase
+    shuffled = random_data.shuffle(alphabet)
+    return dict(zip(alphabet, shuffled))
+
+
+def _load_firstnames(filename):
+    return pd.read_csv(filename, names=['name'])['name'].values
+
+
+def _inverted_name_index(names, index_getter=lambda x: x[0]):
+    index = defaultdict(list)
+    for name in names:
+        key = index_getter(name)
+        key = NameDatabase.normalize_index_key(key)
+        index[key].append(name)
+    return index
+
+
+class NameDatabase:
+
+    def __init__(self, firstnames_male=None, firstnames_female=None, lastnames=None):
+        """Provides access to the 10,000 most common Dutch firstnames/lastnames fetched from the
+        Meertens Instituut (see: http://www.naamkunde.net).
+
+        Name lists are stored in a form of "inverted index" where the key is the first letter of
+        the firstname/lastname and the values are a list of names starting with this letter.
+
+        Example for firstnames [Anne, Alieke, Jan, Thomas]:
+        ```
+            {
+                'a': [Anne, Alieke],
+                'j': [Jan],
+                't': [Thomas]
+            }
+        ```
+
+        Lastnames are stored as (prefix, lastname) tuples. Example: ('de', 'Groot').
+
+        Parameters
+        ----------
+        firstnames_male : iterable of type `str`
+            A list of male firstnames.
+        firstnames_female : iterable of type `str`
+            A list of female firstnames.
+        lastnames : iterable of (str, str) tuples
+            A list of lastname tuples in form of (prefix: str, lastname: str). Example: `('de', 'Groot')`.
+        """
+        if not firstnames_male:
+            firstnames_male = _load_firstnames(join(RESOURCES_PATH, 'firstnames_male.txt'))
+        self.__male_normalized = set(name.lower() for name in firstnames_male)
+        self.male_index = _inverted_name_index(firstnames_male)
+
+        if not firstnames_female:
+            firstnames_female = _load_firstnames(join(RESOURCES_PATH, 'firstnames_female.txt'))
+        self.__female_normalized = set(name.lower() for name in firstnames_female)
+        self.female_index = _inverted_name_index(firstnames_female)
+
+        if not lastnames:
+            df_lastnames = pd.read_csv(join(RESOURCES_PATH, 'lastnames.csv'))
+            df_lastnames.prefix.fillna('', inplace=True)
+            lastnames = df_lastnames.apply(lambda row: (row['prefix'], row['name']), axis=1)
+
+        # Given (prefix, lastname) tuple select first character of lastname and use as index
+        def lastname_index_getter(prefix_lastname_tuple):
+            return prefix_lastname_tuple[1][0]
+        self.lastname_index = _inverted_name_index(lastnames, index_getter=lastname_index_getter)
+
+    def gender_index_for_name(self, firstname):
+        """Make a best-guess at the gender of the given firstname and returns the appropriate index
+        name index.
+
+        Performs a lookup in the firstname database. If name is not present in male index, it is
+        assumed that the gender is female.
+
+        Returns
+        -------
+        dict(str: [str])
+            The name index correspoding to the gender of `firstname`.
+        """
+        if firstname.lower() in self.__male_normalized:
+            return self.male_index
+        return self.female_index
+
+    @staticmethod
+    def normalize_index_key(key):
+        return unidecode(key).lower()
+
+
+class InitialsSurrogates(ExactMatchGenerator):
+
+    def __init__(self, annotations, char_mapping):
+        super(InitialsSurrogates, self).__init__(annotations=annotations)
+        self.char_mapping = char_mapping
+
+    def replace_one(self, annotation):
+        replacement = ''
+        for initial in annotation:
+            replacement_initial = self.char_mapping.get(initial.lower(), initial)
+            # restore casing of original initial
+            if initial.islower():
+                replacement_initial = replacement_initial.lower()
+            elif initial.isupper():
+                replacement_initial = replacement_initial.upper()
+            replacement += replacement_initial
+        return replacement
+
+
+class NameSurrogates(SurrogateGenerator):
+
+    def __init__(self, annotations, random_data, firstname_char_mapping, lastname_char_mapping,
+                 name_database=NameDatabase()):
+        super(NameSurrogates, self).__init__(annotations=annotations, random_data=random_data)
+
+        self.firstname_char_mapping = firstname_char_mapping
+        self.lastname_char_mapping = lastname_char_mapping
+        self.name_database = name_database
+        self.initials_surrogates = InitialsSurrogates(annotations=[],
+                                                      char_mapping=firstname_char_mapping)
+
+    @staticmethod
+    def normalize_name(annotation):
+        return HumanName(annotation)
+
+    @staticmethod
+    def remove_prepositions(lastname):
+        return PREPOSITIONS_REGEX.sub('', lastname)
+
+    @staticmethod
+    def is_initials(part_of_name):
+        return not INITIALS_REGEX.sub('', part_of_name)
+
+    def _get_surrogate_name(self, index, index_key, char_mapping):
+        """Retrieve random surrogate from the given index according to a character mapped index.
+
+        Index keys are normalized by transliterating unicode characters to their ascii equivalent
+        and lowercasing (e.g., Ö => o).
+
+        Parameters
+        ----------
+        index : dict(str: [str])
+            The name index to retrieve a random name from. The first letter of the values equals
+            the index key.
+        index_key : str
+            The first letter of a name.
+        char_mapping : dict(str: str)
+            A random character mapping.
+
+        Returns
+        -------
+        str
+            A random name starting with the value of `char_mapping[index_key]`.
+        """
+        index_key = self.name_database.normalize_index_key(index_key)
+        first_letter_surrogate = char_mapping[index_key]
+        return self.random_data.choice(index[first_letter_surrogate])
+
+    @staticmethod
+    def restore_case(original_char, new_string):
+        if original_char.islower():
+            return new_string[0].lower() + new_string[1:]
+
+        return new_string[0].upper() + new_string[1:]
+
+    def surrogate_firstname(self, firstname):
+        name_index = self.name_database.gender_index_for_name(firstname)
+        first_letter = firstname[0]
+        return self._get_surrogate_name(name_index, first_letter, self.firstname_char_mapping)
+
+    def surrogate_lastname(self, lastname):
+        lastname = self.remove_prepositions(lastname)
+        first_letter = lastname[0]
+        return self._get_surrogate_name(self.name_database.lastname_index,
+                                        first_letter, self.lastname_char_mapping)
+
+    @staticmethod
+    def cached_surrogate(cache, name_string, replacement_generator):
+        """Generate a new surrogate for the given name or use an existing one if it already exist.
+
+        Parameters
+        ----------
+        cache : dict(str: str)
+            Lookup table of already replaced names. Keys are original names and values are
+            surrogates.
+        name_string : str
+            The name to generate a replacement for
+        replacement_generator : callable accepting `name_string`
+            A callable that generates a new surrogate for `name_string`, in case no previous
+            replacement exists.
+
+        Returns
+        -------
+        str
+            The surrogate name.
+
+        """
+        replacement = cache.get(name_string.lower(), None)
+        if not replacement:
+            replacement = replacement_generator(name_string)
+            cache[name_string.lower()] = replacement
+        return replacement
+
+    @staticmethod
+    def strict_replace(part, replacement, whole):
+        # initials may end on ., so matching on a word boundary \b is insufficient.
+        # If we match on \W, we need to re-insert this in the substitute.
+        fmt = r'\b(?:{})(\W|\b)'.format(re.escape(part))
+        return re.sub(fmt, replacement + r'\1', whole)
+
+    def _replace_name(self, annotation, firstname_mapping, lastname_mapping):
+        new_name = annotation
+        name = self.normalize_name(annotation)
+
+        if name.first:
+            if self.is_initials(name.first):
+                replacement = self.initials_surrogates.replace_one(name.first)
+            else:
+                replacement = self.cached_surrogate(firstname_mapping,
+                                                    name.first,
+                                                    self.surrogate_firstname)
+                replacement = self.restore_case(name.first[0], replacement)
+            new_name = self.strict_replace(part=name.first, replacement=replacement, whole=new_name)
+
+        if name.middle:
+            replacement = ''
+            parts = name.middle.split()
+            for i, part in enumerate(parts):
+                if self.is_initials(part):
+                    replacement += self.initials_surrogates.replace_one(part)
+                else:
+                    # If part is not an initial, we assume it is a middle name
+                    part_replacement = self.cached_surrogate(firstname_mapping,
+                                                             part,
+                                                             self.surrogate_firstname)
+                    replacement += self.restore_case(part[0], part_replacement)
+                if i < len(parts) - 1:
+                    replacement += ' '
+            new_name = self.strict_replace(part=name.middle, replacement=replacement,
+                                           whole=new_name)
+
+        if name.last:
+            original_lastname = self.remove_prepositions(name.last)
+            replacement = self.cached_surrogate(lastname_mapping,
+                                                name.last,
+                                                self.surrogate_lastname)
+            prefix, lastname = replacement
+            lastname = self.restore_case(original_lastname[0], lastname)
+            if prefix:
+                replacement = prefix + ' ' + lastname
+            else:
+                replacement = lastname
+            new_name = self.strict_replace(part=name.last, replacement=replacement, whole=new_name)
+
+        assert new_name != annotation
+        return new_name
+
+    def replace_all(self):
+        firstname_mapping = {}
+        lastname_mapping = {}
+
+        replaced = []
+        for annotation in self.annotations:
+            # TODO: Add an annotation object that encapsules automatic replacement errors
+            new_name = None
+            try:
+                new_name = self._replace_name(annotation, firstname_mapping, lastname_mapping)
+            except (AssertionError, KeyError):
+                logger.opt(exception=False).debug('Could not process name {}'.format(annotation))
+            replaced.append(new_name)
+
+        return replaced