deidentify / Git / [7fc5df] /deidentify/surrogates/generators/name.py

Models:
philipB/
deidentify
Downloads: 1
[7fc5df]: / deidentify / surrogates / generators / name.py
History
Download this file
334 lines (271 with data), 13.4 kB

"""Generate random surrogates for names that are of the same syntactic pattern.

Examples:
- Daniel MT de Groot             => Jurrien HD Nguyen
- Dr. Annemarie van Heijer, Ph.D => Dr. Clara van der Linden, Ph.D

Algorithm:
1. Generate two random mappings for characters of the latin alphabet: one for firstnames and one
   for lastnames (e.g., A->E, B->F...). This helps to maintain continuity across documents by
   always replacing names and initials with the same character mapping.
2. For each name:
   2.1 Normalize the name into it's parts (title, first, middle, last, suffix) using `nameparser`
   2.2 Check lookup table if a surrogate for this name already exists. If not, continue.
   2.3 Extract the first letter of firstname, middle name and lastname, respectively.
   2.4 Map these letters using the appropriate character mapping from step 1.
   2.5 Lookup a random name that starts with the mapped character.
   2.6 Place (name: surrogate) mapping in the lookup list.

References:
- https://nameparser.readthedocs.io/en/latest/
- Stubbs, A., Uzuner, Ö., Kotfila, C., Goldstein, I., & Szolovits, P. (2015). Challenges in
  Synthesizing Surrogate PHI in Narrative EMRs. https://doi.org/10.1007/978-3-319-23633-9_27
"""
import re
import string
from collections import defaultdict
from os.path import dirname, join

import nameparser.config
import pandas as pd
from loguru import logger
from nameparser import HumanName
from unidecode import unidecode

from .base import ExactMatchGenerator, SurrogateGenerator

RESOURCES_PATH = join(dirname(__file__), 'resources')

# Add common Dutch titles for Mr. and Mrs.
nameparser.config.CONSTANTS.titles.add('mw', 'dhr', 'mevr', 'mr')
# Add common Dutch prepositions to prefixes such that they are parsed as part of last names as
# opposed to middle names.
# See: https://en.wikipedia.org/wiki/Dutch_name#Tussenvoegsels
PREPOSITIONS = ['van', 'den', 'v.d.', 'vd', 'de', 'der', "'t", 'ten', 'ter', 'at', 'op']
nameparser.config.CONSTANTS.prefixes.add(*PREPOSITIONS)

PREPOSITIONS_REGEX = re.compile(r'((?:{})\s)*'.format('|'.join(PREPOSITIONS)))
INITIALS_REGEX = re.compile(r'([A-Z]?\.?)+')


def random_char_mapping(random_data):
    """Generate a random mapping between for the characters of the lowercase ASCII alphabet.

    Parameters
    ----------
    random_data : deidentify.surrogates.RandomData
        The random operations provider.

    Returns
    -------
    dict(str: str)
        The random character mapping.
    """
    alphabet = string.ascii_lowercase
    shuffled = random_data.shuffle(alphabet)
    return dict(zip(alphabet, shuffled))


def _load_firstnames(filename):
    return pd.read_csv(filename, names=['name'])['name'].values


def _inverted_name_index(names, index_getter=lambda x: x[0]):
    index = defaultdict(list)
    for name in names:
        key = index_getter(name)
        key = NameDatabase.normalize_index_key(key)
        index[key].append(name)
    return index


class NameDatabase:

    def __init__(self, firstnames_male=None, firstnames_female=None, lastnames=None):
        """Provides access to the 10,000 most common Dutch firstnames/lastnames fetched from the
        Meertens Instituut (see: http://www.naamkunde.net).

        Name lists are stored in a form of "inverted index" where the key is the first letter of
        the firstname/lastname and the values are a list of names starting with this letter.

        Example for firstnames [Anne, Alieke, Jan, Thomas]:
        ```
            {
                'a': [Anne, Alieke],
                'j': [Jan],
                't': [Thomas]
            }
        ```

        Lastnames are stored as (prefix, lastname) tuples. Example: ('de', 'Groot').

        Parameters
        ----------
        firstnames_male : iterable of type `str`
            A list of male firstnames.
        firstnames_female : iterable of type `str`
            A list of female firstnames.
        lastnames : iterable of (str, str) tuples
            A list of lastname tuples in form of (prefix: str, lastname: str). Example: `('de', 'Groot')`.
        """
        if not firstnames_male:
            firstnames_male = _load_firstnames(join(RESOURCES_PATH, 'firstnames_male.txt'))
        self.__male_normalized = set(name.lower() for name in firstnames_male)
        self.male_index = _inverted_name_index(firstnames_male)

        if not firstnames_female:
            firstnames_female = _load_firstnames(join(RESOURCES_PATH, 'firstnames_female.txt'))
        self.__female_normalized = set(name.lower() for name in firstnames_female)
        self.female_index = _inverted_name_index(firstnames_female)

        if not lastnames:
            df_lastnames = pd.read_csv(join(RESOURCES_PATH, 'lastnames.csv'))
            df_lastnames.prefix.fillna('', inplace=True)
            lastnames = df_lastnames.apply(lambda row: (row['prefix'], row['name']), axis=1)

        # Given (prefix, lastname) tuple select first character of lastname and use as index
        def lastname_index_getter(prefix_lastname_tuple):
            return prefix_lastname_tuple[1][0]
        self.lastname_index = _inverted_name_index(lastnames, index_getter=lastname_index_getter)

    def gender_index_for_name(self, firstname):
        """Make a best-guess at the gender of the given firstname and returns the appropriate index
        name index.

        Performs a lookup in the firstname database. If name is not present in male index, it is
        assumed that the gender is female.

        Returns
        -------
        dict(str: [str])
            The name index correspoding to the gender of `firstname`.
        """
        if firstname.lower() in self.__male_normalized:
            return self.male_index
        return self.female_index

    @staticmethod
    def normalize_index_key(key):
        return unidecode(key).lower()


class InitialsSurrogates(ExactMatchGenerator):

    def __init__(self, annotations, char_mapping):
        super(InitialsSurrogates, self).__init__(annotations=annotations)
        self.char_mapping = char_mapping

    def replace_one(self, annotation):
        replacement = ''
        for initial in annotation:
            replacement_initial = self.char_mapping.get(initial.lower(), initial)
            # restore casing of original initial
            if initial.islower():
                replacement_initial = replacement_initial.lower()
            elif initial.isupper():
                replacement_initial = replacement_initial.upper()
            replacement += replacement_initial
        return replacement


class NameSurrogates(SurrogateGenerator):

    def __init__(self, annotations, random_data, firstname_char_mapping, lastname_char_mapping,
                 name_database=NameDatabase()):
        super(NameSurrogates, self).__init__(annotations=annotations, random_data=random_data)

        self.firstname_char_mapping = firstname_char_mapping
        self.lastname_char_mapping = lastname_char_mapping
        self.name_database = name_database
        self.initials_surrogates = InitialsSurrogates(annotations=[],
                                                      char_mapping=firstname_char_mapping)

    @staticmethod
    def normalize_name(annotation):
        return HumanName(annotation)

    @staticmethod
    def remove_prepositions(lastname):
        return PREPOSITIONS_REGEX.sub('', lastname)

    @staticmethod
    def is_initials(part_of_name):
        return not INITIALS_REGEX.sub('', part_of_name)

    def _get_surrogate_name(self, index, index_key, char_mapping):
        """Retrieve random surrogate from the given index according to a character mapped index.

        Index keys are normalized by transliterating unicode characters to their ascii equivalent
        and lowercasing (e.g., Ö => o).

        Parameters
        ----------
        index : dict(str: [str])
            The name index to retrieve a random name from. The first letter of the values equals
            the index key.
        index_key : str
            The first letter of a name.
        char_mapping : dict(str: str)
            A random character mapping.

        Returns
        -------
        str
            A random name starting with the value of `char_mapping[index_key]`.
        """
        index_key = self.name_database.normalize_index_key(index_key)
        first_letter_surrogate = char_mapping[index_key]
        return self.random_data.choice(index[first_letter_surrogate])

    @staticmethod
    def restore_case(original_char, new_string):
        if original_char.islower():
            return new_string[0].lower() + new_string[1:]

        return new_string[0].upper() + new_string[1:]

    def surrogate_firstname(self, firstname):
        name_index = self.name_database.gender_index_for_name(firstname)
        first_letter = firstname[0]
        return self._get_surrogate_name(name_index, first_letter, self.firstname_char_mapping)

    def surrogate_lastname(self, lastname):
        lastname = self.remove_prepositions(lastname)
        first_letter = lastname[0]
        return self._get_surrogate_name(self.name_database.lastname_index,
                                        first_letter, self.lastname_char_mapping)

    @staticmethod
    def cached_surrogate(cache, name_string, replacement_generator):
        """Generate a new surrogate for the given name or use an existing one if it already exist.

        Parameters
        ----------
        cache : dict(str: str)
            Lookup table of already replaced names. Keys are original names and values are
            surrogates.
        name_string : str
            The name to generate a replacement for
        replacement_generator : callable accepting `name_string`
            A callable that generates a new surrogate for `name_string`, in case no previous
            replacement exists.

        Returns
        -------
        str
            The surrogate name.

        """
        replacement = cache.get(name_string.lower(), None)
        if not replacement:
            replacement = replacement_generator(name_string)
            cache[name_string.lower()] = replacement
        return replacement

    @staticmethod
    def strict_replace(part, replacement, whole):
        # initials may end on ., so matching on a word boundary \b is insufficient.
        # If we match on \W, we need to re-insert this in the substitute.
        fmt = r'\b(?:{})(\W|\b)'.format(re.escape(part))
        return re.sub(fmt, replacement + r'\1', whole)

    def _replace_name(self, annotation, firstname_mapping, lastname_mapping):
        new_name = annotation
        name = self.normalize_name(annotation)

        if name.first:
            if self.is_initials(name.first):
                replacement = self.initials_surrogates.replace_one(name.first)
            else:
                replacement = self.cached_surrogate(firstname_mapping,
                                                    name.first,
                                                    self.surrogate_firstname)
                replacement = self.restore_case(name.first[0], replacement)
            new_name = self.strict_replace(part=name.first, replacement=replacement, whole=new_name)

        if name.middle:
            replacement = ''
            parts = name.middle.split()
            for i, part in enumerate(parts):
                if self.is_initials(part):
                    replacement += self.initials_surrogates.replace_one(part)
                else:
                    # If part is not an initial, we assume it is a middle name
                    part_replacement = self.cached_surrogate(firstname_mapping,
                                                             part,
                                                             self.surrogate_firstname)
                    replacement += self.restore_case(part[0], part_replacement)
                if i < len(parts) - 1:
                    replacement += ' '
            new_name = self.strict_replace(part=name.middle, replacement=replacement,
                                           whole=new_name)

        if name.last:
            original_lastname = self.remove_prepositions(name.last)
            replacement = self.cached_surrogate(lastname_mapping,
                                                name.last,
                                                self.surrogate_lastname)
            prefix, lastname = replacement
            lastname = self.restore_case(original_lastname[0], lastname)
            if prefix:
                replacement = prefix + ' ' + lastname
            else:
                replacement = lastname
            new_name = self.strict_replace(part=name.last, replacement=replacement, whole=new_name)

        assert new_name != annotation
        return new_name

    def replace_all(self):
        firstname_mapping = {}
        lastname_mapping = {}

        replaced = []
        for annotation in self.annotations:
            # TODO: Add an annotation object that encapsules automatic replacement errors
            new_name = None
            try:
                new_name = self._replace_name(annotation, firstname_mapping, lastname_mapping)
            except (AssertionError, KeyError):
                logger.opt(exception=False).debug('Could not process name {}'.format(annotation))
            replaced.append(new_name)

        return replaced