deidentify / Git / [7fc5df] /deidentify/surrogates/generators/url.py

Models:

philipB/

deidentify

Downloads: 1

[7fc5df]: / deidentify / surrogates / generators / url.py

History

Download this file

23 lines (15 with data), 835 Bytes

import re
from .base import ExactMatchGenerator
from .identifier import IDSurrogates

TLDS = ['org', 'com', 'nl', 'de', 'be', 'co.uk', 'gov', 'net', 'edu', 'care']
URL_ELEMENTS_REGEX = re.compile(r'(https?|www\.|\.(?:{}))'.format('|'.join(TLDS)))


class URLSurrogates(ExactMatchGenerator):

    def __init__(self, annotations, random_data=None):
        super(URLSurrogates, self).__init__(annotations, random_data)
        self.id_surrogates = IDSurrogates(annotations=[], random_data=random_data)

    def replace_one(self, annotation):
        url_components = URL_ELEMENTS_REGEX.finditer(annotation)

        replacement = self.id_surrogates.replace_one(annotation)
        for match in url_components:
            replacement = replacement[:match.start()] + match.group(1) + replacement[match.end():]

        return replacement