--- a +++ b/deidentify/surrogates/generators/url.py @@ -0,0 +1,22 @@ +import re +from .base import ExactMatchGenerator +from .identifier import IDSurrogates + +TLDS = ['org', 'com', 'nl', 'de', 'be', 'co.uk', 'gov', 'net', 'edu', 'care'] +URL_ELEMENTS_REGEX = re.compile(r'(https?|www\.|\.(?:{}))'.format('|'.join(TLDS))) + + +class URLSurrogates(ExactMatchGenerator): + + def __init__(self, annotations, random_data=None): + super(URLSurrogates, self).__init__(annotations, random_data) + self.id_surrogates = IDSurrogates(annotations=[], random_data=random_data) + + def replace_one(self, annotation): + url_components = URL_ELEMENTS_REGEX.finditer(annotation) + + replacement = self.id_surrogates.replace_one(annotation) + for match in url_components: + replacement = replacement[:match.start()] + match.group(1) + replacement[match.end():] + + return replacement