a b/.eggs/bleach-3.1.0-py3.6.egg/bleach/linkifier.py
1
from __future__ import unicode_literals
2
import re
3
import six
4
5
from bleach import callbacks as linkify_callbacks
6
from bleach import html5lib_shim
7
from bleach.utils import alphabetize_attributes, force_unicode
8
9
10
#: List of default callbacks
11
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
12
13
14
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
15
       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
16
       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
17
       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
18
       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
19
       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
20
       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
21
       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
22
       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
23
       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
24
       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
25
       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
26
       xn xxx ye yt yu za zm zw""".split()
27
28
# Make sure that .com doesn't get matched by .co first
29
TLDS.reverse()
30
31
32
def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
33
    """Builds the url regex used by linkifier
34
35
   If you want a different set of tlds or allowed protocols, pass those in
36
   and stomp on the existing ``url_re``::
37
38
       from bleach import linkifier
39
40
       my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
41
42
       linker = LinkifyFilter(url_re=my_url_re)
43
44
    """
45
    return re.compile(
46
        r"""\(*  # Match any opening parentheses.
47
        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
48
        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
49
        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
50
            # /path/zz (excluding "unsafe" chars from RFC 1738,
51
            # except for # and ~, which happen in practice)
52
        """.format('|'.join(protocols), '|'.join(tlds)),
53
        re.IGNORECASE | re.VERBOSE | re.UNICODE)
54
55
56
URL_RE = build_url_re()
57
58
59
PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
60
61
62
EMAIL_RE = re.compile(
63
    r"""(?<!//)
64
    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
65
        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom
66
    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
67
        |\\[\001-\011\013\014\016-\177])*"  # quoted-string
68
    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain
69
    """,
70
    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
71
72
73
class Linker(object):
74
    """Convert URL-like strings in an HTML fragment to links
75
76
    This function converts strings that look like URLs, domain names and email
77
    addresses in text that may be an HTML fragment to links, while preserving:
78
79
    1. links already in the string
80
    2. urls found in attributes
81
    3. email addresses
82
83
    linkify does a best-effort approach and tries to recover from bad
84
    situations due to crazy text.
85
86
    """
87
    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
88
                 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS):
89
        """Creates a Linker instance
90
91
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
92
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
93
94
        :arg list skip_tags: list of tags that you don't want to linkify the
95
            contents of; for example, you could set this to ``['pre']`` to skip
96
            linkifying contents of ``pre`` tags
97
98
        :arg bool parse_email: whether or not to linkify email addresses
99
100
        :arg re url_re: url matching regex
101
102
        :arg re email_re: email matching regex
103
104
        :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
105
            everything else gets escaped
106
107
        :returns: linkified text as unicode
108
109
        """
110
        self.callbacks = callbacks
111
        self.skip_tags = skip_tags
112
        self.parse_email = parse_email
113
        self.url_re = url_re
114
        self.email_re = email_re
115
116
        # Create a parser/tokenizer that allows all HTML tags and escapes
117
        # anything not in that list.
118
        self.parser = html5lib_shim.BleachHTMLParser(
119
            tags=recognized_tags,
120
            strip=False,
121
            consume_entities=True,
122
            namespaceHTMLElements=False,
123
        )
124
        self.walker = html5lib_shim.getTreeWalker('etree')
125
        self.serializer = html5lib_shim.BleachHTMLSerializer(
126
            quote_attr_values='always',
127
            omit_optional_tags=False,
128
129
            # linkify does not sanitize
130
            sanitize=False,
131
132
            # linkify alphabetizes
133
            alphabetical_attributes=False,
134
        )
135
136
    def linkify(self, text):
137
        """Linkify specified text
138
139
        :arg str text: the text to add links to
140
141
        :returns: linkified text as unicode
142
143
        :raises TypeError: if ``text`` is not a text type
144
145
        """
146
        if not isinstance(text, six.string_types):
147
            raise TypeError('argument must be of text type')
148
149
        text = force_unicode(text)
150
151
        if not text:
152
            return u''
153
154
        dom = self.parser.parseFragment(text)
155
        filtered = LinkifyFilter(
156
            source=self.walker(dom),
157
            callbacks=self.callbacks,
158
            skip_tags=self.skip_tags,
159
            parse_email=self.parse_email,
160
            url_re=self.url_re,
161
            email_re=self.email_re,
162
        )
163
        return self.serializer.render(filtered)
164
165
166
class LinkifyFilter(html5lib_shim.Filter):
167
    """html5lib filter that linkifies text
168
169
    This will do the following:
170
171
    * convert email addresses into links
172
    * convert urls into links
173
    * edit existing links by running them through callbacks--the default is to
174
      add a ``rel="nofollow"``
175
176
    This filter can be used anywhere html5lib filters can be used.
177
178
    """
179
    def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
180
                 url_re=URL_RE, email_re=EMAIL_RE):
181
        """Creates a LinkifyFilter instance
182
183
        :arg TreeWalker source: stream
184
185
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
186
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
187
188
        :arg list skip_tags: list of tags that you don't want to linkify the
189
            contents of; for example, you could set this to ``['pre']`` to skip
190
            linkifying contents of ``pre`` tags
191
192
        :arg bool parse_email: whether or not to linkify email addresses
193
194
        :arg re url_re: url matching regex
195
196
        :arg re email_re: email matching regex
197
198
        """
199
        super(LinkifyFilter, self).__init__(source)
200
201
        self.callbacks = callbacks or []
202
        self.skip_tags = skip_tags or []
203
        self.parse_email = parse_email
204
205
        self.url_re = url_re
206
        self.email_re = email_re
207
208
    def apply_callbacks(self, attrs, is_new):
209
        """Given an attrs dict and an is_new bool, runs through callbacks
210
211
        Callbacks can return an adjusted attrs dict or ``None``. In the case of
212
        ``None``, we stop going through callbacks and return that and the link
213
        gets dropped.
214
215
        :arg dict attrs: map of ``(namespace, name)`` -> ``value``
216
217
        :arg bool is_new: whether or not this link was added by linkify
218
219
        :returns: adjusted attrs dict or ``None``
220
221
        """
222
        for cb in self.callbacks:
223
            attrs = cb(attrs, is_new)
224
            if attrs is None:
225
                return None
226
        return attrs
227
228
    def extract_character_data(self, token_list):
229
        """Extracts and squashes character sequences in a token stream"""
230
        # FIXME(willkg): This is a terrible idea. What it does is drop all the
231
        # tags from the token list and merge the Characters and SpaceCharacters
232
        # tokens into a single text.
233
        #
234
        # So something like this::
235
        #
236
        #     "<span>" "<b>" "some text" "</b>" "</span>"
237
        #
238
        # gets converted to "some text".
239
        #
240
        # This gets used to figure out the ``_text`` fauxttribute value for
241
        # linkify callables.
242
        #
243
        # I'm not really sure how else to support that ``_text`` fauxttribute and
244
        # maintain some modicum of backwards compatability with previous versions
245
        # of Bleach.
246
247
        out = []
248
        for token in token_list:
249
            token_type = token['type']
250
            if token_type in ['Characters', 'SpaceCharacters']:
251
                out.append(token['data'])
252
253
        return u''.join(out)
254
255
    def handle_email_addresses(self, src_iter):
256
        """Handle email addresses in character tokens"""
257
        for token in src_iter:
258
            if token['type'] == 'Characters':
259
                text = token['data']
260
                new_tokens = []
261
                end = 0
262
263
                # For each email address we find in the text
264
                for match in self.email_re.finditer(text):
265
                    if match.start() > end:
266
                        new_tokens.append(
267
                            {u'type': u'Characters', u'data': text[end:match.start()]}
268
                        )
269
270
                    # Run attributes through the callbacks to see what we
271
                    # should do with this match
272
                    attrs = {
273
                        (None, u'href'): u'mailto:%s' % match.group(0),
274
                        u'_text': match.group(0)
275
                    }
276
                    attrs = self.apply_callbacks(attrs, True)
277
278
                    if attrs is None:
279
                        # Just add the text--but not as a link
280
                        new_tokens.append(
281
                            {u'type': u'Characters', u'data': match.group(0)}
282
                        )
283
284
                    else:
285
                        # Add an "a" tag for the new link
286
                        _text = attrs.pop(u'_text', '')
287
                        attrs = alphabetize_attributes(attrs)
288
                        new_tokens.extend([
289
                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
290
                            {u'type': u'Characters', u'data': force_unicode(_text)},
291
                            {u'type': u'EndTag', u'name': 'a'}
292
                        ])
293
                    end = match.end()
294
295
                if new_tokens:
296
                    # Yield the adjusted set of tokens and then continue
297
                    # through the loop
298
                    if end < len(text):
299
                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
300
301
                    for new_token in new_tokens:
302
                        yield new_token
303
304
                    continue
305
306
            yield token
307
308
    def strip_non_url_bits(self, fragment):
309
        """Strips non-url bits from the url
310
311
        This accounts for over-eager matching by the regex.
312
313
        """
314
        prefix = suffix = ''
315
316
        while fragment:
317
            # Try removing ( from the beginning and, if it's balanced, from the
318
            # end, too
319
            if fragment.startswith(u'('):
320
                prefix = prefix + u'('
321
                fragment = fragment[1:]
322
323
                if fragment.endswith(u')'):
324
                    suffix = u')' + suffix
325
                    fragment = fragment[:-1]
326
                continue
327
328
            # Now try extraneous things from the end. For example, sometimes we
329
            # pick up ) at the end of a url, but the url is in a parenthesized
330
            # phrase like:
331
            #
332
            #     "i looked at the site (at http://example.com)"
333
334
            if fragment.endswith(u')') and u'(' not in fragment:
335
                fragment = fragment[:-1]
336
                suffix = u')' + suffix
337
                continue
338
339
            # Handle commas
340
            if fragment.endswith(u','):
341
                fragment = fragment[:-1]
342
                suffix = u',' + suffix
343
                continue
344
345
            # Handle periods
346
            if fragment.endswith(u'.'):
347
                fragment = fragment[:-1]
348
                suffix = u'.' + suffix
349
                continue
350
351
            # Nothing matched, so we're done
352
            break
353
354
        return fragment, prefix, suffix
355
356
    def handle_links(self, src_iter):
357
        """Handle links in character tokens"""
358
        in_a = False  # happens, if parse_email=True and if a mail was found
359
        for token in src_iter:
360
            if in_a:
361
                if token['type'] == 'EndTag' and token['name'] == 'a':
362
                    in_a = False
363
                yield token
364
                continue
365
            elif token['type'] == 'StartTag' and token['name'] == 'a':
366
                in_a = True
367
                yield token
368
                continue
369
            if token['type'] == 'Characters':
370
                text = token['data']
371
                new_tokens = []
372
                end = 0
373
374
                for match in self.url_re.finditer(text):
375
                    if match.start() > end:
376
                        new_tokens.append(
377
                            {u'type': u'Characters', u'data': text[end:match.start()]}
378
                        )
379
380
                    url = match.group(0)
381
                    prefix = suffix = ''
382
383
                    # Sometimes we pick up too much in the url match, so look for
384
                    # bits we should drop and remove them from the match
385
                    url, prefix, suffix = self.strip_non_url_bits(url)
386
387
                    # If there's no protocol, add one
388
                    if PROTO_RE.search(url):
389
                        href = url
390
                    else:
391
                        href = u'http://%s' % url
392
393
                    attrs = {
394
                        (None, u'href'): href,
395
                        u'_text': url
396
                    }
397
                    attrs = self.apply_callbacks(attrs, True)
398
399
                    if attrs is None:
400
                        # Just add the text
401
                        new_tokens.append(
402
                            {u'type': u'Characters', u'data': prefix + url + suffix}
403
                        )
404
405
                    else:
406
                        # Add the "a" tag!
407
                        if prefix:
408
                            new_tokens.append(
409
                                {u'type': u'Characters', u'data': prefix}
410
                            )
411
412
                        _text = attrs.pop(u'_text', '')
413
                        attrs = alphabetize_attributes(attrs)
414
415
                        new_tokens.extend([
416
                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
417
                            {u'type': u'Characters', u'data': force_unicode(_text)},
418
                            {u'type': u'EndTag', u'name': 'a'},
419
                        ])
420
421
                        if suffix:
422
                            new_tokens.append(
423
                                {u'type': u'Characters', u'data': suffix}
424
                            )
425
426
                    end = match.end()
427
428
                if new_tokens:
429
                    # Yield the adjusted set of tokens and then continue
430
                    # through the loop
431
                    if end < len(text):
432
                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
433
434
                    for new_token in new_tokens:
435
                        yield new_token
436
437
                    continue
438
439
            yield token
440
441
    def handle_a_tag(self, token_buffer):
442
        """Handle the "a" tag
443
444
        This could adjust the link or drop it altogether depending on what the
445
        callbacks return.
446
447
        This yields the new set of tokens.
448
449
        """
450
        a_token = token_buffer[0]
451
        if a_token['data']:
452
            attrs = a_token['data']
453
        else:
454
            attrs = {}
455
        text = self.extract_character_data(token_buffer)
456
        attrs['_text'] = text
457
458
        attrs = self.apply_callbacks(attrs, False)
459
460
        if attrs is None:
461
            # We're dropping the "a" tag and everything else and replacing
462
            # it with character data. So emit that token.
463
            yield {'type': 'Characters', 'data': text}
464
465
        else:
466
            new_text = attrs.pop('_text', '')
467
            a_token['data'] = alphabetize_attributes(attrs)
468
469
            if text == new_text:
470
                # The callbacks didn't change the text, so we yield the new "a"
471
                # token, then whatever else was there, then the end "a" token
472
                yield a_token
473
                for mem in token_buffer[1:]:
474
                    yield mem
475
476
            else:
477
                # If the callbacks changed the text, then we're going to drop
478
                # all the tokens between the start and end "a" tags and replace
479
                # it with the new text
480
                yield a_token
481
                yield {'type': 'Characters', 'data': force_unicode(new_text)}
482
                yield token_buffer[-1]
483
484
    def __iter__(self):
485
        in_a = False
486
        in_skip_tag = None
487
488
        token_buffer = []
489
490
        for token in super(LinkifyFilter, self).__iter__():
491
            if in_a:
492
                # Handle the case where we're in an "a" tag--we want to buffer tokens
493
                # until we hit an end "a" tag.
494
                if token['type'] == 'EndTag' and token['name'] == 'a':
495
                    # Add the end tag to the token buffer and then handle them
496
                    # and yield anything returned
497
                    token_buffer.append(token)
498
                    for new_token in self.handle_a_tag(token_buffer):
499
                        yield new_token
500
501
                    # Clear "a" related state and continue since we've yielded all
502
                    # the tokens we're going to yield
503
                    in_a = False
504
                    token_buffer = []
505
                else:
506
                    token_buffer.append(token)
507
                continue
508
509
            if token['type'] in ['StartTag', 'EmptyTag']:
510
                if token['name'] in self.skip_tags:
511
                    # Skip tags start a "special mode" where we don't linkify
512
                    # anything until the end tag.
513
                    in_skip_tag = token['name']
514
515
                elif token['name'] == 'a':
516
                    # The "a" tag is special--we switch to a slurp mode and
517
                    # slurp all the tokens until the end "a" tag and then
518
                    # figure out what to do with them there.
519
                    in_a = True
520
                    token_buffer.append(token)
521
522
                    # We buffer the start tag, so we don't want to yield it,
523
                    # yet
524
                    continue
525
526
            elif in_skip_tag and self.skip_tags:
527
                # NOTE(willkg): We put this clause here since in_a and
528
                # switching in and out of in_a takes precedence.
529
                if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
530
                    in_skip_tag = None
531
532
            elif not in_a and not in_skip_tag and token['type'] == 'Characters':
533
                new_stream = iter([token])
534
                if self.parse_email:
535
                    new_stream = self.handle_email_addresses(new_stream)
536
537
                new_stream = self.handle_links(new_stream)
538
539
                for token in new_stream:
540
                    yield token
541
542
                # We've already yielded this token, so continue
543
                continue
544
545
            yield token