a b/.eggs/bleach-3.1.0-py3.6.egg/bleach/sanitizer.py
1
from __future__ import unicode_literals
2
3
from itertools import chain
4
import re
5
6
import six
7
from six.moves.urllib.parse import urlparse
8
from xml.sax.saxutils import unescape
9
10
from bleach import html5lib_shim
11
from bleach.utils import alphabetize_attributes, force_unicode
12
13
14
#: List of allowed tags
15
ALLOWED_TAGS = [
16
    'a',
17
    'abbr',
18
    'acronym',
19
    'b',
20
    'blockquote',
21
    'code',
22
    'em',
23
    'i',
24
    'li',
25
    'ol',
26
    'strong',
27
    'ul',
28
]
29
30
31
#: Map of allowed attributes by tag
32
ALLOWED_ATTRIBUTES = {
33
    'a': ['href', 'title'],
34
    'abbr': ['title'],
35
    'acronym': ['title'],
36
}
37
38
#: List of allowed styles
39
ALLOWED_STYLES = []
40
41
#: List of allowed protocols
42
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
43
44
#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
45
INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
46
47
#: Regexp for characters that are invisible
48
INVISIBLE_CHARACTERS_RE = re.compile(
49
    '[' + INVISIBLE_CHARACTERS + ']',
50
    re.UNICODE
51
)
52
53
#: String to replace invisible characters with. This can be a character, a
54
#: string, or even a function that takes a Python re matchobj
55
INVISIBLE_REPLACEMENT_CHAR = '?'
56
57
58
class Cleaner(object):
59
    """Cleaner for cleaning HTML fragments of malicious content
60
61
    This cleaner is a security-focused function whose sole purpose is to remove
62
    malicious content from a string such that it can be displayed as content in
63
    a web page.
64
65
    To use::
66
67
        from bleach.sanitizer import Cleaner
68
69
        cleaner = Cleaner()
70
71
        for text in all_the_yucky_things:
72
            sanitized = cleaner.clean(text)
73
74
    .. Note::
75
76
       This cleaner is not designed to use to transform content to be used in
77
       non-web-page contexts.
78
79
    .. Warning::
80
81
       This cleaner is not thread-safe--the html parser has internal state.
82
       Create a separate cleaner per thread!
83
84
85
    """
86
87
    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
88
                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
89
                 strip_comments=True, filters=None):
90
        """Initializes a Cleaner
91
92
        :arg list tags: allowed list of tags; defaults to
93
            ``bleach.sanitizer.ALLOWED_TAGS``
94
95
        :arg dict attributes: allowed attributes; can be a callable, list or dict;
96
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
97
98
        :arg list styles: allowed list of css styles; defaults to
99
            ``bleach.sanitizer.ALLOWED_STYLES``
100
101
        :arg list protocols: allowed list of protocols for links; defaults
102
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
103
104
        :arg bool strip: whether or not to strip disallowed elements
105
106
        :arg bool strip_comments: whether or not to strip HTML comments
107
108
        :arg list filters: list of html5lib Filter classes to pass streamed content through
109
110
            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
111
112
            .. Warning::
113
114
               Using filters changes the output of ``bleach.Cleaner.clean``.
115
               Make sure the way the filters change the output are secure.
116
117
        """
118
        self.tags = tags
119
        self.attributes = attributes
120
        self.styles = styles
121
        self.protocols = protocols
122
        self.strip = strip
123
        self.strip_comments = strip_comments
124
        self.filters = filters or []
125
126
        self.parser = html5lib_shim.BleachHTMLParser(
127
            tags=self.tags,
128
            strip=self.strip,
129
            consume_entities=False,
130
            namespaceHTMLElements=False
131
        )
132
        self.walker = html5lib_shim.getTreeWalker('etree')
133
        self.serializer = html5lib_shim.BleachHTMLSerializer(
134
            quote_attr_values='always',
135
            omit_optional_tags=False,
136
            escape_lt_in_attrs=True,
137
138
            # We want to leave entities as they are without escaping or
139
            # resolving or expanding
140
            resolve_entities=False,
141
142
            # Bleach has its own sanitizer, so don't use the html5lib one
143
            sanitize=False,
144
145
            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
146
            alphabetical_attributes=False,
147
        )
148
149
    def clean(self, text):
150
        """Cleans text and returns sanitized result as unicode
151
152
        :arg str text: text to be cleaned
153
154
        :returns: sanitized text as unicode
155
156
        :raises TypeError: if ``text`` is not a text type
157
158
        """
159
        if not isinstance(text, six.string_types):
160
            message = "argument cannot be of '{name}' type, must be of text type".format(
161
                name=text.__class__.__name__)
162
            raise TypeError(message)
163
164
        if not text:
165
            return u''
166
167
        text = force_unicode(text)
168
169
        dom = self.parser.parseFragment(text)
170
        filtered = BleachSanitizerFilter(
171
            source=self.walker(dom),
172
173
            # Bleach-sanitizer-specific things
174
            attributes=self.attributes,
175
            strip_disallowed_elements=self.strip,
176
            strip_html_comments=self.strip_comments,
177
178
            # html5lib-sanitizer things
179
            allowed_elements=self.tags,
180
            allowed_css_properties=self.styles,
181
            allowed_protocols=self.protocols,
182
            allowed_svg_properties=[],
183
        )
184
185
        # Apply any filters after the BleachSanitizerFilter
186
        for filter_class in self.filters:
187
            filtered = filter_class(source=filtered)
188
189
        return self.serializer.render(filtered)
190
191
192
def attribute_filter_factory(attributes):
193
    """Generates attribute filter function for the given attributes value
194
195
    The attributes value can take one of several shapes. This returns a filter
196
    function appropriate to the attributes value. One nice thing about this is
197
    that there's less if/then shenanigans in the ``allow_token`` method.
198
199
    """
200
    if callable(attributes):
201
        return attributes
202
203
    if isinstance(attributes, dict):
204
        def _attr_filter(tag, attr, value):
205
            if tag in attributes:
206
                attr_val = attributes[tag]
207
                if callable(attr_val):
208
                    return attr_val(tag, attr, value)
209
210
                if attr in attr_val:
211
                    return True
212
213
            if '*' in attributes:
214
                attr_val = attributes['*']
215
                if callable(attr_val):
216
                    return attr_val(tag, attr, value)
217
218
                return attr in attr_val
219
220
            return False
221
222
        return _attr_filter
223
224
    if isinstance(attributes, list):
225
        def _attr_filter(tag, attr, value):
226
            return attr in attributes
227
228
        return _attr_filter
229
230
    raise ValueError('attributes needs to be a callable, a list or a dict')
231
232
233
class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
234
    """html5lib Filter that sanitizes text
235
236
    This filter can be used anywhere html5lib filters can be used.
237
238
    """
239
    def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
240
                 strip_disallowed_elements=False, strip_html_comments=True,
241
                 **kwargs):
242
        """Creates a BleachSanitizerFilter instance
243
244
        :arg Treewalker source: stream
245
246
        :arg list tags: allowed list of tags; defaults to
247
            ``bleach.sanitizer.ALLOWED_TAGS``
248
249
        :arg dict attributes: allowed attributes; can be a callable, list or dict;
250
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
251
252
        :arg list styles: allowed list of css styles; defaults to
253
            ``bleach.sanitizer.ALLOWED_STYLES``
254
255
        :arg list protocols: allowed list of protocols for links; defaults
256
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
257
258
        :arg bool strip_disallowed_elements: whether or not to strip disallowed
259
            elements
260
261
        :arg bool strip_html_comments: whether or not to strip HTML comments
262
263
        """
264
        self.attr_filter = attribute_filter_factory(attributes)
265
        self.strip_disallowed_elements = strip_disallowed_elements
266
        self.strip_html_comments = strip_html_comments
267
268
        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
269
270
    def sanitize_stream(self, token_iterator):
271
        for token in token_iterator:
272
            ret = self.sanitize_token(token)
273
274
            if not ret:
275
                continue
276
277
            if isinstance(ret, list):
278
                for subtoken in ret:
279
                    yield subtoken
280
            else:
281
                yield ret
282
283
    def merge_characters(self, token_iterator):
284
        """Merge consecutive Characters tokens in a stream"""
285
        characters_buffer = []
286
287
        for token in token_iterator:
288
            if characters_buffer:
289
                if token['type'] == 'Characters':
290
                    characters_buffer.append(token)
291
                    continue
292
                else:
293
                    # Merge all the characters tokens together into one and then
294
                    # operate on it.
295
                    new_token = {
296
                        'data': ''.join([char_token['data'] for char_token in characters_buffer]),
297
                        'type': 'Characters'
298
                    }
299
                    characters_buffer = []
300
                    yield new_token
301
302
            elif token['type'] == 'Characters':
303
                characters_buffer.append(token)
304
                continue
305
306
            yield token
307
308
        new_token = {
309
            'data': ''.join([char_token['data'] for char_token in characters_buffer]),
310
            'type': 'Characters'
311
        }
312
        yield new_token
313
314
    def __iter__(self):
315
        return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
316
317
    def sanitize_token(self, token):
318
        """Sanitize a token either by HTML-encoding or dropping.
319
320
        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
321
        ['attribute', 'pairs'], 'tag': callable}.
322
323
        Here callable is a function with two arguments of attribute name and
324
        value. It should return true of false.
325
326
        Also gives the option to strip tags instead of encoding.
327
328
        :arg dict token: token to sanitize
329
330
        :returns: token or list of tokens
331
332
        """
333
        token_type = token['type']
334
        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
335
            if token['name'] in self.allowed_elements:
336
                return self.allow_token(token)
337
338
            elif self.strip_disallowed_elements:
339
                return None
340
341
            else:
342
                if 'data' in token:
343
                    # Alphabetize the attributes before calling .disallowed_token()
344
                    # so that the resulting string is stable
345
                    token['data'] = alphabetize_attributes(token['data'])
346
                return self.disallowed_token(token)
347
348
        elif token_type == 'Comment':
349
            if not self.strip_html_comments:
350
                return token
351
            else:
352
                return None
353
354
        elif token_type == 'Characters':
355
            return self.sanitize_characters(token)
356
357
        else:
358
            return token
359
360
    def sanitize_characters(self, token):
361
        """Handles Characters tokens
362
363
        Our overridden tokenizer doesn't do anything with entities. However,
364
        that means that the serializer will convert all ``&`` in Characters
365
        tokens to ``&``.
366
367
        Since we don't want that, we extract entities here and convert them to
368
        Entity tokens so the serializer will let them be.
369
370
        :arg token: the Characters token to work on
371
372
        :returns: a list of tokens
373
374
        """
375
        data = token.get('data', '')
376
377
        if not data:
378
            return token
379
380
        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
381
        token['data'] = data
382
383
        # If there isn't a & in the data, we can return now
384
        if '&' not in data:
385
            return token
386
387
        new_tokens = []
388
389
        # For each possible entity that starts with a "&", we try to extract an
390
        # actual entity and re-tokenize accordingly
391
        for part in html5lib_shim.next_possible_entity(data):
392
            if not part:
393
                continue
394
395
            if part.startswith('&'):
396
                entity = html5lib_shim.match_entity(part)
397
                if entity is not None:
398
                    if entity == 'amp':
399
                        # LinkifyFilter can't match urls across token boundaries
400
                        # which is problematic with & since that shows up in
401
                        # querystrings all the time. This special-cases &
402
                        # and converts it to a & and sticks it in as a
403
                        # Characters token. It'll get merged with surrounding
404
                        # tokens in the BleachSanitizerfilter.__iter__ and
405
                        # escaped in the serializer.
406
                        new_tokens.append({'type': 'Characters', 'data': '&'})
407
                    else:
408
                        new_tokens.append({'type': 'Entity', 'name': entity})
409
410
                    # Length of the entity plus 2--one for & at the beginning
411
                    # and and one for ; at the end
412
                    remainder = part[len(entity) + 2:]
413
                    if remainder:
414
                        new_tokens.append({'type': 'Characters', 'data': remainder})
415
                    continue
416
417
            new_tokens.append({'type': 'Characters', 'data': part})
418
419
        return new_tokens
420
421
    def sanitize_uri_value(self, value, allowed_protocols):
422
        """Checks a uri value to see if it's allowed
423
424
        :arg value: the uri value to sanitize
425
        :arg allowed_protocols: list of allowed protocols
426
427
        :returns: allowed value or None
428
429
        """
430
        # NOTE(willkg): This transforms the value into one that's easier to
431
        # match and verify, but shouldn't get returned since it's vastly
432
        # different than the original value.
433
434
        # Convert all character entities in the value
435
        new_value = html5lib_shim.convert_entities(value)
436
437
        # Nix backtick, space characters, and control characters
438
        new_value = re.sub(
439
            r"[`\000-\040\177-\240\s]+",
440
            '',
441
            new_value
442
        )
443
444
        # Remove REPLACEMENT characters
445
        new_value = new_value.replace('\ufffd', '')
446
447
        # Lowercase it--this breaks the value, but makes it easier to match
448
        # against
449
        new_value = new_value.lower()
450
451
        try:
452
            # Drop attributes with uri values that have protocols that aren't
453
            # allowed
454
            parsed = urlparse(new_value)
455
        except ValueError:
456
            # URI is impossible to parse, therefore it's not allowed
457
            return None
458
459
        if parsed.scheme:
460
            # If urlparse found a scheme, check that
461
            if parsed.scheme in allowed_protocols:
462
                return value
463
464
        else:
465
            # Allow uris that are just an anchor
466
            if new_value.startswith('#'):
467
                return value
468
469
            # Handle protocols that urlparse doesn't recognize like "myprotocol"
470
            if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
471
                return value
472
473
            # If there's no protocol/scheme specified, then assume it's "http"
474
            # and see if that's allowed
475
            if 'http' in allowed_protocols:
476
                return value
477
478
        return None
479
480
    def allow_token(self, token):
481
        """Handles the case where we're allowing the tag"""
482
        if 'data' in token:
483
            # Loop through all the attributes and drop the ones that are not
484
            # allowed, are unsafe or break other rules. Additionally, fix
485
            # attribute values that need fixing.
486
            #
487
            # At the end of this loop, we have the final set of attributes
488
            # we're keeping.
489
            attrs = {}
490
            for namespaced_name, val in token['data'].items():
491
                namespace, name = namespaced_name
492
493
                # Drop attributes that are not explicitly allowed
494
                #
495
                # NOTE(willkg): We pass in the attribute name--not a namespaced
496
                # name.
497
                if not self.attr_filter(token['name'], name, val):
498
                    continue
499
500
                # Drop attributes with uri values that use a disallowed protocol
501
                # Sanitize attributes with uri values
502
                if namespaced_name in self.attr_val_is_uri:
503
                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
504
                    if new_value is None:
505
                        continue
506
                    val = new_value
507
508
                # Drop values in svg attrs with non-local IRIs
509
                if namespaced_name in self.svg_attr_val_allows_ref:
510
                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
511
                                     ' ',
512
                                     unescape(val))
513
                    new_val = new_val.strip()
514
                    if not new_val:
515
                        continue
516
517
                    else:
518
                        # Replace the val with the unescaped version because
519
                        # it's a iri
520
                        val = new_val
521
522
                # Drop href and xlink:href attr for svg elements with non-local IRIs
523
                if (None, token['name']) in self.svg_allow_local_href:
524
                    if namespaced_name in [
525
                            (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
526
                    ]:
527
                        if re.search(r'^\s*[^#\s]', val):
528
                            continue
529
530
                # If it's a style attribute, sanitize it
531
                if namespaced_name == (None, u'style'):
532
                    val = self.sanitize_css(val)
533
534
                # At this point, we want to keep the attribute, so add it in
535
                attrs[namespaced_name] = val
536
537
            token['data'] = alphabetize_attributes(attrs)
538
539
        return token
540
541
    def disallowed_token(self, token):
542
        token_type = token["type"]
543
        if token_type == "EndTag":
544
            token["data"] = "</%s>" % token["name"]
545
546
        elif token["data"]:
547
            assert token_type in ("StartTag", "EmptyTag")
548
            attrs = []
549
            for (ns, name), v in token["data"].items():
550
                # If we end up with a namespace, but no name, switch them so we
551
                # have a valid name to use.
552
                if ns and not name:
553
                    ns, name = name, ns
554
555
                # Figure out namespaced name if the namespace is appropriate
556
                # and exists; if the ns isn't in prefixes, then drop it.
557
                if ns is None or ns not in html5lib_shim.prefixes:
558
                    namespaced_name = name
559
                else:
560
                    namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
561
562
                attrs.append(' %s="%s"' % (
563
                    namespaced_name,
564
                    # NOTE(willkg): HTMLSerializer escapes attribute values
565
                    # already, so if we do it here (like HTMLSerializer does),
566
                    # then we end up double-escaping.
567
                    v)
568
                )
569
            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
570
571
        else:
572
            token["data"] = "<%s>" % token["name"]
573
574
        if token.get("selfClosing"):
575
            token["data"] = token["data"][:-1] + "/>"
576
577
        token["type"] = "Characters"
578
579
        del token["name"]
580
        return token
581
582
    def sanitize_css(self, style):
583
        """Sanitizes css in style tags"""
584
        # Convert entities in the style so that it can be parsed as CSS
585
        style = html5lib_shim.convert_entities(style)
586
587
        # Drop any url values before we do anything else
588
        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
589
590
        # The gauntlet of sanitization
591
592
        # Validate the css in the style tag and if it's not valid, then drop
593
        # the whole thing.
594
        parts = style.split(';')
595
        gauntlet = re.compile(
596
            r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
597
        )
598
599
        for part in parts:
600
            if not gauntlet.match(part):
601
                return ''
602
603
        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
604
            return ''
605
606
        clean = []
607
        for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
608
            if not value:
609
                continue
610
611
            if prop.lower() in self.allowed_css_properties:
612
                clean.append(prop + ': ' + value + ';')
613
614
            elif prop.lower() in self.allowed_svg_properties:
615
                clean.append(prop + ': ' + value + ';')
616
617
        return ' '.join(clean)