|
a |
|
b/.eggs/bleach-3.1.0-py3.6.egg/bleach/linkifier.py |
|
|
1 |
from __future__ import unicode_literals |
|
|
2 |
import re |
|
|
3 |
import six |
|
|
4 |
|
|
|
5 |
from bleach import callbacks as linkify_callbacks |
|
|
6 |
from bleach import html5lib_shim |
|
|
7 |
from bleach.utils import alphabetize_attributes, force_unicode |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
#: List of default callbacks |
|
|
11 |
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] |
|
|
12 |
|
|
|
13 |
|
|
|
14 |
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az |
|
|
15 |
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat |
|
|
16 |
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk |
|
|
17 |
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg |
|
|
18 |
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il |
|
|
19 |
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp |
|
|
20 |
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk |
|
|
21 |
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne |
|
|
22 |
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post |
|
|
23 |
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl |
|
|
24 |
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to |
|
|
25 |
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws |
|
|
26 |
xn xxx ye yt yu za zm zw""".split() |
|
|
27 |
|
|
|
28 |
# Make sure that .com doesn't get matched by .co first |
|
|
29 |
TLDS.reverse() |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): |
|
|
33 |
"""Builds the url regex used by linkifier |
|
|
34 |
|
|
|
35 |
If you want a different set of tlds or allowed protocols, pass those in |
|
|
36 |
and stomp on the existing ``url_re``:: |
|
|
37 |
|
|
|
38 |
from bleach import linkifier |
|
|
39 |
|
|
|
40 |
my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) |
|
|
41 |
|
|
|
42 |
linker = LinkifyFilter(url_re=my_url_re) |
|
|
43 |
|
|
|
44 |
""" |
|
|
45 |
return re.compile( |
|
|
46 |
r"""\(* # Match any opening parentheses. |
|
|
47 |
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// |
|
|
48 |
([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? |
|
|
49 |
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? |
|
|
50 |
# /path/zz (excluding "unsafe" chars from RFC 1738, |
|
|
51 |
# except for # and ~, which happen in practice) |
|
|
52 |
""".format('|'.join(protocols), '|'.join(tlds)), |
|
|
53 |
re.IGNORECASE | re.VERBOSE | re.UNICODE) |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
URL_RE = build_url_re() |
|
|
57 |
|
|
|
58 |
|
|
|
59 |
PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) |
|
|
60 |
|
|
|
61 |
|
|
|
62 |
EMAIL_RE = re.compile( |
|
|
63 |
r"""(?<!//) |
|
|
64 |
(([-!#$%&'*+/=?^_`{}|~0-9A-Z]+ |
|
|
65 |
(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom |
|
|
66 |
|^"([\001-\010\013\014\016-\037!#-\[\]-\177] |
|
|
67 |
|\\[\001-\011\013\014\016-\177])*" # quoted-string |
|
|
68 |
)@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}) # domain |
|
|
69 |
""", |
|
|
70 |
re.IGNORECASE | re.MULTILINE | re.VERBOSE) |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
class Linker(object): |
|
|
74 |
"""Convert URL-like strings in an HTML fragment to links |
|
|
75 |
|
|
|
76 |
This function converts strings that look like URLs, domain names and email |
|
|
77 |
addresses in text that may be an HTML fragment to links, while preserving: |
|
|
78 |
|
|
|
79 |
1. links already in the string |
|
|
80 |
2. urls found in attributes |
|
|
81 |
3. email addresses |
|
|
82 |
|
|
|
83 |
linkify does a best-effort approach and tries to recover from bad |
|
|
84 |
situations due to crazy text. |
|
|
85 |
|
|
|
86 |
""" |
|
|
87 |
def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, |
|
|
88 |
url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS): |
|
|
89 |
"""Creates a Linker instance |
|
|
90 |
|
|
|
91 |
:arg list callbacks: list of callbacks to run when adjusting tag attributes; |
|
|
92 |
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` |
|
|
93 |
|
|
|
94 |
:arg list skip_tags: list of tags that you don't want to linkify the |
|
|
95 |
contents of; for example, you could set this to ``['pre']`` to skip |
|
|
96 |
linkifying contents of ``pre`` tags |
|
|
97 |
|
|
|
98 |
:arg bool parse_email: whether or not to linkify email addresses |
|
|
99 |
|
|
|
100 |
:arg re url_re: url matching regex |
|
|
101 |
|
|
|
102 |
:arg re email_re: email matching regex |
|
|
103 |
|
|
|
104 |
:arg list-of-strings recognized_tags: the list of tags that linkify knows about; |
|
|
105 |
everything else gets escaped |
|
|
106 |
|
|
|
107 |
:returns: linkified text as unicode |
|
|
108 |
|
|
|
109 |
""" |
|
|
110 |
self.callbacks = callbacks |
|
|
111 |
self.skip_tags = skip_tags |
|
|
112 |
self.parse_email = parse_email |
|
|
113 |
self.url_re = url_re |
|
|
114 |
self.email_re = email_re |
|
|
115 |
|
|
|
116 |
# Create a parser/tokenizer that allows all HTML tags and escapes |
|
|
117 |
# anything not in that list. |
|
|
118 |
self.parser = html5lib_shim.BleachHTMLParser( |
|
|
119 |
tags=recognized_tags, |
|
|
120 |
strip=False, |
|
|
121 |
consume_entities=True, |
|
|
122 |
namespaceHTMLElements=False, |
|
|
123 |
) |
|
|
124 |
self.walker = html5lib_shim.getTreeWalker('etree') |
|
|
125 |
self.serializer = html5lib_shim.BleachHTMLSerializer( |
|
|
126 |
quote_attr_values='always', |
|
|
127 |
omit_optional_tags=False, |
|
|
128 |
|
|
|
129 |
# linkify does not sanitize |
|
|
130 |
sanitize=False, |
|
|
131 |
|
|
|
132 |
# linkify alphabetizes |
|
|
133 |
alphabetical_attributes=False, |
|
|
134 |
) |
|
|
135 |
|
|
|
136 |
def linkify(self, text): |
|
|
137 |
"""Linkify specified text |
|
|
138 |
|
|
|
139 |
:arg str text: the text to add links to |
|
|
140 |
|
|
|
141 |
:returns: linkified text as unicode |
|
|
142 |
|
|
|
143 |
:raises TypeError: if ``text`` is not a text type |
|
|
144 |
|
|
|
145 |
""" |
|
|
146 |
if not isinstance(text, six.string_types): |
|
|
147 |
raise TypeError('argument must be of text type') |
|
|
148 |
|
|
|
149 |
text = force_unicode(text) |
|
|
150 |
|
|
|
151 |
if not text: |
|
|
152 |
return u'' |
|
|
153 |
|
|
|
154 |
dom = self.parser.parseFragment(text) |
|
|
155 |
filtered = LinkifyFilter( |
|
|
156 |
source=self.walker(dom), |
|
|
157 |
callbacks=self.callbacks, |
|
|
158 |
skip_tags=self.skip_tags, |
|
|
159 |
parse_email=self.parse_email, |
|
|
160 |
url_re=self.url_re, |
|
|
161 |
email_re=self.email_re, |
|
|
162 |
) |
|
|
163 |
return self.serializer.render(filtered) |
|
|
164 |
|
|
|
165 |
|
|
|
166 |
class LinkifyFilter(html5lib_shim.Filter): |
|
|
167 |
"""html5lib filter that linkifies text |
|
|
168 |
|
|
|
169 |
This will do the following: |
|
|
170 |
|
|
|
171 |
* convert email addresses into links |
|
|
172 |
* convert urls into links |
|
|
173 |
* edit existing links by running them through callbacks--the default is to |
|
|
174 |
add a ``rel="nofollow"`` |
|
|
175 |
|
|
|
176 |
This filter can be used anywhere html5lib filters can be used. |
|
|
177 |
|
|
|
178 |
""" |
|
|
179 |
def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False, |
|
|
180 |
url_re=URL_RE, email_re=EMAIL_RE): |
|
|
181 |
"""Creates a LinkifyFilter instance |
|
|
182 |
|
|
|
183 |
:arg TreeWalker source: stream |
|
|
184 |
|
|
|
185 |
:arg list callbacks: list of callbacks to run when adjusting tag attributes; |
|
|
186 |
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` |
|
|
187 |
|
|
|
188 |
:arg list skip_tags: list of tags that you don't want to linkify the |
|
|
189 |
contents of; for example, you could set this to ``['pre']`` to skip |
|
|
190 |
linkifying contents of ``pre`` tags |
|
|
191 |
|
|
|
192 |
:arg bool parse_email: whether or not to linkify email addresses |
|
|
193 |
|
|
|
194 |
:arg re url_re: url matching regex |
|
|
195 |
|
|
|
196 |
:arg re email_re: email matching regex |
|
|
197 |
|
|
|
198 |
""" |
|
|
199 |
super(LinkifyFilter, self).__init__(source) |
|
|
200 |
|
|
|
201 |
self.callbacks = callbacks or [] |
|
|
202 |
self.skip_tags = skip_tags or [] |
|
|
203 |
self.parse_email = parse_email |
|
|
204 |
|
|
|
205 |
self.url_re = url_re |
|
|
206 |
self.email_re = email_re |
|
|
207 |
|
|
|
208 |
def apply_callbacks(self, attrs, is_new): |
|
|
209 |
"""Given an attrs dict and an is_new bool, runs through callbacks |
|
|
210 |
|
|
|
211 |
Callbacks can return an adjusted attrs dict or ``None``. In the case of |
|
|
212 |
``None``, we stop going through callbacks and return that and the link |
|
|
213 |
gets dropped. |
|
|
214 |
|
|
|
215 |
:arg dict attrs: map of ``(namespace, name)`` -> ``value`` |
|
|
216 |
|
|
|
217 |
:arg bool is_new: whether or not this link was added by linkify |
|
|
218 |
|
|
|
219 |
:returns: adjusted attrs dict or ``None`` |
|
|
220 |
|
|
|
221 |
""" |
|
|
222 |
for cb in self.callbacks: |
|
|
223 |
attrs = cb(attrs, is_new) |
|
|
224 |
if attrs is None: |
|
|
225 |
return None |
|
|
226 |
return attrs |
|
|
227 |
|
|
|
228 |
def extract_character_data(self, token_list): |
|
|
229 |
"""Extracts and squashes character sequences in a token stream""" |
|
|
230 |
# FIXME(willkg): This is a terrible idea. What it does is drop all the |
|
|
231 |
# tags from the token list and merge the Characters and SpaceCharacters |
|
|
232 |
# tokens into a single text. |
|
|
233 |
# |
|
|
234 |
# So something like this:: |
|
|
235 |
# |
|
|
236 |
# "<span>" "<b>" "some text" "</b>" "</span>" |
|
|
237 |
# |
|
|
238 |
# gets converted to "some text". |
|
|
239 |
# |
|
|
240 |
# This gets used to figure out the ``_text`` fauxttribute value for |
|
|
241 |
# linkify callables. |
|
|
242 |
# |
|
|
243 |
# I'm not really sure how else to support that ``_text`` fauxttribute and |
|
|
244 |
# maintain some modicum of backwards compatability with previous versions |
|
|
245 |
# of Bleach. |
|
|
246 |
|
|
|
247 |
out = [] |
|
|
248 |
for token in token_list: |
|
|
249 |
token_type = token['type'] |
|
|
250 |
if token_type in ['Characters', 'SpaceCharacters']: |
|
|
251 |
out.append(token['data']) |
|
|
252 |
|
|
|
253 |
return u''.join(out) |
|
|
254 |
|
|
|
255 |
def handle_email_addresses(self, src_iter): |
|
|
256 |
"""Handle email addresses in character tokens""" |
|
|
257 |
for token in src_iter: |
|
|
258 |
if token['type'] == 'Characters': |
|
|
259 |
text = token['data'] |
|
|
260 |
new_tokens = [] |
|
|
261 |
end = 0 |
|
|
262 |
|
|
|
263 |
# For each email address we find in the text |
|
|
264 |
for match in self.email_re.finditer(text): |
|
|
265 |
if match.start() > end: |
|
|
266 |
new_tokens.append( |
|
|
267 |
{u'type': u'Characters', u'data': text[end:match.start()]} |
|
|
268 |
) |
|
|
269 |
|
|
|
270 |
# Run attributes through the callbacks to see what we |
|
|
271 |
# should do with this match |
|
|
272 |
attrs = { |
|
|
273 |
(None, u'href'): u'mailto:%s' % match.group(0), |
|
|
274 |
u'_text': match.group(0) |
|
|
275 |
} |
|
|
276 |
attrs = self.apply_callbacks(attrs, True) |
|
|
277 |
|
|
|
278 |
if attrs is None: |
|
|
279 |
# Just add the text--but not as a link |
|
|
280 |
new_tokens.append( |
|
|
281 |
{u'type': u'Characters', u'data': match.group(0)} |
|
|
282 |
) |
|
|
283 |
|
|
|
284 |
else: |
|
|
285 |
# Add an "a" tag for the new link |
|
|
286 |
_text = attrs.pop(u'_text', '') |
|
|
287 |
attrs = alphabetize_attributes(attrs) |
|
|
288 |
new_tokens.extend([ |
|
|
289 |
{u'type': u'StartTag', u'name': u'a', u'data': attrs}, |
|
|
290 |
{u'type': u'Characters', u'data': force_unicode(_text)}, |
|
|
291 |
{u'type': u'EndTag', u'name': 'a'} |
|
|
292 |
]) |
|
|
293 |
end = match.end() |
|
|
294 |
|
|
|
295 |
if new_tokens: |
|
|
296 |
# Yield the adjusted set of tokens and then continue |
|
|
297 |
# through the loop |
|
|
298 |
if end < len(text): |
|
|
299 |
new_tokens.append({u'type': u'Characters', u'data': text[end:]}) |
|
|
300 |
|
|
|
301 |
for new_token in new_tokens: |
|
|
302 |
yield new_token |
|
|
303 |
|
|
|
304 |
continue |
|
|
305 |
|
|
|
306 |
yield token |
|
|
307 |
|
|
|
308 |
def strip_non_url_bits(self, fragment): |
|
|
309 |
"""Strips non-url bits from the url |
|
|
310 |
|
|
|
311 |
This accounts for over-eager matching by the regex. |
|
|
312 |
|
|
|
313 |
""" |
|
|
314 |
prefix = suffix = '' |
|
|
315 |
|
|
|
316 |
while fragment: |
|
|
317 |
# Try removing ( from the beginning and, if it's balanced, from the |
|
|
318 |
# end, too |
|
|
319 |
if fragment.startswith(u'('): |
|
|
320 |
prefix = prefix + u'(' |
|
|
321 |
fragment = fragment[1:] |
|
|
322 |
|
|
|
323 |
if fragment.endswith(u')'): |
|
|
324 |
suffix = u')' + suffix |
|
|
325 |
fragment = fragment[:-1] |
|
|
326 |
continue |
|
|
327 |
|
|
|
328 |
# Now try extraneous things from the end. For example, sometimes we |
|
|
329 |
# pick up ) at the end of a url, but the url is in a parenthesized |
|
|
330 |
# phrase like: |
|
|
331 |
# |
|
|
332 |
# "i looked at the site (at http://example.com)" |
|
|
333 |
|
|
|
334 |
if fragment.endswith(u')') and u'(' not in fragment: |
|
|
335 |
fragment = fragment[:-1] |
|
|
336 |
suffix = u')' + suffix |
|
|
337 |
continue |
|
|
338 |
|
|
|
339 |
# Handle commas |
|
|
340 |
if fragment.endswith(u','): |
|
|
341 |
fragment = fragment[:-1] |
|
|
342 |
suffix = u',' + suffix |
|
|
343 |
continue |
|
|
344 |
|
|
|
345 |
# Handle periods |
|
|
346 |
if fragment.endswith(u'.'): |
|
|
347 |
fragment = fragment[:-1] |
|
|
348 |
suffix = u'.' + suffix |
|
|
349 |
continue |
|
|
350 |
|
|
|
351 |
# Nothing matched, so we're done |
|
|
352 |
break |
|
|
353 |
|
|
|
354 |
return fragment, prefix, suffix |
|
|
355 |
|
|
|
356 |
def handle_links(self, src_iter): |
|
|
357 |
"""Handle links in character tokens""" |
|
|
358 |
in_a = False # happens, if parse_email=True and if a mail was found |
|
|
359 |
for token in src_iter: |
|
|
360 |
if in_a: |
|
|
361 |
if token['type'] == 'EndTag' and token['name'] == 'a': |
|
|
362 |
in_a = False |
|
|
363 |
yield token |
|
|
364 |
continue |
|
|
365 |
elif token['type'] == 'StartTag' and token['name'] == 'a': |
|
|
366 |
in_a = True |
|
|
367 |
yield token |
|
|
368 |
continue |
|
|
369 |
if token['type'] == 'Characters': |
|
|
370 |
text = token['data'] |
|
|
371 |
new_tokens = [] |
|
|
372 |
end = 0 |
|
|
373 |
|
|
|
374 |
for match in self.url_re.finditer(text): |
|
|
375 |
if match.start() > end: |
|
|
376 |
new_tokens.append( |
|
|
377 |
{u'type': u'Characters', u'data': text[end:match.start()]} |
|
|
378 |
) |
|
|
379 |
|
|
|
380 |
url = match.group(0) |
|
|
381 |
prefix = suffix = '' |
|
|
382 |
|
|
|
383 |
# Sometimes we pick up too much in the url match, so look for |
|
|
384 |
# bits we should drop and remove them from the match |
|
|
385 |
url, prefix, suffix = self.strip_non_url_bits(url) |
|
|
386 |
|
|
|
387 |
# If there's no protocol, add one |
|
|
388 |
if PROTO_RE.search(url): |
|
|
389 |
href = url |
|
|
390 |
else: |
|
|
391 |
href = u'http://%s' % url |
|
|
392 |
|
|
|
393 |
attrs = { |
|
|
394 |
(None, u'href'): href, |
|
|
395 |
u'_text': url |
|
|
396 |
} |
|
|
397 |
attrs = self.apply_callbacks(attrs, True) |
|
|
398 |
|
|
|
399 |
if attrs is None: |
|
|
400 |
# Just add the text |
|
|
401 |
new_tokens.append( |
|
|
402 |
{u'type': u'Characters', u'data': prefix + url + suffix} |
|
|
403 |
) |
|
|
404 |
|
|
|
405 |
else: |
|
|
406 |
# Add the "a" tag! |
|
|
407 |
if prefix: |
|
|
408 |
new_tokens.append( |
|
|
409 |
{u'type': u'Characters', u'data': prefix} |
|
|
410 |
) |
|
|
411 |
|
|
|
412 |
_text = attrs.pop(u'_text', '') |
|
|
413 |
attrs = alphabetize_attributes(attrs) |
|
|
414 |
|
|
|
415 |
new_tokens.extend([ |
|
|
416 |
{u'type': u'StartTag', u'name': u'a', u'data': attrs}, |
|
|
417 |
{u'type': u'Characters', u'data': force_unicode(_text)}, |
|
|
418 |
{u'type': u'EndTag', u'name': 'a'}, |
|
|
419 |
]) |
|
|
420 |
|
|
|
421 |
if suffix: |
|
|
422 |
new_tokens.append( |
|
|
423 |
{u'type': u'Characters', u'data': suffix} |
|
|
424 |
) |
|
|
425 |
|
|
|
426 |
end = match.end() |
|
|
427 |
|
|
|
428 |
if new_tokens: |
|
|
429 |
# Yield the adjusted set of tokens and then continue |
|
|
430 |
# through the loop |
|
|
431 |
if end < len(text): |
|
|
432 |
new_tokens.append({u'type': u'Characters', u'data': text[end:]}) |
|
|
433 |
|
|
|
434 |
for new_token in new_tokens: |
|
|
435 |
yield new_token |
|
|
436 |
|
|
|
437 |
continue |
|
|
438 |
|
|
|
439 |
yield token |
|
|
440 |
|
|
|
441 |
def handle_a_tag(self, token_buffer): |
|
|
442 |
"""Handle the "a" tag |
|
|
443 |
|
|
|
444 |
This could adjust the link or drop it altogether depending on what the |
|
|
445 |
callbacks return. |
|
|
446 |
|
|
|
447 |
This yields the new set of tokens. |
|
|
448 |
|
|
|
449 |
""" |
|
|
450 |
a_token = token_buffer[0] |
|
|
451 |
if a_token['data']: |
|
|
452 |
attrs = a_token['data'] |
|
|
453 |
else: |
|
|
454 |
attrs = {} |
|
|
455 |
text = self.extract_character_data(token_buffer) |
|
|
456 |
attrs['_text'] = text |
|
|
457 |
|
|
|
458 |
attrs = self.apply_callbacks(attrs, False) |
|
|
459 |
|
|
|
460 |
if attrs is None: |
|
|
461 |
# We're dropping the "a" tag and everything else and replacing |
|
|
462 |
# it with character data. So emit that token. |
|
|
463 |
yield {'type': 'Characters', 'data': text} |
|
|
464 |
|
|
|
465 |
else: |
|
|
466 |
new_text = attrs.pop('_text', '') |
|
|
467 |
a_token['data'] = alphabetize_attributes(attrs) |
|
|
468 |
|
|
|
469 |
if text == new_text: |
|
|
470 |
# The callbacks didn't change the text, so we yield the new "a" |
|
|
471 |
# token, then whatever else was there, then the end "a" token |
|
|
472 |
yield a_token |
|
|
473 |
for mem in token_buffer[1:]: |
|
|
474 |
yield mem |
|
|
475 |
|
|
|
476 |
else: |
|
|
477 |
# If the callbacks changed the text, then we're going to drop |
|
|
478 |
# all the tokens between the start and end "a" tags and replace |
|
|
479 |
# it with the new text |
|
|
480 |
yield a_token |
|
|
481 |
yield {'type': 'Characters', 'data': force_unicode(new_text)} |
|
|
482 |
yield token_buffer[-1] |
|
|
483 |
|
|
|
484 |
def __iter__(self): |
|
|
485 |
in_a = False |
|
|
486 |
in_skip_tag = None |
|
|
487 |
|
|
|
488 |
token_buffer = [] |
|
|
489 |
|
|
|
490 |
for token in super(LinkifyFilter, self).__iter__(): |
|
|
491 |
if in_a: |
|
|
492 |
# Handle the case where we're in an "a" tag--we want to buffer tokens |
|
|
493 |
# until we hit an end "a" tag. |
|
|
494 |
if token['type'] == 'EndTag' and token['name'] == 'a': |
|
|
495 |
# Add the end tag to the token buffer and then handle them |
|
|
496 |
# and yield anything returned |
|
|
497 |
token_buffer.append(token) |
|
|
498 |
for new_token in self.handle_a_tag(token_buffer): |
|
|
499 |
yield new_token |
|
|
500 |
|
|
|
501 |
# Clear "a" related state and continue since we've yielded all |
|
|
502 |
# the tokens we're going to yield |
|
|
503 |
in_a = False |
|
|
504 |
token_buffer = [] |
|
|
505 |
else: |
|
|
506 |
token_buffer.append(token) |
|
|
507 |
continue |
|
|
508 |
|
|
|
509 |
if token['type'] in ['StartTag', 'EmptyTag']: |
|
|
510 |
if token['name'] in self.skip_tags: |
|
|
511 |
# Skip tags start a "special mode" where we don't linkify |
|
|
512 |
# anything until the end tag. |
|
|
513 |
in_skip_tag = token['name'] |
|
|
514 |
|
|
|
515 |
elif token['name'] == 'a': |
|
|
516 |
# The "a" tag is special--we switch to a slurp mode and |
|
|
517 |
# slurp all the tokens until the end "a" tag and then |
|
|
518 |
# figure out what to do with them there. |
|
|
519 |
in_a = True |
|
|
520 |
token_buffer.append(token) |
|
|
521 |
|
|
|
522 |
# We buffer the start tag, so we don't want to yield it, |
|
|
523 |
# yet |
|
|
524 |
continue |
|
|
525 |
|
|
|
526 |
elif in_skip_tag and self.skip_tags: |
|
|
527 |
# NOTE(willkg): We put this clause here since in_a and |
|
|
528 |
# switching in and out of in_a takes precedence. |
|
|
529 |
if token['type'] == 'EndTag' and token['name'] == in_skip_tag: |
|
|
530 |
in_skip_tag = None |
|
|
531 |
|
|
|
532 |
elif not in_a and not in_skip_tag and token['type'] == 'Characters': |
|
|
533 |
new_stream = iter([token]) |
|
|
534 |
if self.parse_email: |
|
|
535 |
new_stream = self.handle_email_addresses(new_stream) |
|
|
536 |
|
|
|
537 |
new_stream = self.handle_links(new_stream) |
|
|
538 |
|
|
|
539 |
for token in new_stream: |
|
|
540 |
yield token |
|
|
541 |
|
|
|
542 |
# We've already yielded this token, so continue |
|
|
543 |
continue |
|
|
544 |
|
|
|
545 |
yield token |