[cad161]: / docs / scripts / clickable_snippets.py

Download this file

235 lines (197 with data), 7.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# Based on https://github.com/darwindarak/mdx_bib
import os
import re
from bisect import bisect_right
from typing import Tuple
import jedi
import mkdocs.structure.pages
import parso
import regex
from mkdocs.config.config_options import Type as MkType
from mkdocs.config.defaults import MkDocsConfig
from mkdocs.plugins import BasePlugin
from docs.scripts.autorefs.plugin import AutorefsPlugin
try:
from importlib.metadata import entry_points
except ImportError:
from importlib_metadata import entry_points
from bs4 import BeautifulSoup
BRACKET_RE = re.compile(r"\[([^\[]+)\]")
CITE_RE = re.compile(r"@([\w_:-]+)")
DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)")
INDENT_RE = re.compile(r"\A\t| {4}(.*)")
HREF_REGEX = (
r"(?<=<\s*(?:a[^>]*href|img[^>]*src)=)"
r'(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))'
)
# Maybe find something less specific ?
PIPE_REGEX = r"(?<![a-zA-Z0-9._-])eds[.]([a-zA-Z0-9._-]*)(?![a-zA-Z0-9._-])"
HTML_PIPE_REGEX = r"""(?x)
(?<![a-zA-Z0-9._-])
<span[^>]*>eds<\/span>
<span[^>]*>[.]<\/span>
<span[^>]*>([a-zA-Z0-9._-]*)<\/span>
(?![a-zA-Z0-9._-])
"""
CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])"
class ClickableSnippetsPlugin(BasePlugin):
config_scheme: Tuple[Tuple[str, MkType]] = ()
@mkdocs.plugins.event_priority(1000)
def on_config(self, config: MkDocsConfig):
for event_name, events in config.plugins.events.items():
for event in list(events):
if "autorefs" in str(event):
events.remove(event)
old_plugin = config["plugins"]["autorefs"]
plugin_config = dict(old_plugin.config)
plugin = AutorefsPlugin()
config.plugins["autorefs"] = plugin
config["plugins"]["autorefs"] = plugin
plugin.load_config(plugin_config)
@classmethod
def get_ep_namespace(cls, ep, namespace):
if hasattr(ep, "select"):
return ep.select(group=namespace)
else: # dict
return ep.get(namespace, [])
@mkdocs.plugins.event_priority(-1000)
def on_post_page(
self,
output: str,
page: mkdocs.structure.pages.Page,
config: mkdocs.config.Config,
):
"""
1. Replace absolute paths with path relative to the rendered page
This must be performed after all other plugins have run.
2. Replace component names with links to the component reference
Parameters
----------
output
page
config
Returns
-------
"""
autorefs: AutorefsPlugin = config["plugins"]["autorefs"]
ep = entry_points()
page_url = os.path.join("/", page.file.url)
spacy_factories_entry_points = {
ep.name: ep.value
for ep in (
*self.get_ep_namespace(ep, "spacy_factories"),
*self.get_ep_namespace(ep, "edsnlp_factories"),
)
}
def replace_component(match):
full_group = match.group(0)
name = "eds." + match.group(1)
ep = spacy_factories_entry_points.get(name)
preceding = output[match.start(0) - 50 : match.start(0)]
if ep is not None and "DEFAULT:" not in preceding:
try:
url = autorefs.get_item_url(ep.replace(":", "."))
except KeyError:
pass
else:
return f"<a href={url}>{name}</a>"
return full_group
def replace_link(match):
relative_url = url = match.group(1) or match.group(2) or match.group(3)
if url.startswith("/"):
relative_url = os.path.relpath(url, page_url)
return f'"{relative_url}"'
output = regex.sub(PIPE_REGEX, replace_component, output)
output = regex.sub(HTML_PIPE_REGEX, replace_component, output)
all_snippets = ""
all_offsets = []
all_nodes = []
soups = []
# Replace absolute paths with path relative to the rendered page
for match in regex.finditer("<code>.*?</code>", output, flags=regex.DOTALL):
node = match.group(0)
if "\n" in node:
soup, snippet, python_offsets, html_nodes = self.convert_html_to_code(
node
)
size = len(all_snippets)
all_snippets += snippet + "\n"
all_offsets.extend([size + i for i in python_offsets])
all_nodes.extend(html_nodes)
soups.append((soup, match.start(0), match.end(0)))
interpreter = jedi.Interpreter(all_snippets, [{}])
line_lengths = [0]
for line in all_snippets.split("\n"):
line_lengths.append(len(line) + line_lengths[-1] + 1)
line_lengths[-1] -= 1
for name in self.iter_names(interpreter._module_node):
try:
line, col = name.start_pos
offset = line_lengths[line - 1] + col
node_idx = bisect_right(all_offsets, offset) - 1
node = all_nodes[node_idx]
gotos = interpreter.goto(line, col, follow_imports=True)
gotos = [
goto
for goto in gotos
if (
goto
and goto.full_name
and goto.full_name.startswith("edsnlp")
and goto.type != "module"
)
]
goto = gotos[0] if gotos else None
if goto:
url = autorefs.get_item_url(goto.full_name)
# Check if node has no link in its upstream ancestors
if not node.find_parents("a"):
node.replace_with(
BeautifulSoup(
f'<a class="discrete-link" href="{url}">{node}</a>',
"html5lib",
)
)
except Exception:
pass
# Re-insert soups into the output
for soup, start, end in reversed(soups):
output = output[:start] + str(soup.find("code")) + output[end:]
output = regex.sub(HREF_REGEX, replace_link, output)
return output
@classmethod
def iter_names(cls, root):
if isinstance(root, parso.python.tree.Name):
yield root
for child in getattr(root, "children", ()):
yield from cls.iter_names(child)
@classmethod
def convert_html_to_code(
cls, html_content: str
) -> Tuple[BeautifulSoup, str, list, list]:
pre_html_content = "<pre>" + html_content + "</pre>"
soup = list(BeautifulSoup(pre_html_content, "html5lib").children)[0]
code_element = soup.find("code")
line_lengths = [0]
for line in pre_html_content.split("\n"):
line_lengths.append(len(line) + line_lengths[-1] + 1)
line_lengths[-1] -= 1
python_code = ""
code_offsets = []
html_nodes = []
code_offset = 0
def extract_text_with_offsets(el):
nonlocal python_code, code_offset
for content in el.contents:
# check not class md-annotation
# Recursively process child elements
if isinstance(content, str):
python_code += content
code_offsets.append(code_offset)
code_offset += len(content)
html_nodes.append(content)
continue
if "md-annotation" not in content.get("class", ""):
extract_text_with_offsets(content)
extract_text_with_offsets(code_element)
return soup, python_code, code_offsets, html_nodes