|
a |
|
b/docs/scripts/clickable_snippets.py |
|
|
1 |
# Based on https://github.com/darwindarak/mdx_bib |
|
|
2 |
import os |
|
|
3 |
import re |
|
|
4 |
from bisect import bisect_right |
|
|
5 |
from typing import Tuple |
|
|
6 |
|
|
|
7 |
import jedi |
|
|
8 |
import mkdocs.structure.pages |
|
|
9 |
import parso |
|
|
10 |
import regex |
|
|
11 |
from mkdocs.config.config_options import Type as MkType |
|
|
12 |
from mkdocs.config.defaults import MkDocsConfig |
|
|
13 |
from mkdocs.plugins import BasePlugin |
|
|
14 |
|
|
|
15 |
from docs.scripts.autorefs.plugin import AutorefsPlugin |
|
|
16 |
|
|
|
17 |
try: |
|
|
18 |
from importlib.metadata import entry_points |
|
|
19 |
except ImportError: |
|
|
20 |
from importlib_metadata import entry_points |
|
|
21 |
|
|
|
22 |
|
|
|
23 |
from bs4 import BeautifulSoup |
|
|
24 |
|
|
|
25 |
BRACKET_RE = re.compile(r"\[([^\[]+)\]") |
|
|
26 |
CITE_RE = re.compile(r"@([\w_:-]+)") |
|
|
27 |
DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)") |
|
|
28 |
INDENT_RE = re.compile(r"\A\t| {4}(.*)") |
|
|
29 |
|
|
|
30 |
HREF_REGEX = ( |
|
|
31 |
r"(?<=<\s*(?:a[^>]*href|img[^>]*src)=)" |
|
|
32 |
r'(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))' |
|
|
33 |
) |
|
|
34 |
# Maybe find something less specific ? |
|
|
35 |
PIPE_REGEX = r"(?<![a-zA-Z0-9._-])eds[.]([a-zA-Z0-9._-]*)(?![a-zA-Z0-9._-])" |
|
|
36 |
|
|
|
37 |
HTML_PIPE_REGEX = r"""(?x) |
|
|
38 |
(?<![a-zA-Z0-9._-]) |
|
|
39 |
<span[^>]*>eds<\/span> |
|
|
40 |
<span[^>]*>[.]<\/span> |
|
|
41 |
<span[^>]*>([a-zA-Z0-9._-]*)<\/span> |
|
|
42 |
(?![a-zA-Z0-9._-]) |
|
|
43 |
""" |
|
|
44 |
|
|
|
45 |
CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])" |
|
|
46 |
|
|
|
47 |
|
|
|
48 |
class ClickableSnippetsPlugin(BasePlugin): |
|
|
49 |
config_scheme: Tuple[Tuple[str, MkType]] = () |
|
|
50 |
|
|
|
51 |
@mkdocs.plugins.event_priority(1000) |
|
|
52 |
def on_config(self, config: MkDocsConfig): |
|
|
53 |
for event_name, events in config.plugins.events.items(): |
|
|
54 |
for event in list(events): |
|
|
55 |
if "autorefs" in str(event): |
|
|
56 |
events.remove(event) |
|
|
57 |
old_plugin = config["plugins"]["autorefs"] |
|
|
58 |
plugin_config = dict(old_plugin.config) |
|
|
59 |
plugin = AutorefsPlugin() |
|
|
60 |
config.plugins["autorefs"] = plugin |
|
|
61 |
config["plugins"]["autorefs"] = plugin |
|
|
62 |
plugin.load_config(plugin_config) |
|
|
63 |
|
|
|
64 |
@classmethod |
|
|
65 |
def get_ep_namespace(cls, ep, namespace): |
|
|
66 |
if hasattr(ep, "select"): |
|
|
67 |
return ep.select(group=namespace) |
|
|
68 |
else: # dict |
|
|
69 |
return ep.get(namespace, []) |
|
|
70 |
|
|
|
71 |
@mkdocs.plugins.event_priority(-1000) |
|
|
72 |
def on_post_page( |
|
|
73 |
self, |
|
|
74 |
output: str, |
|
|
75 |
page: mkdocs.structure.pages.Page, |
|
|
76 |
config: mkdocs.config.Config, |
|
|
77 |
): |
|
|
78 |
""" |
|
|
79 |
1. Replace absolute paths with path relative to the rendered page |
|
|
80 |
This must be performed after all other plugins have run. |
|
|
81 |
2. Replace component names with links to the component reference |
|
|
82 |
|
|
|
83 |
Parameters |
|
|
84 |
---------- |
|
|
85 |
output |
|
|
86 |
page |
|
|
87 |
config |
|
|
88 |
|
|
|
89 |
Returns |
|
|
90 |
------- |
|
|
91 |
|
|
|
92 |
""" |
|
|
93 |
|
|
|
94 |
autorefs: AutorefsPlugin = config["plugins"]["autorefs"] |
|
|
95 |
ep = entry_points() |
|
|
96 |
page_url = os.path.join("/", page.file.url) |
|
|
97 |
spacy_factories_entry_points = { |
|
|
98 |
ep.name: ep.value |
|
|
99 |
for ep in ( |
|
|
100 |
*self.get_ep_namespace(ep, "spacy_factories"), |
|
|
101 |
*self.get_ep_namespace(ep, "edsnlp_factories"), |
|
|
102 |
) |
|
|
103 |
} |
|
|
104 |
|
|
|
105 |
def replace_component(match): |
|
|
106 |
full_group = match.group(0) |
|
|
107 |
name = "eds." + match.group(1) |
|
|
108 |
ep = spacy_factories_entry_points.get(name) |
|
|
109 |
preceding = output[match.start(0) - 50 : match.start(0)] |
|
|
110 |
if ep is not None and "DEFAULT:" not in preceding: |
|
|
111 |
try: |
|
|
112 |
url = autorefs.get_item_url(ep.replace(":", ".")) |
|
|
113 |
except KeyError: |
|
|
114 |
pass |
|
|
115 |
else: |
|
|
116 |
return f"<a href={url}>{name}</a>" |
|
|
117 |
return full_group |
|
|
118 |
|
|
|
119 |
def replace_link(match): |
|
|
120 |
relative_url = url = match.group(1) or match.group(2) or match.group(3) |
|
|
121 |
if url.startswith("/"): |
|
|
122 |
relative_url = os.path.relpath(url, page_url) |
|
|
123 |
return f'"{relative_url}"' |
|
|
124 |
|
|
|
125 |
output = regex.sub(PIPE_REGEX, replace_component, output) |
|
|
126 |
output = regex.sub(HTML_PIPE_REGEX, replace_component, output) |
|
|
127 |
|
|
|
128 |
all_snippets = "" |
|
|
129 |
all_offsets = [] |
|
|
130 |
all_nodes = [] |
|
|
131 |
|
|
|
132 |
soups = [] |
|
|
133 |
|
|
|
134 |
# Replace absolute paths with path relative to the rendered page |
|
|
135 |
for match in regex.finditer("<code>.*?</code>", output, flags=regex.DOTALL): |
|
|
136 |
node = match.group(0) |
|
|
137 |
if "\n" in node: |
|
|
138 |
soup, snippet, python_offsets, html_nodes = self.convert_html_to_code( |
|
|
139 |
node |
|
|
140 |
) |
|
|
141 |
size = len(all_snippets) |
|
|
142 |
all_snippets += snippet + "\n" |
|
|
143 |
all_offsets.extend([size + i for i in python_offsets]) |
|
|
144 |
all_nodes.extend(html_nodes) |
|
|
145 |
soups.append((soup, match.start(0), match.end(0))) |
|
|
146 |
|
|
|
147 |
interpreter = jedi.Interpreter(all_snippets, [{}]) |
|
|
148 |
line_lengths = [0] |
|
|
149 |
for line in all_snippets.split("\n"): |
|
|
150 |
line_lengths.append(len(line) + line_lengths[-1] + 1) |
|
|
151 |
line_lengths[-1] -= 1 |
|
|
152 |
|
|
|
153 |
for name in self.iter_names(interpreter._module_node): |
|
|
154 |
try: |
|
|
155 |
line, col = name.start_pos |
|
|
156 |
offset = line_lengths[line - 1] + col |
|
|
157 |
node_idx = bisect_right(all_offsets, offset) - 1 |
|
|
158 |
|
|
|
159 |
node = all_nodes[node_idx] |
|
|
160 |
gotos = interpreter.goto(line, col, follow_imports=True) |
|
|
161 |
gotos = [ |
|
|
162 |
goto |
|
|
163 |
for goto in gotos |
|
|
164 |
if ( |
|
|
165 |
goto |
|
|
166 |
and goto.full_name |
|
|
167 |
and goto.full_name.startswith("edsnlp") |
|
|
168 |
and goto.type != "module" |
|
|
169 |
) |
|
|
170 |
] |
|
|
171 |
goto = gotos[0] if gotos else None |
|
|
172 |
if goto: |
|
|
173 |
url = autorefs.get_item_url(goto.full_name) |
|
|
174 |
# Check if node has no link in its upstream ancestors |
|
|
175 |
if not node.find_parents("a"): |
|
|
176 |
node.replace_with( |
|
|
177 |
BeautifulSoup( |
|
|
178 |
f'<a class="discrete-link" href="{url}">{node}</a>', |
|
|
179 |
"html5lib", |
|
|
180 |
) |
|
|
181 |
) |
|
|
182 |
except Exception: |
|
|
183 |
pass |
|
|
184 |
|
|
|
185 |
# Re-insert soups into the output |
|
|
186 |
for soup, start, end in reversed(soups): |
|
|
187 |
output = output[:start] + str(soup.find("code")) + output[end:] |
|
|
188 |
|
|
|
189 |
output = regex.sub(HREF_REGEX, replace_link, output) |
|
|
190 |
|
|
|
191 |
return output |
|
|
192 |
|
|
|
193 |
@classmethod |
|
|
194 |
def iter_names(cls, root): |
|
|
195 |
if isinstance(root, parso.python.tree.Name): |
|
|
196 |
yield root |
|
|
197 |
for child in getattr(root, "children", ()): |
|
|
198 |
yield from cls.iter_names(child) |
|
|
199 |
|
|
|
200 |
@classmethod |
|
|
201 |
def convert_html_to_code( |
|
|
202 |
cls, html_content: str |
|
|
203 |
) -> Tuple[BeautifulSoup, str, list, list]: |
|
|
204 |
pre_html_content = "<pre>" + html_content + "</pre>" |
|
|
205 |
soup = list(BeautifulSoup(pre_html_content, "html5lib").children)[0] |
|
|
206 |
code_element = soup.find("code") |
|
|
207 |
|
|
|
208 |
line_lengths = [0] |
|
|
209 |
for line in pre_html_content.split("\n"): |
|
|
210 |
line_lengths.append(len(line) + line_lengths[-1] + 1) |
|
|
211 |
line_lengths[-1] -= 1 |
|
|
212 |
|
|
|
213 |
python_code = "" |
|
|
214 |
code_offsets = [] |
|
|
215 |
html_nodes = [] |
|
|
216 |
code_offset = 0 |
|
|
217 |
|
|
|
218 |
def extract_text_with_offsets(el): |
|
|
219 |
nonlocal python_code, code_offset |
|
|
220 |
for content in el.contents: |
|
|
221 |
# check not class md-annotation |
|
|
222 |
# Recursively process child elements |
|
|
223 |
if isinstance(content, str): |
|
|
224 |
python_code += content |
|
|
225 |
code_offsets.append(code_offset) |
|
|
226 |
code_offset += len(content) |
|
|
227 |
html_nodes.append(content) |
|
|
228 |
continue |
|
|
229 |
if "md-annotation" not in content.get("class", ""): |
|
|
230 |
extract_text_with_offsets(content) |
|
|
231 |
|
|
|
232 |
extract_text_with_offsets(code_element) |
|
|
233 |
|
|
|
234 |
return soup, python_code, code_offsets, html_nodes |