Switch to unified view

a b/docs/scripts/clickable_snippets.py
1
# Based on https://github.com/darwindarak/mdx_bib
2
import os
3
import re
4
from bisect import bisect_right
5
from typing import Tuple
6
7
import jedi
8
import mkdocs.structure.pages
9
import parso
10
import regex
11
from mkdocs.config.config_options import Type as MkType
12
from mkdocs.config.defaults import MkDocsConfig
13
from mkdocs.plugins import BasePlugin
14
15
from docs.scripts.autorefs.plugin import AutorefsPlugin
16
17
try:
18
    from importlib.metadata import entry_points
19
except ImportError:
20
    from importlib_metadata import entry_points
21
22
23
from bs4 import BeautifulSoup
24
25
BRACKET_RE = re.compile(r"\[([^\[]+)\]")
26
CITE_RE = re.compile(r"@([\w_:-]+)")
27
DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)")
28
INDENT_RE = re.compile(r"\A\t| {4}(.*)")
29
30
HREF_REGEX = (
31
    r"(?<=<\s*(?:a[^>]*href|img[^>]*src)=)"
32
    r'(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))'
33
)
34
# Maybe find something less specific ?
35
PIPE_REGEX = r"(?<![a-zA-Z0-9._-])eds[.]([a-zA-Z0-9._-]*)(?![a-zA-Z0-9._-])"
36
37
HTML_PIPE_REGEX = r"""(?x)
38
(?<![a-zA-Z0-9._-])
39
<span[^>]*>eds<\/span>
40
<span[^>]*>[.]<\/span>
41
<span[^>]*>([a-zA-Z0-9._-]*)<\/span>
42
(?![a-zA-Z0-9._-])
43
"""
44
45
CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])"
46
47
48
class ClickableSnippetsPlugin(BasePlugin):
49
    config_scheme: Tuple[Tuple[str, MkType]] = ()
50
51
    @mkdocs.plugins.event_priority(1000)
52
    def on_config(self, config: MkDocsConfig):
53
        for event_name, events in config.plugins.events.items():
54
            for event in list(events):
55
                if "autorefs" in str(event):
56
                    events.remove(event)
57
        old_plugin = config["plugins"]["autorefs"]
58
        plugin_config = dict(old_plugin.config)
59
        plugin = AutorefsPlugin()
60
        config.plugins["autorefs"] = plugin
61
        config["plugins"]["autorefs"] = plugin
62
        plugin.load_config(plugin_config)
63
64
    @classmethod
65
    def get_ep_namespace(cls, ep, namespace):
66
        if hasattr(ep, "select"):
67
            return ep.select(group=namespace)
68
        else:  # dict
69
            return ep.get(namespace, [])
70
71
    @mkdocs.plugins.event_priority(-1000)
72
    def on_post_page(
73
        self,
74
        output: str,
75
        page: mkdocs.structure.pages.Page,
76
        config: mkdocs.config.Config,
77
    ):
78
        """
79
        1. Replace absolute paths with path relative to the rendered page
80
           This must be performed after all other plugins have run.
81
        2. Replace component names with links to the component reference
82
83
        Parameters
84
        ----------
85
        output
86
        page
87
        config
88
89
        Returns
90
        -------
91
92
        """
93
94
        autorefs: AutorefsPlugin = config["plugins"]["autorefs"]
95
        ep = entry_points()
96
        page_url = os.path.join("/", page.file.url)
97
        spacy_factories_entry_points = {
98
            ep.name: ep.value
99
            for ep in (
100
                *self.get_ep_namespace(ep, "spacy_factories"),
101
                *self.get_ep_namespace(ep, "edsnlp_factories"),
102
            )
103
        }
104
105
        def replace_component(match):
106
            full_group = match.group(0)
107
            name = "eds." + match.group(1)
108
            ep = spacy_factories_entry_points.get(name)
109
            preceding = output[match.start(0) - 50 : match.start(0)]
110
            if ep is not None and "DEFAULT:" not in preceding:
111
                try:
112
                    url = autorefs.get_item_url(ep.replace(":", "."))
113
                except KeyError:
114
                    pass
115
                else:
116
                    return f"<a href={url}>{name}</a>"
117
            return full_group
118
119
        def replace_link(match):
120
            relative_url = url = match.group(1) or match.group(2) or match.group(3)
121
            if url.startswith("/"):
122
                relative_url = os.path.relpath(url, page_url)
123
            return f'"{relative_url}"'
124
125
        output = regex.sub(PIPE_REGEX, replace_component, output)
126
        output = regex.sub(HTML_PIPE_REGEX, replace_component, output)
127
128
        all_snippets = ""
129
        all_offsets = []
130
        all_nodes = []
131
132
        soups = []
133
134
        # Replace absolute paths with path relative to the rendered page
135
        for match in regex.finditer("<code>.*?</code>", output, flags=regex.DOTALL):
136
            node = match.group(0)
137
            if "\n" in node:
138
                soup, snippet, python_offsets, html_nodes = self.convert_html_to_code(
139
                    node
140
                )
141
                size = len(all_snippets)
142
                all_snippets += snippet + "\n"
143
                all_offsets.extend([size + i for i in python_offsets])
144
                all_nodes.extend(html_nodes)
145
                soups.append((soup, match.start(0), match.end(0)))
146
147
        interpreter = jedi.Interpreter(all_snippets, [{}])
148
        line_lengths = [0]
149
        for line in all_snippets.split("\n"):
150
            line_lengths.append(len(line) + line_lengths[-1] + 1)
151
        line_lengths[-1] -= 1
152
153
        for name in self.iter_names(interpreter._module_node):
154
            try:
155
                line, col = name.start_pos
156
                offset = line_lengths[line - 1] + col
157
                node_idx = bisect_right(all_offsets, offset) - 1
158
159
                node = all_nodes[node_idx]
160
                gotos = interpreter.goto(line, col, follow_imports=True)
161
                gotos = [
162
                    goto
163
                    for goto in gotos
164
                    if (
165
                        goto
166
                        and goto.full_name
167
                        and goto.full_name.startswith("edsnlp")
168
                        and goto.type != "module"
169
                    )
170
                ]
171
                goto = gotos[0] if gotos else None
172
                if goto:
173
                    url = autorefs.get_item_url(goto.full_name)
174
                    # Check if node has no link in its upstream ancestors
175
                    if not node.find_parents("a"):
176
                        node.replace_with(
177
                            BeautifulSoup(
178
                                f'<a class="discrete-link" href="{url}">{node}</a>',
179
                                "html5lib",
180
                            )
181
                        )
182
            except Exception:
183
                pass
184
185
        # Re-insert soups into the output
186
        for soup, start, end in reversed(soups):
187
            output = output[:start] + str(soup.find("code")) + output[end:]
188
189
        output = regex.sub(HREF_REGEX, replace_link, output)
190
191
        return output
192
193
    @classmethod
194
    def iter_names(cls, root):
195
        if isinstance(root, parso.python.tree.Name):
196
            yield root
197
        for child in getattr(root, "children", ()):
198
            yield from cls.iter_names(child)
199
200
    @classmethod
201
    def convert_html_to_code(
202
        cls, html_content: str
203
    ) -> Tuple[BeautifulSoup, str, list, list]:
204
        pre_html_content = "<pre>" + html_content + "</pre>"
205
        soup = list(BeautifulSoup(pre_html_content, "html5lib").children)[0]
206
        code_element = soup.find("code")
207
208
        line_lengths = [0]
209
        for line in pre_html_content.split("\n"):
210
            line_lengths.append(len(line) + line_lengths[-1] + 1)
211
        line_lengths[-1] -= 1
212
213
        python_code = ""
214
        code_offsets = []
215
        html_nodes = []
216
        code_offset = 0
217
218
        def extract_text_with_offsets(el):
219
            nonlocal python_code, code_offset
220
            for content in el.contents:
221
                # check not class md-annotation
222
                # Recursively process child elements
223
                if isinstance(content, str):
224
                    python_code += content
225
                    code_offsets.append(code_offset)
226
                    code_offset += len(content)
227
                    html_nodes.append(content)
228
                    continue
229
                if "md-annotation" not in content.get("class", ""):
230
                    extract_text_with_offsets(content)
231
232
        extract_text_with_offsets(code_element)
233
234
        return soup, python_code, code_offsets, html_nodes