👌 IMPROVE: Allow setting {#id} on headings (#706)

chrisjsewell · web-flow · commit c8b6fc34cc49 · 2023-02-20T07:59:13.000+01:00
For use with the `attrs_block` extension.
This just moves around the logic for implicit heading anchors a bit (without changing anything user facing), to allow distinguishing between explicit and implicit heading ids.
diff --git a/myst_parser/cli.py b/myst_parser/cli.py
@@ -2,6 +2,8 @@
 import sys
 
 from markdown_it.renderer import RendererHTML
+from markdown_it.rules_core import StateCore
+from mdit_py_plugins.anchors import anchors_plugin
 
 from myst_parser.config.main import MdParserConfig
 from myst_parser.parsers.mdit import create_md_parser
@@ -28,9 +30,10 @@ def print_anchors(args=None):
         "-l", "--level", type=int, default=2, help="Maximum heading level."
     )
     args = arg_parser.parse_args(args)
-    parser = create_md_parser(MdParserConfig(heading_anchors=args.level), RendererHTML)
+    parser = create_md_parser(MdParserConfig(), RendererHTML)
+    parser.use(anchors_plugin, max_level=args.level)
 
-    def _filter_plugin(state):
+    def _filter_plugin(state: StateCore) -> None:
         state.tokens = [
             t
             for t in state.tokens
diff --git a/myst_parser/config/main.py b/myst_parser/config/main.py
@@ -267,10 +267,10 @@ def __repr__(self) -> str:
         },
     )
 
-    heading_anchors: Optional[int] = dc.field(
-        default=None,
+    heading_anchors: int = dc.field(
+        default=0,
         metadata={
-            "validator": optional(in_([1, 2, 3, 4, 5, 6, 7])),
+            "validator": optional(in_([0, 1, 2, 3, 4, 5, 6, 7])),
             "help": "Heading level depth to assign HTML anchors",
         },
     )
diff --git a/myst_parser/mdit_to_docutils/base.py b/myst_parser/mdit_to_docutils/base.py
@@ -14,6 +14,7 @@
     TYPE_CHECKING,
     Any,
     Callable,
+    Iterable,
     Iterator,
     MutableMapping,
     Sequence,
@@ -790,18 +791,24 @@ def blocks_mathjax_processing(self) -> bool:
     def render_heading(self, token: SyntaxTreeNode) -> None:
         """Render a heading, e.g. `# Heading`."""
 
-        if self.md_env.get("match_titles", None) is False:
-            # this can occur if a nested parse is performed by a directive
-            # (such as an admonition) which contains a header.
-            # this would break the document structure
-            self.create_warning(
-                "Disallowed nested header found, converting to rubric",
-                MystWarnings.MD_HEADING_NESTED,
-                line=token_line(token, default=0),
-                append_to=self.current_node,
-            )
+        if (
+            token.attrs.get("toc", None) == "false"
+            or self.md_env.get("match_titles", None) is False
+        ):
+            if self.md_env.get("match_titles", None) is False:
+                # this can occur if a nested parse is performed by a directive
+                # (such as an admonition) which contains a header.
+                # this would break the document structure
+                self.create_warning(
+                    "Disallowed nested header found, converting to rubric",
+                    MystWarnings.MD_HEADING_NESTED,
+                    line=token_line(token, default=0),
+                    append_to=self.current_node,
+                )
+
             rubric = nodes.rubric(token.content, "")
             self.add_line_and_source_path(rubric, token)
+            self.copy_attributes(token, rubric, ("class", "id"))
             with self.current_node_context(rubric, append=True):
                 self.render_children(token)
             return
@@ -811,6 +818,7 @@ def render_heading(self, token: SyntaxTreeNode) -> None:
         # create the section node
         new_section = nodes.section()
         self.add_line_and_source_path(new_section, token)
+        self.copy_attributes(token, new_section, ("class", "id"))
         # if a top level section,
         # then add classes to set default mathjax processing to false
         # we then turn it back on, on a per-node basis
@@ -830,28 +838,36 @@ def render_heading(self, token: SyntaxTreeNode) -> None:
 
         # create a target reference for the section, based on the heading text.
         # Note, this is an implicit target, meaning that it is not prioritised,
-        # and is not stored by sphinx for ref resolution
+        # during ref resolution, and is not stored in the document.
+        # TODO this is purely to mimic docutils, but maybe we don't need it?
+        # (since we have the slugify logic below)
         name = nodes.fully_normalize_name(title_node.astext())
         new_section["names"].append(name)
         self.document.note_implicit_target(new_section, new_section)
 
-        # add possible reference slug, this may be different to the standard name above,
-        # and does not have to be normalised, so we treat it separately
-        # TODO this id can now come from attributes, which we actually want to be explicit
-        # I think rather than using the mdit anchors_plugin,
-        # we should just compute them here (with the same logic)
-        if "id" in token.attrs:
-            slug = str(token.attrs["id"])
-            new_section["slug"] = slug
-            if slug in self._slug_to_section:
-                other_node = self._slug_to_section[slug]
+        if level <= self.md_config.heading_anchors:
+
+            # Create an implicit reference slug.
+            # The problem with this reference slug,
+            # is that it might not be in the "normalised" format required by docutils,
+            # https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#normalized-reference-names
+            # so we store it separately, and have separate logic than docutils
+            # TODO maybe revisit this assumption, or improve the logic
+            try:
+                slug = compute_unique_slug(
+                    token,
+                    self._slug_to_section,
+                    self.md_config.heading_slug_func,
+                )
+            except Exception as error:
                 self.create_warning(
-                    f"duplicate heading slug {slug!r}, other at line {other_node.line}",
-                    MystWarnings.ANCHOR_DUPE,
-                    line=new_section.line,
+                    str(error),
+                    MystWarnings.HEADING_SLUG,
+                    line=token_line(token, default=0),
+                    append_to=self.current_node,
                 )
             else:
-                # we store this for later processing on finalise
+                new_section["slug"] = slug
                 self._slug_to_section[slug] = new_section
 
         # set the section as the current node for subsequent rendering
@@ -1842,3 +1858,42 @@ def clean_astext(node: nodes.Element) -> str:
     for raw in list(findall(node)(nodes.raw)):
         raw.parent.remove(raw)
     return node.astext()
+
+
+_SLUGIFY_CLEAN_REGEX = re.compile(r"[^\w\u4e00-\u9fff\- ]")
+
+
+def default_slugify(title: str) -> str:
+    """Default slugify function.
+
+    This aims to mimic the GitHub Markdown format, see:
+
+    - https://github.com/jch/html-pipeline/blob/master/lib/html/pipeline/toc_filter.rb
+    - https://gist.github.com/asabaylus/3071099
+    """
+    return _SLUGIFY_CLEAN_REGEX.sub("", title.strip().lower().replace(" ", "-"))
+
+
+def compute_unique_slug(
+    token_tree: SyntaxTreeNode,
+    slugs: Iterable[str],
+    slug_func: None | Callable[[str], str] = None,
+) -> str:
+    """Compute the slug for a token.
+
+    This directly mirrors the logic in `mdit_py_plugins.anchors_plugin`
+    """
+    slug_func = default_slugify if slug_func is None else slug_func
+    tokens = token_tree.to_tokens()
+    inline_token = tokens[1]
+    title = "".join(
+        child.content
+        for child in (inline_token.children or [])
+        if child.type in ["text", "code_inline"]
+    )
+    slug = slug_func(title)
+    i = 1
+    while slug in slugs:
+        slug = f"{slug}-{i}"
+        i += 1
+    return slug
diff --git a/myst_parser/parsers/mdit.py b/myst_parser/parsers/mdit.py
@@ -8,7 +8,6 @@
 from markdown_it import MarkdownIt
 from markdown_it.renderer import RendererProtocol
 from mdit_py_plugins.amsmath import amsmath_plugin
-from mdit_py_plugins.anchors import anchors_plugin
 from mdit_py_plugins.attrs import attrs_block_plugin, attrs_plugin
 from mdit_py_plugins.colon_fence import colon_fence_plugin
 from mdit_py_plugins.deflist import deflist_plugin
@@ -113,12 +112,6 @@ def create_md_parser(
         md.use(attrs_plugin, after=("image",))
     if "attrs_block" in config.enable_extensions:
         md.use(attrs_block_plugin)
-    if config.heading_anchors is not None:
-        md.use(
-            anchors_plugin,
-            max_level=config.heading_anchors,
-            slug_func=config.heading_slug_func,
-        )
     for name in config.disable_syntax:
         md.disable(name, True)
 
diff --git a/myst_parser/warnings_.py b/myst_parser/warnings_.py
@@ -51,8 +51,8 @@ class MystWarnings(Enum):
     """A legacy domain found, which does not support `resolve_any_xref`."""
 
     # extensions
-    ANCHOR_DUPE = "anchor_dupe"
-    """Duplicate heading anchors generated in same document."""
+    HEADING_SLUG = "heading_slug"
+    """An error occured computing a heading slug."""
     STRIKETHROUGH = "strikethrough"
     """Strikethrough warning, since only implemented in HTML."""
     HTML_PARSE = "html"
diff --git a/tests/test_renderers/fixtures/myst-config.txt b/tests/test_renderers/fixtures/myst-config.txt
@@ -433,16 +433,38 @@ My paragraph
 <string>:3: (WARNING/2) Multiple matches for '*:*:*:*index': key:std:label:genindex, key:std:label:modindex, key:std:label:py-modindex, ... [myst.iref_ambiguous]
 .
 
-[heading_slug_func] --myst-heading-anchors=1 --myst-heading-slug-func=myst_parser.config.main._test_slug_func
+[heading_slug_func] --myst-heading-anchors=2 --myst-heading-slug-func=myst_parser.config.main._test_slug_func
 .
 # title
 
+## title
+
+## title a b c
+
+## title *nested syntax*
+
+### other
+
 [reversed](#eltit)
 .
-<document ids="title" names="title" slug="eltit" source="<string>" title="title">
+<document dupnames="title" ids="title" slug="eltit" source="<string>" title="title">
     <title>
         title
-    <paragraph>
-        <reference id_link="True" refid="title">
-            reversed
+    <section dupnames="title" ids="title-1" slug="eltit-1">
+        <title>
+            title
+    <section ids="title-a-b-c" names="title\ a\ b\ c" slug="c b a eltit">
+        <title>
+            title a b c
+    <section ids="title-nested-syntax" names="title\ nested\ syntax" slug="xatnys detsen eltit">
+        <title>
+            title
+            <emphasis>
+                nested syntax
+        <section ids="other" names="other">
+            <title>
+                other
+            <paragraph>
+                <reference id_link="True" refid="title">
+                    reversed
 .
diff --git a/tests/test_renderers/test_myst_config.py b/tests/test_renderers/test_myst_config.py
@@ -19,6 +19,8 @@ def test_cmdline(file_params: ParamTestData):
     """The description is parsed as a docutils commandline"""
     if "url_schemes_list" in file_params.title and __version_info__ < (0, 18):
         pytest.skip("problematic node ids changed in docutils 0.18")
+    if "heading_slug_func" in file_params.title and __version_info__ < (0, 18):
+        pytest.skip("dupnames ids changed in docutils 0.18")
     pub = Publisher(parser=Parser())
     option_parser = pub.setup_option_parser()
     try:

Original file line number	Diff line number	Diff line change
`@@ -267,10 +267,10 @@ def __repr__(self) -> str:`
`267`	`267`	`},`
`268`	`268`	`)`
`269`	`269`
`270`		`- heading_anchors: Optional[int] = dc.field(`
`271`		`- default=None,`
	`270`	`+ heading_anchors: int = dc.field(`
	`271`	`+ default=0,`
`272`	`272`	`metadata={`
`273`		`- "validator": optional(in_([1, 2, 3, 4, 5, 6, 7])),`
	`273`	`+ "validator": optional(in_([0, 1, 2, 3, 4, 5, 6, 7])),`
`274`	`274`	`"help": "Heading level depth to assign HTML anchors",`
`275`	`275`	`},`
`276`	`276`	`)`