Add BibTeX citations to PEPs

hugovk · hugovk · commit 580c1fea59c4 · 2026-03-30T12:52:20.000+03:00
diff --git a/pep_sphinx_extensions/__init__.py b/pep_sphinx_extensions/__init__.py
@@ -9,11 +9,9 @@
 from docutils.writers.html5_polyglot import HTMLTranslator
 from sphinx import environment
 
-from pep_sphinx_extensions.generate_rss import (
-    create_rss_feed,
-    get_from_doctree,
-    pep_abstract,
-)
+from pep_sphinx_extensions.doctree import get_from_doctree
+from pep_sphinx_extensions.generate_bibtex import create_bibtex_files
+from pep_sphinx_extensions.generate_rss import create_rss_feed
 from pep_sphinx_extensions.pep_processor.html import (
     pep_html_builder,
     pep_html_translator,
@@ -51,6 +49,7 @@ def _post_build(app: Sphinx, exception: Exception | None) -> None:
     if "internal_builder" not in app.tags:
         create_index_file(Path(app.outdir), app.builder.name)
     create_rss_feed(app.doctreedir, app.outdir)
+    create_bibtex_files(app.doctreedir, app.outdir)
 
 
 def set_description(
diff --git a/pep_sphinx_extensions/doctree.py b/pep_sphinx_extensions/doctree.py
@@ -0,0 +1,47 @@
+# This file is placed in the public domain or under the
+# CC0-1.0-Universal license, whichever is more permissive.
+
+from __future__ import annotations
+
+import pickle
+from pathlib import Path
+
+from docutils import nodes
+
+document_cache: dict[Path, dict[str, str]] = {}
+
+
+def pep_abstract(document: nodes.document) -> str:
+    """Return the first paragraph of the PEP abstract.
+    If not found, return the first paragraph of the introduction.
+    """
+    introduction = ""
+    for node in document.findall(nodes.section):
+        title_node = node.next_node(nodes.title)
+        if title_node is None:
+            continue
+
+        if title_node.astext() == "Abstract":
+            if (para_node := node.next_node(nodes.paragraph)) is not None:
+                return para_node.astext().strip().replace("\n", " ")
+            return ""
+        if title_node.astext() == "Introduction":
+            introduction = node.next_node(nodes.paragraph).astext().strip().replace("\n", " ")
+
+    return introduction
+
+
+def get_from_doctree(full_path: Path, text: str) -> str:
+    """Retrieve a header value from a pickled doctree, with caching."""
+    # Try and retrieve from cache
+    if full_path in document_cache:
+        return document_cache[full_path].get(text, "")
+
+    # Else load doctree
+    document = pickle.loads(full_path.read_bytes())
+    # Store the headers (populated in the PEPHeaders transform)
+    document_cache[full_path] = path_cache = document.get("headers", {})
+    # Store the Abstract
+    path_cache["Abstract"] = pep_abstract(document)
+    # Return the requested key
+    return path_cache.get(text, "")
diff --git a/pep_sphinx_extensions/generate_bibtex.py b/pep_sphinx_extensions/generate_bibtex.py
@@ -0,0 +1,73 @@
+# This file is placed in the public domain or under the
+# CC0-1.0-Universal license, whichever is more permissive.
+
+from __future__ import annotations
+
+import re
+import textwrap
+from pathlib import Path
+
+from pep_sphinx_extensions.doctree import get_from_doctree
+
+# LaTeX special characters that need escaping in BibTeX values
+_BIBTEX_SPECIAL = re.compile(r"([&%$#_{}~^])")
+_EMAIL_ADDRESS = re.compile(r"\s*<[^>]+>")
+
+
+def _escape_bibtex(text: str) -> str:
+    """Escape special BibTeX characters in a string."""
+    return _BIBTEX_SPECIAL.sub(r"\\\1", text)
+
+
+def _parse_created(created: str) -> tuple[str, str]:
+    """Parse a PEP 'Created' date string (e.g. '01-Jan-2020') into (year, month).
+
+    Returns the year as a string and the BibTeX month abbreviation.
+    """
+    _, month_abbr, year = created.split("-")
+    return year, month_abbr.lower()
+
+
+def _format_authors(author_header: str) -> str:
+    """Format the Author header value for BibTeX.
+
+    Strips email addresses and joins names with " and ".
+    """
+    # Remove email addresses in angle brackets
+    author_header = _EMAIL_ADDRESS.sub("", author_header)
+    # Split on commas and clean up
+    authors = [name.strip() for name in author_header.split(",") if name.strip()]
+    return " and ".join(authors)
+
+
+def _generate_bibtex_entry(full_path: Path) -> str:
+    """Generate a BibTeX entry for a single PEP from its doctree."""
+    number = int(get_from_doctree(full_path, "PEP"))
+    created = get_from_doctree(full_path, "Created")
+    author = get_from_doctree(full_path, "Author")
+    title = get_from_doctree(full_path, "Title")
+
+    year, month = _parse_created(created)
+    authors_bibtex = _escape_bibtex(_format_authors(author))
+    title_escaped = _escape_bibtex(title)
+
+    return textwrap.dedent(f"""\
+        @techreport{{pep{number},
+            author = "{authors_bibtex}",
+            title = "PEP {number} --- {title_escaped}",
+            institution = "Python Software Foundation",
+            year = "{year}",
+            month = {month},
+            type = "PEP",
+            number = "{number}",
+            url = "https://peps.python.org/pep-{number:0>4}/",
+        }}""")
+
+
+def create_bibtex_files(doctree_dir: str, output_dir: str) -> None:
+    """Generate a .bib file for each PEP in the output directory."""
+    out = Path(output_dir)
+    for doctree_file in Path(doctree_dir).glob("pep-????.doctree"):
+        pep_name = doctree_file.stem  # for example "pep-0008"
+        entry = _generate_bibtex_entry(doctree_file)
+        (out / f"{pep_name}.bib").write_text(entry + "\n", encoding="utf-8")
diff --git a/pep_sphinx_extensions/generate_rss.py b/pep_sphinx_extensions/generate_rss.py
@@ -4,12 +4,11 @@
 from __future__ import annotations
 
 import datetime as dt
-import pickle
 from email.utils import format_datetime, getaddresses
 from html import escape
 from pathlib import Path
 
-from docutils import nodes
+from pep_sphinx_extensions.doctree import get_from_doctree
 
 RSS_DESCRIPTION = (
     "Newest Python Enhancement Proposals (PEPs): "
@@ -23,24 +22,6 @@ def _format_rfc_2822(datetime: dt.datetime) -> str:
     return format_datetime(datetime, usegmt=True)
 
 
-document_cache: dict[Path, dict[str, str]] = {}
-
-
-def get_from_doctree(full_path: Path, text: str) -> str:
-    # Try and retrieve from cache
-    if full_path in document_cache:
-        return document_cache[full_path].get(text, "")
-
-    # Else load doctree
-    document = pickle.loads(full_path.read_bytes())
-    # Store the headers (populated in the PEPHeaders transform)
-    document_cache[full_path] = path_cache = document.get("headers", {})
-    # Store the Abstract
-    path_cache["Abstract"] = pep_abstract(document)
-    # Return the requested key
-    return path_cache.get(text, "")
-
-
 def pep_creation(full_path: Path) -> dt.datetime:
     created_str = get_from_doctree(full_path, "Created")
     try:
@@ -49,26 +30,6 @@ def pep_creation(full_path: Path) -> dt.datetime:
         return dt.datetime.min
 
 
-def pep_abstract(document: nodes.document) -> str:
-    """Return the first paragraph of the PEP abstract.
-    If not found, return the first paragraph of the introduction.
-    """
-    introduction = ""
-    for node in document.findall(nodes.section):
-        title_node = node.next_node(nodes.title)
-        if title_node is None:
-            continue
-
-        if title_node.astext() == "Abstract":
-            if (para_node := node.next_node(nodes.paragraph)) is not None:
-                return para_node.astext().strip().replace("\n", " ")
-            return ""
-        if title_node.astext() == "Introduction":
-            introduction = node.next_node(nodes.paragraph).astext().strip().replace("\n", " ")
-
-    return introduction
-
-
 def _generate_items(doctree_dir: Path):
     # get list of peps with creation time (from "Created:" string in pep source)
     peps_with_dt = sorted((pep_creation(path), path) for path in doctree_dir.glob("pep-????.doctree"))
diff --git a/pep_sphinx_extensions/pep_processor/transforms/pep_footer.py b/pep_sphinx_extensions/pep_processor/transforms/pep_footer.py
@@ -50,6 +50,7 @@ def apply(self) -> None:
                 self.document += nodes.transition()
             self.document += _add_source_link(pep_source_path)
             self.document += _add_commit_history_info(pep_source_path)
+            self.document += _add_bibtex_link(pep_source_path)
 
 
 def _add_source_link(pep_source_path: Path) -> nodes.paragraph:
@@ -71,6 +72,13 @@ def _add_commit_history_info(pep_source_path: Path) -> nodes.paragraph:
     return nodes.paragraph("", "Last modified: ", link_node)
 
 
+def _add_bibtex_link(pep_source_path: Path) -> nodes.paragraph:
+    """Add link to download BibTeX citation."""
+    bib_url = f"{pep_source_path.stem}.bib"
+    link_node = nodes.reference("", "BibTeX", refuri=bib_url)
+    return nodes.paragraph("", "Cite: ", link_node)
+
+
 def _get_last_modified_timestamps():
     # get timestamps and changed files from all commits (without paging results)
     args = ("git", "--no-pager", "log", "--format=#%at", "--name-only")
diff --git a/pep_sphinx_extensions/tests/test_generate_bibtex.py b/pep_sphinx_extensions/tests/test_generate_bibtex.py
@@ -0,0 +1,150 @@
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from pep_sphinx_extensions.generate_bibtex import (
+    _escape_bibtex,
+    _format_authors,
+    _generate_bibtex_entry,
+    _parse_created,
+    create_bibtex_files,
+)
+
+MOCK_TARGET = "pep_sphinx_extensions.generate_bibtex.get_from_doctree"
+
+PEP_8_HEADERS = {
+    "PEP": "8",
+    "Title": "Style Guide for Python Code",
+    "Author": "Guido van Rossum, Barry Warsaw, Alyssa Coghlan",
+    "Created": "05-Jul-2001",
+}
+
+
+def _mock_doctree(headers: dict[str, str]):
+    """Return a mock get_from_doctree that returns values from headers dict."""
+    return lambda full_path, text: headers.get(text, "")
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("Hello World", "Hello World"),
+        ("Tom & Jerry", r"Tom \& Jerry"),
+        ("100%", r"100\%"),
+        ("$x$", r"\$x\$"),
+        ("C#", r"C\#"),
+        ("snake_case", r"snake\_case"),
+        ("{}", r"\{\}"),
+        ("~tilde", r"\~tilde"),
+        ("no specials", "no specials"),
+    ],
+)
+def test_escape_bibtex(text: str, expected: str) -> None:
+    assert _escape_bibtex(text) == expected
+
+
+@pytest.mark.parametrize(
+    ("created", "expected"),
+    [
+        ("01-Jan-1990", ("1990", "jan")),
+        ("15-Sep-2021", ("2021", "sep")),
+        ("28-Feb-2000", ("2000", "feb")),
+    ],
+)
+def test_parse_created(created: str, expected: tuple[str, str]) -> None:
+    assert _parse_created(created) == expected
+
+
+@pytest.mark.parametrize(
+    ("author_header", "expected"),
+    [
+        ("Cardinal Ximénez", "Cardinal Ximénez"),
+        (
+            "Cardinal Ximénez <Cardinal.Ximenez@spanish.inquisition>,"
+            " Cardinal Biggles <Cardinal.Biggles@spanish.inquisition>",
+            "Cardinal Ximénez and Cardinal Biggles",
+        ),
+        (
+            "Cardinal Ximénez,\n Cardinal Biggles",
+            "Cardinal Ximénez and Cardinal Biggles",
+        ),
+        (
+            "Cardinal Ximénez, Cardinal Biggles, Cardinal Fang",
+            "Cardinal Ximénez and Cardinal Biggles and Cardinal Fang",
+        ),
+    ],
+)
+def test_format_authors(author_header: str, expected: str) -> None:
+    assert _format_authors(author_header) == expected
+
+
+def test_generate_bibtex_entry() -> None:
+    # Arrange / Act
+    with patch(MOCK_TARGET, _mock_doctree(PEP_8_HEADERS)):
+        result = _generate_bibtex_entry(Path("pep-0008.doctree"))
+
+    # Assert
+    assert "@techreport{pep8," in result
+    assert 'author = "Guido van Rossum and Barry Warsaw and Alyssa Coghlan"' in result
+    assert 'title = "PEP 8 --- Style Guide for Python Code"' in result
+    assert 'year = "2001"' in result
+    assert "month = jul," in result
+    assert 'number = "8"' in result
+    assert 'url = "https://peps.python.org/pep-0008/"' in result
+
+
+def test_generate_bibtex_entry_title_escaped() -> None:
+    # Arrange
+    headers = {**PEP_8_HEADERS, "PEP": "999", "Title": "Use of $ & % in PEPs"}
+
+    # Act
+    with patch(MOCK_TARGET, _mock_doctree(headers)):
+        result = _generate_bibtex_entry(Path("pep-0999.doctree"))
+
+    # Assert
+    assert r"Use of \$ \& \% in PEPs" in result
+
+
+def test_generate_bibtex_entry_author_escaped() -> None:
+    # Arrange
+    headers = {**PEP_8_HEADERS, "Author": "Tom & Jerry <tj@example.com>"}
+
+    # Act
+    with patch(MOCK_TARGET, _mock_doctree(headers)):
+        result = _generate_bibtex_entry(Path("pep-0008.doctree"))
+
+    # Assert
+    assert r"Tom \& Jerry" in result
+
+
+def test_create_bibtex_files(tmp_path: Path) -> None:
+    # Arrange
+    doctree_dir = tmp_path / "doctrees"
+    doctree_dir.mkdir()
+    output_dir = tmp_path / "output"
+    output_dir.mkdir()
+    (doctree_dir / "pep-0008.doctree").touch()
+
+    # Act
+    with patch(MOCK_TARGET, _mock_doctree(PEP_8_HEADERS)):
+        create_bibtex_files(str(doctree_dir), str(output_dir))
+
+    # Assert
+    bib = (output_dir / "pep-0008.bib").read_text()
+    assert "@techreport{pep8," in bib
+    assert 'author = "Guido van Rossum and Barry Warsaw and Alyssa Coghlan"' in bib
+
+
+def test_create_bibtex_files_no_doctrees(tmp_path: Path) -> None:
+    # Arrange
+    doctree_dir = tmp_path / "doctrees"
+    doctree_dir.mkdir()
+    output_dir = tmp_path / "output"
+    output_dir.mkdir()
+
+    # Act
+    create_bibtex_files(str(doctree_dir), str(output_dir))
+
+    # Assert
+    assert list(output_dir.glob("*.bib")) == []