From 7ae646b174bb1869c8e17cf36f80602084b98dc2 Mon Sep 17 00:00:00 2001 From: Keith Cirkel Date: Thu, 2 Oct 2025 11:50:41 +0100 Subject: [PATCH] implement preds.py as a Rust wheel with PyO3 & Maturin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change is a proof of concept for how possible it would be to take individual python modules and re-implement them in Rust, in a "Ship of Theseus" style Rust rewrite. This takes some of the simplest parts of the codebase - a list of predicate free functions - and turns them into Rust checks. These functions become quite a bit more trivial in Rust as Rust implements many of these functions on `char`, so we simply delegate to those methods. These functions where chosen not because they're a slow path, but more because they are straightforward functions that return booleans and show demonstrate a good proof of concept. They certainly don't make things _slower_, however. The next steps would be to take some of the bigger functions in preds.py that do full string comparison (such as isXMLishTagname), and port those. This was avoided for now as these might be slightly more controversial, as we might want to include dependencies for fast string matching. I'm not very well versed in Python, this was mostly cobbled together using https://medium.com/@MatthieuL49/a-mixed-rust-python-project-24491e2af424 and https://colliery.io/blog/rust-python-pattern/ as guides. Some completely non empirical evaluation of timings, I ran the tests with/without Rust bindings: ```sh $ time BIKESHED_USE_RUST=0 ./bikeshed.py test --no-update Running tests |████████████████████| 502/502 [100%] in 3:04.5 (2.72/s) ✔ All tests passed. ________________________________________________________ Executed in 184.82 secs fish external usr time 176.82 secs 717.00 micros 176.82 secs sys time 1.99 secs 0.00 micros 1.99 secs $ time BIKESHED_USE_RUST=1 ./bikeshed.py test --no-update Running tests [R] |████████████████████| 502/502 [100%] in 3:04.4 (2.72/s) ✔ All tests passed. ________________________________________________________ Executed in 184.72 secs fish external usr time 177.16 secs 481.00 micros 177.16 secs sys time 1.96 secs 172.00 micros 1.96 secs ``` As expected both take essentially the same time. This may prove 1 of 2 things: - I'm an idiot and haven't done this right. - The FFI boundary between Python/Rust isn't costing us much (at least for simple checks like these). --- .github/workflows/ci.yml | 15 ++- .gitignore | 3 +- README.md | 7 ++ bikeshed/h/parser/parser.py | 2 +- bikeshed/h/parser/preds_wrapper.py | 82 +++++++++++++++ bikeshed/test.py | 17 ++- pyproject.toml | 14 +++ rust/Cargo.lock | 163 +++++++++++++++++++++++++++++ rust/Cargo.toml | 16 +++ rust/README.md | 28 +++++ rust/src/lib.rs | 20 ++++ rust/src/preds.rs | 131 +++++++++++++++++++++++ 12 files changed, 493 insertions(+), 5 deletions(-) create mode 100644 bikeshed/h/parser/preds_wrapper.py create mode 100644 rust/Cargo.lock create mode 100644 rust/Cargo.toml create mode 100644 rust/README.md create mode 100644 rust/src/lib.rs create mode 100644 rust/src/preds.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f563ca80d..ae0096c87e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,12 +36,25 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable - name: Install dependencies run: | pip install --upgrade pip wheel pip install --editable . - - name: Test with bikeshed + - name: Build Rust extension + run: | + pip install maturin + maturin build --release + pip install --force-reinstall --find-links rust/target/wheels bikeshed_rust + - name: Test with bikeshed (Python mode) + run: bikeshed --no-update test + env: + BIKESHED_USE_RUST: '0' + - name: Test with bikeshed (Rust mode) run: bikeshed --no-update test + env: + BIKESHED_USE_RUST: '1' lint: diff --git a/.gitignore b/.gitignore index bf5e33fec4..11a0409361 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,5 @@ node_modules/ /playwright-report/ /playwright/.cache/ /env -/docs/*.html \ No newline at end of file +/docs/*.html +/rust/target/ diff --git a/README.md b/README.md index 1207b48305..e0474fa0c0 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,13 @@ though most such specs have switched their source file extensions to `.bs` now. Using `.src.html` in most text editors will display the file with HTML source formatting, which isn't generally what you want.) +Rust +----------- + +Bikeshed includes optional Rust extensions, in an effort to port some or all of the code into Rust. + +To enable: `export BIKESHED_USE_RUST=1` + License ------- diff --git a/bikeshed/h/parser/parser.py b/bikeshed/h/parser/parser.py index 80d3e27442..0ac9808dbd 100644 --- a/bikeshed/h/parser/parser.py +++ b/bikeshed/h/parser/parser.py @@ -6,7 +6,7 @@ from ... import config, constants, t from ... import messages as m -from . import preds +from . import preds_wrapper as preds from .nodes import ( Comment, Doctype, diff --git a/bikeshed/h/parser/preds_wrapper.py b/bikeshed/h/parser/preds_wrapper.py new file mode 100644 index 0000000000..6623d30efa --- /dev/null +++ b/bikeshed/h/parser/preds_wrapper.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import os + +_USE_RUST = os.environ.get("BIKESHED_USE_RUST", "").lower() in ("1", "true") + +if _USE_RUST: + try: + import bikeshed_rust + from . import preds as _preds + + isASCII = bikeshed_rust.is_ascii + isASCIIAlpha = bikeshed_rust.is_ascii_alpha + isASCIIAlphanum = bikeshed_rust.is_ascii_alphanum + isASCIILowerAlpha = bikeshed_rust.is_ascii_lower_alpha + isASCIIUpperAlpha = bikeshed_rust.is_ascii_upper_alpha + isAttrNameChar = bikeshed_rust.is_attr_name_char + isControl = bikeshed_rust.is_control + isDigit = bikeshed_rust.is_digit + isHexDigit = bikeshed_rust.is_hex_digit + isNoncharacter = bikeshed_rust.is_noncharacter + isTagnameChar = bikeshed_rust.is_tagname_char + isWhitespace = bikeshed_rust.is_whitespace + + charRefs = _preds.charRefs + xmlishTagnames = _preds.xmlishTagnames + isXMLishTagname = _preds.isXMLishTagname + except ImportError: + from . import preds as _preds + + charRefs = _preds.charRefs + xmlishTagnames = _preds.xmlishTagnames + isASCII = _preds.isASCII + isASCIIAlpha = _preds.isASCIIAlpha + isASCIIAlphanum = _preds.isASCIIAlphanum + isASCIILowerAlpha = _preds.isASCIILowerAlpha + isASCIIUpperAlpha = _preds.isASCIIUpperAlpha + isAttrNameChar = _preds.isAttrNameChar + isControl = _preds.isControl + isDigit = _preds.isDigit + isHexDigit = _preds.isHexDigit + isNoncharacter = _preds.isNoncharacter + isTagnameChar = _preds.isTagnameChar + isWhitespace = _preds.isWhitespace + isXMLishTagname = _preds.isXMLishTagname + +else: + from . import preds as _preds + + charRefs = _preds.charRefs + xmlishTagnames = _preds.xmlishTagnames + isASCII = _preds.isASCII + isASCIIAlpha = _preds.isASCIIAlpha + isASCIIAlphanum = _preds.isASCIIAlphanum + isASCIILowerAlpha = _preds.isASCIILowerAlpha + isASCIIUpperAlpha = _preds.isASCIIUpperAlpha + isAttrNameChar = _preds.isAttrNameChar + isControl = _preds.isControl + isDigit = _preds.isDigit + isHexDigit = _preds.isHexDigit + isNoncharacter = _preds.isNoncharacter + isTagnameChar = _preds.isTagnameChar + isWhitespace = _preds.isWhitespace + isXMLishTagname = _preds.isXMLishTagname + +__all__ = [ + "charRefs", + "xmlishTagnames", + "isASCII", + "isASCIIAlpha", + "isASCIIAlphanum", + "isASCIILowerAlpha", + "isASCIIUpperAlpha", + "isAttrNameChar", + "isControl", + "isDigit", + "isHexDigit", + "isNoncharacter", + "isTagnameChar", + "isWhitespace", + "isXMLishTagname", +] diff --git a/bikeshed/test.py b/bikeshed/test.py index 1267e1a5d0..56a02ea4fe 100644 --- a/bikeshed/test.py +++ b/bikeshed/test.py @@ -12,6 +12,19 @@ from . import messages as m from .Spec import Spec + +def _getTestTitle() -> str: + try: + from .h.parser import preds_wrapper + # Check if we're using the Rust implementation + if hasattr(preds_wrapper, 'isASCII'): + module = getattr(preds_wrapper.isASCII, '__module__', '') + if 'bikeshed_rust' in module: + return "Running tests [R]" + except Exception: + pass + return "Running tests" + if t.TYPE_CHECKING: import argparse @@ -101,7 +114,7 @@ def run( numPassed = 0 total = 0 fails = [] - pathProgress = alive_it(paths, dual_line=True, length=20) + pathProgress = alive_it(paths, dual_line=True, length=20, title=_getTestTitle()) try: for path in pathProgress: testName = testNameForPath(path) @@ -149,7 +162,7 @@ def rebase( if len(paths) == 0: m.p("No tests were found.") return True - pathProgress = alive_it(paths, dual_line=True, length=20) + pathProgress = alive_it(paths, dual_line=True, length=20, title=_getTestTitle()) try: for path in pathProgress: testName = testNameForPath(path) diff --git a/pyproject.toml b/pyproject.toml index 06566708c0..7c16ced8c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,17 @@ +[build-system] +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" + +[project] +name = "bikeshed" +version = "0.1.0" +requires-python = ">=3.9" + +[tool.maturin] +manifest-path = "rust/Cargo.toml" +module-name = "bikeshed_rust" +python-source = "bikeshed" + [tool.black] line-length = 120 diff --git a/rust/Cargo.lock b/rust/Cargo.lock new file mode 100644 index 0000000000..831c6f2fe7 --- /dev/null +++ b/rust/Cargo.lock @@ -0,0 +1,163 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bikeshed_rust" +version = "0.1.0" +dependencies = [ + "pyo3", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indoc" +version = "2.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" + +[[package]] +name = "libc" +version = "0.2.176" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "portable-atomic" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" + +[[package]] +name = "proc-macro2" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" +dependencies = [ + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" +dependencies = [ + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" + +[[package]] +name = "unicode-ident" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000000..d811381053 --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "bikeshed_rust" +version = "0.1.0" +edition = "2024" + +[lib] +name = "bikeshed_rust" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.26", features = ["extension-module"] } + +[profile.release] +lto = true +codegen-units = 1 +strip = true diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 0000000000..66ac4b9548 --- /dev/null +++ b/rust/README.md @@ -0,0 +1,28 @@ +# Bikeshed Rust Extensions + +This directory contains Rust implementations of Bikeshed modules, compiled to Python extensions using PyO3. + +## Building + +```bash +# Install maturin (build tool) +cargo install maturin + +# Build release wheel +cd rust +maturin build --release +``` + +## Testing + +## Using Rust Extensions + +```bash +# Use Rust implementation +export BIKESHED_USE_RUST=1 +bikeshed spec input.bs output.html + +# Use Python implementation (default) +export BIKESHED_USE_RUST=0 +bikeshed spec input.bs output.html +``` diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000000..0a6847259a --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,20 @@ +use pyo3::prelude::*; + +mod preds; + +#[pymodule] +fn bikeshed_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(preds::is_whitespace, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_digit, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_hex_digit, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_ascii_lower_alpha, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_ascii_upper_alpha, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_ascii_alpha, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_ascii_alphanum, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_ascii, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_control, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_noncharacter, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_attr_name_char, m)?)?; + m.add_function(wrap_pyfunction!(preds::is_tagname_char, m)?)?; + Ok(()) +} diff --git a/rust/src/preds.rs b/rust/src/preds.rs new file mode 100644 index 0000000000..606859faf3 --- /dev/null +++ b/rust/src/preds.rs @@ -0,0 +1,131 @@ +use pyo3::prelude::*; +use pyo3::types::PyString; + +/// Helper to extract a character from either str or int +fn get_codepoint(obj: &Bound<'_, PyAny>) -> PyResult> { + if let Ok(s) = obj.downcast::() { + let s = s.to_str()?; + Ok(get_char(s)) + } else if let Ok(i) = obj.extract::() { + if i < 0 || i > 0x10FFFF { + return Ok(None); + } + Ok(char::from_u32(i as u32)) + } else { + Ok(None) + } +} + +/// Convers a 1 character str to the char, returns None if the string is > 1 +fn get_char(s: &str) -> Option { + if s.chars().count() != 1 { + None + } else { + s.chars().next() + } +} + +fn is_whitespace_char(ch: char) -> bool { + matches!(ch as u32, 0x9 | 0xA | 0xC | 0x20) +} + +/// Check if a character is whitespace (tab, newline, form feed, or space) +#[pyfunction] +pub fn is_whitespace(ch: &Bound<'_, PyAny>) -> PyResult> { + Ok(get_codepoint(ch)?.map(is_whitespace_char)) +} + +/// Check if a character is a digit (0-9) +#[pyfunction] +pub fn is_digit(ch: &str) -> Option { + get_char(ch).map(|c| c.is_ascii_digit()) +} + +/// Check if a character is a hexadecimal digit +#[pyfunction] +pub fn is_hex_digit(ch: &str) -> Option { + get_char(ch).map(|c| c.is_ascii_hexdigit()) +} + +/// Check if a character is ASCII lowercase alpha +#[pyfunction] +pub fn is_ascii_lower_alpha(ch: &str) -> Option { + get_char(ch).map(|c| c.is_ascii_lowercase()) +} + +/// Check if a character is ASCII uppercase alpha +#[pyfunction] +pub fn is_ascii_upper_alpha(ch: &str) -> Option { + get_char(ch).map(|c| c.is_ascii_uppercase()) +} + +/// Check if a character is ASCII alpha (upper or lower) +#[pyfunction] +pub fn is_ascii_alpha(ch: &str) -> Option { + get_char(ch).map(|c| c.is_ascii_alphabetic()) +} + +/// Check if a character is ASCII alphanumeric +#[pyfunction] +pub fn is_ascii_alphanum(ch: &str) -> Option { + get_char(ch).map(|c| c.is_ascii_alphanumeric()) +} + +/// Check if a character is ASCII (code point <= 127) +#[pyfunction] +pub fn is_ascii(ch: &str) -> Option { + get_char(ch).map(|c| c.is_ascii()) +} + +/// Check if a character is a control character +#[pyfunction] +pub fn is_control(ch: &Bound<'_, PyAny>) -> PyResult> { + Ok(get_codepoint(ch)?.map(|c| { + let cp = c as u32; + (cp <= 0x08) || (cp == 0x0B) || (0x0D..=0x1F).contains(&cp) || (0x7F..=0x9F).contains(&cp) + })) +} + +/// Check if a character is a noncharacter +#[pyfunction] +pub fn is_noncharacter(ch: &Bound<'_, PyAny>) -> PyResult> { + Ok(get_codepoint(ch)?.map(|c| { + let cp = c as u32; + (0xFDD0..=0xFDEF).contains(&cp) || (cp & 0xFFFE == 0xFFFE && cp <= 0x10FFFF) + })) +} + +/// Check if a character is valid for an attribute name +#[pyfunction] +pub fn is_attr_name_char(ch: &str) -> Option { + get_char(ch).map(|c| { + if is_whitespace_char(c) { + return false; + } + !matches!(c, '/' | '<' | '>' | '=' | '"' | '\'' | '\0') + }) +} + +/// Check if a character is valid for a tag name +#[pyfunction] +pub fn is_tagname_char(ch: &str) -> Option { + get_char(ch).map(|c| { + if matches!(c, '-' | '.' | '_') || c.is_ascii_alphanumeric() { + return true; + } + + let cp = c as u32; + match cp { + 0xB7 => true, + 0xC0..=0x1FFF => !matches!(cp, 0xD7 | 0xF7 | 0x37E), + 0x200C | 0x200D | 0x203F | 0x2040 => true, + 0x2070..=0x218F => true, + 0x2C00..=0x2FEF => true, + 0x3001..=0xD7FF => true, + 0xF900..=0xFDCF => true, + 0xFDF0..=0xFFFD => true, + 0x10000..=0xEFFFF => true, + _ => false, + } + }) +}