wip

wakonig · wakonig · commit 447162a713ce · 2026-04-17T13:01:31.000+02:00
diff --git a/.github/scripts/aggregate_benchmarks.py b/.github/scripts/aggregate_benchmarks.py
@@ -1,5 +1,11 @@
 #!/usr/bin/env python3
-"""Aggregate benchmark JSON files by taking the median across runner attempts."""
+"""Aggregate benchmark JSON files by taking the median across runner attempts.
+
+The workflow runs the same benchmark suite on multiple independent runners.
+This script reads every JSON file produced by those attempts, normalizes the
+contained benchmark values, and writes a compact mapping JSON where each value is
+the median across attempts.
+"""
 
 from __future__ import annotations
 
@@ -12,6 +18,16 @@
 
 
 def collect_benchmarks(paths: list[Path]) -> dict[str, list[Benchmark]]:
+    """Collect benchmarks from multiple JSON files.
+
+    Args:
+        paths (list[Path]): Paths to hyperfine, pytest-benchmark, or compact
+            mapping JSON files.
+
+    Returns:
+        dict[str, list[Benchmark]]: Benchmarks grouped by benchmark name.
+    """
+
     collected: dict[str, list[Benchmark]] = {}
     for path in paths:
         for name, benchmark in extract_benchmarks(path).items():
@@ -20,6 +36,18 @@ def collect_benchmarks(paths: list[Path]) -> dict[str, list[Benchmark]]:
 
 
 def aggregate(collected: dict[str, list[Benchmark]]) -> dict[str, dict[str, object]]:
+    """Aggregate grouped benchmarks using the median value.
+
+    Args:
+        collected (dict[str, list[Benchmark]]): Benchmarks grouped by benchmark
+            name.
+
+    Returns:
+        dict[str, dict[str, object]]: Compact mapping JSON data. Each benchmark
+        contains ``value``, ``unit``, ``metric``, ``attempts``, and
+        ``attempt_values``.
+    """
+
     aggregated: dict[str, dict[str, object]] = {}
     for name, benchmarks in sorted(collected.items()):
         values = [benchmark.value for benchmark in benchmarks]
@@ -36,6 +64,19 @@ def aggregate(collected: dict[str, list[Benchmark]]) -> dict[str, dict[str, obje
 
 
 def main_from_paths(input_dir: Path, output: Path) -> int:
+    """Aggregate all JSON files in a directory and write the result.
+
+    Args:
+        input_dir (Path): Directory containing benchmark JSON files.
+        output (Path): Path where the aggregate JSON should be written.
+
+    Returns:
+        int: Always ``0`` on success.
+
+    Raises:
+        ValueError: If no JSON files are found in ``input_dir``.
+    """
+
     paths = sorted(input_dir.rglob("*.json"))
     if not paths:
         raise ValueError(f"No benchmark JSON files found in {input_dir}")
@@ -49,6 +90,12 @@ def main_from_paths(input_dir: Path, output: Path) -> int:
 
 
 def main() -> int:
+    """Run the benchmark aggregation command line interface.
+
+    Returns:
+        int: Always ``0`` on success.
+    """
+
     parser = argparse.ArgumentParser()
     parser.add_argument("--input-dir", required=True, type=Path)
     parser.add_argument("--output", required=True, type=Path)
diff --git a/.github/scripts/compare_benchmarks.py b/.github/scripts/compare_benchmarks.py
@@ -1,5 +1,11 @@
 #!/usr/bin/env python3
-"""Compare benchmark JSON files and write a GitHub Actions summary."""
+"""Compare benchmark JSON files and write a GitHub Actions summary.
+
+The script supports JSON emitted by hyperfine, JSON emitted by pytest-benchmark,
+and a compact mapping format generated by ``aggregate_benchmarks.py``. Timing
+formats prefer median values and fall back to mean values when median values are
+not present.
+"""
 
 from __future__ import annotations
 
@@ -13,6 +19,15 @@
 
 @dataclass(frozen=True)
 class Benchmark:
+    """Normalized benchmark result.
+
+    Attributes:
+        name (str): Stable benchmark name used to match baseline and current results.
+        value (float): Numeric benchmark value used for comparison.
+        unit (str): Display unit for the value, for example ``"s"``.
+        metric (str): Source metric name, for example ``"median"`` or ``"mean"``.
+    """
+
     name: str
     value: float
     unit: str
@@ -21,6 +36,18 @@ class Benchmark:
 
 @dataclass(frozen=True)
 class Comparison:
+    """Comparison between one baseline benchmark and one current benchmark.
+
+    Attributes:
+        name (str): Benchmark name.
+        baseline (float): Baseline benchmark value.
+        current (float): Current benchmark value.
+        delta_percent (float): Percent change from baseline to current.
+        unit (str): Display unit for both values.
+        metric (str): Current result metric used for comparison.
+        regressed (bool): Whether the change exceeds the configured threshold.
+    """
+
     name: str
     baseline: float
     current: float
@@ -31,11 +58,29 @@ class Comparison:
 
 
 def _read_json(path: Path) -> Any:
+    """Read JSON data from a file.
+
+    Args:
+        path (Path): Path to the JSON file.
+
+    Returns:
+        Any: Parsed JSON value.
+    """
+
     with path.open("r", encoding="utf-8") as stream:
         return json.load(stream)
 
 
 def _as_float(value: Any) -> float | None:
+    """Convert a value to a finite float.
+
+    Args:
+        value (Any): Value to convert.
+
+    Returns:
+        float | None: Converted finite float, or ``None`` if conversion fails.
+    """
+
     try:
         result = float(value)
     except (TypeError, ValueError):
@@ -46,6 +91,15 @@ def _as_float(value: Any) -> float | None:
 
 
 def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]:
+    """Extract normalized benchmarks from hyperfine JSON.
+
+    Args:
+        data (dict[str, Any]): Parsed hyperfine JSON object.
+
+    Returns:
+        dict[str, Benchmark]: Benchmarks keyed by command name.
+    """
+
     benchmarks: dict[str, Benchmark] = {}
     for result in data.get("results", []):
         if not isinstance(result, dict):
@@ -62,6 +116,15 @@ def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]:
 
 
 def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]:
+    """Extract normalized benchmarks from pytest-benchmark JSON.
+
+    Args:
+        data (dict[str, Any]): Parsed pytest-benchmark JSON object.
+
+    Returns:
+        dict[str, Benchmark]: Benchmarks keyed by full benchmark name.
+    """
+
     benchmarks: dict[str, Benchmark] = {}
     for benchmark in data.get("benchmarks", []):
         if not isinstance(benchmark, dict):
@@ -82,6 +145,16 @@ def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]:
 
 
 def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]:
+    """Extract normalized benchmarks from a compact mapping JSON object.
+
+    Args:
+        data (dict[str, Any]): Parsed mapping where each benchmark is either a
+            raw number or an object containing ``value``, ``unit``, and ``metric``.
+
+    Returns:
+        dict[str, Benchmark]: Benchmarks keyed by mapping key.
+    """
+
     benchmarks: dict[str, Benchmark] = {}
 
     for name, raw_value in data.items():
@@ -103,6 +176,20 @@ def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]:
 
 
 def extract_benchmarks(path: Path) -> dict[str, Benchmark]:
+    """Extract normalized benchmarks from a supported JSON file.
+
+    Args:
+        path (Path): Path to a hyperfine, pytest-benchmark, or compact mapping
+            JSON file.
+
+    Returns:
+        dict[str, Benchmark]: Normalized benchmarks keyed by name.
+
+    Raises:
+        ValueError: If the JSON root is not an object or no supported benchmark
+            entries can be extracted.
+    """
+
     data = _read_json(path)
     if not isinstance(data, dict):
         raise ValueError(f"{path} must contain a JSON object")
@@ -122,6 +209,22 @@ def compare_benchmarks(
     threshold_percent: float,
     higher_is_better: bool,
 ) -> tuple[list[Comparison], list[str], list[str]]:
+    """Compare baseline benchmarks with current benchmarks.
+
+    Args:
+        baseline (dict[str, Benchmark]): Baseline benchmarks keyed by name.
+        current (dict[str, Benchmark]): Current benchmarks keyed by name.
+        threshold_percent (float): Regression threshold in percent.
+        higher_is_better (bool): If ``True``, lower current values are treated as
+            regressions. If ``False``, higher current values are treated as
+            regressions.
+
+    Returns:
+        tuple[list[Comparison], list[str], list[str]]: Comparisons for common
+        benchmark names, names missing from current results, and names newly
+        present in current results.
+    """
+
     comparisons: list[Comparison] = []
     missing_in_current: list[str] = []
     new_in_current: list[str] = []
@@ -165,6 +268,16 @@ def compare_benchmarks(
 
 
 def _format_value(value: float, unit: str) -> str:
+    """Format a benchmark value for Markdown output.
+
+    Args:
+        value (float): Numeric benchmark value.
+        unit (str): Display unit.
+
+    Returns:
+        str: Formatted value with optional unit suffix.
+    """
+
     suffix = f" {unit}" if unit else ""
     return f"{value:.6g}{suffix}"
 
@@ -177,6 +290,20 @@ def write_summary(
     threshold_percent: float,
     higher_is_better: bool,
 ) -> None:
+    """Write a Markdown benchmark comparison summary.
+
+    Args:
+        path (Path): Path where the summary should be written.
+        comparisons (list[Comparison]): Comparison rows for matching benchmarks.
+        missing_in_current (list[str]): Baseline benchmark names missing from the
+            current result.
+        new_in_current (list[str]): Current benchmark names not present in the
+            baseline result.
+        threshold_percent (float): Regression threshold in percent.
+        higher_is_better (bool): Whether higher benchmark values are considered
+            better.
+    """
+
     regressions = [comparison for comparison in comparisons if comparison.regressed]
     direction = "higher is better" if higher_is_better else "lower is better"
     sorted_comparisons = sorted(comparisons, key=lambda comparison: comparison.name)
@@ -245,6 +372,12 @@ def write_summary(
 
 
 def main() -> int:
+    """Run the benchmark comparison command line interface.
+
+    Returns:
+        int: ``1`` when a regression exceeds the threshold, otherwise ``0``.
+    """
+
     parser = argparse.ArgumentParser()
     parser.add_argument("--baseline", required=True, type=Path)
     parser.add_argument("--current", required=True, type=Path)