lorenarosati · lorenarosati · Mar 28, 2026 · Mar 28, 2026 · Mar 18, 2026 · Mar 24, 2026
diff --git a/.github/scripts/run-benchmarks.sh b/.github/scripts/run-benchmarks.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# Benchmark comparison script for pull requests.
+#
+# Called by .github/workflows/benchmark.yml (run-benchmark job) after the repo
+# has been checked out at the PR head. Writes the formatted markdown comparison
+# to /tmp/bench-comment.md; the post-comment job picks it up and posts it.
+#
+# Expects the following environment variables:
+#
+#   COMMENT    - the /bench PR comment body
+#   BASE_REF   - base branch ref (e.g. "main")
+#   HEAD_SHA   - full SHA of the PR head commit
+
+set -euo pipefail
+shopt -s extglob
+
+# ---------------------------------------------------------------------------
+# 1. Parse the /bench comment
+#    Syntax: /bench [--tags <csv>] [--filter <regex>]
+#      --tags    sets BENCH_TAGS (comma-separated tag list); defaults to "base"
+#                when the comment is just /bench
+#      --filter  Criterion name regex passed as a positional arg to cargo bench
+# ---------------------------------------------------------------------------
+
+ARGS="${COMMENT#/bench}"
+ARGS="${ARGS##+( )}"
+
+TAGS=""
+FILTER=""
+
+if [[ -z "$ARGS" ]]; then
+  # Bare /bench with no args: default to the "base" tag
+  TAGS="base"
+else
+  # Normalize: strip /bench prefix, collapse all whitespace (including newlines)
+  # to spaces, then strip to a safe allowlist before parsing
+  ARGS=$(printf '%s' "$ARGS" | tr '\n\r\t' ' ' | tr -s ' ' | tr -cd 'a-zA-Z0-9,_./|*+?()[]^$ -')
+  ARGS="${ARGS## }"   # strip leading space
+  ARGS="${ARGS%% }"   # strip trailing space
+
+  read -ra TOKENS <<< "$ARGS"
+  i=0
+  while [[ $i -lt ${#TOKENS[@]} ]]; do
+    case "${TOKENS[$i]}" in
+      --tags)   i=$((i + 1)); TAGS="${TOKENS[$i]:-}"   ;;
+      --filter) i=$((i + 1)); FILTER="${TOKENS[$i]:-}" ;;
+      *)        echo "Unknown token: '${TOKENS[$i]}'" >&2; exit 1 ;;
+    esac
+    i=$((i + 1))
+  done
+fi
+
+# Default: if nothing was parsed, run with BENCH_TAGS=base
+if [[ -z "$TAGS" && -z "$FILTER" ]]; then
+  TAGS="base"
+fi
+
+echo "Parsed tags:   ${TAGS:-<none>}"
+echo "Parsed filter: ${FILTER:-<none>}"
+
+[[ -n "$TAGS" ]] && export BENCH_TAGS="$TAGS"
+
+# ---------------------------------------------------------------------------
+# 2. Benchmark the PR branch (already checked out by the workflow)
+# ---------------------------------------------------------------------------
+(cd benchmarks && cargo bench --locked --bench workload_bench -- --save-baseline changes "$FILTER")
+
+# ---------------------------------------------------------------------------
+# 3. Switch to the base branch and benchmark it
+#    The benchmarks/target/ directory is not tracked by git, so the
+#    "changes" baseline files are preserved across the branch switch.
+# ---------------------------------------------------------------------------
+git fetch origin -- "$BASE_REF"
+git checkout FETCH_HEAD
+(cd benchmarks && cargo bench --locked --bench workload_bench -- --save-baseline base "$FILTER")
+
+# ---------------------------------------------------------------------------
+# 4. Compare baselines with critcmp and format as a markdown table.
+#      - Parses actual duration values (not rank factors) for the % column
+#      - Bolds the faster duration and % cell when the difference is
+#        statistically significant (error bounds do not overlap)
+# ---------------------------------------------------------------------------
+# Use `critcmp` to compare the criterion output for `base` and `changes`. We use `critcmp` instead of manually
+# parsing criterion outputs because criterion may update its output format. By using `critcmp`, we inherit all
+# updated criterion output parsing.
+COMPARISON=$((cd benchmarks && critcmp base changes) | python3 benchmarks/ci/parse_critcmp.py)
+
+# ---------------------------------------------------------------------------
+# 5. Write results to /tmp/bench-comment.md
+#    The post-comment job in benchmark.yml downloads this file as an artifact
+#    and posts it as a PR comment using a step that holds GH_TOKEN.
+# ---------------------------------------------------------------------------
+SHORT_SHA="${HEAD_SHA:0:7}"
+
+SUMMARY=""
+[[ -n "$TAGS" ]]   && SUMMARY="tags: \`${TAGS}\`"
+[[ -n "$FILTER" ]] && SUMMARY+="${SUMMARY:+ | }filter: \`${FILTER}\`"
+
+{
+  echo "## Benchmark for ${SHORT_SHA}"
+  echo "<details>"
+  echo "<summary>${SUMMARY}</summary>"
+  echo ""
+  echo "$COMPARISON"
+  echo ""
+  echo "</details>"
+} > /tmp/bench-comment.md
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -6,35 +6,39 @@ on:
     types: [created, edited]
 name: Benchmarking PR performance
 jobs:
-  runBenchmark:
+  run-benchmark:
     name: Run benchmarks
     if: >
       github.event.issue.pull_request &&
       (github.event.comment.body == '/bench' || startsWith(github.event.comment.body, '/bench '))
     runs-on: ubuntu-latest
     permissions:
-      pull-requests: write
+      contents: read
+    outputs:
+      pr_number: ${{ steps.pr.outputs.pr_number }}
     steps:
-      - name: Parse benchmark tags
-        env:
-          COMMENT: ${{ github.event.comment.body }}
-        run: |
-          if [[ "$COMMENT" == "/bench" ]]; then
-            TAGS="base"
-          else
-            TAGS="${COMMENT#/bench }"
-            TAGS=$(echo "$TAGS" | tr -d '[:space:]')
-          fi
-          echo "BENCH_TAGS=$TAGS" >> "$GITHUB_ENV"
-          echo "Parsed tags: $TAGS"
-      - name: Get PR HEAD sha
+      - name: Get PR metadata
         id: pr
-        run: |
-          PR_DATA=$(gh api repos/${{ github.repository }}/pulls/${{ github.event.issue.number }})
-          echo "head_sha=$(echo "$PR_DATA" | jq -r .head.sha)" >> "$GITHUB_OUTPUT"
-          echo "base_ref=$(echo "$PR_DATA" | jq -r .base.ref)" >> "$GITHUB_OUTPUT"
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+          PR_NUMBER: ${{ github.event.issue.number }}
+        run: |
+          PR_DATA=$(gh api "repos/$REPO/pulls/$PR_NUMBER")
+          HEAD_SHA=$(echo "$PR_DATA" | jq -r .head.sha)
+          BASE_REF=$(echo "$PR_DATA" | jq -r .base.ref)
+          [[ "$HEAD_SHA" == *$'\n'* || "$BASE_REF" == *$'\n'* ]] && { echo "Unexpected newline in API response" >&2; exit 1; }
+          [[ "$BASE_REF" =~ ^[a-zA-Z0-9/_.-]+$ ]] || { echo "Invalid BASE_REF: $BASE_REF" >&2; exit 1; }
+          printf 'head_sha=%s\n' "$HEAD_SHA" >> "$GITHUB_OUTPUT"
+          printf 'base_ref=%s\n'  "$BASE_REF"  >> "$GITHUB_OUTPUT"
+          printf 'pr_number=%s\n' "$PR_NUMBER"  >> "$GITHUB_OUTPUT"
+      - name: Install critcmp
+        # Installed before checkout so the PR's .cargo/config.toml cannot
+        # redirect the registry to a malicious source. The runner's
+        # pre-installed Rust is sufficient -- no toolchain setup needed here.
+        # --locked is omitted for cargo install (same exemption as cargo miri
+        # setup); --version pins the top-level crate.
+        run: cargo install critcmp --version 0.1.8
       - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
         with:
           ref: ${{ steps.pr.outputs.head_sha }}
@@ -45,13 +49,34 @@ jobs:
       - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
         with:
           save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-      # TODO: This action internally runs `cargo bench` without --locked, bypassing our
-      #       supply chain lockfile policy (see build.yml top-level comment). Replace with
-      #       manual cargo bench --locked + critcmp + gh pr comment steps.
-      # - uses: boa-dev/criterion-compare-action@adfd3a94634fe2041ce5613eb7df09d247555b87 # v3.2.4
-      #   with:
-      #     token: ${{ secrets.GITHUB_TOKEN }}
-      #     branchName: ${{ steps.pr.outputs.base_ref }}
-      #     cwd: benchmarks
-      #     benchName: workload_bench
-      - run: echo "Benchmarking is temporarily disabled. See TODO above."
+      - name: Run benchmarks
+        # The comment is posted in the post-comment job after this job completes.
+        env:
+          COMMENT:  ${{ github.event.comment.body }}
+          BASE_REF: ${{ steps.pr.outputs.base_ref }}
+          HEAD_SHA: ${{ steps.pr.outputs.head_sha }}
+        run: bash .github/scripts/run-benchmarks.sh
+      - name: Upload benchmark comment
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: bench-comment
+          path: /tmp/bench-comment.md
+
+  post-comment:
+    name: Post benchmark results
+    needs: run-benchmark
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Download benchmark comment
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: bench-comment
+          path: /tmp/
+      - name: Post results as PR comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ needs.run-benchmark.outputs.pr_number }}
+          REPO: ${{ github.repository }}
+        run: gh pr comment "$PR_NUMBER" --repo "$REPO" --body-file /tmp/bench-comment.md
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -85,12 +85,28 @@ BENCH_TAGS=base,my-feature cargo bench -p delta_kernel_benchmarks
 
 ### Running benchmarking on a PR
 
-To trigger benchmarks on a pull request, post a comment on the PR with one of the following:
+To trigger benchmarks on a pull request, post a comment using the following syntax:
 
-- `/bench` — runs benchmarks with `BENCH_TAGS=base`
-- `/bench <tags>` — runs benchmarks with `BENCH_TAGS=<tags>` (e.g. `bench base,tag1`)
+```
+/bench [--tags <comma separated list of tags>] [--filter <regex>]
+```
+
+- `--tags` sets `BENCH_TAGS` (comma-separated), controlling which table groupings run.
+- `--filter` is a single-token Criterion regex matched against benchmark names.
+- Both flags are optional and independent; they can be given in any order.
+- When both are specified, they apply as AND: only benchmarks from tables that match the tag filter AND whose name matches the regex are run.
+- Running just `/bench` (with no flags) defaults to `BENCH_TAGS=base`. If neither flag is parsed, the same default applies.
+
+Examples:
+```
+/bench                                                  # BENCH_TAGS=base, all benchmark names
+/bench --tags base,my-tag                               # BENCH_TAGS=base,my-tag, all benchmark names
+/bench --filter snapshotConstruction                    # no BENCH_TAGS set, only snapshotConstruction benchmarks
+/bench --tags base --filter 101kAdds.*snapshotConstruction  # only snapshotConstruction benchmarks from tables tagged "base"
+/bench --filter 101kAdds|10kAdds                        # no BENCH_TAGS set, OR two table names
+```
 
-See [By tag (`BENCH_TAGS`)](#by-tag-bench_tags) for details on how tags work. Results are posted automatically as a PR comment, comparing the PR branch against the base branch.
+See [By tag (`BENCH_TAGS`)](#by-tag-bench_tags) for how tags work and [By benchmark name](#by-benchmark-name) for regex pattern examples. Results are posted automatically as a PR comment, comparing the PR branch against the base branch.
 
 ## Workload data layout
 

diff --git a/benchmarks/ci/parse_critcmp.py b/benchmarks/ci/parse_critcmp.py
@@ -0,0 +1,68 @@
+import sys, re
+
+def to_seconds(value, units):
+    u = units.strip()
+    if u == 's':   return value
+    if u == 'ms':  return value / 1e3
+    if u in ('µs', 'us', 'μs'): return value / 1e6
+    if u == 'ns':  return value / 1e9
+    return value
+
+def is_significant(chg_dur, chg_err, base_dur, base_err):
+    if chg_dur < base_dur:
+        return chg_dur + chg_err < base_dur or base_dur - base_err > chg_dur
+    else:
+        return chg_dur - chg_err > base_dur or base_dur + base_err < chg_dur
+
+def parse_duration(s):
+    m = re.match(r'([0-9.]+)±([0-9.]+)(.+)', s.strip())
+    if not m:
+        return None
+    return float(m.group(1)), float(m.group(2)), m.group(3).strip()
+
+lines = sys.stdin.read().splitlines()
+print("| Test | Base         | PR               | % |")
+print("|------|--------------|------------------|---|")
+
+for line in lines[2:]:  # skip critcmp header rows
+    if not line.strip():
+        continue
+    # critcmp columns (split on 2+ spaces):
+    #   with throughput:    name, baseFactor, baseDuration, baseBandwidth, changesFactor, changesDuration, changesBandwidth
+    #   without throughput: name, baseFactor, baseDuration, changesFactor, changesDuration
+    # Locate duration fields by the presence of "±" rather than hardcoding indices,
+    # so the script works correctly regardless of whether bandwidth columns are present.
+    fields = re.split(r'  +', line)
+    name = fields[0].strip().replace('|', r'\|') if fields else ''
+    dur_fields = [f.strip() for f in fields[1:] if '±' in f]
+    base_dur_str = dur_fields[0] if len(dur_fields) > 0 else None
+    chg_dur_str  = dur_fields[1] if len(dur_fields) > 1 else None
+
+    if not name and not base_dur_str and not chg_dur_str:
+        continue
+
+    base_display = base_dur_str or 'N/A'
+    chg_display  = chg_dur_str  or 'N/A'
+    difference   = 'N/A'
+
+    if base_dur_str and chg_dur_str:
+        base_p = parse_duration(base_dur_str)
+        chg_p  = parse_duration(chg_dur_str)
+        if base_p and chg_p:
+            base_secs     = to_seconds(base_p[0], base_p[2])
+            base_err_secs = to_seconds(base_p[1], base_p[2])
+            chg_secs      = to_seconds(chg_p[0],  chg_p[2])
+            chg_err_secs  = to_seconds(chg_p[1],  chg_p[2])
+
+            pct    = -(1 - chg_secs / base_secs) * 100
+            prefix = '' if chg_secs <= base_secs else '+'
+            difference = f'{prefix}{pct:.2f}%'
+
+            if is_significant(chg_secs, chg_err_secs, base_secs, base_err_secs):
+                if chg_secs < base_secs:
+                    chg_display = f'**{chg_dur_str}**'
+                elif chg_secs > base_secs:
+                    base_display = f'**{base_dur_str}**'
+                difference = f'**{difference}**'
+
+    print(f'| {name} | {base_display} | {chg_display} | {difference} |')
diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs
@@ -298,20 +298,38 @@ impl LogSegment {
             .as_ref()
             .and_then(|hint| hint.checkpoint_schema.clone());
 
-        let listed_files = match (checkpoint_hint, time_travel_version) {
-            (Some(cp), None) => {
-                LogSegmentFiles::list_with_checkpoint_hint(&cp, storage, &log_root, log_tail, None)?
-            }
-            (Some(cp), Some(end_version)) if cp.version <= end_version => {
-                LogSegmentFiles::list_with_checkpoint_hint(
-                    &cp,
-                    storage,
-                    &log_root,
-                    log_tail,
-                    Some(end_version),
-                )?
-            }
-            _ => LogSegmentFiles::list(storage, &log_root, log_tail, None, time_travel_version)?,
+        // The end_version is the time_travel_version, if present
+        // TODO: When max catalog version is implemented, we would use that as end_version if
+        // time_travel_version is not present
+        let end_version = time_travel_version;
+
+        // Keep the hint only if it points at or before end_version, or if there is no end_version bound
+        let usable_hint = checkpoint_hint.filter(|cp| end_version.is_none_or(|v| cp.version <= v));
+
+        // Cases:
+        //
+        // 1. usable_hint present, end_version is Some  --> list_with_checkpoint_hint from hint.version TO end_version
+        // 2. usable_hint present, end_version is None  --> list_with_checkpoint_hint from hint.version unbounded
+        // 3. no usable_hint,      end_version is Some  --> backward-scan for checkpoint before end_version,
+        //                                                  list from that checkpoint TO end_version
+        //                                                  (falls back to v0 if no checkpoint found)
+        // 4. no usable_hint,      end_version is None  --> list from v0 unbounded
+
+        let listed_files = match (usable_hint, end_version) {
+            // Cases 1 and 2
+            (Some(cp), end_version) => LogSegmentFiles::list_with_checkpoint_hint(
+                &cp,
+                storage,
+                &log_root,
+                log_tail,
+                end_version,
+            )?,
+            // Case 3
+            (None, Some(end)) => LogSegmentFiles::list_with_backward_checkpoint_scan(
+                storage, &log_root, log_tail, end,
+            )?,
+            // Case 4
+            (None, None) => LogSegmentFiles::list(storage, &log_root, log_tail, None, None)?,
         };
 
         LogSegment::try_new(