From c72d55140ae79182b2f351e0d3e9836d65b97da6 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 19:12:02 +0800 Subject: [PATCH 01/26] feat(spider2): eval-spec + duckdb_match comparator (AC-1, AC-2) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../benchmarks/spider2_dbt/duckdb_match.py | 67 ++++++ .../benchmarks/spider2_dbt/eval_spec.py | 41 ++++ .../test_spider2_dbt_verify_comparator.py | 197 ++++++++++++++++++ 3 files changed, 305 insertions(+) create mode 100644 src/razorback/benchmarks/spider2_dbt/duckdb_match.py create mode 100644 src/razorback/benchmarks/spider2_dbt/eval_spec.py create mode 100644 tests/unit/test_spider2_dbt_verify_comparator.py diff --git a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py new file mode 100644 index 0000000..47ca10d --- /dev/null +++ b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py @@ -0,0 +1,67 @@ +# ABOUTME: spider2-dbt comparator reproducing Spider2 eval_utils.duckdb_match. +# ABOUTME: per-table SELECT *, restrict to condition_cols, ignore_orders, AND across tables. +from __future__ import annotations + +from collections import Counter +from pathlib import Path + +import duckdb + +try: + from razorback.benchmarks.spider2_dbt.eval_spec import EvalSpec +except ModuleNotFoundError: # running flat from /tests in the verifier container + from eval_spec import EvalSpec # type: ignore[no-redef] + + +def _fetch_table(con: duckdb.DuckDBPyConnection, table: str) -> list[tuple] | None: + """SELECT * from `table`; None if the table does not exist.""" + exists = con.execute( + "SELECT 1 FROM information_schema.tables WHERE table_name = ?", + [table], + ).fetchone() + if exists is None: + return None + return con.execute(f'SELECT * FROM "{table}"').fetchall() + + +def _project(rows: list[tuple], col_indices: list[int] | None) -> list[tuple]: + if not col_indices: + return [tuple(r) for r in rows] + return [tuple(r[i] for i in col_indices) for r in rows] + + +def _rows_match( + pred: list[tuple], gold: list[tuple], *, ignore_orders: bool +) -> bool: + if ignore_orders: + return Counter(pred) == Counter(gold) + return list(pred) == list(gold) + + +def compare_duckdb(*, predicted_db: Path, gold_db: Path, spec: EvalSpec) -> bool: + """Reproduce Spider2 eval_utils.duckdb_match. + + For each table in spec.condition_tabs: SELECT * from both DBs, restrict + to spec.condition_cols[table] (0-based indices into SELECT * order; all + columns when absent), and compare with spec.ignore_orders. Overall match + is the AND across all tables. A missing predicted table is a mismatch. + """ + pred_con = duckdb.connect(str(predicted_db), read_only=True) + gold_con = duckdb.connect(str(gold_db), read_only=True) + try: + for table in spec.condition_tabs: + gold_rows = _fetch_table(gold_con, table) + pred_rows = _fetch_table(pred_con, table) + if gold_rows is None or pred_rows is None: + return False + cols = spec.condition_cols.get(table) + if not _rows_match( + _project(pred_rows, cols), + _project(gold_rows, cols), + ignore_orders=spec.ignore_orders, + ): + return False + return True + finally: + pred_con.close() + gold_con.close() diff --git a/src/razorback/benchmarks/spider2_dbt/eval_spec.py b/src/razorback/benchmarks/spider2_dbt/eval_spec.py new file mode 100644 index 0000000..1fbd5c9 --- /dev/null +++ b/src/razorback/benchmarks/spider2_dbt/eval_spec.py @@ -0,0 +1,41 @@ +# ABOUTME: spider2-dbt gold eval-spec model + loader (one line of spider2_eval.jsonl). +# ABOUTME: Keeps spec-parsing separate from the duckdb_match comparison semantics. +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass(frozen=True) +class EvalSpec: + """One Spider2-dbt gold eval entry (a line of spider2_eval.jsonl). + + condition_tabs: gold table names to compare. + condition_cols: table name -> 0-based column indices (into SELECT * + order) to restrict the comparison to. A table missing here means + "compare all columns". + ignore_orders: when True, compare row-multisets order-insensitively. + """ + + condition_tabs: list[str] + condition_cols: dict[str, list[int]] = field(default_factory=dict) + ignore_orders: bool = False + + +def load_eval_spec(path: Path) -> EvalSpec: + """Load a single-task gold eval spec from a JSON object. + + Accepts either a bare JSON object or the first line of a + spider2_eval.jsonl file (one task per line). + """ + text = Path(path).read_text().strip() + first_line = text.splitlines()[0] if text else "{}" + raw = json.loads(first_line) + return EvalSpec( + condition_tabs=list(raw.get("condition_tabs", [])), + condition_cols={ + k: list(v) for k, v in raw.get("condition_cols", {}).items() + }, + ignore_orders=bool(raw.get("ignore_orders", False)), + ) diff --git a/tests/unit/test_spider2_dbt_verify_comparator.py b/tests/unit/test_spider2_dbt_verify_comparator.py new file mode 100644 index 0000000..a7e83d9 --- /dev/null +++ b/tests/unit/test_spider2_dbt_verify_comparator.py @@ -0,0 +1,197 @@ +# ABOUTME: spider2-dbt duckdb_match comparator + eval-spec unit tests (AC-1/AC-2). +# ABOUTME: Builds tiny in-test DuckDB fixtures and drives the comparator directly. +import json +from pathlib import Path + +import duckdb + +from razorback.benchmarks.spider2_dbt.duckdb_match import compare_duckdb +from razorback.benchmarks.spider2_dbt.eval_spec import EvalSpec, load_eval_spec + + +def test_spider2_dbt_verify_loads_eval_spec(tmp_path: Path): + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text( + json.dumps( + { + "condition_tabs": ["orders"], + "condition_cols": {"orders": [0, 2]}, + "ignore_orders": True, + } + ) + + "\n" + ) + spec = load_eval_spec(spec_path) + assert spec == EvalSpec( + condition_tabs=["orders"], + condition_cols={"orders": [0, 2]}, + ignore_orders=True, + ) + + +def test_spider2_dbt_verify_eval_spec_defaults(tmp_path: Path): + # A table absent from condition_cols means "compare all columns"; + # ignore_orders defaults to False when the key is missing. + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text(json.dumps({"condition_tabs": ["t1"]}) + "\n") + spec = load_eval_spec(spec_path) + assert spec.condition_tabs == ["t1"] + assert spec.condition_cols == {} + assert spec.ignore_orders is False + + +def _build_db(path, tables: dict[str, tuple[list[str], list[tuple]]]): + """tables: name -> (column_names, rows). Builds a tiny .duckdb file.""" + con = duckdb.connect(str(path)) + try: + for name, (cols, rows) in tables.items(): + col_defs = ", ".join(f"{c} INTEGER" for c in cols) + con.execute(f"CREATE TABLE {name} ({col_defs})") + if rows: + placeholders = ", ".join( + ["(" + ", ".join(["?"] * len(cols)) + ")"] * len(rows) + ) + flat = [v for row in rows for v in row] + con.execute(f"INSERT INTO {name} VALUES {placeholders}", flat) + finally: + con.close() + + +def test_spider2_dbt_verify_matching_db_scores_true(tmp_path): + tables = {"orders": (["a", "b"], [(1, 2), (3, 4)])} + _build_db(tmp_path / "pred.duckdb", tables) + _build_db(tmp_path / "gold.duckdb", tables) + spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=False) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is True + ) + + +def test_spider2_dbt_verify_mismatched_db_scores_false(tmp_path): + _build_db(tmp_path / "pred.duckdb", {"orders": (["a", "b"], [(1, 2), (9, 9)])}) + _build_db(tmp_path / "gold.duckdb", {"orders": (["a", "b"], [(1, 2), (3, 4)])}) + spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=False) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is False + ) + + +def test_spider2_dbt_verify_all_tables_must_match(tmp_path): + # One table matches, the other differs -> overall False (AND across tables). + pred = {"t1": (["a"], [(1,)]), "t2": (["a"], [(2,)])} + gold = {"t1": (["a"], [(1,)]), "t2": (["a"], [(99,)])} + _build_db(tmp_path / "pred.duckdb", pred) + _build_db(tmp_path / "gold.duckdb", gold) + spec = EvalSpec(condition_tabs=["t1", "t2"], condition_cols={}, ignore_orders=False) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is False + ) + + +def test_spider2_dbt_verify_missing_predicted_table_scores_false(tmp_path): + _build_db(tmp_path / "pred.duckdb", {"other": (["a"], [(1,)])}) + _build_db(tmp_path / "gold.duckdb", {"orders": (["a"], [(1,)])}) + spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=False) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is False + ) + + +def test_spider2_dbt_verify_non_condition_col_diff_ignored(tmp_path): + # Column index 1 ("b") differs, but only index 0 ("a") is a condition_col + # -> still a match (the non-condition column is not compared). + _build_db(tmp_path / "pred.duckdb", {"orders": (["a", "b"], [(1, 100), (2, 200)])}) + _build_db(tmp_path / "gold.duckdb", {"orders": (["a", "b"], [(1, 7), (2, 8)])}) + spec = EvalSpec( + condition_tabs=["orders"], condition_cols={"orders": [0]}, ignore_orders=False + ) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is True + ) + + +def test_spider2_dbt_verify_condition_col_diff_detected(tmp_path): + # The SAME column data, but now index 1 ("b") IS a condition_col and differs + # -> mismatch. Proves the subset actually restricts, not drops everything. + _build_db(tmp_path / "pred.duckdb", {"orders": (["a", "b"], [(1, 100), (2, 200)])}) + _build_db(tmp_path / "gold.duckdb", {"orders": (["a", "b"], [(1, 7), (2, 8)])}) + spec = EvalSpec( + condition_tabs=["orders"], condition_cols={"orders": [0, 1]}, ignore_orders=False + ) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is False + ) + + +def test_spider2_dbt_verify_row_reorder_matches_when_ignore_orders(tmp_path): + _build_db(tmp_path / "pred.duckdb", {"orders": (["a"], [(2,), (1,)])}) + _build_db(tmp_path / "gold.duckdb", {"orders": (["a"], [(1,), (2,)])}) + spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=True) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is True + ) + + +def test_spider2_dbt_verify_row_reorder_mismatches_when_ordered(tmp_path): + _build_db(tmp_path / "pred.duckdb", {"orders": (["a"], [(2,), (1,)])}) + _build_db(tmp_path / "gold.duckdb", {"orders": (["a"], [(1,), (2,)])}) + spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=False) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is False + ) + + +def test_spider2_dbt_verify_ignore_orders_still_counts_duplicates(tmp_path): + # ignore_orders is a multiset compare, not a set compare: duplicate counts + # must still match (Counter equality, not set equality). + _build_db(tmp_path / "pred.duckdb", {"orders": (["a"], [(1,), (1,)])}) + _build_db(tmp_path / "gold.duckdb", {"orders": (["a"], [(1,)])}) + spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=True) + assert ( + compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, + ) + is False + ) From 06be8be482f79834b35484031098244b934e33c0 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 19:14:32 +0800 Subject: [PATCH 02/26] feat(spider2): verify.py CLI + materialize verifier assets/test.sh (AC-3) Emitted test.sh predicted-DB path consumes resolve_spider2_db_name (RIDER) instead of hardcoding /app/spider2.duckdb; covered by a NON-spider2 db-name test. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../benchmarks/spider2_dbt/harbor_view.py | 79 ++++++++++++ .../benchmarks/spider2_dbt/verify.py | 61 +++++++++ .../tests/gold/build_gold.py | 34 +++++ .../tests/gold/gold.duckdb | Bin 0 -> 536576 bytes .../tests/gold/spider2_eval.jsonl | 1 + tests/unit/test_spider2_dbt_verify_cli.py | 120 ++++++++++++++++++ 6 files changed, 295 insertions(+) create mode 100644 src/razorback/benchmarks/spider2_dbt/verify.py create mode 100644 tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/build_gold.py create mode 100644 tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/gold.duckdb create mode 100644 tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/spider2_eval.jsonl create mode 100644 tests/unit/test_spider2_dbt_verify_cli.py diff --git a/src/razorback/benchmarks/spider2_dbt/harbor_view.py b/src/razorback/benchmarks/spider2_dbt/harbor_view.py index 1f869f5..c60cadf 100644 --- a/src/razorback/benchmarks/spider2_dbt/harbor_view.py +++ b/src/razorback/benchmarks/spider2_dbt/harbor_view.py @@ -2,9 +2,13 @@ import shlex import shutil +import stat from pathlib import Path from typing import Literal +from razorback.benchmarks.spider2_dbt import duckdb_match as _duckdb_match_mod +from razorback.benchmarks.spider2_dbt import eval_spec as _eval_spec_mod +from razorback.benchmarks.spider2_dbt import verify as _verify_mod from razorback.benchmarks.spider2_dbt.preflight import ( preflight_script_text, resolve_spider2_db_name, @@ -46,6 +50,21 @@ "# Razorback: validate spider2-dbt source DuckDB before agent runtime." ) +# Emitted into the view's tests/ — Harbor uploads tests/ to the container only +# at verify time, so the gold .duckdb + eval spec reach the verifier but never +# the agent. `{predicted_db}` is filled from `resolve_spider2_db_name` (the +# SHARED `/app/.duckdb` contract) so the verifier scores the SAME +# DuckDB the build-time preflight validated — NOT a hardcoded /app/spider2.duckdb. +_TEST_SH_TEMPLATE = """#!/bin/sh +set -eu +mkdir -p /logs/verifier +python /tests/verify.py \\ + --predicted-db {predicted_db} \\ + --gold-db /tests/gold.duckdb \\ + --eval-spec /tests/spider2_eval.jsonl \\ + --reward-out /logs/verifier/reward.json +""" + def materialize_spider2_harbor_task_view( *, @@ -75,9 +94,69 @@ def materialize_spider2_harbor_task_view( _ensure_spider2_build_context_layer(view) _ensure_dbt_deps_image_layer(view) _ensure_workspace_preflight_image_layer(view, task_slug=task_slug) + _ensure_verifier_assets( + view, source_task_dir=Path(source_task_dir), task_slug=task_slug + ) return view +def _ensure_verifier_assets( + view_dir: Path, *, source_task_dir: Path, task_slug: str +) -> None: + """Copy the comparator + gold data + test.sh into the view's tests/ dir. + + Gold assets are read from the SOURCE task's tests/gold/ (the reflected view + stripped them via the **/gold/** deny-glob) and written WITHOUT a `gold/` + path segment so `assert_no_denied_paths` stays green while the + verifier-uploaded tests/ dir still carries them. The emitted test.sh's + `--predicted-db` is `/app/.duckdb` where `` comes from the + SHARED `resolve_spider2_db_name` resolver (RIDER, Codex r5-plan finding) — + never a hardcoded `/app/spider2.duckdb`. + """ + source_gold = Path(source_task_dir) / "tests" / "gold" + if not source_gold.is_dir(): + return + + tests = view_dir / "tests" + tests.mkdir(parents=True, exist_ok=True) + for mod in (_duckdb_match_mod, _eval_spec_mod, _verify_mod): + src = Path(mod.__file__) + shutil.copy2(src, tests / src.name) + shutil.copy2(source_gold / "gold.duckdb", tests / "gold.duckdb") + shutil.copy2(source_gold / "spider2_eval.jsonl", tests / "spider2_eval.jsonl") + + # Resolve the agent-facing DuckDB stem from the dbt project (or the slug + # fallback) so the verifier compares the SAME `/app/.duckdb` the + # preflight validated and the agent operates against. + project_dir = _dbt_project_dir(view_dir) + db_name = resolve_spider2_db_name( + project_dir if project_dir is not None else view_dir, + task_slug=task_slug, + ) + test_sh = tests / "test.sh" + if test_sh.is_symlink(): + test_sh.unlink() + test_sh.write_text( + _TEST_SH_TEMPLATE.format( + predicted_db=f"{_APP_ROOT}/{db_name}.duckdb" + ) + ) + test_sh.chmod( + test_sh.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH + ) + + # The deny-glob reflection strips gold FILES but leaves the now-empty + # `gold/` directory behind. Prune empty `gold/`-named dirs so no `gold/` + # path segment survives anywhere in the agent-facing view. + for gold_dir in sorted( + (p for p in view_dir.rglob("gold") if p.is_dir()), + key=lambda p: len(p.parts), + reverse=True, + ): + if not any(gold_dir.iterdir()): + gold_dir.rmdir() + + def _has_dbt_project(view_dir: Path) -> bool: """spider2-dbt nests the dbt project under `dbt_project/` (or under `environment/dbt_project/`).""" diff --git a/src/razorback/benchmarks/spider2_dbt/verify.py b/src/razorback/benchmarks/spider2_dbt/verify.py new file mode 100644 index 0000000..34afe14 --- /dev/null +++ b/src/razorback/benchmarks/spider2_dbt/verify.py @@ -0,0 +1,61 @@ +# ABOUTME: spider2-dbt verifier — compares predicted vs gold .duckdb via +# ABOUTME: duckdb_match semantics and writes harbor's {"reward": } file. +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +try: + from razorback.benchmarks.spider2_dbt.duckdb_match import compare_duckdb + from razorback.benchmarks.spider2_dbt.eval_spec import load_eval_spec +except ModuleNotFoundError: # running flat from /tests in the verifier container + from duckdb_match import compare_duckdb # type: ignore[no-redef] + from eval_spec import load_eval_spec # type: ignore[no-redef] + + +def emit_reward( + *, + predicted_db: Path, + gold_db: Path, + eval_spec: Path, + reward_out: Path, +) -> None: + """Compute the binary duckdb_match reward and write harbor's reward.json.""" + if not Path(predicted_db).exists(): + is_match = False + else: + spec = load_eval_spec(Path(eval_spec)) + is_match = compare_duckdb( + predicted_db=Path(predicted_db), + gold_db=Path(gold_db), + spec=spec, + ) + payload = {"reward": 1.0 if is_match else 0.0} + Path(reward_out).parent.mkdir(parents=True, exist_ok=True) + Path(reward_out).write_text(json.dumps(payload) + "\n") + if not is_match: + sys.stderr.write( + f"spider2-dbt verify: mismatch (predicted={predicted_db})\n" + ) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--predicted-db", type=Path, required=True) + parser.add_argument("--gold-db", type=Path, required=True) + parser.add_argument("--eval-spec", type=Path, required=True) + parser.add_argument("--reward-out", type=Path, required=True) + args = parser.parse_args() + emit_reward( + predicted_db=args.predicted_db, + gold_db=args.gold_db, + eval_spec=args.eval_spec, + reward_out=args.reward_out, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/build_gold.py b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/build_gold.py new file mode 100644 index 0000000..66ebf49 --- /dev/null +++ b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/build_gold.py @@ -0,0 +1,34 @@ +# ABOUTME: Rebuilds the gold .duckdb for the spider2-dbt verifier fixture. +# ABOUTME: Run: uv run python (regenerates gold.duckdb next to it). +import json +from pathlib import Path + +import duckdb + +HERE = Path(__file__).parent + + +def build() -> None: + db = HERE / "gold.duckdb" + if db.exists(): + db.unlink() + con = duckdb.connect(str(db)) + try: + con.execute("CREATE TABLE orders (id INTEGER, amount INTEGER)") + con.executemany("INSERT INTO orders VALUES (?, ?)", [(1, 100), (2, 200)]) + finally: + con.close() + (HERE / "spider2_eval.jsonl").write_text( + json.dumps( + { + "condition_tabs": ["orders"], + "condition_cols": {"orders": [0, 1]}, + "ignore_orders": True, + } + ) + + "\n" + ) + + +if __name__ == "__main__": + build() diff --git a/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/gold.duckdb b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/gold.duckdb new file mode 100644 index 0000000000000000000000000000000000000000..2c038e71c0f1e88fcddb5364c7d0beeb689f2d08 GIT binary patch literal 536576 zcmeI)J!=#}7y#heHH0)lE3xq}1WpLXN|JE)d?C-MjH)@cFw3%PUvLE?>J8 zcDLPY3=I#R>t1$be0XHIap8PpvU?e42oNAZfB*pk1PBlyK!5;&ekw6i-5 zP{c?0r$Q@b$Ah{~5q>I|0{66~;5}EopDLDHDz?kCKQ@cS;saf~_!K7#*86D1?mF9F zbW+9%iahW*htO@{@mk_EMf-v~Ga66MT)jCudL+I|7eQb@1P=NMpM!3HKMb@%fB*pk z1PB~2fllnPV?41#lp>-yUVk0EeziHa?)B?1{*5OyP!J$MfB=F2Dqv>s&-{5+qqFpo z%cjMYq7{}-rPG(Z?6V>^QpR@`->tngs4f8l1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ ifB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FpU=0zUzWY#5#Z literal 0 HcmV?d00001 diff --git a/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/spider2_eval.jsonl b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/spider2_eval.jsonl new file mode 100644 index 0000000..8a0ae1d --- /dev/null +++ b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/spider2_eval.jsonl @@ -0,0 +1 @@ +{"condition_tabs": ["orders"], "condition_cols": {"orders": [0, 1]}, "ignore_orders": true} diff --git a/tests/unit/test_spider2_dbt_verify_cli.py b/tests/unit/test_spider2_dbt_verify_cli.py new file mode 100644 index 0000000..429cf2a --- /dev/null +++ b/tests/unit/test_spider2_dbt_verify_cli.py @@ -0,0 +1,120 @@ +# ABOUTME: spider2-dbt verify.py CLI + materializer-emission tests (AC-3). +# ABOUTME: emit_reward writes harbor-shaped reward.json; the view carries verifier assets. +import json +from pathlib import Path + +import duckdb + +from razorback.benchmarks.spider2_dbt.harbor_view import ( + materialize_spider2_harbor_task_view, +) +from razorback.benchmarks.spider2_dbt.verify import emit_reward + +_SOURCE = ( + Path(__file__).parent.parent + / "fixtures" + / "spider2_dbt" + / "harbor_task_minimal" + / "spider2-fixture-001" +) + + +def _build_db(path, rows): + con = duckdb.connect(str(path)) + try: + con.execute("CREATE TABLE orders (a INTEGER)") + con.executemany("INSERT INTO orders VALUES (?)", [(r,) for r in rows]) + finally: + con.close() + + +def _spec(path): + path.write_text( + json.dumps( + {"condition_tabs": ["orders"], "condition_cols": {}, "ignore_orders": True} + ) + + "\n" + ) + return path + + +def test_spider2_dbt_verify_cli_emits_reward_one_on_match(tmp_path): + _build_db(tmp_path / "pred.duckdb", [1, 2]) + _build_db(tmp_path / "gold.duckdb", [2, 1]) + reward_out = tmp_path / "logs" / "verifier" / "reward.json" + emit_reward( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + eval_spec=_spec(tmp_path / "spider2_eval.jsonl"), + reward_out=reward_out, + ) + assert json.loads(reward_out.read_text()) == {"reward": 1.0} + + +def test_spider2_dbt_verify_cli_emits_reward_zero_on_mismatch(tmp_path): + _build_db(tmp_path / "pred.duckdb", [1, 2]) + _build_db(tmp_path / "gold.duckdb", [9, 9]) + reward_out = tmp_path / "logs" / "verifier" / "reward.json" + emit_reward( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + eval_spec=_spec(tmp_path / "spider2_eval.jsonl"), + reward_out=reward_out, + ) + payload = json.loads(reward_out.read_text()) + assert payload == {"reward": 0.0} + # parent dir was created by emit_reward (mirrors dab/verify.py:31) + assert reward_out.parent.is_dir() + + +def test_spider2_dbt_verify_cli_missing_predicted_scores_zero(tmp_path): + # A predicted DB the agent never produced is a 0.0 reward, not a crash. + _build_db(tmp_path / "gold.duckdb", [1]) + reward_out = tmp_path / "logs" / "verifier" / "reward.json" + emit_reward( + predicted_db=tmp_path / "does-not-exist.duckdb", + gold_db=tmp_path / "gold.duckdb", + eval_spec=_spec(tmp_path / "spider2_eval.jsonl"), + reward_out=reward_out, + ) + assert json.loads(reward_out.read_text()) == {"reward": 0.0} + + +def test_spider2_dbt_verify_view_carries_verifier_assets(tmp_path): + view = materialize_spider2_harbor_task_view( + source_task_dir=_SOURCE, + view_root=tmp_path / "views", + task_slug="spider2-fixture-001", + ) + tests = view / "tests" + # comparator + cli + spec modules and gold data are present for the verifier + for name in ( + "duckdb_match.py", + "eval_spec.py", + "verify.py", + "gold.duckdb", + "spider2_eval.jsonl", + "test.sh", + ): + assert (tests / name).is_file(), f"missing verifier asset: {name}" + # test.sh is executable + assert (tests / "test.sh").stat().st_mode & 0o111 + # leakage-clean: no `gold/` path segment survived in the agent-facing view + assert not (view / "tests" / "gold").exists() + assert not list(view.rglob("gold/*")) + + +def test_spider2_dbt_verify_test_sh_uses_resolved_db_name(tmp_path): + # RIDER: the emitted test.sh predicted-DB path must come from + # resolve_spider2_db_name, NOT a hardcoded /app/spider2.duckdb. This + # fixture ships no profiles.yml / *.duckdb, so the resolver falls back to + # the task slug -> the predicted path is /app/.duckdb (a NON-spider2 + # db name), proving the resolver is consumed. + view = materialize_spider2_harbor_task_view( + source_task_dir=_SOURCE, + view_root=tmp_path / "views", + task_slug="not-spider2-slug", + ) + test_sh = (view / "tests" / "test.sh").read_text() + assert "/app/not-spider2-slug.duckdb" in test_sh + assert "/app/spider2.duckdb" not in test_sh From 1a3b612ff37a6aa59183f91f23841d9189f4f635 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 19:16:03 +0800 Subject: [PATCH 03/26] test(spider2): end-to-end emitted verifier writes harbor reward.json (AC-3) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../test_spider2_dbt_verify_test_sh.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 tests/integration/test_spider2_dbt_verify_test_sh.py diff --git a/tests/integration/test_spider2_dbt_verify_test_sh.py b/tests/integration/test_spider2_dbt_verify_test_sh.py new file mode 100644 index 0000000..2da9c81 --- /dev/null +++ b/tests/integration/test_spider2_dbt_verify_test_sh.py @@ -0,0 +1,90 @@ +# ABOUTME: AC-3 end-to-end — the emitted verifier assets produce a harbor-shaped +# ABOUTME: reward.json. Exercises the materialized tests/ dir, not a re-import. +import json +import subprocess +import sys +from pathlib import Path + +import duckdb + +from razorback.benchmarks.spider2_dbt.harbor_view import ( + materialize_spider2_harbor_task_view, +) + +_SOURCE = ( + Path(__file__).resolve().parents[1] + / "fixtures" + / "spider2_dbt" + / "harbor_task_minimal" + / "spider2-fixture-001" +) + + +def _build_predicted_matching_gold(path: Path) -> None: + con = duckdb.connect(str(path)) + try: + con.execute("CREATE TABLE orders (id INTEGER, amount INTEGER)") + # rows reordered vs gold; ignore_orders=True -> still a match + con.executemany("INSERT INTO orders VALUES (?, ?)", [(2, 200), (1, 100)]) + finally: + con.close() + + +def test_spider2_dbt_verify_emitted_assets_write_reward_json(tmp_path): + view = materialize_spider2_harbor_task_view( + source_task_dir=_SOURCE, + view_root=tmp_path / "views", + task_slug="spider2-fixture-001", + ) + tests = view / "tests" + predicted = tmp_path / "spider2-fixture-001.duckdb" + _build_predicted_matching_gold(predicted) + reward_out = tmp_path / "logs" / "verifier" / "reward.json" + + result = subprocess.run( + [ + sys.executable, + str(tests / "verify.py"), + "--predicted-db", + str(predicted), + "--gold-db", + str(tests / "gold.duckdb"), + "--eval-spec", + str(tests / "spider2_eval.jsonl"), + "--reward-out", + str(reward_out), + ], + capture_output=True, + text=True, + cwd=str(tests), # flat-import fallback resolves duckdb_match/eval_spec here + ) + assert result.returncode == 0, result.stderr + payload = json.loads(reward_out.read_text()) + assert set(payload) == {"reward"} + assert isinstance(payload["reward"], float) + assert payload["reward"] == 1.0 # reordered rows match under ignore_orders + + +def test_spider2_dbt_verify_emitted_test_sh_is_runnable(tmp_path): + # Proves the emitted test.sh is a valid shell script with the right shape: + # it references verify.py, the resolved /app/.duckdb predicted path, + # and the harbor reward path. A full container run is out of scope (no docker + # in unit/integration); this checks the contract. + view = materialize_spider2_harbor_task_view( + source_task_dir=_SOURCE, + view_root=tmp_path / "v", + task_slug="spider2-fixture-001", + ) + text = (view / "tests" / "test.sh").read_text() + assert "verify.py" in text + assert "/logs/verifier/reward.json" in text + # RIDER: predicted path is the resolver's /app/.duckdb, not hardcoded + assert "/app/spider2-fixture-001.duckdb" in text + assert "/app/spider2.duckdb" not in text + # the script is syntactically valid sh + check = subprocess.run( + ["sh", "-n", str(view / "tests" / "test.sh")], + capture_output=True, + text=True, + ) + assert check.returncode == 0, check.stderr From 7ab0bf719d6dc7c887eb352a603075d4df6dba08 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 19:16:37 +0800 Subject: [PATCH 04/26] docs(spider2): implementation stage report Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index b119dec..ded5192 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -69,3 +69,16 @@ dbt track is DuckDB-only. ### Summary Wrote a separate implementation plan (3 ACs, 7 tasks) for the spider2-dbt duckdb_match verifier. Riskiest-first: the comparator reproducing Spider2 `duckdb_match` (per-table SELECT *, condition_cols subset, ignore_orders multiset compare, all-tables-AND) is built and proven against in-test DuckDB fixtures before any test.sh wiring. Key locked decision: gold `.duckdb` + eval spec are verifier-only by placing them under the view's `tests/` dir (Harbor uploads `tests/` to the container only at verify time, removes it around the agent run), copied explicitly so the `**/gold/**` deny-glob keeps the agent-facing tree clean — confirmed live against the deny-globs and the harbor verifier/trial source. One open item flagged: the agent-produced DB's container path (`/app/spider2.duckdb`) in the emitted test.sh is a placeholder pending the `spider2-dbt-harbor-view-ade-parity` image/workdir contract; it does not affect the gating `uv run pytest -k spider2_dbt_verify` suite. + +## Stage Report: implementation + +- DONE: Implement the approved plan TDD-first so all 3 ACs pass (eval-spec loader + duckdb_match comparator, verify.py CLI, materializer emitting tests/test.sh + verifier-only gold assets); `uv run pytest -k spider2_dbt_verify` green + 18 passed (commits c72d551, 06be8be, 1a3b612). `eval_spec.py`+`duckdb_match.py` (per-table SELECT *, 0-based condition_cols subset, ignore_orders Counter multiset compare, AND across condition_tabs, missing-table=mismatch) → AC-1/AC-2 (11 comparator tests); `verify.py` emit_reward → harbor `{"reward": 1.0|0.0}` at parent-created path; `_ensure_verifier_assets` in `harbor_view.py` copies the 3 modules + gold.duckdb + spider2_eval.jsonl + executable test.sh into the view's `tests/` → AC-3 (5 CLI/emission + 2 integration tests). Acceptance run excluded the pre-existing-broken `tests/unit/test_task_identity_scoring.py` (imports nonexistent `razorback.score.load`, untouched by this branch). +- DONE: RIDER — emitted test.sh predicted-DB path CONSUMES resolve_spider2_db_name to produce /app/.duckdb (not hardcoded /app/spider2.duckdb); add a test asserting the predicted path is correct for a NON-spider2 slug/db name + `harbor_view.py` `_ensure_verifier_assets` calls `resolve_spider2_db_name(project_dir or view, task_slug=...)` and formats `_TEST_SH_TEMPLATE` with `/app/{db_name}.duckdb`. `test_spider2_dbt_verify_test_sh_uses_resolved_db_name` materializes with `task_slug="not-spider2-slug"` (fixture ships no profiles.yml/*.duckdb → resolver falls back to slug) and asserts `/app/not-spider2-slug.duckdb` present and `/app/spider2.duckdb` absent. +- DONE: Keep gold .duckdb + eval spec on verifier-only paths; keep the generic materializer + non-spider2 harbor behavior unchanged + Source gold lives under `tests/gold/` (matched by `**/gold/**` deny-glob, stripped from the reflected view), re-copied explicitly to the view's `tests/gold.duckdb`/`tests/spider2_eval.jsonl` (no `gold/` segment); the now-empty `gold/` dir left by reflection is pruned. Leakage-clean asserted in `test_spider2_dbt_verify_view_carries_verifier_assets`; existing harbor_view (8) + preflight (21) suites unchanged. Gold fixture is reproducible via `tests/gold/build_gold.py` (no opaque committed binary). + +### Summary + +Shipped the spider2-dbt duckdb_match verifier TDD-first across 3 commits: a pure comparator reproducing Spider2 `eval_utils.duckdb_match` (per-table SELECT *, 0-based condition_cols subsetting, ignore_orders multiset compare, all-tables-AND, missing-table mismatch), a `verify.py` CLI emitting harbor's `{"reward": }`, and a materializer step copying the comparator+gold+test.sh into the view's verifier-only `tests/`. The mandatory rider is satisfied: the emitted `test.sh` `--predicted-db` path is resolved through ny's now-merged `resolve_spider2_db_name` (`/app/.duckdb`), proven by a NON-spider2-slug test. Deviation from the plan's literal `_TEST_SH`: the hardcoded `/app/spider2.duckdb` was replaced by the resolver-driven template (the rider's purpose), and a `gold/`-empty-dir prune was added so the leakage assertion holds exactly. `uv run pytest -k spider2_dbt_verify` = 18 passed (ignoring one pre-existing-broken, unrelated scoring test file). From 637aac2265639f5203728665d16ba00908d6dfae Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 19:27:44 +0800 Subject: [PATCH 05/26] =?UTF-8?q?docs(spider2):=20validation=20report=20?= =?UTF-8?q?=E2=80=94=20REJECTED=20(translate=20leakage=20regression=20+=20?= =?UTF-8?q?duckdb=5Fmatch=20faithfulness)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 13 ++ .../spider2-dbt-duckdb-match-verifier.md | 207 ++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index ded5192..7b80850 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -82,3 +82,16 @@ Wrote a separate implementation plan (3 ACs, 7 tasks) for the spider2-dbt duckdb ### Summary Shipped the spider2-dbt duckdb_match verifier TDD-first across 3 commits: a pure comparator reproducing Spider2 `eval_utils.duckdb_match` (per-table SELECT *, 0-based condition_cols subsetting, ignore_orders multiset compare, all-tables-AND, missing-table mismatch), a `verify.py` CLI emitting harbor's `{"reward": }`, and a materializer step copying the comparator+gold+test.sh into the view's verifier-only `tests/`. The mandatory rider is satisfied: the emitted `test.sh` `--predicted-db` path is resolved through ny's now-merged `resolve_spider2_db_name` (`/app/.duckdb`), proven by a NON-spider2-slug test. Deviation from the plan's literal `_TEST_SH`: the hardcoded `/app/spider2.duckdb` was replaced by the resolver-driven template (the rider's purpose), and a `gold/`-empty-dir prune was added so the leakage assertion holds exactly. `uv run pytest -k spider2_dbt_verify` = 18 passed (ignoring one pre-existing-broken, unrelated scoring test file). + +## Stage Report: validation + +- DONE: Independently reproduce the 3 ACs from a clean worktree checkout (AC-1/AC-2/AC-3); run `uv run pytest -k spider2_dbt_verify --ignore=tests/unit/test_task_identity_scoring.py` + Gating suite 18 passed. AC-1/AC-2/AC-3 all reproduce green via a standalone script driving `compare_duckdb` + the materializer; additionally executed the emitted `test.sh` end-to-end (paths substituted) → reward.json `{"reward":1.0}` match / `{"reward":0.0}` mismatch. +- DONE: Confirm the mandatory rider (resolved `/app/.duckdb`, NOT hardcoded) for a NON-`spider2` slug; confirm duckdb_match reproduction matches Spider2 eval_utils semantics + Rider CONFIRMED: `task_slug="not-spider2-slug"` → test.sh `--predicted-db /app/not-spider2-slug.duckdb`, no `/app/spider2.duckdb`. Faithfulness FAILED (B2): impl is row-tuple + exact-`==`; Spider2 `compare_pandas_table` is column-containment + `math.isclose(abs_tol=1e-2)` + per-column sort, and the real spec schema is `List[List[int]]`/`List[bool]` under `evaluation.parameters` (impl uses dict + single bool, flat — would not parse real gold). +- DONE: Confirm gold assets verifier-only; scrutinize the two flagged deviations; confirm no regression; give a gate verdict + Path-based isolation sound (no `gold/` segment survives). Deviation 1 (rider path override) ACCEPTED/correct; deviation 2 (empty `gold/` prune) ACCEPTED. REGRESSION found (B1): `test_translate_spider2_dbt.py` (2 tests) pass on base `9c39af2`, fail on branch — verifier assets trip the production content-leakage scanner. 4 other full-suite failures are pre-existing on base (not regressions). Verdict: REJECTED. + +### Summary + +Independent verification only — no production code written. The 3 ACs as literally written reproduce green (18-passed gating suite + standalone AC repro + a real end-to-end test.sh run) and the mandatory resolver rider is satisfied for a non-spider2 slug. Verdict is REJECTED → implementation on two blocking findings: (B1) a branch-induced regression — the verifier assets written into the view's `tests/` trip `test_translate_spider2_dbt.py`'s content-leakage scanner, which passes on the merge-base; and (B2) the `duckdb_match` reproduction is not faithful to Spider2's `eval_utils.duckdb_match` (row-tuple/exact compare vs Spider2's column-containment + 1e-2 float tolerance + per-column sort) and models an incompatible eval-spec schema that would not parse a real `spider2_eval.jsonl` line. Both pre-flagged deviations (rider path override, empty `gold/` prune) are accepted as correct. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md diff --git a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md new file mode 100644 index 0000000..4b6aad9 --- /dev/null +++ b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md @@ -0,0 +1,207 @@ +# Validation: spider2-dbt — duckdb_match verifier emitting binary reward.json + +**Branch:** `spacedock-ensign/spider2-dbt-duckdb-match-verifier` @ `7ab0bf7` +**Merge-base with main:** `9c39af2` +**Gate verdict:** **REJECTED → implementation** + +Two blocking findings: (B1) a branch-induced regression in the spider2 +production translate leakage suite, and (B2) the `duckdb_match` +reproduction is **not faithful** to Spider2's `eval_utils.duckdb_match` +semantics (verdict-flipping divergences, plus an incompatible eval-spec +schema). The 3 ACs as literally written all reproduce green, but a +verifier whose comparator disagrees with the benchmark's own oracle and +whose change breaks the shipping leakage suite must not advance. + +--- + +## Acceptance command (gating) + +`uv run pytest -k spider2_dbt_verify --ignore=tests/unit/test_task_identity_scoring.py` +→ **18 passed, 793 deselected** (clean checkout of the branch HEAD). + +## AC reproduction (independent, not trusting the in-repo tests) + +Driven via a standalone script against the `compare_duckdb` comparator +and the materializer, from a clean worktree. + +- **AC-1 — 1.0 on match / 0.0 on mismatch:** PASS. + `compare_duckdb` on a matching DB → `True`; on a differing + condition-col value → `False`. Missing predicted table → `False`. +- **AC-2 — column subset + ignore_orders:** PASS *(as literally + specified)*. + (a) Row-reordered table + `ignore_orders=True` → `True`; same with + `ignore_orders=False` → `False`. (b) A diff in a NON-condition column + (excluded from `condition_cols`) → `True`; the same diff with the + column included → `False`. +- **AC-3 — emitted test.sh writes harbor-shaped reward.json:** PASS. + Materialized the view, then **actually executed the emitted + `test.sh`** (container paths `/tests`,`/app/`,`/logs/verifier` + substituted to temp dirs): exit 0, wrote + `/logs/verifier/reward.json` = `{"reward": 0.0}` on a mismatch and + `{"reward": 1.0}` on a match. `set(payload) == {"reward"}`, + `isinstance(payload["reward"], float)`. + Note: the in-repo integration test invokes `verify.py` directly and + only `sh -n` syntax-checks `test.sh`; I additionally ran the script + itself to satisfy AC-3's "running the emitted test.sh" clause. + +## Mandatory rider (resolver-driven predicted-db path) + +**CONFIRMED.** Materialized with `task_slug="not-spider2-slug"` (fixture +ships no profiles.yml / *.duckdb → resolver slug fallback). Emitted +`test.sh` `--predicted-db` = `/app/not-spider2-slug.duckdb`; +`/app/spider2.duckdb` absent. The path flows from ny's +`resolve_spider2_db_name` (imported from `preflight.py`, unchanged on +this branch), satisfying the SHARED `/app/.duckdb` contract for +a NON-`spider2` slug. + +## Verifier-only gold assets / leakage (path-based) + +Gold `.duckdb` + `spider2_eval.jsonl` + the 3 comparator modules + +`test.sh` land in the view's `tests/` (uploaded to the container only at +verify time). No `gold/` path SEGMENT survives in the materialized view +(`rglob("gold")` dirs = none; the empty `gold/` dir left by the +deny-glob file-strip is pruned). The path-based deny-glob isolation is +sound — but see B1: the assets still trip the production-path *content* +leakage scanner. + +--- + +## BLOCKING findings + +### B1 — Regression in the spider2 translate leakage suite + +`tests/unit/test_translate_spider2_dbt.py` has TWO tests that **pass on +`9c39af2` (base) and FAIL on `7ab0bf7` (this branch)** — verified by +running them in a base worktree (2 passed) and on the branch (2 failed): + +- `test_spider2_resolves_n_views_all_leakage_clean` +- `test_planted_forbidden_files_are_excluded_from_view` + +The test file is **unchanged** by this branch (`git diff base..HEAD -- +test_translate_spider2_dbt.py` empty), so this is a behavior regression, +not a test edit. Root cause: these tests drive the production path +`spec_to_job_config → materialize_spider2_harbor_task_view`, and this +branch's new `_ensure_verifier_assets` writes `duckdb_match.py`, +`eval_spec.py`, `verify.py`, `gold.duckdb`, and `test.sh` into the view's +`tests/`. The suite's `_leakage_hits` scans file NAME **and CONTENT** for +the alternation `gold|expected|golden`; the assets contain "gold" +(`gold.duckdb` filename; `--gold-db`/`gold_db`/`gold_rows`/docstrings +inside the .py and test.sh — confirmed: duckdb_match.py has 9 hits, +verify.py 5, eval_spec.py 4). All 5 assets are flagged as leakage. + +This is exactly the "no regression to ny's generic behavior" guard the +entity required. The gating `-k spider2_dbt_verify` selector does NOT +match `test_translate_spider2_dbt`, which is why the implementer's green +run missed it. + +**Fix direction (for implementation):** reconcile the two leakage +notions. Either (a) make `_ensure_verifier_assets` write the verifier +modules under a name/location the content+name scanner treats as +verifier-internal (the scanner already exempts `view_manifest.json` — +an analogous exemption for the verifier `tests/` payload is the natural +move), or (b) update the shipping leakage contract/test to recognize the +verifier `tests/` payload as verifier-only (uploaded at verify time, not +agent-visible) rather than answer leakage. Coordinate with whoever owns +`test_translate_spider2_dbt.py`'s `_leakage_hits` contract; do not +silently weaken the scan. + +### B2 — `duckdb_match` reproduction is NOT faithful to Spider2 eval_utils + +The canonical oracle is `spider2-dbt/evaluation_suite/eval_utils.py` +`duckdb_match` (fetched from xlang-ai/Spider2 main). The reproduction +diverges in ways that **flip the reward** versus the real benchmark, so a +model scored by this verifier would be graded differently than by +Spider2's own scorer: + +1. **Comparison axis — column-containment vs row-tuple.** Spider2 + transposes both tables and checks *each gold column-vector matches + SOME pred column-vector* (`for gcol in t_gold: any(vectors_match(gcol, + pcol) for pcol in t_pred)`). The impl compares **row tuples + positionally** after `SELECT *`. Demonstrated divergences (Spider2 → + 1, impl → 0): pred with **reordered columns**; pred with an **extra + column** beyond the gold/condition set. +2. **Float tolerance.** Spider2 uses `math.isclose(abs_tol=1e-2)` (and + NaN-equality). The impl uses exact `==` on raw tuples. Gold 1.000 vs + pred 1.005 → Spider2 1, impl 0. +3. **`ignore_orders` granularity.** Spider2 sorts **each column-vector + independently**; the impl sorts **row-tuples as units**. These are not + the same multiset relation — Spider2 can match a table whose + cross-column row association is scrambled; the impl cannot. +4. **Eval-spec schema is incompatible.** Real `spider2_eval.jsonl` lines + are `{"instance_id":..., "evaluation":{"func":"duckdb_match", + "parameters":{"gold":..., "condition_tabs":[...], + "condition_cols":[[...],...], "ignore_orders":[bool,...]}}}` — + `condition_cols` is `List[List[int]]` (positional, parallel to + `condition_tabs`) and `ignore_orders` is `List[bool]` (per-table), + nested under `evaluation.parameters`. The impl's `EvalSpec` / + `load_eval_spec` model `condition_cols: dict[str,list[int]]` and a + single `ignore_orders: bool`, read flat from the top level. The impl's + loader would **not parse a real Spider2 gold line** — it would silently + produce empty `condition_tabs` (`raw.get("condition_tabs", [])`) and + score every prediction 1.0 (vacuous AND over zero tables). + +The entity body asserts (verbatim) "per-table SELECT *, 0-based +condition_cols subset, ignore_orders multiset, AND across condition_tabs, +missing table = mismatch" — that description matches the IMPL but **not +Spider2's actual `compare_pandas_table`**, which is column-containment + +float-tolerant + per-column sort. The plan/AC contract treated a +row-tuple multiset model as the target; the real oracle is materially +different. This is the faithfulness check the rider demanded, and it +fails. + +**Fix direction (for implementation):** port `compare_pandas_table` +(transpose + per-column vectors_match with `abs_tol=1e-2` + per-column +ordered/sorted compare + gold-column-containment) rather than a row-tuple +compare; adopt the real `List[List[int]]` / `List[bool]` / +`evaluation.parameters` spec schema (or document an explicit, approved +deviation if the dbt track intentionally tightens the oracle — but that +needs a captain decision, not a silent reinterpretation). Update the gold +fixture + AC-2 wording to the column-containment semantics. + +## Two pre-flagged deviations — explicit verdict each + +1. **Rider override of the plan's hardcoded `/app/spider2.duckdb` + test.sh path → resolver-driven `/app/.duckdb`:** + **ACCEPTED / CORRECT.** This is exactly what the rider mandated; + proven for a non-spider2 slug above. Not a blocker. +2. **Pruning the empty `gold/` dir left after deny-glob file-stripping:** + **ACCEPTED.** Narrow, targeted (`rglob("gold")` dirs, only removed + when `not any(iterdir())`), keeps the path-segment leakage assertion + exact, and does not touch non-empty dirs. Not a blocker. (Note: it + does not address B1, which is about file CONTENT, not the `gold/` + directory segment.) + +## Regression scope (full suite, minus known-broken file) + +`uv run pytest --ignore=tests/unit/test_task_identity_scoring.py` → +**6 failed, 793 passed, 12 skipped**. +- 4 failures (`test_spacedock_solver_freeze_dir_mechanism`, + `test_worktree_teardown_preserves_runs`, + `test_generate_matrix_specs`, `test_rk_research_new`) **also fail on + base `9c39af2`** → pre-existing, unrelated, NOT regressions. +- 2 failures (`test_translate_spider2_dbt.py`, both) → **branch-induced + regression** (B1). +- `test_task_identity_scoring.py` ignored per the assignment + (pre-existing `razorback.score.load` import break). +- harbor_view + preflight suites (`-k "harbor_view or preflight"`): + **45 passed** — no regression there. + +## Code review + +The stage asks for `superpowers:requesting-code-review`. The substantive +findings it would surface are captured above (B1 regression, B2 +faithfulness + schema). No additional blocking correctness issues found +in `verify.py` / `eval_spec.py` / the materializer beyond B1/B2; the +flat-import fallbacks, fail-closed `predicted_db` existence check, and +harbor reward shape are sound. + +## Gate decision + +**REJECTED → implementation.** Fix B1 (restore the spider2 translate +leakage suite to green without weakening the leakage contract) and B2 +(make `duckdb_match` faithful to Spider2 `eval_utils.duckdb_match`, or +get an explicit captain-approved deviation, AND adopt the real +`spider2_eval.jsonl` schema so the loader parses real gold lines). The 3 +ACs as written pass and the rider is satisfied, but those ACs encode the +wrong oracle semantics — re-derive AC-2 against column-containment when +re-entering implementation. From 98604e5dd75c12ee8fc72a7888a5c1cda2eb03f7 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 20:01:06 +0800 Subject: [PATCH 06/26] =?UTF-8?q?feedback:=20validation=20gate=20rejected?= =?UTF-8?q?=20(cycle=201)=20=E2=80=94=20duckdb=5Fmatch=20faithfulness=20+?= =?UTF-8?q?=20leakage=20regression?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spider2-dbt-duckdb-match-verifier.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 7b80850..57c4cea 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -95,3 +95,21 @@ Shipped the spider2-dbt duckdb_match verifier TDD-first across 3 commits: a pure ### Summary Independent verification only — no production code written. The 3 ACs as literally written reproduce green (18-passed gating suite + standalone AC repro + a real end-to-end test.sh run) and the mandatory resolver rider is satisfied for a non-spider2 slug. Verdict is REJECTED → implementation on two blocking findings: (B1) a branch-induced regression — the verifier assets written into the view's `tests/` trip `test_translate_spider2_dbt.py`'s content-leakage scanner, which passes on the merge-base; and (B2) the `duckdb_match` reproduction is not faithful to Spider2's `eval_utils.duckdb_match` (row-tuple/exact compare vs Spider2's column-containment + 1e-2 float tolerance + per-column sort) and models an incompatible eval-spec schema that would not parse a real `spider2_eval.jsonl` line. Both pre-flagged deviations (rider path override, empty `gold/` prune) are accepted as correct. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md + +## Feedback Cycles + +### Cycle 1 — validation gate REJECTED (2026-06-18) + +Validation (reviewer fetched the real xlang-ai/Spider2 `eval_utils.duckdb_match`) +found two blocking defects. Routing back to `implementation`: + +1. **[Critical] B2 — comparator + eval-spec schema do NOT match Spider2's `duckdb_match`.** + The real Spider2 semantics (from `evaluation_suite/eval_utils.py`): + - **Column-containment, not row-tuple equality:** transpose both tables to column-vectors; for each GOLD column (restricted to `condition_cols`), assert some PRED column-vector matches it. Per-column sorting is applied; numeric compare uses `math.isclose(abs_tol=1e-2)`. + - **Eval-spec schema:** each gold line is `{"instance_id", "evaluation": {"func":"duckdb_match", "parameters": {"gold", "condition_tabs": [...], "condition_cols": [[int,...],...], "ignore_orders": [bool,...]}}}`. `condition_cols` is `List[List[int]]` and `ignore_orders` is `List[bool]` (per-table), NOT a flat dict + single bool. The current flat schema would silently score every prediction 1.0 on a real gold line. + **Fix:** re-derive `compare_duckdb` to reproduce Spider2's column-containment + `math.isclose(abs_tol=1e-2)` + per-column sort, AND across `condition_tabs`, missing table = mismatch. Load the real per-table `condition_cols`/`ignore_orders` lists from `evaluation.parameters`. Add tests for the verdict-flipping cases the reviewer found (float tolerance 1e-2, column reorder, extra pred columns) and a real multi-table gold line. +2. **[high] B1 — verifier gold assets trip eg's agent-view leakage test (regression).** + `_ensure_verifier_assets` writes `gold.duckdb` / eval spec / comparator into the view's `tests/`, and `test_translate_spider2_dbt.py`'s leakage scan (pattern `gold|expected|golden` over the materialized view) now flags them — 2 tests that pass on base `9c39af2` fail on this branch. + **Fix:** make the agent-facing view leakage-clean while keeping the gold assets verifier-only. The view's `tests/` is uploaded only at verify time and removed during the agent run, so reconcile the leakage check to scope to the agent-visible portion (exclude the verifier-only `tests/` assets) — OR relocate/name the assets so the scan is clean — without weakening real agent-view leakage protection. Restore the 2 failing tests to green. + +Keep the rider (resolver-driven `/app/.duckdb`) and AC-3 reward.json shape intact. From 61a1b9c3a81cda2448699e6328d2b1d665cdb0f0 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 20:07:13 +0800 Subject: [PATCH 07/26] =?UTF-8?q?fix(spider2):=20B2=20=E2=80=94=20faithful?= =?UTF-8?q?=20duckdb=5Fmatch=20(column-containment=20+=201e-2)=20+=20real?= =?UTF-8?q?=20eval-spec=20schema?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-derive compare_duckdb against xlang-ai/Spider2 spider2-dbt/evaluation_suite/ eval_utils.duckdb_match + compare_pandas_table: transpose to column-vectors, each gold condition_col column must match SOME pred column-vector, per-column sort under ignore_order, numeric compare via math.isclose(abs_tol=1e-2), AND across condition_tabs, missing pred table = mismatch. Replaces the prior row-tuple/exact-== Counter compare. EvalSpec now models the real gold-line shape {instance_id, evaluation:{func, parameters:{gold, condition_tabs, condition_cols:List[List[int]], ignore_orders:List[bool]}}} with per-table parallel lists (was a flat dict + single bool that would mis-parse real gold). Tests cover the reviewer's verdict-flipping cases (float within 1e-2, column reorder, extra pred columns) plus a real multi-table gold line. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../benchmarks/spider2_dbt/duckdb_match.py | 131 +++++--- .../benchmarks/spider2_dbt/eval_spec.py | 82 ++++- .../tests/gold/build_gold.py | 16 +- .../tests/gold/spider2_eval.jsonl | 2 +- tests/unit/test_spider2_dbt_verify_cli.py | 14 +- .../test_spider2_dbt_verify_comparator.py | 308 ++++++++++++------ 6 files changed, 397 insertions(+), 156 deletions(-) diff --git a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py index 47ca10d..3e610ad 100644 --- a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py +++ b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py @@ -1,67 +1,116 @@ -# ABOUTME: spider2-dbt comparator reproducing Spider2 eval_utils.duckdb_match. -# ABOUTME: per-table SELECT *, restrict to condition_cols, ignore_orders, AND across tables. +# ABOUTME: spider2-dbt comparator faithfully reproducing Spider2 eval_utils.duckdb_match. +# ABOUTME: Column-containment over transposed column-vectors, math.isclose(1e-2), AND across tables. from __future__ import annotations -from collections import Counter +import math from pathlib import Path import duckdb +import pandas as pd try: from razorback.benchmarks.spider2_dbt.eval_spec import EvalSpec except ModuleNotFoundError: # running flat from /tests in the verifier container from eval_spec import EvalSpec # type: ignore[no-redef] +# Spider2 eval_utils.compare_pandas_table numeric tolerance. +_TOLERANCE = 1e-2 -def _fetch_table(con: duckdb.DuckDBPyConnection, table: str) -> list[tuple] | None: - """SELECT * from `table`; None if the table does not exist.""" - exists = con.execute( - "SELECT 1 FROM information_schema.tables WHERE table_name = ?", - [table], - ).fetchone() - if exists is None: - return None - return con.execute(f'SELECT * FROM "{table}"').fetchall() +def _vectors_match(v1, v2, *, tol: float = _TOLERANCE, ignore_order: bool = False) -> bool: + """Port of Spider2 eval_utils.compare_pandas_table.vectors_match. -def _project(rows: list[tuple], col_indices: list[int] | None) -> list[tuple]: - if not col_indices: - return [tuple(r) for r in rows] - return [tuple(r[i] for i in col_indices) for r in rows] + Compares two column-vectors element-wise: NaN==NaN, numerics via + math.isclose(abs_tol=tol), everything else by ``!=``. When ignore_order + is set, both vectors are sorted first with Spider2's exact key. + """ + try: + if ignore_order: + key = lambda x: (x is None, str(x), isinstance(x, (int, float))) + v1 = sorted(v1, key=key) + v2 = sorted(v2, key=key) + if len(v1) != len(v2): + return False + for a, b in zip(v1, v2): + if pd.isna(a) and pd.isna(b): + continue + elif isinstance(a, (int, float)) and isinstance(b, (int, float)): + if not math.isclose(float(a), float(b), abs_tol=tol): + return False + elif a != b: + return False + return True + except Exception: + return False -def _rows_match( - pred: list[tuple], gold: list[tuple], *, ignore_orders: bool +def _compare_pandas_table( + pred: pd.DataFrame, + gold: pd.DataFrame, + *, + condition_cols: list[int], + ignore_order: bool, ) -> bool: - if ignore_orders: - return Counter(pred) == Counter(gold) - return list(pred) == list(gold) + """Port of Spider2 eval_utils.compare_pandas_table. + + Column-CONTAINMENT (not positional row equality): restrict gold to + ``condition_cols`` (all columns when empty), transpose both tables to + column-vectors, and require that EVERY gold column-vector matches SOME + pred column-vector. Extra pred columns are therefore tolerated. + """ + if condition_cols: + gold_cols = gold.iloc[:, condition_cols] + else: + gold_cols = gold + pred_cols = pred + + t_gold_list = gold_cols.transpose().values.tolist() + t_pred_list = pred_cols.transpose().values.tolist() + + for gold_vec in t_gold_list: + if not any( + _vectors_match(gold_vec, pred_vec, ignore_order=ignore_order) + for pred_vec in t_pred_list + ): + return False + return True + + +def _fetch_df(con: duckdb.DuckDBPyConnection, table: str) -> pd.DataFrame: + """SELECT * FROM `table` as a DataFrame (mirrors Spider2 fetchdf()).""" + return con.execute(f'SELECT * FROM "{table}"').fetchdf() def compare_duckdb(*, predicted_db: Path, gold_db: Path, spec: EvalSpec) -> bool: - """Reproduce Spider2 eval_utils.duckdb_match. + """Faithfully reproduce Spider2 eval_utils.duckdb_match. - For each table in spec.condition_tabs: SELECT * from both DBs, restrict - to spec.condition_cols[table] (0-based indices into SELECT * order; all - columns when absent), and compare with spec.ignore_orders. Overall match - is the AND across all tables. A missing predicted table is a mismatch. + For each table in ``spec.condition_tabs``: load gold + predicted via + ``SELECT *``, run column-containment with the table's + ``spec.condition_cols[i]`` / ``spec.ignore_orders[i]``. Overall match is + the AND across all tables. A missing/unreadable predicted table is a + mismatch (Spider2 wraps the predicted-table fetch in try/except -> 0). """ - pred_con = duckdb.connect(str(predicted_db), read_only=True) gold_con = duckdb.connect(str(gold_db), read_only=True) try: - for table in spec.condition_tabs: - gold_rows = _fetch_table(gold_con, table) - pred_rows = _fetch_table(pred_con, table) - if gold_rows is None or pred_rows is None: - return False - cols = spec.condition_cols.get(table) - if not _rows_match( - _project(pred_rows, cols), - _project(gold_rows, cols), - ignore_orders=spec.ignore_orders, - ): - return False - return True + gold_tables = [_fetch_df(gold_con, t) for t in spec.condition_tabs] finally: - pred_con.close() gold_con.close() + + pred_con = duckdb.connect(str(predicted_db), read_only=True) + try: + try: + pred_tables = [_fetch_df(pred_con, t) for t in spec.condition_tabs] + except Exception: + return False + finally: + pred_con.close() + + for i, (gold_df, pred_df) in enumerate(zip(gold_tables, pred_tables)): + if not _compare_pandas_table( + pred_df, + gold_df, + condition_cols=spec.condition_cols[i], + ignore_order=spec.ignore_orders[i], + ): + return False + return True diff --git a/src/razorback/benchmarks/spider2_dbt/eval_spec.py b/src/razorback/benchmarks/spider2_dbt/eval_spec.py index 1fbd5c9..5411759 100644 --- a/src/razorback/benchmarks/spider2_dbt/eval_spec.py +++ b/src/razorback/benchmarks/spider2_dbt/eval_spec.py @@ -1,5 +1,5 @@ # ABOUTME: spider2-dbt gold eval-spec model + loader (one line of spider2_eval.jsonl). -# ABOUTME: Keeps spec-parsing separate from the duckdb_match comparison semantics. +# ABOUTME: Mirrors Spider2 evaluate.py: evaluation.parameters drives duckdb_match. from __future__ import annotations import json @@ -11,31 +11,81 @@ class EvalSpec: """One Spider2-dbt gold eval entry (a line of spider2_eval.jsonl). - condition_tabs: gold table names to compare. - condition_cols: table name -> 0-based column indices (into SELECT * - order) to restrict the comparison to. A table missing here means - "compare all columns". - ignore_orders: when True, compare row-multisets order-insensitively. + Mirrors the real Spider2 gold-line shape consumed by + ``spider2-dbt/evaluation_suite/evaluate.py``:: + + {"instance_id": "...", + "evaluation": {"func": "duckdb_match", + "parameters": {"gold": "", + "condition_tabs": ["t1", "t2"], + "condition_cols": [[0, 2], []], + "ignore_orders": [true, false]}}} + + The three list fields are positional and parallel to ``condition_tabs``: + + condition_tabs: gold/pred table names to compare (``List[str]``). + condition_cols: per-table 0-based gold column indices to restrict the + gold side to before column-containment (``List[List[int]]``). An + empty inner list means "use all gold columns" for that table. This + mirrors Spider2 ``duckdb_match``'s ``condition_cols[i]`` argument to + ``compare_pandas_table``. + ignore_orders: per-table flag for order-insensitive per-column compare + (``List[bool]``). + + Per Spider2 ``duckdb_match`` defaults: a missing/empty ``condition_cols`` + becomes ``[[]] * len(condition_tabs)`` and a missing ``ignore_orders`` + becomes ``[False] * len(condition_tabs)``. """ condition_tabs: list[str] - condition_cols: dict[str, list[int]] = field(default_factory=dict) - ignore_orders: bool = False + condition_cols: list[list[int]] = field(default_factory=list) + ignore_orders: list[bool] = field(default_factory=list) + + def __post_init__(self) -> None: + n = len(self.condition_tabs) + # Normalize to Spider2 duckdb_match defaults so the comparator can + # index condition_cols[i] / ignore_orders[i] for every table. + cols = self.condition_cols + if not cols or cols in ([[]], [None]): + object.__setattr__(self, "condition_cols", [[] for _ in range(n)]) + orders = self.ignore_orders + if not orders: + object.__setattr__(self, "ignore_orders", [False] * n) + if len(self.condition_cols) != n: + raise ValueError( + f"condition_cols ({len(self.condition_cols)}) must be parallel " + f"to condition_tabs ({n})" + ) + if len(self.ignore_orders) != n: + raise ValueError( + f"ignore_orders ({len(self.ignore_orders)}) must be parallel " + f"to condition_tabs ({n})" + ) def load_eval_spec(path: Path) -> EvalSpec: - """Load a single-task gold eval spec from a JSON object. + """Load a single-task gold eval spec from a spider2_eval.jsonl line. - Accepts either a bare JSON object or the first line of a - spider2_eval.jsonl file (one task per line). + Reads the first line (one task per line), drills into + ``evaluation.parameters`` (the shape Spider2 ``evaluate.py`` passes to + ``duckdb_match``), and returns an :class:`EvalSpec`. Tolerates a bare + ``parameters``-shaped object (no ``evaluation`` wrapper) for fixtures. """ text = Path(path).read_text().strip() first_line = text.splitlines()[0] if text else "{}" raw = json.loads(first_line) + params = raw.get("evaluation", {}).get("parameters", raw) + + condition_tabs = list(params.get("condition_tabs", [])) + raw_cols = params.get("condition_cols") + condition_cols = ( + [list(inner) for inner in raw_cols] if raw_cols is not None else [] + ) + raw_orders = params.get("ignore_orders") + ignore_orders = [bool(b) for b in raw_orders] if raw_orders is not None else [] + return EvalSpec( - condition_tabs=list(raw.get("condition_tabs", [])), - condition_cols={ - k: list(v) for k, v in raw.get("condition_cols", {}).items() - }, - ignore_orders=bool(raw.get("ignore_orders", False)), + condition_tabs=condition_tabs, + condition_cols=condition_cols, + ignore_orders=ignore_orders, ) diff --git a/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/build_gold.py b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/build_gold.py index 66ebf49..7211be1 100644 --- a/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/build_gold.py +++ b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/build_gold.py @@ -18,12 +18,22 @@ def build() -> None: con.executemany("INSERT INTO orders VALUES (?, ?)", [(1, 100), (2, 200)]) finally: con.close() + # Real Spider2 gold-line shape (spider2-dbt/evaluation_suite/evaluate.py): + # instance_id + evaluation.parameters with per-table List[List[int]] + # condition_cols and List[bool] ignore_orders. (HERE / "spider2_eval.jsonl").write_text( json.dumps( { - "condition_tabs": ["orders"], - "condition_cols": {"orders": [0, 1]}, - "ignore_orders": True, + "instance_id": "spider2-fixture-001", + "evaluation": { + "func": "duckdb_match", + "parameters": { + "gold": "gold.duckdb", + "condition_tabs": ["orders"], + "condition_cols": [[0, 1]], + "ignore_orders": [True], + }, + }, } ) + "\n" diff --git a/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/spider2_eval.jsonl b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/spider2_eval.jsonl index 8a0ae1d..ea68512 100644 --- a/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/spider2_eval.jsonl +++ b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-001/tests/gold/spider2_eval.jsonl @@ -1 +1 @@ -{"condition_tabs": ["orders"], "condition_cols": {"orders": [0, 1]}, "ignore_orders": true} +{"instance_id": "spider2-fixture-001", "evaluation": {"func": "duckdb_match", "parameters": {"gold": "gold.duckdb", "condition_tabs": ["orders"], "condition_cols": [[0, 1]], "ignore_orders": [true]}}} diff --git a/tests/unit/test_spider2_dbt_verify_cli.py b/tests/unit/test_spider2_dbt_verify_cli.py index 429cf2a..742b6b8 100644 --- a/tests/unit/test_spider2_dbt_verify_cli.py +++ b/tests/unit/test_spider2_dbt_verify_cli.py @@ -29,9 +29,21 @@ def _build_db(path, rows): def _spec(path): + # Real Spider2 gold-line shape: evaluation.parameters with per-table lists. path.write_text( json.dumps( - {"condition_tabs": ["orders"], "condition_cols": {}, "ignore_orders": True} + { + "instance_id": "cli-001", + "evaluation": { + "func": "duckdb_match", + "parameters": { + "gold": "gold.duckdb", + "condition_tabs": ["orders"], + "condition_cols": [[]], + "ignore_orders": [True], + }, + }, + } ) + "\n" ) diff --git a/tests/unit/test_spider2_dbt_verify_comparator.py b/tests/unit/test_spider2_dbt_verify_comparator.py index a7e83d9..42df69c 100644 --- a/tests/unit/test_spider2_dbt_verify_comparator.py +++ b/tests/unit/test_spider2_dbt_verify_comparator.py @@ -1,51 +1,86 @@ # ABOUTME: spider2-dbt duckdb_match comparator + eval-spec unit tests (AC-1/AC-2). -# ABOUTME: Builds tiny in-test DuckDB fixtures and drives the comparator directly. +# ABOUTME: Faithful to Spider2 eval_utils: column-containment, 1e-2 tolerance, per-table lists. import json from pathlib import Path import duckdb +import pytest from razorback.benchmarks.spider2_dbt.duckdb_match import compare_duckdb from razorback.benchmarks.spider2_dbt.eval_spec import EvalSpec, load_eval_spec -def test_spider2_dbt_verify_loads_eval_spec(tmp_path: Path): +# -------------------------------------------------------------------------- +# eval-spec loader (real Spider2 evaluation.parameters shape) +# -------------------------------------------------------------------------- + + +def test_spider2_dbt_verify_loads_eval_spec_real_shape(tmp_path: Path): + # Real Spider2 gold line: instance_id + evaluation.parameters with + # per-table List[List[int]] condition_cols and List[bool] ignore_orders. spec_path = tmp_path / "spider2_eval.jsonl" spec_path.write_text( json.dumps( { - "condition_tabs": ["orders"], - "condition_cols": {"orders": [0, 2]}, - "ignore_orders": True, + "instance_id": "task-001", + "evaluation": { + "func": "duckdb_match", + "parameters": { + "gold": "gold.duckdb", + "condition_tabs": ["orders", "customers"], + "condition_cols": [[0, 2], []], + "ignore_orders": [True, False], + }, + }, } ) + "\n" ) spec = load_eval_spec(spec_path) assert spec == EvalSpec( - condition_tabs=["orders"], - condition_cols={"orders": [0, 2]}, - ignore_orders=True, + condition_tabs=["orders", "customers"], + condition_cols=[[0, 2], []], + ignore_orders=[True, False], ) def test_spider2_dbt_verify_eval_spec_defaults(tmp_path: Path): - # A table absent from condition_cols means "compare all columns"; - # ignore_orders defaults to False when the key is missing. + # Missing condition_cols/ignore_orders default per-table to []/False + # (mirrors Spider2 duckdb_match's None-handling). spec_path = tmp_path / "spider2_eval.jsonl" - spec_path.write_text(json.dumps({"condition_tabs": ["t1"]}) + "\n") + spec_path.write_text( + json.dumps( + { + "evaluation": { + "func": "duckdb_match", + "parameters": {"gold": "g.duckdb", "condition_tabs": ["t1", "t2"]}, + } + } + ) + + "\n" + ) spec = load_eval_spec(spec_path) - assert spec.condition_tabs == ["t1"] - assert spec.condition_cols == {} - assert spec.ignore_orders is False + assert spec.condition_tabs == ["t1", "t2"] + assert spec.condition_cols == [[], []] + assert spec.ignore_orders == [False, False] -def _build_db(path, tables: dict[str, tuple[list[str], list[tuple]]]): - """tables: name -> (column_names, rows). Builds a tiny .duckdb file.""" +def test_spider2_dbt_verify_eval_spec_rejects_mismatched_lengths(): + with pytest.raises(ValueError): + EvalSpec(condition_tabs=["a", "b"], condition_cols=[[0]], ignore_orders=[True]) + + +# -------------------------------------------------------------------------- +# comparator fixtures +# -------------------------------------------------------------------------- + + +def _build_db(path, tables: dict): + """tables: name -> (column_names, column_types, rows). Builds a .duckdb.""" con = duckdb.connect(str(path)) try: - for name, (cols, rows) in tables.items(): - col_defs = ", ".join(f"{c} INTEGER" for c in cols) + for name, (cols, types, rows) in tables.items(): + col_defs = ", ".join(f"{c} {t}" for c, t in zip(cols, types)) con.execute(f"CREATE TABLE {name} ({col_defs})") if rows: placeholders = ", ".join( @@ -57,29 +92,38 @@ def _build_db(path, tables: dict[str, tuple[list[str], list[tuple]]]): con.close() -def test_spider2_dbt_verify_matching_db_scores_true(tmp_path): - tables = {"orders": (["a", "b"], [(1, 2), (3, 4)])} - _build_db(tmp_path / "pred.duckdb", tables) - _build_db(tmp_path / "gold.duckdb", tables) - spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=False) - assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", - spec=spec, - ) - is True +def _ints(names, rows): + return (names, ["INTEGER"] * len(names), rows) + + +def _compare(tmp_path, *, pred, gold, spec): + _build_db(tmp_path / "pred.duckdb", pred) + _build_db(tmp_path / "gold.duckdb", gold) + return compare_duckdb( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + spec=spec, ) +# -------------------------------------------------------------------------- +# AC-1: 1.0 on match, 0.0 on mismatch +# -------------------------------------------------------------------------- + + +def test_spider2_dbt_verify_matching_db_scores_true(tmp_path): + tables = {"orders": _ints(["a", "b"], [(1, 2), (3, 4)])} + spec = EvalSpec(condition_tabs=["orders"]) + assert _compare(tmp_path, pred=tables, gold=tables, spec=spec) is True + + def test_spider2_dbt_verify_mismatched_db_scores_false(tmp_path): - _build_db(tmp_path / "pred.duckdb", {"orders": (["a", "b"], [(1, 2), (9, 9)])}) - _build_db(tmp_path / "gold.duckdb", {"orders": (["a", "b"], [(1, 2), (3, 4)])}) - spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=False) + spec = EvalSpec(condition_tabs=["orders"]) assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", + _compare( + tmp_path, + pred={"orders": _ints(["a", "b"], [(1, 2), (9, 9)])}, + gold={"orders": _ints(["a", "b"], [(1, 2), (3, 4)])}, spec=spec, ) is False @@ -88,15 +132,12 @@ def test_spider2_dbt_verify_mismatched_db_scores_false(tmp_path): def test_spider2_dbt_verify_all_tables_must_match(tmp_path): # One table matches, the other differs -> overall False (AND across tables). - pred = {"t1": (["a"], [(1,)]), "t2": (["a"], [(2,)])} - gold = {"t1": (["a"], [(1,)]), "t2": (["a"], [(99,)])} - _build_db(tmp_path / "pred.duckdb", pred) - _build_db(tmp_path / "gold.duckdb", gold) - spec = EvalSpec(condition_tabs=["t1", "t2"], condition_cols={}, ignore_orders=False) + spec = EvalSpec(condition_tabs=["t1", "t2"]) assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", + _compare( + tmp_path, + pred={"t1": _ints(["a"], [(1,)]), "t2": _ints(["a"], [(2,)])}, + gold={"t1": _ints(["a"], [(1,)]), "t2": _ints(["a"], [(99,)])}, spec=spec, ) is False @@ -104,49 +145,110 @@ def test_spider2_dbt_verify_all_tables_must_match(tmp_path): def test_spider2_dbt_verify_missing_predicted_table_scores_false(tmp_path): - _build_db(tmp_path / "pred.duckdb", {"other": (["a"], [(1,)])}) - _build_db(tmp_path / "gold.duckdb", {"orders": (["a"], [(1,)])}) - spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=False) + # Spider2 wraps the predicted-table fetch in try/except -> 0 on a + # missing/unreadable table, not a crash. + spec = EvalSpec(condition_tabs=["orders"]) assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", + _compare( + tmp_path, + pred={"other": _ints(["a"], [(1,)])}, + gold={"orders": _ints(["a"], [(1,)])}, spec=spec, ) is False ) -def test_spider2_dbt_verify_non_condition_col_diff_ignored(tmp_path): - # Column index 1 ("b") differs, but only index 0 ("a") is a condition_col - # -> still a match (the non-condition column is not compared). - _build_db(tmp_path / "pred.duckdb", {"orders": (["a", "b"], [(1, 100), (2, 200)])}) - _build_db(tmp_path / "gold.duckdb", {"orders": (["a", "b"], [(1, 7), (2, 8)])}) - spec = EvalSpec( - condition_tabs=["orders"], condition_cols={"orders": [0]}, ignore_orders=False +# -------------------------------------------------------------------------- +# AC-2 + reviewer's verdict-flipping cases +# -------------------------------------------------------------------------- + + +def test_spider2_dbt_verify_float_within_tolerance_matches(tmp_path): + # Reviewer case: a numeric column-vector within math.isclose(abs_tol=1e-2) + # must still match. Row-tuple exact-== would have flipped this to a 0. + spec = EvalSpec(condition_tabs=["m"]) + assert ( + _compare( + tmp_path, + pred={"m": (["v"], ["DOUBLE"], [(1.005,), (2.0,)])}, + gold={"m": (["v"], ["DOUBLE"], [(1.0,), (2.0,)])}, + spec=spec, + ) + is True ) + + +def test_spider2_dbt_verify_float_beyond_tolerance_mismatches(tmp_path): + # Just outside 1e-2 -> mismatch (proves the tolerance actually bounds). + spec = EvalSpec(condition_tabs=["m"]) + assert ( + _compare( + tmp_path, + pred={"m": (["v"], ["DOUBLE"], [(1.5,)])}, + gold={"m": (["v"], ["DOUBLE"], [(1.0,)])}, + spec=spec, + ) + is False + ) + + +def test_spider2_dbt_verify_column_reorder_matches_by_containment(tmp_path): + # Reviewer case: pred has the SAME column data in a DIFFERENT column order. + # Column-containment matches every gold column to SOME pred column, so a + # reordered pred is still a match. Positional row-tuple == would have failed. + spec = EvalSpec(condition_tabs=["t"]) assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", + _compare( + tmp_path, + pred={"t": _ints(["b", "a"], [(10, 1), (20, 2)])}, + gold={"t": _ints(["a", "b"], [(1, 10), (2, 20)])}, spec=spec, ) is True ) -def test_spider2_dbt_verify_condition_col_diff_detected(tmp_path): - # The SAME column data, but now index 1 ("b") IS a condition_col and differs - # -> mismatch. Proves the subset actually restricts, not drops everything. - _build_db(tmp_path / "pred.duckdb", {"orders": (["a", "b"], [(1, 100), (2, 200)])}) - _build_db(tmp_path / "gold.duckdb", {"orders": (["a", "b"], [(1, 7), (2, 8)])}) - spec = EvalSpec( - condition_tabs=["orders"], condition_cols={"orders": [0, 1]}, ignore_orders=False +def test_spider2_dbt_verify_extra_pred_columns_tolerated(tmp_path): + # Reviewer case: pred carries an EXTRA column not present in gold. Since + # only gold's columns must be contained in pred, the extra is tolerated. + spec = EvalSpec(condition_tabs=["t"]) + assert ( + _compare( + tmp_path, + pred={"t": _ints(["a", "extra"], [(1, 999), (2, 888)])}, + gold={"t": _ints(["a"], [(1,), (2,)])}, + spec=spec, + ) + is True + ) + + +def test_spider2_dbt_verify_condition_cols_restricts_gold(tmp_path): + # condition_cols=[[0]] restricts gold to column 0 only, so a difference in + # gold column 1 is NOT compared -> still a match. + spec = EvalSpec(condition_tabs=["orders"], condition_cols=[[0]]) + assert ( + _compare( + tmp_path, + pred={"orders": _ints(["a"], [(1,), (2,)])}, + gold={"orders": _ints(["a", "b"], [(1, 7), (2, 8)])}, + spec=spec, + ) + is True ) + + +def test_spider2_dbt_verify_condition_col_diff_detected(tmp_path): + # condition_cols=[[0,1]] -> gold column 1 IS compared; its values are not + # contained in any pred column -> mismatch. Proves subsetting restricts + # without dropping everything. + spec = EvalSpec(condition_tabs=["orders"], condition_cols=[[0, 1]]) assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", + _compare( + tmp_path, + pred={"orders": _ints(["a", "b"], [(1, 100), (2, 200)])}, + gold={"orders": _ints(["a", "b"], [(1, 7), (2, 8)])}, spec=spec, ) is False @@ -154,13 +256,13 @@ def test_spider2_dbt_verify_condition_col_diff_detected(tmp_path): def test_spider2_dbt_verify_row_reorder_matches_when_ignore_orders(tmp_path): - _build_db(tmp_path / "pred.duckdb", {"orders": (["a"], [(2,), (1,)])}) - _build_db(tmp_path / "gold.duckdb", {"orders": (["a"], [(1,), (2,)])}) - spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=True) + # Per-column ignore_order sorts each column-vector before compare. + spec = EvalSpec(condition_tabs=["orders"], ignore_orders=[True]) assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", + _compare( + tmp_path, + pred={"orders": _ints(["a"], [(2,), (1,)])}, + gold={"orders": _ints(["a"], [(1,), (2,)])}, spec=spec, ) is True @@ -168,30 +270,48 @@ def test_spider2_dbt_verify_row_reorder_matches_when_ignore_orders(tmp_path): def test_spider2_dbt_verify_row_reorder_mismatches_when_ordered(tmp_path): - _build_db(tmp_path / "pred.duckdb", {"orders": (["a"], [(2,), (1,)])}) - _build_db(tmp_path / "gold.duckdb", {"orders": (["a"], [(1,), (2,)])}) - spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=False) + # ignore_orders=False -> per-column positional compare; reordered -> mismatch. + spec = EvalSpec(condition_tabs=["orders"], ignore_orders=[False]) assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", + _compare( + tmp_path, + pred={"orders": _ints(["a"], [(2,), (1,)])}, + gold={"orders": _ints(["a"], [(1,), (2,)])}, spec=spec, ) is False ) -def test_spider2_dbt_verify_ignore_orders_still_counts_duplicates(tmp_path): - # ignore_orders is a multiset compare, not a set compare: duplicate counts - # must still match (Counter equality, not set equality). - _build_db(tmp_path / "pred.duckdb", {"orders": (["a"], [(1,), (1,)])}) - _build_db(tmp_path / "gold.duckdb", {"orders": (["a"], [(1,)])}) - spec = EvalSpec(condition_tabs=["orders"], condition_cols={}, ignore_orders=True) - assert ( - compare_duckdb( - predicted_db=tmp_path / "pred.duckdb", - gold_db=tmp_path / "gold.duckdb", - spec=spec, +def test_spider2_dbt_verify_multi_table_gold_line_mismatch(tmp_path): + # Real multi-table gold line: table 1 matches, table 2's condition column + # genuinely differs -> overall 0.0 (proves the per-table lists are honored, + # not a single flat dict + single bool). + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text( + json.dumps( + { + "instance_id": "multi-001", + "evaluation": { + "func": "duckdb_match", + "parameters": { + "gold": "gold.duckdb", + "condition_tabs": ["orders", "customers"], + "condition_cols": [[0], [0]], + "ignore_orders": [True, False], + }, + }, + } ) - is False + + "\n" ) + spec = load_eval_spec(spec_path) + pred = { + "orders": _ints(["id"], [(2,), (1,)]), # reordered, ignore_orders[0]=True -> ok + "customers": _ints(["id"], [(7,)]), # differs from gold -> table 2 fails + } + gold = { + "orders": _ints(["id"], [(1,), (2,)]), + "customers": _ints(["id"], [(99,)]), + } + assert _compare(tmp_path, pred=pred, gold=gold, spec=spec) is False From 22b9b6cc5a93a3e316422a30ab33ede9cdc77b59 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 20:08:33 +0800 Subject: [PATCH 08/26] =?UTF-8?q?fix(spider2):=20B1=20=E2=80=94=20scope=20?= =?UTF-8?q?leakage=20scan=20to=20agent-visible=20portion=20(exclude=20veri?= =?UTF-8?q?fy-time=20tests/)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verifier-only tests/ assets (gold.duckdb, eval spec, comparator) tripped test_translate_spider2_dbt.py's content-leakage scan. Harbor uploads tests/ to the container only at verify time and wipes/recreates it around the agent run (harbor verifier.py uploads inside verify(); trial.py _verify_step reset_dirs removes tests_dir first), so tests/ is never agent-visible. Scope _leakage_hits to everything OUTSIDE tests/ — real agent-view protection is unchanged. Add a guard test planting gold-content in BOTH an agent-visible path and tests/, proving the scanner still fires on the former and only excludes the latter. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/unit/test_translate_spider2_dbt.py | 61 +++++++++++++++++++++--- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/tests/unit/test_translate_spider2_dbt.py b/tests/unit/test_translate_spider2_dbt.py index 60be44e..10c5020 100644 --- a/tests/unit/test_translate_spider2_dbt.py +++ b/tests/unit/test_translate_spider2_dbt.py @@ -88,17 +88,43 @@ def test_spider2_dataset_requires_tasks_root(tmp_path, monkeypatch): _LEAKAGE_TERMS = ("gold", "expected", "golden") +def _is_verifier_only(rel: Path) -> bool: + """True for the verify-time-only `tests/` subtree. + + Harbor uploads the task's `tests/` dir into the container ONLY at verify + time and wipes/recreates it around the agent run, so it is never part of + the agent-visible view: + - harbor/verifier/verifier.py:133-138 uploads `tests/` inside `verify()`. + - harbor/trial/trial.py `_verify_step` first `reset_dirs(remove=[tests_dir, + verifier_dir])` then re-uploads — the agent step never sees `/tests`. + The spider2-dbt materializer deliberately stages the gold .duckdb + eval + spec + comparator there (`_ensure_verifier_assets`); those are the verifier's + answer key, NOT agent-visible leakage. Scoping the scan to the agent-visible + portion (everything OUTSIDE `tests/`) keeps real leakage protection intact: + any answer data that survives into an agent-reachable path still trips. + """ + return rel.parts[:1] == ("tests",) + + def _leakage_hits(view: Path) -> list[Path]: - """Pure-Python reimplementation of the rider's `rg -l 'gold|expected|golden'`. + """Pure-Python reimplementation of the rider's `rg -l 'gold|expected|golden'`, + scoped to the AGENT-VISIBLE portion of the view. Walks the view dir and reports any file whose NAME or CONTENT matches the case-sensitive alternation `gold|expected|golden` — matching the rider's - unescaped `rg -l` semantics. `view_manifest.json` is the materializer's - provenance record: it lists the deny globs (`tests/expected/**`, - `**/gold/**`, `**/golden/**`) and the checksums of every excluded source - file by design — an audit trail, NOT leaked answer content. So the scan - excludes the manifest; a hit on any other file means real answer data - survived the deny globs. + unescaped `rg -l` semantics. Two exclusions, both principled: + + - `view_manifest.json` is the materializer's provenance record: it lists the + deny globs (`tests/expected/**`, `**/gold/**`, `**/golden/**`) and the + checksums of every excluded source file by design — an audit trail, NOT + leaked answer content. + - The verify-time-only `tests/` subtree (see `_is_verifier_only`): Harbor + uploads it to the container only at verify time and removes it during the + agent run, so the gold .duckdb / eval spec / comparator the verifier needs + there are never agent-visible. + + A hit on any other file means real answer data survived into an + agent-reachable path. """ hits: list[Path] = [] for path in sorted(view.rglob("*")): @@ -106,6 +132,8 @@ def _leakage_hits(view: Path) -> list[Path]: continue if path.name == "view_manifest.json": continue + if _is_verifier_only(path.relative_to(view)): + continue if any(term in path.name for term in _LEAKAGE_TERMS): hits.append(path) continue @@ -185,6 +213,25 @@ def test_planted_forbidden_files_are_excluded_from_view(tmp_path, monkeypatch): assert hits == [], f"planted leakage survived into {view}: {hits}" +def test_leakage_scan_still_fires_on_agent_visible_answer_content(tmp_path): + """Guard: the `tests/`-scoped exclusion must NOT blanket-disable the scan. + + Plant `gold`-named answer content BOTH in an agent-visible path (view root) + AND under the verify-time-only `tests/` subtree, then assert `_leakage_hits` + reports the agent-visible one and ignores the verifier-only one. This pins + the B1 fix to scoping (exclude only `tests/`), not weakening the scanner. + """ + view = tmp_path / "view" + (view / "models").mkdir(parents=True) + (view / "models" / "answer_gold.sql").write_text("select 'leak';\n") + (view / "tests").mkdir() + (view / "tests" / "gold.duckdb").write_bytes(b"verifier-only gold answer key") + + hits = _leakage_hits(view) + assert view / "models" / "answer_gold.sql" in hits, "agent-visible leak missed" + assert view / "tests" / "gold.duckdb" not in hits, "verifier-only asset flagged" + + def test_exclude_tasks_drops_spider2_source_slug(tmp_path, monkeypatch): sources = sorted(FIXTURE_ROOT.glob("spider2-fixture-*")) assert len(sources) >= 2, "need >1 fixture instance to prove exclusion keeps the other" From f8789671863cf2c7ee9dfa587ede2bd6a751d7eb Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 20:10:00 +0800 Subject: [PATCH 09/26] docs(spider2): implementation stage report (feedback cycle 1) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 57c4cea..04b29f9 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -113,3 +113,16 @@ found two blocking defects. Routing back to `implementation`: **Fix:** make the agent-facing view leakage-clean while keeping the gold assets verifier-only. The view's `tests/` is uploaded only at verify time and removed during the agent run, so reconcile the leakage check to scope to the agent-visible portion (exclude the verifier-only `tests/` assets) — OR relocate/name the assets so the scan is clean — without weakening real agent-view leakage protection. Restore the 2 failing tests to green. Keep the rider (resolver-driven `/app/.duckdb`) and AC-3 reward.json shape intact. + +## Stage Report: implementation (cycle 1) + +- DONE: B2 (Critical) — re-derive `compare_duckdb` to FAITHFULLY reproduce Spider2's `eval_utils.duckdb_match` (column-containment, per-column sort, `math.isclose(1e-2)`, AND across condition_tabs, missing table = mismatch); add verdict-flipping tests + Commit 61a1b9c. Ported `compare_pandas_table`/`vectors_match` 1:1 from xlang-ai/Spider2 `spider2-dbt/evaluation_suite/eval_utils.py` (fetched via `gh api`, decoded to /tmp): transpose to column-vectors, each gold column (restricted to `condition_cols[i]`) must match SOME pred column-vector, sorted per-column under `ignore_order`, numerics via `math.isclose(abs_tol=1e-2)`, `pd.isna` for NaN; predicted-fetch wrapped in try/except -> 0. Tests added: float-within-1e-2 (True) + beyond (False), column reorder (True), extra pred columns (True), plus a real multi-table gold line -> 0.0. 16/16 comparator tests pass. +- DONE: B2 (Critical) — fix eval-spec schema to Spider2's real shape (`evaluation.parameters` with `condition_cols: List[List[int]]`, `ignore_orders: List[bool]`); add a real multi-table gold-line test proving non-1.0 on mismatch + Commit 61a1b9c. `EvalSpec` now holds per-table parallel lists; `load_eval_spec` drills into `evaluation.parameters` (matching Spider2 `evaluate.py:96-99` which calls `duckdb_match(result, **parameters)`), applies Spider2's None/empty defaults (`[[]]*n`, `[False]*n`), and validates list-length parity. Real-shaped multi-table line proven non-1.0 in `test_spider2_dbt_verify_multi_table_gold_line_mismatch`. Gold fixture + `build_gold.py` regenerated to the real shape. +- DONE: B1 (high) — make the agent-facing view leakage-clean while keeping gold verifier-only; restore the 2 failing `test_translate_spider2_dbt.py` tests; full `uv run pytest -k "spider2_dbt"` green + Commit 22b9b6c. Scoped `_leakage_hits` to exclude the verify-time-only `tests/` subtree (Harbor `verifier.py:133-138` uploads `tests/` only inside `verify()`; `trial.py` `_verify_step` `reset_dirs(remove=[tests_dir,verifier_dir])` first — never agent-visible). Added a guard test planting `gold`-content in BOTH an agent-visible path (caught) and `tests/` (ignored) so the fix is scoping, not weakening. `uv run pytest -k "spider2_dbt" --ignore=tests/unit/test_task_identity_scoring.py` = 66 passed. Rider (resolver-driven `/app/.duckdb`) + AC-3 reward.json shape untouched. + +### Summary + +Resolved both cycle-1 blocking defects against the authoritative xlang-ai/Spider2 source (fetched live, not the plan's prose). B2: `compare_duckdb` is now a faithful port of `eval_utils.duckdb_match` + `compare_pandas_table` (column-containment over transposed column-vectors, `math.isclose(abs_tol=1e-2)`, per-column sort, AND across tables), and `EvalSpec` models the real `evaluation.parameters` shape with per-table `List[List[int]]`/`List[bool]` lists — the prior row-tuple/exact-== + flat-dict design is gone. B1: the leakage scan is scoped to the agent-visible portion (everything outside the verify-time-only `tests/`), justified by the Harbor verify/agent dir lifecycle, with a guard test pinning it to scoping rather than disabling. Full gating suite 66 passed; acceptance `-k spider2_dbt_verify` 23 passed. No changes to the resolver rider or reward.json contract. From 0b64e92461d9e270790a3a04e33a5c6cf9017559 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 20:20:34 +0800 Subject: [PATCH 10/26] =?UTF-8?q?docs(spider2):=20validation=20re-review?= =?UTF-8?q?=20(cycle=202)=20=E2=80=94=20PASSED=20(B2=20faithful,=20B1=20sc?= =?UTF-8?q?oping=20sound)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 13 +++ .../spider2-dbt-duckdb-match-verifier.md | 97 +++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 04b29f9..78bc6a3 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -126,3 +126,16 @@ Keep the rider (resolver-driven `/app/.duckdb`) and AC-3 reward.json sh ### Summary Resolved both cycle-1 blocking defects against the authoritative xlang-ai/Spider2 source (fetched live, not the plan's prose). B2: `compare_duckdb` is now a faithful port of `eval_utils.duckdb_match` + `compare_pandas_table` (column-containment over transposed column-vectors, `math.isclose(abs_tol=1e-2)`, per-column sort, AND across tables), and `EvalSpec` models the real `evaluation.parameters` shape with per-table `List[List[int]]`/`List[bool]` lists — the prior row-tuple/exact-== + flat-dict design is gone. B1: the leakage scan is scoped to the agent-visible portion (everything outside the verify-time-only `tests/`), justified by the Harbor verify/agent dir lifecycle, with a guard test pinning it to scoping rather than disabling. Full gating suite 66 passed; acceptance `-k spider2_dbt_verify` 23 passed. No changes to the resolver rider or reward.json contract. + +## Stage Report: validation (cycle 2) + +- DONE: B2 — independently confirm the comparator faithfully reproduces Spider2's `eval_utils.duckdb_match` by checking against the ACTUAL xlang-ai/Spider2 source + Fetched upstream `eval_utils.py` live via `gh api`; line-by-line confirmed column-containment, per-column sort, `math.isclose(abs_tol=1e-2)`, `pd.isna` NaN, AND across condition_tabs, predicted-fetch=mismatch. 10-case differential test (impl vs inline transcription of the oracle: float within/beyond 1e-2, column reorder, extra pred cols, NaN, missing table, multi-table) ALL AGREE. `load_eval_spec` parses real `playbook001`/`provider001` gold lines from upstream `gold/spider2_eval.jsonl` (`evaluation.parameters`, per-table List[List[int]]/List[bool]). +- DONE: B1 — confirm the agent-facing view is leakage-clean AND the fix did not weaken protection; confirm the Harbor lifecycle justification; the 2 previously-failing tests are green + Verified Harbor lifecycle against installed package: `verifier.py:123 verify()` uploads `tests/` only inside verify(); `trial.py:570 _verify_step` `reset_dirs(remove=[verifier_dir,tests_dir])` wipes them empty pre-verify; agent gets only `workdir/`. Guard test catches agent-visible `gold` leak, ignores `tests/`. Mutation probe (blanket-True) makes the guard FAIL — confirms scoping not weakening. Production deny-globs unchanged. `test_translate_spider2_dbt.py` = 13 passed. +- DONE: Confirm no regression and the rider intact; AC-3 reward.json shape intact; give a gate verdict + `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` = 66 passed. Independent end-to-end: emitted test.sh resolves `/app/not-spider2-slug.duckdb` (no hardcoded), and emitted verify.py writes `{"reward":1.0}`/`{"reward":0.0}`. Branch touches only spider2_dbt scope; 3 named pre-existing failures untouched. Verdict: PASSED. + +### Summary + +Re-review after cycle-1 fixes (61a1b9c, 22b9b6c) — independent verification only, no production code. Both blocking findings resolved against the authoritative live xlang-ai/Spider2 source: B2 — the comparator is a faithful port of `eval_utils.duckdb_match` (column-containment + 1e-2 tolerance + per-column sort + AND-across-tables), proven by a 10-case differential test against the upstream oracle and a real multi-table gold line; `load_eval_spec` reads the real `evaluation.parameters` schema. B1 — the leakage scoping is sound, not weakening: the Harbor verify-only-tests/ lifecycle is verified against the installed package, and a mutation probe confirms the guard test still fires on agent-visible leaks. Rider (`/app/.duckdb`) and AC-3 reward.json shape intact; no regressions outside the known pre-existing set. Gate verdict: PASSED → done. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md diff --git a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md index 4b6aad9..8837189 100644 --- a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md @@ -205,3 +205,100 @@ get an explicit captain-approved deviation, AND adopt the real ACs as written pass and the rider is satisfied, but those ACs encode the wrong oracle semantics — re-derive AC-2 against column-containment when re-entering implementation. + +--- + +# Re-review (cycle 2): validation gate + +**Branch:** `spacedock-ensign/spider2-dbt-duckdb-match-verifier` @ `f878967` +**Merge-base with main:** `9c39af2` +**Gate verdict:** **PASSED → done** + +Independent re-verification of the two cycle-1 blocking findings after the +fix commits `61a1b9c` (B2) and `22b9b6c` (B1). The authoritative +xlang-ai/Spider2 `spider2-dbt/evaluation_suite/eval_utils.py` was fetched +live via `gh api` (387 lines, decoded to `/tmp`); the impl was checked +line-by-line against it and differential-tested against an inline +transcription of that source. Both findings are resolved; no new +regressions; rider + AC-3 intact. + +## B2 — comparator faithfulness to Spider2 `eval_utils.duckdb_match`: CONFIRMED + +`src/razorback/benchmarks/spider2_dbt/duckdb_match.py` is a faithful port of +Spider2 `duckdb_match` + `compare_pandas_table` + `vectors_match`: + +- **Column-containment** (not row-tuple equality): gold restricted to + `condition_cols` via `iloc[:, ...]`, pred uses all columns, both + transposed to column-vectors, each gold column must match SOME pred + column — matches Spider2 `eval_utils.py:132-149`. (impl drops Spider2's + dead `else` loop at 145-148; no behavioral effect.) +- **Per-column sort under `ignore_order`** with Spider2's exact sort key + `(x is None, str(x), isinstance(x,(int,float)))` — matches 115-117. +- **Numeric `math.isclose(abs_tol=1e-2)`** — matches 124. +- **NaN: `pd.isna(a) and pd.isna(b)` → equal** — matches 121. +- **AND across `condition_tabs`**, **predicted-fetch failure = mismatch** + (try/except → False) — matches 221-243. + +Differential test (impl vs an inline transcription of the upstream oracle) +on 10 verdict-flipping cases — float within/beyond 1e-2, row reorder +with/without ignore_order, extra pred columns, diff in non-condition col, +NaN, missing pred table, multi-table 1-mismatch, multi-table all-match — +**all 10 agree and match the expected verdict**. + +`load_eval_spec` drilled into the REAL `playbook001`/`provider001` gold +lines fetched from upstream `gold/spider2_eval.jsonl`: parses +`evaluation.parameters` with per-table `condition_tabs: List[str]`, +`condition_cols: List[List[int]]` (`[[0,1],[0,1,2,5,6,7,9,10,11,12,13]]`), +`ignore_orders: List[bool]` (`[True,True]`) — matches `evaluate.py:96-99` +(`duckdb_match(result, **parameters)`). All 68 real duckdb_match gold lines +carry explicit tabs/cols/orders, so the impl's explicit-`condition_tabs` +requirement (no None→all-tables default) is never exercised on real data. + +Minor non-blocking note: impl uses `SELECT * FROM "{table}"` (quoted) vs +Spider2's unquoted `{table_name}` — a robustness improvement, not a +faithfulness divergence. + +## B1 — leakage scoping sound, protection not weakened: CONFIRMED + +`_leakage_hits` (in `tests/unit/test_translate_spider2_dbt.py`) now excludes +the verify-time-only `tests/` subtree via `_is_verifier_only`. Verified the +Harbor lifecycle claim against the installed package source: + +- `harbor/verifier/verifier.py:123 def verify()` → lines 133-138 upload + the task's `tests/` to `env_paths.tests_dir` ONLY inside `verify()`. +- `harbor/trial/trial.py:570 _verify_step` → line 587-591 + `reset_dirs(remove_dirs=[verifier_dir, tests_dir], create_dirs=[...])` + wipes+recreates them EMPTY immediately before verification. +- The agent step receives only `_upload_step_workdir` (`workdir/`), never + the host view's `tests/`. So the host `tests/` assets genuinely never + reach the agent — excluding them from the host scan is principled. + +Scoping ≠ weakening, proven two ways: (1) the shipped guard test +`test_leakage_scan_still_fires_on_agent_visible_answer_content` plants +`gold`-content in BOTH an agent-visible path (caught) and `tests/` +(ignored); (2) a mutation probe — forcing `_is_verifier_only` to blanket- +`True` — makes that guard FAIL on the agent-visible leak, confirming the +guard is meaningful. Production deny-globs (`gold/**`, `**/gold/**`, +`expected/**`, `golden/**`) — the real agent-view protection — are +UNCHANGED; only the test's scan scope moved. + +The 2 previously-failing `test_translate_spider2_dbt.py` tests are green: +full file = 13 passed. + +## Regression / rider / AC-3 + +- Acceptance/gating: `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` → **66 passed**. +- Rider (independent end-to-end): materialized a view with `task_slug=not-spider2-slug`; emitted `tests/test.sh` resolves `--predicted-db /app/not-spider2-slug.duckdb` (ny's `resolve_spider2_db_name`), no hardcoded `/app/spider2.duckdb`. +- AC-3 (independent end-to-end): ran the emitted `verify.py` against the gold fixture → `/logs/verifier/reward.json` = `{"reward": 1.0}` (match) and `{"reward": 0.0}` (missing pred); reward is a float. +- Branch touches only `spider2_dbt`-scoped modules/tests + the gold fixture. None of the 3 known pre-existing failures (`test_task_identity_scoring`, `test_generate_matrix_specs`, `test_rk_research_new`) are touched by this branch — not regressions. Ignored harness `uv.lock` churn. + +## Gate decision + +**PASSED → done.** Both cycle-1 blocking findings are independently +resolved against the live Spider2 source: B2 — the comparator and eval-spec +schema faithfully reproduce `eval_utils.duckdb_match` (verified by a 10-case +differential test against the upstream oracle and a real multi-table gold +line); B1 — the leakage fix is scoping, not weakening (mutation-probed), +justified by the verified Harbor verify-only-tests/ lifecycle. Rider and +AC-3 reward.json shape intact; no regressions outside the known pre-existing +set. No blocking findings remain. From f25597c82d33ccc3052a9b20458af94aad2d99db Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:01:09 +0800 Subject: [PATCH 11/26] =?UTF-8?q?feedback:=20validation=20gate=20rejected?= =?UTF-8?q?=20(cycle=202)=20=E2=80=94=20fail-closed=20eval-spec=20validati?= =?UTF-8?q?on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spider2-dbt-duckdb-match-verifier.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 78bc6a3..4a4997b 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -139,3 +139,12 @@ Resolved both cycle-1 blocking defects against the authoritative xlang-ai/Spider ### Summary Re-review after cycle-1 fixes (61a1b9c, 22b9b6c) — independent verification only, no production code. Both blocking findings resolved against the authoritative live xlang-ai/Spider2 source: B2 — the comparator is a faithful port of `eval_utils.duckdb_match` (column-containment + 1e-2 tolerance + per-column sort + AND-across-tables), proven by a 10-case differential test against the upstream oracle and a real multi-table gold line; `load_eval_spec` reads the real `evaluation.parameters` schema. B1 — the leakage scoping is sound, not weakening: the Harbor verify-only-tests/ lifecycle is verified against the installed package, and a mutation probe confirms the guard test still fires on agent-visible leaks. Rider (`/app/.duckdb`) and AC-3 reward.json shape intact; no regressions outside the known pre-existing set. Gate verdict: PASSED → done. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md + +### Cycle 2 — validation gate REJECTED (2026-06-18, captain via Codex review) + +Codex found a fail-open verifier hazard; captain chose fix-now. Confirmed live. +Routing back to `implementation`: + +1. **[high] Fail-open: empty/malformed eval spec awards `{"reward": 1.0}`.** + `load_eval_spec` (`eval_spec.py:79`) defaults `condition_tabs` to `[]` with no non-empty check and no `evaluation.func` validation; `compare_duckdb` (`duckdb_match.py:76`) returns `True` when the `condition_tabs` loop never runs. So a corrupted/truncated/schema-drifted `spider2_eval.jsonl` scores every prediction 1.0 — silently reporting broken scoring as success. + **Fix:** fail closed in `load_eval_spec`/`EvalSpec` — require a non-empty `condition_tabs`, validate `evaluation.func == "duckdb_match"`, and raise (or otherwise force reward 0) on empty/missing. Add a negative regression test proving an empty/missing-`condition_tabs` spec raises or emits reward 0, NOT 1. Keep the faithful comparator (cycle-1 B2), the leakage scoping (cycle-1 B1), the rider, and AC-3 reward shape intact. From 869918dd2db51ba26f7526d3692bb7f145b79b6e Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:03:51 +0800 Subject: [PATCH 12/26] fix(spider2-dbt): fail closed on empty/malformed gold eval spec A zero-table EvalSpec makes compare_duckdb's AND-loop never run and return True, so a corrupted/truncated/schema-drifted spider2_eval.jsonl silently scored every prediction reward 1.0 (fail-open). Now: - EvalSpec rejects empty condition_tabs at construction. - load_eval_spec raises on an empty file and on a non-duckdb_match evaluation.func. - emit_reward wraps load_eval_spec/compare in try/except -> reward 0, never crash-into-pass and never silent 1.0. Adds negative regression tests proving an empty file, a missing condition_tabs, and a wrong func each raise (loader) or emit reward 0 (CLI), not 1. Cycle-1 comparator/leakage fixes, the rider, and AC-3 reward shape untouched. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../benchmarks/spider2_dbt/eval_spec.py | 37 ++++++++++- .../benchmarks/spider2_dbt/verify.py | 23 +++++-- tests/unit/test_spider2_dbt_verify_cli.py | 47 +++++++++++++ .../test_spider2_dbt_verify_comparator.py | 66 +++++++++++++++++++ 4 files changed, 164 insertions(+), 9 deletions(-) diff --git a/src/razorback/benchmarks/spider2_dbt/eval_spec.py b/src/razorback/benchmarks/spider2_dbt/eval_spec.py index 5411759..500e3ef 100644 --- a/src/razorback/benchmarks/spider2_dbt/eval_spec.py +++ b/src/razorback/benchmarks/spider2_dbt/eval_spec.py @@ -43,6 +43,15 @@ class EvalSpec: def __post_init__(self) -> None: n = len(self.condition_tabs) + # Fail closed: a zero-table spec would make compare_duckdb's AND-loop + # never run and return True — silently scoring every prediction 1.0. + # A corrupted/truncated/schema-drifted gold line must NOT score as a + # match, so refuse to construct an empty-condition_tabs spec. + if n == 0: + raise ValueError( + "EvalSpec requires a non-empty condition_tabs; a zero-table " + "spec would score every prediction as a match (fail-open)" + ) # Normalize to Spider2 duckdb_match defaults so the comparator can # index condition_cols[i] / ignore_orders[i] for every table. cols = self.condition_cols @@ -70,11 +79,35 @@ def load_eval_spec(path: Path) -> EvalSpec: ``evaluation.parameters`` (the shape Spider2 ``evaluate.py`` passes to ``duckdb_match``), and returns an :class:`EvalSpec`. Tolerates a bare ``parameters``-shaped object (no ``evaluation`` wrapper) for fixtures. + + Fails closed on a corrupted/truncated/schema-drifted gold line: an empty + file, a wrong ``evaluation.func``, or a missing/empty ``condition_tabs`` + each raise ``ValueError`` rather than yielding a zero-table spec that + ``compare_duckdb`` would silently score as a match (reward 1.0). """ text = Path(path).read_text().strip() - first_line = text.splitlines()[0] if text else "{}" + if not text: + raise ValueError( + f"empty gold eval spec at {path}: cannot score a missing/truncated " + "spider2_eval.jsonl (fail-open guard)" + ) + first_line = text.splitlines()[0] raw = json.loads(first_line) - params = raw.get("evaluation", {}).get("parameters", raw) + + # When the real Spider2 evaluation wrapper is present, the func MUST be + # duckdb_match — any other func means this verifier cannot faithfully score + # the line, so refuse rather than fall through to a match. + evaluation = raw.get("evaluation") + if evaluation is not None: + func = evaluation.get("func") + if func != "duckdb_match": + raise ValueError( + f"unsupported evaluation.func {func!r} in {path}: this verifier " + "only scores 'duckdb_match' (fail-open guard)" + ) + params = evaluation.get("parameters", {}) + else: + params = raw condition_tabs = list(params.get("condition_tabs", [])) raw_cols = params.get("condition_cols") diff --git a/src/razorback/benchmarks/spider2_dbt/verify.py b/src/razorback/benchmarks/spider2_dbt/verify.py index 34afe14..6d68424 100644 --- a/src/razorback/benchmarks/spider2_dbt/verify.py +++ b/src/razorback/benchmarks/spider2_dbt/verify.py @@ -23,21 +23,30 @@ def emit_reward( reward_out: Path, ) -> None: """Compute the binary duckdb_match reward and write harbor's reward.json.""" + err: str | None = None if not Path(predicted_db).exists(): is_match = False + err = f"predicted DB not found ({predicted_db})" else: - spec = load_eval_spec(Path(eval_spec)) - is_match = compare_duckdb( - predicted_db=Path(predicted_db), - gold_db=Path(gold_db), - spec=spec, - ) + # Fail closed: an empty / malformed / schema-drifted gold spec must + # surface as reward 0, never crash-into-pass and never silently 1.0. + try: + spec = load_eval_spec(Path(eval_spec)) + is_match = compare_duckdb( + predicted_db=Path(predicted_db), + gold_db=Path(gold_db), + spec=spec, + ) + except Exception as exc: # noqa: BLE001 — any spec/compare failure = non-match + is_match = False + err = f"eval-spec/compare error: {exc}" payload = {"reward": 1.0 if is_match else 0.0} Path(reward_out).parent.mkdir(parents=True, exist_ok=True) Path(reward_out).write_text(json.dumps(payload) + "\n") if not is_match: + detail = f": {err}" if err else "" sys.stderr.write( - f"spider2-dbt verify: mismatch (predicted={predicted_db})\n" + f"spider2-dbt verify: mismatch (predicted={predicted_db}){detail}\n" ) diff --git a/tests/unit/test_spider2_dbt_verify_cli.py b/tests/unit/test_spider2_dbt_verify_cli.py index 742b6b8..0fd38a6 100644 --- a/tests/unit/test_spider2_dbt_verify_cli.py +++ b/tests/unit/test_spider2_dbt_verify_cli.py @@ -92,6 +92,53 @@ def test_spider2_dbt_verify_cli_missing_predicted_scores_zero(tmp_path): assert json.loads(reward_out.read_text()) == {"reward": 0.0} +def test_spider2_dbt_verify_cli_empty_spec_scores_zero_not_one(tmp_path): + # FAIL-CLOSED (cycle-2 B1): a real predicted DB matching a gold DB, but with + # an empty / malformed gold spec, must score 0.0 — never silently 1.0. An + # empty condition_tabs is the hazard (compare_duckdb's AND-loop returns True + # on zero tables), so emit_reward must surface it as a NON-match. + _build_db(tmp_path / "pred.duckdb", [1, 2]) + _build_db(tmp_path / "gold.duckdb", [1, 2]) # identical -> would be 1.0 if scored + empty_spec = tmp_path / "spider2_eval.jsonl" + empty_spec.write_text("") + reward_out = tmp_path / "logs" / "verifier" / "reward.json" + emit_reward( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + eval_spec=empty_spec, + reward_out=reward_out, + ) + assert json.loads(reward_out.read_text()) == {"reward": 0.0} + + +def test_spider2_dbt_verify_cli_wrong_func_scores_zero_not_one(tmp_path): + # A schema-drifted spec (non-duckdb_match func) over a matching DB pair must + # also fail closed to 0.0, not crash and not pass. + _build_db(tmp_path / "pred.duckdb", [1, 2]) + _build_db(tmp_path / "gold.duckdb", [1, 2]) + bad_spec = tmp_path / "spider2_eval.jsonl" + bad_spec.write_text( + json.dumps( + { + "instance_id": "x", + "evaluation": { + "func": "string_match", + "parameters": {"gold": "gold.duckdb", "condition_tabs": ["orders"]}, + }, + } + ) + + "\n" + ) + reward_out = tmp_path / "logs" / "verifier" / "reward.json" + emit_reward( + predicted_db=tmp_path / "pred.duckdb", + gold_db=tmp_path / "gold.duckdb", + eval_spec=bad_spec, + reward_out=reward_out, + ) + assert json.loads(reward_out.read_text()) == {"reward": 0.0} + + def test_spider2_dbt_verify_view_carries_verifier_assets(tmp_path): view = materialize_spider2_harbor_task_view( source_task_dir=_SOURCE, diff --git a/tests/unit/test_spider2_dbt_verify_comparator.py b/tests/unit/test_spider2_dbt_verify_comparator.py index 42df69c..d570f59 100644 --- a/tests/unit/test_spider2_dbt_verify_comparator.py +++ b/tests/unit/test_spider2_dbt_verify_comparator.py @@ -70,6 +70,72 @@ def test_spider2_dbt_verify_eval_spec_rejects_mismatched_lengths(): EvalSpec(condition_tabs=["a", "b"], condition_cols=[[0]], ignore_orders=[True]) +# -------------------------------------------------------------------------- +# Fail-closed: a zero-table / malformed gold spec MUST NOT score as a match +# (cycle-2 B1). compare_duckdb returns True on an empty condition_tabs loop, +# so an empty spec would silently award 1.0 -> reject it at load/construct time. +# -------------------------------------------------------------------------- + + +def test_spider2_dbt_verify_eval_spec_rejects_empty_condition_tabs(): + # A zero-table spec is the fail-open hazard: compare_duckdb's AND-loop never + # runs and returns True. EvalSpec must refuse to construct one. + with pytest.raises(ValueError): + EvalSpec(condition_tabs=[]) + + +def test_spider2_dbt_verify_load_eval_spec_empty_file_raises(tmp_path: Path): + # An empty / truncated spider2_eval.jsonl must NOT yield a zero-table spec. + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text("") + with pytest.raises(ValueError): + load_eval_spec(spec_path) + + +def test_spider2_dbt_verify_load_eval_spec_missing_condition_tabs_raises(tmp_path: Path): + # evaluation.parameters present but no condition_tabs -> fail closed. + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text( + json.dumps( + { + "instance_id": "t", + "evaluation": {"func": "duckdb_match", "parameters": {"gold": "g.duckdb"}}, + } + ) + + "\n" + ) + with pytest.raises(ValueError): + load_eval_spec(spec_path) + + +def test_spider2_dbt_verify_load_eval_spec_wrong_func_raises(tmp_path: Path): + # A non-duckdb_match evaluation func is a schema drift we must not score. + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text( + json.dumps( + { + "instance_id": "t", + "evaluation": { + "func": "string_match", + "parameters": {"gold": "g.duckdb", "condition_tabs": ["orders"]}, + }, + } + ) + + "\n" + ) + with pytest.raises(ValueError): + load_eval_spec(spec_path) + + +def test_spider2_dbt_verify_load_eval_spec_missing_evaluation_raises(tmp_path: Path): + # A line with no evaluation wrapper and no condition_tabs must fail closed + # rather than fall through to a zero-table (match-everything) spec. + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text(json.dumps({"instance_id": "t"}) + "\n") + with pytest.raises(ValueError): + load_eval_spec(spec_path) + + # -------------------------------------------------------------------------- # comparator fixtures # -------------------------------------------------------------------------- From 11c66f41f0756a31f73994c3da4a0e6eb542bc30 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:04:17 +0800 Subject: [PATCH 13/26] docs(spider2-dbt): cycle-2 implementation stage report (fail-closed) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 4a4997b..beec682 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -148,3 +148,16 @@ Routing back to `implementation`: 1. **[high] Fail-open: empty/malformed eval spec awards `{"reward": 1.0}`.** `load_eval_spec` (`eval_spec.py:79`) defaults `condition_tabs` to `[]` with no non-empty check and no `evaluation.func` validation; `compare_duckdb` (`duckdb_match.py:76`) returns `True` when the `condition_tabs` loop never runs. So a corrupted/truncated/schema-drifted `spider2_eval.jsonl` scores every prediction 1.0 — silently reporting broken scoring as success. **Fix:** fail closed in `load_eval_spec`/`EvalSpec` — require a non-empty `condition_tabs`, validate `evaluation.func == "duckdb_match"`, and raise (or otherwise force reward 0) on empty/missing. Add a negative regression test proving an empty/missing-`condition_tabs` spec raises or emits reward 0, NOT 1. Keep the faithful comparator (cycle-1 B2), the leakage scoping (cycle-1 B1), the rider, and AC-3 reward shape intact. + +## Stage Report: implementation (cycle 2) + +- DONE: Fail closed in `load_eval_spec`/`EvalSpec` (`eval_spec.py`): require a non-empty `condition_tabs`, and validate `evaluation.func == "duckdb_match"`. On an empty/missing `condition_tabs`, a missing/empty `evaluation`, or a wrong `func`, RAISE a clear error — never accept a zero-table spec that compare_duckdb would score as a match. + Commit 869918d. `EvalSpec.__post_init__` now raises `ValueError` when `condition_tabs` is empty (the fail-open hazard: compare_duckdb's AND-loop returns True on zero tables). `load_eval_spec` raises on an empty/truncated file and on any `evaluation.func != "duckdb_match"`; a missing/empty `condition_tabs` falls through to the empty-spec guard and raises. TDD: red first (5 loader/construct negatives DID NOT RAISE), then green. +- DONE: Ensure the verifier path cannot emit `{"reward": 1.0}` for an empty/malformed gold spec; add a NEGATIVE regression test (empty file, missing `condition_tabs`, non-`duckdb_match` func each raise or emit reward 0 — NOT 1). + Commit 869918d. `emit_reward` (`verify.py`) wraps `load_eval_spec`+`compare_duckdb` in try/except → reward 0 on any spec/compare error (no crash-into-pass, no silent 1.0). Negatives added: loader-level (empty file / missing condition_tabs / wrong func / missing evaluation → raise) and CLI-level over a MATCHING pred/gold pair (empty spec, wrong func → `{"reward": 0.0}`, proving fail-closed beats the would-be-1.0). +- DONE: Keep intact the faithful comparator (cycle-1 B2), the leakage scoping (cycle-1 B1), the rider, and the AC-3 reward.json shape; `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` green. + Gating suite 73 passed (was 66; +7 new negatives), no edits to duckdb_match comparator, harbor_view leakage scoping, the resolver rider, or the reward.json contract. Focused fix touched only `eval_spec.py`, `verify.py`, and the two test files. + +### Summary + +Resolved the cycle-2 fail-open hazard with a focused fix. `EvalSpec` and `load_eval_spec` now fail closed: an empty/truncated `spider2_eval.jsonl`, a missing/empty `condition_tabs`, or a non-`duckdb_match` `evaluation.func` raise `ValueError` instead of yielding a zero-table spec that `compare_duckdb` silently scored 1.0; `emit_reward` catches any such failure and emits reward 0 rather than crashing or passing. Proven TDD-first by 7 negative regression tests (4 loader/construct raises + 2 CLI fail-closed-over-matching-DBs + 1 empty-EvalSpec raise) — all red before the fix, green after. The cycle-1 comparator/leakage fixes, the resolver rider, and the AC-3 reward shape are untouched; `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` = 73 passed. From 6698e2692285625e42a7f035f74f402b4f49fa59 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:10:26 +0800 Subject: [PATCH 14/26] =?UTF-8?q?docs(spider2):=20validation=20re-review?= =?UTF-8?q?=20(cycle=203)=20=E2=80=94=20PASSED=20(fail-closed=20real=20+?= =?UTF-8?q?=20load-bearing)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 13 +++ .../spider2-dbt-duckdb-match-verifier.md | 81 +++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index beec682..9eea728 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -161,3 +161,16 @@ Routing back to `implementation`: ### Summary Resolved the cycle-2 fail-open hazard with a focused fix. `EvalSpec` and `load_eval_spec` now fail closed: an empty/truncated `spider2_eval.jsonl`, a missing/empty `condition_tabs`, or a non-`duckdb_match` `evaluation.func` raise `ValueError` instead of yielding a zero-table spec that `compare_duckdb` silently scored 1.0; `emit_reward` catches any such failure and emits reward 0 rather than crashing or passing. Proven TDD-first by 7 negative regression tests (4 loader/construct raises + 2 CLI fail-closed-over-matching-DBs + 1 empty-EvalSpec raise) — all red before the fix, green after. The cycle-1 comparator/leakage fixes, the resolver rider, and the AC-3 reward shape are untouched; `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` = 73 passed. + +## Stage Report: validation (cycle 3) + +- DONE: Confirm the fail-closed fix is real and load-bearing: an empty file, a missing/empty `condition_tabs`, and a non-`duckdb_match` `evaluation.func` each RAISE in load_eval_spec/EvalSpec; and `emit_reward` (verify.py) surfaces any spec/compare failure as reward 0 (never crash-into-pass, never silent 1.0). Prove it beats the prior hazard: over a MATCHING pred/gold pair, an empty/zero-table spec now yields reward 0 (or raises), not 1.0. Confirm the negative tests fail if the guards are removed. + Independent probe (clean branch HEAD 11c66f4): empty file / missing `condition_tabs` / empty `condition_tabs` / wrong func / `EvalSpec([])` all RAISE ValueError; over a MATCHING pred/gold pair an empty-spec and a wrong-func spec each emit `{"reward": 0.0}` (not 1.0), garbage non-JSON emits 0.0 without crashing, and a VALID matching spec still emits 1.0 (positive control — guards aren't blanket-failing). MUTATION TEST: stripping the `n==0`/empty-file/wrong-func guards made all 6 negative tests FAIL (CLI mutation reproduced the exact `{'reward': 1.0}` hazard) — guards are load-bearing; tree restored clean. +- DONE: Confirm the cycle-1 fixes are intact: the comparator still faithfully reproduces Spider2 duckdb_match (column-containment, math.isclose 1e-2, per-table condition_cols/ignore_orders), and the agent-view leakage scoping still excludes only the verify-only tests/ subtree (agent-visible leaks still caught). + `git diff 0b64e92..869918d --name-only` touches ONLY eval_spec.py/verify.py + their 2 test files — duckdb_match.py (B2), harbor_view.py + test_translate_spider2_dbt.py (B1) untouched, so cycle-1 is intact by construction. Confirmed live anyway: comparator probe passes column-containment (reordered + extra pred col -> True), isclose 1e-2 within/beyond, per-column sort under ignore_order, ordered mismatch, missing pred table -> False. Leakage suite (`leakage_scan_still_fires` guard) 9 passed; translate suite leakage/planted/resolves 3 passed. +- DONE: Confirm no regression and rider/AC-3 intact: `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` (expect ~73 passed); emitted test.sh resolves /app/.duckdb via ny's resolver; reward.json shape `{"reward": }`. Give a gate verdict. + Gating suite = 73 passed, 751 deselected. RIDER: materialized view with `task_slug=not-spider2-slug` -> emitted test.sh `--predicted-db /app/not-spider2-slug.duckdb`, no hardcoded `/app/spider2.duckdb`. AC-3 end-to-end via emit_reward against the materialized gold fixture: matching pred -> `{"reward": 1.0}`, mismatch -> `{"reward": 0.0}`, `set(payload)=={"reward"}` & float. Working tree clean (only known `uv.lock` churn, ignored). Verdict: PASSED. + +### Summary + +Cycle-3 re-validation of the fail-closed fix (commit 869918d) — independent verification only, no production code. The fix is real and load-bearing: empty file / missing-or-empty `condition_tabs` / non-`duckdb_match` func all RAISE, `emit_reward` surfaces any failure as reward 0 (no crash-into-pass, no silent 1.0), and over a MATCHING pred/gold pair the prior would-be-1.0 hazard now scores 0 — proven by a mutation test where removing the guards makes all 6 negatives fail (CLI mutation reproduced the literal `{'reward': 1.0}` hazard). A positive control confirms a valid matching spec still scores 1.0. Cycle-1 fixes (faithful column-containment comparator + 1e-2 tolerance; verify-only-tests/ leakage scoping) are untouched by the cycle-2 diff and re-confirmed live; rider (`/app/.duckdb`) and AC-3 reward shape intact; gating suite 73 passed; only the known pre-existing unrelated failures + harness `uv.lock` churn remain. Gate verdict: PASSED -> done. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md diff --git a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md index 8837189..aeadd5a 100644 --- a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md @@ -302,3 +302,84 @@ line); B1 — the leakage fix is scoping, not weakening (mutation-probed), justified by the verified Harbor verify-only-tests/ lifecycle. Rider and AC-3 reward.json shape intact; no regressions outside the known pre-existing set. No blocking findings remain. + +--- + +# Re-review (cycle 3): validation gate — fail-closed fix + +**Branch:** `spacedock-ensign/spider2-dbt-duckdb-match-verifier` @ `11c66f4` +**Merge-base with main:** `9c39af2` +**Fix commit under review:** `869918d` (fail-closed eval-spec validation) +**Gate verdict:** **PASSED → done** + +Independent re-verification of the cycle-2 fail-open hazard fix (Codex +finding: empty/malformed eval spec awarded `{"reward": 1.0}`). No +production code written. + +## Fail-closed fix is real and load-bearing: CONFIRMED + +`load_eval_spec` / `EvalSpec.__post_init__` (`eval_spec.py`) now fail closed, +and `emit_reward` (`verify.py:33-42`) wraps spec-load + compare in +`try/except → reward 0`. Independent probe at clean branch HEAD: + +- **RAISE cases** (all `ValueError`): empty file; `evaluation.parameters` + with no `condition_tabs`; explicit empty `condition_tabs`; non-`duckdb_match` + `evaluation.func` (`string_match`); direct `EvalSpec(condition_tabs=[])`. +- **Hazard-beating** — over a MATCHING pred/gold pair (cloned tables): + an empty/zero-table spec → `{"reward": 0.0}` and a wrong-func spec → + `{"reward": 0.0}` — **not the prior 1.0**. Garbage non-JSON → `0.0` with + no exception (no crash-into-pass). +- **Positive control:** a VALID matching spec still → `{"reward": 1.0}`, + so the guards are targeted, not blanket-failing. + +**Mutation test (guards are load-bearing):** stripping the `n==0` guard, +the empty-file raise, and the wrong-func raise from `eval_spec.py` made all +6 negative tests FAIL — and the CLI negative reproduced the exact +`{'reward': 1.0}` == `{'reward': 0.0}` assertion failure, i.e. the original +hazard returned. Tree restored to clean (`git diff` on `eval_spec.py` empty). + +Root-cause confirmed in source: `compare_duckdb` (`duckdb_match.py:116`) +returns `True` after the per-table AND-loop, so a zero-`condition_tabs` spec +would vacuously match — the guards prevent that spec from ever being built. + +## Cycle-1 fixes intact (B2 comparator + B1 leakage): CONFIRMED + +`git diff 0b64e92..869918d --name-only` (the cycle-2 fix delta) touches ONLY +`eval_spec.py`, `verify.py`, `test_spider2_dbt_verify_cli.py`, +`test_spider2_dbt_verify_comparator.py`. `duckdb_match.py` (B2), +`harbor_view.py` + `test_translate_spider2_dbt.py` (B1) are **untouched** — +cycle-1 is intact by construction, and re-confirmed live: + +- **B2 faithfulness:** column-containment with reordered + extra pred + columns → True; `math.isclose(1e-2)` within → True / beyond → False; + per-column sort under `ignore_order` → True; ordered mismatch → False; + missing pred table → False (mismatch). +- **B1 leakage scoping:** `test_translate_spider2_dbt.py` leakage/planted/ + resolves tests → 3 passed; the `leakage_scan_still_fires_on_agent_visible` + guard among 9 rider/AC-3/leakage tests → 9 passed (agent-visible leaks + still caught; only verify-only `tests/` excluded). + +## Regression / rider / AC-3 + +- **Gating:** `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` + → **73 passed, 751 deselected** (was 66 in cycle 2; +7 negatives). +- **Rider:** materialized view with `task_slug=not-spider2-slug` → emitted + `test.sh` `--predicted-db /app/not-spider2-slug.duckdb`; `/app/spider2.duckdb` + absent (ny's `resolve_spider2_db_name`). +- **AC-3:** independent end-to-end via `emit_reward` against the materialized + gold fixture — matching pred → `{"reward": 1.0}`, mismatch → `{"reward": 0.0}`; + `set(payload) == {"reward"}`, reward is `float`. +- Working tree clean apart from the known harness `uv.lock` churn (ignored + per assignment). Pre-existing unrelated failures on base + (`test_task_identity_scoring`, `test_generate_matrix_specs`, + `test_rk_research_new`, etc.) are not regressions and not touched. + +## Gate decision + +**PASSED → done.** The cycle-2 fail-open hazard is independently closed: +empty/malformed/schema-drifted specs raise or score 0, the matching-pair +hazard now yields 0 not 1.0, and a mutation test proves the guards (and their +negative tests) are load-bearing. Cycle-1 comparator faithfulness and leakage +scoping are intact (untouched by the fix + re-confirmed live); rider and AC-3 +reward shape intact; no new regressions. Per the convergence plan this was the +last re-validation before PR. From c6749fdabd9e3378f60b224530731ec2619b6e4b Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:14:59 +0800 Subject: [PATCH 15/26] =?UTF-8?q?feedback:=20validation=20gate=20rejected?= =?UTF-8?q?=20(cycle=203)=20=E2=80=94=20honor=20eval-spec=20gold=20basenam?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spider2-dbt-duckdb-match-verifier.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 9eea728..797da8f 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -174,3 +174,12 @@ Resolved the cycle-2 fail-open hazard with a focused fix. `EvalSpec` and `load_e ### Summary Cycle-3 re-validation of the fail-closed fix (commit 869918d) — independent verification only, no production code. The fix is real and load-bearing: empty file / missing-or-empty `condition_tabs` / non-`duckdb_match` func all RAISE, `emit_reward` surfaces any failure as reward 0 (no crash-into-pass, no silent 1.0), and over a MATCHING pred/gold pair the prior would-be-1.0 hazard now scores 0 — proven by a mutation test where removing the guards makes all 6 negatives fail (CLI mutation reproduced the literal `{'reward': 1.0}` hazard). A positive control confirms a valid matching spec still scores 1.0. Cycle-1 fixes (faithful column-containment comparator + 1e-2 tolerance; verify-only-tests/ leakage scoping) are untouched by the cycle-2 diff and re-confirmed live; rider (`/app/.duckdb`) and AC-3 reward shape intact; gating suite 73 passed; only the known pre-existing unrelated failures + harness `uv.lock` churn remain. Gate verdict: PASSED -> done. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md + +### Cycle 3 — validation gate REJECTED (2026-06-18, captain via Codex review) + +Codex found the verifier ignores the eval spec's gold-DB basename; captain chose +fix-now. Confirmed live. Routing back to `implementation`: + +1. **[high] Verifier hardcodes `gold.duckdb`, ignoring `evaluation.parameters.gold`.** + `EvalSpec` has no `gold` field and `load_eval_spec` discards `parameters.gold`; `harbor_view.py:63` emits `--gold-db /tests/gold.duckdb` and `:125` copies a hardcoded `gold.duckdb`. Real Spider2 tasks name the gold DB per task (e.g. `playbook.duckdb`, `tpch.duckdb`), so the verifier would fail to find the gold or score against the wrong file — a benchmark-correctness defect. This is the gold-DB analog of the predicted-DB resolver ny added. + **Fix:** add `gold` to `EvalSpec`, parse it from `evaluation.parameters.gold` (validate present for real wrapped specs), copy THAT exact file from the source `tests/gold/` into the view's verifier-only `tests/`, and emit `--gold-db /tests/`; fail closed if the named gold file is missing. Add a regression test with `parameters.gold` set to a NON-default basename (and no `gold.duckdb` present), proving the named file is used. Keep the cycle-1 comparator/leakage fixes, the cycle-2 fail-closed validation, the rider, and AC-3 reward shape intact. From a677c363b266d149a9578458aa440c6188abf61b Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:20:00 +0800 Subject: [PATCH 16/26] fix(spider2-dbt): drive gold DB basename from evaluation.parameters.gold Cycle-3 validation finding: the verifier hardcoded gold.duckdb and discarded the eval spec's parameters.gold, so real Spider2 tasks (which name the gold DB per task, e.g. playbook.duckdb) would score against the wrong/missing file. - EvalSpec gains a gold field; load_eval_spec parses parameters.gold and fails closed when a real wrapped duckdb_match spec omits it. - _ensure_verifier_assets resolves the gold basename from the spec, copies the EXACT named file from tests/gold/, emits --gold-db /tests/, and raises if the named gold is missing (no scoring against a missing file). - Regression tests: non-default basename (playbook.duckdb, no gold.duckdb) is copied + scored + leakage-clean; missing named gold fails closed; loader parses/validates gold. Cycle-1 comparator + leakage scoping, cycle-2 fail-closed validation, the predicted-DB resolver rider, and the AC-3 reward shape are untouched. uv run pytest -k spider2_dbt (ignoring the pre-existing-broken scoring test) = 78 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../benchmarks/spider2_dbt/eval_spec.py | 32 ++++- .../benchmarks/spider2_dbt/harbor_view.py | 31 ++++- tests/unit/test_spider2_dbt_verify_cli.py | 125 ++++++++++++++++++ .../test_spider2_dbt_verify_comparator.py | 48 +++++++ 4 files changed, 228 insertions(+), 8 deletions(-) diff --git a/src/razorback/benchmarks/spider2_dbt/eval_spec.py b/src/razorback/benchmarks/spider2_dbt/eval_spec.py index 500e3ef..70b1620 100644 --- a/src/razorback/benchmarks/spider2_dbt/eval_spec.py +++ b/src/razorback/benchmarks/spider2_dbt/eval_spec.py @@ -23,6 +23,14 @@ class EvalSpec: The three list fields are positional and parallel to ``condition_tabs``: + gold: basename of the gold ``.duckdb`` to score against, taken verbatim + from ``evaluation.parameters.gold``. Real Spider2 tasks name the gold + DB per task (e.g. ``playbook.duckdb``, ``tpch.duckdb``), so this drives + which file the verifier compares — it is NOT hardcoded to + ``gold.duckdb``. Mirrors Spider2 ``evaluate.py:97``, which resolves + ``parameters['gold']`` to a per-task gold path before calling + ``duckdb_match``. ``None`` only for bare-``parameters`` fixtures with no + ``evaluation`` wrapper. condition_tabs: gold/pred table names to compare (``List[str]``). condition_cols: per-table 0-based gold column indices to restrict the gold side to before column-containment (``List[List[int]]``). An @@ -40,6 +48,7 @@ class EvalSpec: condition_tabs: list[str] condition_cols: list[list[int]] = field(default_factory=list) ignore_orders: list[bool] = field(default_factory=list) + gold: str | None = None def __post_init__(self) -> None: n = len(self.condition_tabs) @@ -81,9 +90,10 @@ def load_eval_spec(path: Path) -> EvalSpec: ``parameters``-shaped object (no ``evaluation`` wrapper) for fixtures. Fails closed on a corrupted/truncated/schema-drifted gold line: an empty - file, a wrong ``evaluation.func``, or a missing/empty ``condition_tabs`` - each raise ``ValueError`` rather than yielding a zero-table spec that - ``compare_duckdb`` would silently score as a match (reward 1.0). + file, a wrong ``evaluation.func``, a missing/empty ``condition_tabs``, or + (for a real wrapped spec) a missing/empty ``parameters.gold`` each raise + ``ValueError`` rather than yielding a spec that ``compare_duckdb`` would + silently score as a match or that would score against the wrong gold file. """ text = Path(path).read_text().strip() if not text: @@ -98,7 +108,8 @@ def load_eval_spec(path: Path) -> EvalSpec: # duckdb_match — any other func means this verifier cannot faithfully score # the line, so refuse rather than fall through to a match. evaluation = raw.get("evaluation") - if evaluation is not None: + wrapped = evaluation is not None + if wrapped: func = evaluation.get("func") if func != "duckdb_match": raise ValueError( @@ -109,6 +120,18 @@ def load_eval_spec(path: Path) -> EvalSpec: else: params = raw + gold = params.get("gold") + # A real wrapped Spider2 gold line MUST name its per-task gold DB (Spider2 + # evaluate.py resolves parameters['gold'] to a per-task path). A missing or + # empty gold basename means we cannot know which file to score against, so + # fail closed rather than fall back to a hardcoded gold.duckdb. + if wrapped and not (isinstance(gold, str) and gold.strip()): + raise ValueError( + f"missing/empty evaluation.parameters.gold in {path}: a wrapped " + "duckdb_match spec must name its per-task gold .duckdb (fail-open " + "guard)" + ) + condition_tabs = list(params.get("condition_tabs", [])) raw_cols = params.get("condition_cols") condition_cols = ( @@ -121,4 +144,5 @@ def load_eval_spec(path: Path) -> EvalSpec: condition_tabs=condition_tabs, condition_cols=condition_cols, ignore_orders=ignore_orders, + gold=gold if isinstance(gold, str) else None, ) diff --git a/src/razorback/benchmarks/spider2_dbt/harbor_view.py b/src/razorback/benchmarks/spider2_dbt/harbor_view.py index c60cadf..2b06922 100644 --- a/src/razorback/benchmarks/spider2_dbt/harbor_view.py +++ b/src/razorback/benchmarks/spider2_dbt/harbor_view.py @@ -60,7 +60,7 @@ mkdir -p /logs/verifier python /tests/verify.py \\ --predicted-db {predicted_db} \\ - --gold-db /tests/gold.duckdb \\ + --gold-db /tests/{gold_db} \\ --eval-spec /tests/spider2_eval.jsonl \\ --reward-out /logs/verifier/reward.json """ @@ -112,6 +112,13 @@ def _ensure_verifier_assets( `--predicted-db` is `/app/.duckdb` where `` comes from the SHARED `resolve_spider2_db_name` resolver (RIDER, Codex r5-plan finding) — never a hardcoded `/app/spider2.duckdb`. + + The GOLD DB basename is driven by the eval spec's + ``evaluation.parameters.gold`` (real Spider2 tasks name the gold per task, + e.g. ``playbook.duckdb``) — the EXACT named file is copied and + ``--gold-db /tests/`` is emitted. Fails closed (raises) if the + spec names a gold file that does not exist under ``tests/gold/``, so the + verifier never scores against a missing/wrong gold. """ source_gold = Path(source_task_dir) / "tests" / "gold" if not source_gold.is_dir(): @@ -122,8 +129,23 @@ def _ensure_verifier_assets( for mod in (_duckdb_match_mod, _eval_spec_mod, _verify_mod): src = Path(mod.__file__) shutil.copy2(src, tests / src.name) - shutil.copy2(source_gold / "gold.duckdb", tests / "gold.duckdb") - shutil.copy2(source_gold / "spider2_eval.jsonl", tests / "spider2_eval.jsonl") + + # Resolve the gold DB basename from the spec (NOT hardcoded gold.duckdb). + # Real Spider2 tasks name the gold per task; scoring the wrong/missing file + # is a benchmark-correctness defect. A wrapped spec without parameters.gold + # fails closed inside load_eval_spec. + source_spec = source_gold / "spider2_eval.jsonl" + spec = _eval_spec_mod.load_eval_spec(source_spec) + gold_basename = spec.gold or "gold.duckdb" + source_gold_db = source_gold / gold_basename + if not source_gold_db.is_file(): + raise FileNotFoundError( + f"spider2-dbt gold spec names gold DB {gold_basename!r} but " + f"{source_gold_db} does not exist; refusing to emit a verifier that " + "would score against a missing gold (fail-closed)" + ) + shutil.copy2(source_gold_db, tests / gold_basename) + shutil.copy2(source_spec, tests / "spider2_eval.jsonl") # Resolve the agent-facing DuckDB stem from the dbt project (or the slug # fallback) so the verifier compares the SAME `/app/.duckdb` the @@ -138,7 +160,8 @@ def _ensure_verifier_assets( test_sh.unlink() test_sh.write_text( _TEST_SH_TEMPLATE.format( - predicted_db=f"{_APP_ROOT}/{db_name}.duckdb" + predicted_db=f"{_APP_ROOT}/{db_name}.duckdb", + gold_db=gold_basename, ) ) test_sh.chmod( diff --git a/tests/unit/test_spider2_dbt_verify_cli.py b/tests/unit/test_spider2_dbt_verify_cli.py index 0fd38a6..f695be6 100644 --- a/tests/unit/test_spider2_dbt_verify_cli.py +++ b/tests/unit/test_spider2_dbt_verify_cli.py @@ -163,6 +163,131 @@ def test_spider2_dbt_verify_view_carries_verifier_assets(tmp_path): assert not list(view.rglob("gold/*")) +def _write_gold_source( + source: Path, *, gold_basename: str, with_default_gold: bool +) -> Path: + # A minimal spider2-dbt source task whose gold line names a NON-default gold + # DB (e.g. playbook.duckdb), with NO gold.duckdb present. Mirrors a real + # Spider2 task that names its gold per task. + (source / "environment").mkdir(parents=True) + (source / "dbt_project" / "models").mkdir(parents=True) + (source / "task.toml").write_text( + "\n".join( + [ + 'schema_version = "1.0"', + "[environment]", + 'os = "linux"', + "cpus = 1", + "memory_mb = 1024", + "storage_mb = 1024", + "", + ] + ) + ) + (source / "instruction.md").write_text("Fix the dbt project.\n") + (source / "dbt_project" / "dbt_project.yml").write_text( + "name: example\nprofile: example\n" + ) + (source / "dbt_project" / "models" / "example.sql").write_text("select 1\n") + (source / "environment" / "Dockerfile").write_text( + "FROM python:3.12\nWORKDIR /app\nCMD [\"bash\"]\n" + ) + gold_dir = source / "tests" / "gold" + gold_dir.mkdir(parents=True) + # The NAMED gold DB (the one the spec points at). + _build_db(gold_dir / gold_basename, [1, 2]) + if with_default_gold: + _build_db(gold_dir / "gold.duckdb", [9, 9]) # a decoy default + (gold_dir / "spider2_eval.jsonl").write_text( + json.dumps( + { + "instance_id": "playbook001", + "evaluation": { + "func": "duckdb_match", + "parameters": { + "gold": gold_basename, + "condition_tabs": ["orders"], + "condition_cols": [[]], + "ignore_orders": [True], + }, + }, + } + ) + + "\n" + ) + return source + + +def test_spider2_dbt_verify_view_uses_non_default_gold_basename(tmp_path): + # REGRESSION (cycle-3): the gold DB basename is driven by parameters.gold, + # NOT hardcoded gold.duckdb. A task naming playbook.duckdb (and shipping NO + # gold.duckdb) must copy playbook.duckdb into the verifier-only tests/ and + # emit --gold-db /tests/playbook.duckdb. + source = _write_gold_source( + tmp_path / "source", gold_basename="playbook.duckdb", with_default_gold=False + ) + view = materialize_spider2_harbor_task_view( + source_task_dir=source, + view_root=tmp_path / "views", + task_slug="playbook001", + ) + tests = view / "tests" + # The NAMED gold file is copied; the hardcoded gold.duckdb is NOT invented. + assert (tests / "playbook.duckdb").is_file() + assert not (tests / "gold.duckdb").exists() + # test.sh scores against the named file. + test_sh = (tests / "test.sh").read_text() + assert "--gold-db /tests/playbook.duckdb" in test_sh + assert "/tests/gold.duckdb" not in test_sh + # leakage-clean: no gold/ segment survives. + assert not (view / "tests" / "gold").exists() + assert not list(view.rglob("gold/*")) + + +def test_spider2_dbt_verify_view_named_gold_actually_scores(tmp_path): + # End-to-end: a predicted DB matching the NAMED gold scores 1.0 through the + # emitted verifier, proving the named file (not a missing/decoy gold.duckdb) + # is the one actually scored. + source = _write_gold_source( + tmp_path / "source", gold_basename="playbook.duckdb", with_default_gold=False + ) + view = materialize_spider2_harbor_task_view( + source_task_dir=source, + view_root=tmp_path / "views", + task_slug="playbook001", + ) + tests = view / "tests" + predicted = tmp_path / "pred.duckdb" + _build_db(predicted, [2, 1]) # reordered; ignore_orders -> match + reward_out = tmp_path / "logs" / "verifier" / "reward.json" + emit_reward( + predicted_db=predicted, + gold_db=tests / "playbook.duckdb", + eval_spec=tests / "spider2_eval.jsonl", + reward_out=reward_out, + ) + assert json.loads(reward_out.read_text()) == {"reward": 1.0} + + +def test_spider2_dbt_verify_view_missing_named_gold_fails_closed(tmp_path): + # Fail closed: if the spec names a gold DB that does NOT exist under + # tests/gold/, materialization must raise rather than silently emit a + # verifier that scores against a missing file. + source = _write_gold_source( + tmp_path / "source", gold_basename="playbook.duckdb", with_default_gold=False + ) + # Delete the named gold so the spec points at a missing file. + (source / "tests" / "gold" / "playbook.duckdb").unlink() + import pytest + + with pytest.raises((FileNotFoundError, ValueError)): + materialize_spider2_harbor_task_view( + source_task_dir=source, + view_root=tmp_path / "views", + task_slug="playbook001", + ) + + def test_spider2_dbt_verify_test_sh_uses_resolved_db_name(tmp_path): # RIDER: the emitted test.sh predicted-DB path must come from # resolve_spider2_db_name, NOT a hardcoded /app/spider2.duckdb. This diff --git a/tests/unit/test_spider2_dbt_verify_comparator.py b/tests/unit/test_spider2_dbt_verify_comparator.py index d570f59..1d301f4 100644 --- a/tests/unit/test_spider2_dbt_verify_comparator.py +++ b/tests/unit/test_spider2_dbt_verify_comparator.py @@ -38,12 +38,60 @@ def test_spider2_dbt_verify_loads_eval_spec_real_shape(tmp_path: Path): ) spec = load_eval_spec(spec_path) assert spec == EvalSpec( + gold="gold.duckdb", condition_tabs=["orders", "customers"], condition_cols=[[0, 2], []], ignore_orders=[True, False], ) +def test_spider2_dbt_verify_load_eval_spec_parses_gold_basename(tmp_path: Path): + # Real Spider2 tasks name the gold DB per task (playbook.duckdb, tpch.duckdb, + # ...). load_eval_spec must carry parameters.gold through so the verifier + # scores against the NAMED file, not a hardcoded gold.duckdb. + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text( + json.dumps( + { + "instance_id": "playbook001", + "evaluation": { + "func": "duckdb_match", + "parameters": { + "gold": "playbook.duckdb", + "condition_tabs": ["orders"], + }, + }, + } + ) + + "\n" + ) + spec = load_eval_spec(spec_path) + assert spec.gold == "playbook.duckdb" + + +def test_spider2_dbt_verify_load_eval_spec_missing_gold_in_wrapped_spec_raises( + tmp_path: Path, +): + # A real wrapped (evaluation.func == duckdb_match) gold line MUST name its + # gold DB. A missing/empty parameters.gold is a schema drift the verifier + # cannot score against the right file -> fail closed. + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text( + json.dumps( + { + "instance_id": "t", + "evaluation": { + "func": "duckdb_match", + "parameters": {"condition_tabs": ["orders"]}, + }, + } + ) + + "\n" + ) + with pytest.raises(ValueError): + load_eval_spec(spec_path) + + def test_spider2_dbt_verify_eval_spec_defaults(tmp_path: Path): # Missing condition_cols/ignore_orders default per-table to []/False # (mirrors Spider2 duckdb_match's None-handling). From bac07057de9339477781ee6c10ab30064a8b5ec6 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:20:20 +0800 Subject: [PATCH 17/26] docs: cycle-3 implementation stage report (gold basename fix) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 797da8f..a157ca9 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -183,3 +183,16 @@ fix-now. Confirmed live. Routing back to `implementation`: 1. **[high] Verifier hardcodes `gold.duckdb`, ignoring `evaluation.parameters.gold`.** `EvalSpec` has no `gold` field and `load_eval_spec` discards `parameters.gold`; `harbor_view.py:63` emits `--gold-db /tests/gold.duckdb` and `:125` copies a hardcoded `gold.duckdb`. Real Spider2 tasks name the gold DB per task (e.g. `playbook.duckdb`, `tpch.duckdb`), so the verifier would fail to find the gold or score against the wrong file — a benchmark-correctness defect. This is the gold-DB analog of the predicted-DB resolver ny added. **Fix:** add `gold` to `EvalSpec`, parse it from `evaluation.parameters.gold` (validate present for real wrapped specs), copy THAT exact file from the source `tests/gold/` into the view's verifier-only `tests/`, and emit `--gold-db /tests/`; fail closed if the named gold file is missing. Add a regression test with `parameters.gold` set to a NON-default basename (and no `gold.duckdb` present), proving the named file is used. Keep the cycle-1 comparator/leakage fixes, the cycle-2 fail-closed validation, the rider, and AC-3 reward shape intact. + +## Stage Report: implementation (cycle 3) + +- DONE: Add a `gold` field to `EvalSpec` and parse it from `evaluation.parameters.gold` in `load_eval_spec` (validate present/non-empty for real wrapped specs; keep the cycle-2 fail-closed behavior). The gold basename drives which file is scored — do not hardcode `gold.duckdb`. + Commit a677c36. `EvalSpec.gold: str | None`; `load_eval_spec` reads `parameters.gold` and RAISES `ValueError` when a wrapped (`evaluation.func == duckdb_match`) line omits/empties it. Cycle-2 guards (empty file / wrong func / empty condition_tabs raise) unchanged. Confirms upstream `evaluate.py:96-99` resolves `parameters['gold']` per-task before calling `duckdb_match` (fetched live via `gh api`). +- DONE: Wire the parsed gold basename through materialization (`harbor_view.py`): copy the EXACT named file from the source `tests/gold/` into the view's verifier-only `tests/`, and emit `--gold-db /tests/` in test.sh (resolved from the spec, not hardcoded). Fail closed if the named gold file is missing. + Commit a677c36. `_ensure_verifier_assets` calls `load_eval_spec(source tests/gold/spider2_eval.jsonl)`, copies `tests/gold/` to `tests/`, formats `_TEST_SH_TEMPLATE` with `gold_db=` -> `--gold-db /tests/`, and raises `FileNotFoundError` if the named gold is absent. Leakage-clean (no `gold/` segment) preserved. +- DONE: Add a regression test with `parameters.gold` set to a NON-default basename (e.g. `playbook.duckdb`) and NO `gold.duckdb` present, proving the named file is copied + scored. Keep intact: cycle-1 comparator faithfulness + leakage scoping, cycle-2 fail-closed validation, the rider (`/app/.duckdb`), AC-3 reward shape. `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` green. + 5 new tests (red-first verified): non-default `playbook.duckdb` copied + `--gold-db /tests/playbook.duckdb` emitted + leakage-clean; named gold end-to-end scores 1.0; missing named gold fails closed; loader parses gold + raises on missing-gold-in-wrapped-spec. Gating suite = 78 passed (was 73). Cycle-1 (`duckdb_match.py`) untouched; cycle-2 guards untouched; rider + AC-3 reward shape unchanged. + +### Summary + +Resolved the cycle-3 gold-DB-basename defect (commit a677c36). The verifier no longer hardcodes `gold.duckdb`: `EvalSpec` carries a `gold` field, `load_eval_spec` parses `evaluation.parameters.gold` and fails closed when a real wrapped `duckdb_match` spec omits it, and `_ensure_verifier_assets` copies the EXACT spec-named gold file into the verifier-only `tests/` and emits `--gold-db /tests/` — raising if the named file is missing so a benchmark can never silently score against the wrong/missing gold. This is the gold-DB analog of the existing predicted-DB resolver rider. Proven TDD-first by 5 tests (red before, green after), including a `playbook.duckdb`/no-`gold.duckdb` regression that copies + scores the named file. Cycle-1 comparator faithfulness + leakage scoping, cycle-2 fail-closed validation, the resolver rider, and AC-3 reward shape are all intact; `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` = 78 passed. From 78b4a52d0b1e37a7ed866c3cf43c32ddd844200b Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:29:24 +0800 Subject: [PATCH 18/26] =?UTF-8?q?docs(spider2):=20validation=20re-review?= =?UTF-8?q?=20(cycle-3=20fix)=20=E2=80=94=20PASSED=20(gold=20basename=20re?= =?UTF-8?q?al=20+=20load-bearing)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 13 +++ .../spider2-dbt-duckdb-match-verifier.md | 79 +++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index a157ca9..28f7ae6 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -196,3 +196,16 @@ fix-now. Confirmed live. Routing back to `implementation`: ### Summary Resolved the cycle-3 gold-DB-basename defect (commit a677c36). The verifier no longer hardcodes `gold.duckdb`: `EvalSpec` carries a `gold` field, `load_eval_spec` parses `evaluation.parameters.gold` and fails closed when a real wrapped `duckdb_match` spec omits it, and `_ensure_verifier_assets` copies the EXACT spec-named gold file into the verifier-only `tests/` and emits `--gold-db /tests/` — raising if the named file is missing so a benchmark can never silently score against the wrong/missing gold. This is the gold-DB analog of the existing predicted-DB resolver rider. Proven TDD-first by 5 tests (red before, green after), including a `playbook.duckdb`/no-`gold.duckdb` regression that copies + scores the named file. Cycle-1 comparator faithfulness + leakage scoping, cycle-2 fail-closed validation, the resolver rider, and AC-3 reward shape are all intact; `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` = 78 passed. + +## Stage Report: validation (cycle 4) + +- DONE: Confirm the cycle-3 fix: `EvalSpec.gold` parsed from `evaluation.parameters.gold`; the materializer copies the EXACT named gold from source `tests/gold/` into the verifier-only tests/, emits `--gold-db /tests/` (not hardcoded gold.duckdb), and fails closed (FileNotFoundError) when the named gold is absent. Exercise it: a spec naming `playbook.duckdb` with NO `gold.duckdb` present copies + scores `playbook.duckdb`, leakage-clean; a wrapped spec missing `gold` raises. + Verified live against branch HEAD bac0705 / fix commit a677c36. Standalone probe (no test infra) 11/11: wrapped-spec-missing-gold RAISES, empty-gold RAISES, gold parsed verbatim; `playbook.duckdb`+no-`gold.duckdb` materializes → only `playbook.duckdb` copied, test.sh `--gold-db /tests/playbook.duckdb`, no `/tests/gold.duckdb`, leakage-clean (no `gold/` segment); end-to-end `emit_reward` against named gold → match 1.0 / mismatch 0.0; missing named gold → FileNotFoundError. LOAD-BEARING: hardcoding `gold.duckdb` in harbor_view → 2 regression tests FAIL; disabling the loader missing-gold raise → 1 test FAILS. Tree restored clean. +- DONE: Confirm no regression to prior cycles: cycle-1 comparator faithfulness + leakage scoping; cycle-2 fail-closed validation. Confirm the rider (`/app/.duckdb`) and AC-3 reward.json shape intact. + Cycle-3 diff touches ONLY eval_spec.py + harbor_view.py + their 2 tests; duckdb_match.py (cycle-1, 61a1b9c) and verify.py (cycle-2/AC-3, 869918d) UNTOUCHED — intact by construction. Re-confirmed live: cycle-1 column-containment + isclose 1e-2 + per-column sort + missing-table mismatch (3/3); cycle-2 empty-file/wrong-func/empty-condition_tabs raises (3/3). Rider test `..._uses_resolved_db_name` → `/app/not-spider2-slug.duckdb` 1 passed; AC-3 `{"reward": }` shape exercised end-to-end. +- DONE: `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` green (~78 passed); test_translate_spider2_dbt.py green. Give a gate verdict (PASSED -> done, or REJECTED -> implementation). + Gating suite = 78 passed, 751 deselected. test_translate_spider2_dbt.py = 13 passed. Full suite (ignoring the broken scoring file) = 813 passed, 4 failed; the 4 failures ALSO fail on merge-base 9c39af2 (verified in a throwaway base worktree) — pre-existing/unrelated, NOT regressions. Verdict: PASSED. + +### Summary + +Cycle-3-fix re-validation (commit a677c36) — independent verification only, no production code. The gold-DB-basename fix is real and load-bearing: a non-default `playbook.duckdb` (with NO `gold.duckdb` present) is copied + scored + leakage-clean, the emitted test.sh carries `--gold-db /tests/playbook.duckdb`, a wrapped spec lacking `gold` raises, and a missing named gold fails closed — each proven by a standalone 11/11 probe AND a mutation test that flips the regression tests red. Cycles 1-2 (faithful column-containment comparator + 1e-2 tolerance; fail-closed empty/malformed-spec guards), the predicted-DB resolver rider, and the AC-3 `{"reward": }` shape are untouched by the fix and re-confirmed live. Gating suite 78 passed; translate suite 13 passed; the only remaining full-suite failures are pre-existing on base (verified). Gate verdict: PASSED → done. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md diff --git a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md index aeadd5a..2c0a201 100644 --- a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md @@ -383,3 +383,82 @@ negative tests) are load-bearing. Cycle-1 comparator faithfulness and leakage scoping are intact (untouched by the fix + re-confirmed live); rider and AC-3 reward shape intact; no new regressions. Per the convergence plan this was the last re-validation before PR. + +--- + +## Re-validation (cycle-3 fix: gold DB basename) — 2026-06-18 + +Independent re-review after the cycle-3 fix (commit `a677c36`) that threads +the eval spec's `gold` basename through `EvalSpec` → materializer copy → +`test.sh --gold-db`, replacing the hardcoded `gold.duckdb`. No production code +written by this validator. + +### Fix is real (production code, branch HEAD `bac0705`) + +- `eval_spec.py`: `EvalSpec` gains `gold: str | None`; `load_eval_spec` parses + `evaluation.parameters.gold` and RAISES `ValueError` when a wrapped + `duckdb_match` spec omits/empties it (`eval_spec.py:128`). +- `harbor_view.py`: `_ensure_verifier_assets` loads the spec, resolves + `gold_basename = spec.gold or "gold.duckdb"`, copies the EXACT named file + from `tests/gold/`, raises `FileNotFoundError` if absent + (`harbor_view.py:141`), and formats `_TEST_SH_TEMPLATE` with + `--gold-db /tests/{gold_db}` (`:63`, `:164`) — no hardcoded `gold.duckdb`. + +### Independent exercise (standalone probe, no test infra) — 11/11 PASS + +- Loader: wrapped spec missing `gold` RAISES; empty-string `gold` RAISES; + `gold` parsed verbatim (`playbook.duckdb`). +- Materializer with `parameters.gold = playbook.duckdb` and NO `gold.duckdb` + present: `playbook.duckdb` copied; `gold.duckdb` NOT invented; test.sh emits + `--gold-db /tests/playbook.duckdb`, no `/tests/gold.duckdb`; leakage-clean + (no `gold/` path segment survives). +- End-to-end over the materialized named gold via `emit_reward`: matching pred + → `{"reward": 1.0}`, mismatch → `{"reward": 0.0}`. +- Missing named gold → `FileNotFoundError` (fail-closed). + +### Load-bearing (mutation tests) + +- Hardcoding `gold_basename = "gold.duckdb"` in `harbor_view.py` → + `test_spider2_dbt_verify_view_uses_non_default_gold_basename` and + `..._named_gold_actually_scores` FAIL (2 failed, 6 passed). Restored. +- Disabling the wrapped-spec missing-`gold` raise in `eval_spec.py` → + `test_...load_eval_spec_missing_gold_in_wrapped_spec_raises` FAILS + (1 failed, 7 passed). Restored. Tree clean (only `uv.lock`). + +### No regression to prior cycles / rider / AC-3 + +- Cycle-3 diff (`a677c36`) touches ONLY `eval_spec.py` + `harbor_view.py` + + their 2 test files. `duckdb_match.py` (cycle-1 comparator, last `61a1b9c`) + and `verify.py` (cycle-2 / AC-3, last `869918d`) UNTOUCHED — cycle-1/cycle-2 + intact by construction. +- Re-confirmed live: cycle-1 column-containment + `math.isclose(1e-2)` + + per-column sort under `ignore_order` + missing-table mismatch (3/3); + cycle-2 fail-closed empty-file / wrong-func / empty-`condition_tabs` raises + (3/3). +- Rider: `task_slug=not-spider2-slug` → emitted test.sh + `--predicted-db /app/not-spider2-slug.duckdb`, no `/app/spider2.duckdb` + (`...test_sh_uses_resolved_db_name` 1 passed). +- AC-3 reward shape `{"reward": }` intact (end-to-end probe above). + +### Suites (clean checkout) + +- `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` + = **78 passed**, 751 deselected. +- `tests/unit/test_translate_spider2_dbt.py` = **13 passed**. +- Full suite (`--ignore` the broken scoring file) = 813 passed, 4 failed. + The 4 failures (`test_spacedock_solver_freeze_dir_mechanism`, + `test_worktree_teardown_preserves_runs`, `test_generate_matrix_specs`, + `test_rk_research_new`) ALSO fail on merge-base `9c39af2` (verified in a + throwaway base worktree) — pre-existing, unrelated, NOT regressions. Branch + touches only spider2_dbt-scoped files. + +## Gate decision (cycle-3 re-validation) + +**PASSED → done.** The cycle-3 gold-basename fix is real and load-bearing: a +non-default `playbook.duckdb` is honored (copied + scored, leakage-clean), a +missing named gold fails closed, and a wrapped spec lacking `gold` raises — +each proven by an independent probe and a mutation test that flips the +regression tests red. Cycles 1-2, the predicted-DB rider, and the AC-3 reward +shape are intact (untouched by the fix + re-confirmed live). The only +remaining full-suite failures are pre-existing on base. Per the convergence +plan, this is the last re-validation before PR. From bfed74cfbeeca3a90b807cf2c43b4b6ed068505a Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:38:08 +0800 Subject: [PATCH 19/26] fix(spider2-dbt): reject non-basename gold in load_eval_spec (path-traversal guard) evaluation.parameters.gold is external Spider2 input that the materializer joins onto a path (tests/gold/) and emits into test.sh (--gold-db /tests/). Path preserves '..'/absolute components, so a malformed or hostile spec could read/write outside the verifier-only gold area. Fail closed at the trust boundary: require Path(g).name == g and reject '.'/'..'. One parametrized regression test (../, absolute, separators, ., ..). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 19 +++++++++++++ .../benchmarks/spider2_dbt/eval_spec.py | 14 ++++++++++ .../test_spider2_dbt_verify_comparator.py | 27 +++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 28f7ae6..6b0f47e 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -209,3 +209,22 @@ Resolved the cycle-3 gold-DB-basename defect (commit a677c36). The verifier no l ### Summary Cycle-3-fix re-validation (commit a677c36) — independent verification only, no production code. The gold-DB-basename fix is real and load-bearing: a non-default `playbook.duckdb` (with NO `gold.duckdb` present) is copied + scored + leakage-clean, the emitted test.sh carries `--gold-db /tests/playbook.duckdb`, a wrapped spec lacking `gold` raises, and a missing named gold fails closed — each proven by a standalone 11/11 probe AND a mutation test that flips the regression tests red. Cycles 1-2 (faithful column-containment comparator + 1e-2 tolerance; fail-closed empty/malformed-spec guards), the predicted-DB resolver rider, and the AC-3 `{"reward": }` shape are untouched by the fix and re-confirmed live. Gating suite 78 passed; translate suite 13 passed; the only remaining full-suite failures are pre-existing on base (verified). Gate verdict: PASSED → done. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md + +### Cycle 4 — validation gate REJECTED (2026-06-18, captain via Codex review) + +4th adversarial Codex pass found a path-traversal hole in the (now spec-driven) gold basename; captain chose fix-now. Confirmed live. Routing back to `implementation`: + +1. **[high] `evaluation.parameters.gold` used verbatim as a path component (trust boundary).** + `gold` is external Spider2 input. `harbor_view.py:139-147` joins it as `source_gold / gold` (read) and `tests / gold` (copy2), and emits `--gold-db /tests/` in test.sh. `Path` preserves `..`/absolute components, so a spec value like `../dbt_project/foo.duckdb` passes the `is_file()` check against a file OUTSIDE `tests/gold/`, then `copy2` writes outside `view/tests/` and the emitted path leaves `/tests` — benchmark corruption / leakage from a malformed or hostile task. + **Fix:** reject non-basename `gold` at the trust boundary (`load_eval_spec`): require `Path(g).name == g`, reject `.`/`..`/absolute/separators, before any path use. Add a `../`/absolute/separator regression test asserting fail-closed. Keep cycles 1-3 intact. + +## Stage Report: implementation (cycle 4) + +- DONE: Reject non-basename `gold` in `load_eval_spec` before it is used as a path component. + Guard added in `eval_spec.py` immediately after the missing-gold check: when `gold` is a non-empty str, require `Path(g).name == g` and reject `.`/`..` → raises `ValueError` (fail-closed at the trust boundary). Bare-fixture `gold=None` still falls back to `gold.duckdb` (safe). The materializer and test.sh emitter are unchanged — the basename is sanitized upstream so both inherit the guarantee. +- DONE: Regression test for `../`/absolute/separator/`.`/`..` gold values. + Parametrized `test_..._rejects_non_basename_gold` (5 cases: `../dbt_project/foo.duckdb`, `/etc/passwd`, `sub/dir/g.duckdb`, `..`, `.`) asserts `load_eval_spec` raises. Comparator suite 28 passed (was 23); harbor_view + verify_cli + translate suites 34 passed. Cycles 1-3 untouched. + +### Summary + +Resolved the cycle-4 path-traversal defect. `load_eval_spec` now fails closed on any `gold` that is not a bare basename (`..`, `.`, absolute, or containing separators), sanitizing the value at the trust boundary so both the materializer's `copy2` targets and the emitted `--gold-db /tests/` path stay inside the verifier-only gold area. One file + one parametrized test (5 cases); cycles 1-3 left intact. spider2_dbt suite green (62 across comparator/harbor_view/verify_cli/translate). diff --git a/src/razorback/benchmarks/spider2_dbt/eval_spec.py b/src/razorback/benchmarks/spider2_dbt/eval_spec.py index 70b1620..33c8a60 100644 --- a/src/razorback/benchmarks/spider2_dbt/eval_spec.py +++ b/src/razorback/benchmarks/spider2_dbt/eval_spec.py @@ -131,6 +131,20 @@ def load_eval_spec(path: Path) -> EvalSpec: "duckdb_match spec must name its per-task gold .duckdb (fail-open " "guard)" ) + # `gold` is external Spider2 input that the materializer joins onto a path + # (tests/gold/) and emits into test.sh (--gold-db /tests/). It is + # documented as a bare basename; reject absolute paths, separators, and `..` + # so a malformed/hostile spec cannot read or write outside the verifier-only + # gold area. Fail closed at the trust boundary, not downstream. + if isinstance(gold, str) and gold.strip(): + g = gold.strip() + if Path(g).name != g or g in (".", ".."): + raise ValueError( + f"unsafe evaluation.parameters.gold {gold!r} in {path}: must be " + "a bare .duckdb basename (no path separators, '..', or absolute " + "path) — refusing to score outside tests/gold/ (fail-closed)" + ) + gold = g condition_tabs = list(params.get("condition_tabs", [])) raw_cols = params.get("condition_cols") diff --git a/tests/unit/test_spider2_dbt_verify_comparator.py b/tests/unit/test_spider2_dbt_verify_comparator.py index 1d301f4..7194c0d 100644 --- a/tests/unit/test_spider2_dbt_verify_comparator.py +++ b/tests/unit/test_spider2_dbt_verify_comparator.py @@ -184,6 +184,33 @@ def test_spider2_dbt_verify_load_eval_spec_missing_evaluation_raises(tmp_path: P load_eval_spec(spec_path) +@pytest.mark.parametrize( + "bad_gold", + ["../dbt_project/foo.duckdb", "/etc/passwd", "sub/dir/g.duckdb", "..", "."], +) +def test_spider2_dbt_verify_load_eval_spec_rejects_non_basename_gold( + tmp_path: Path, bad_gold: str +): + # `gold` is external Spider2 input the materializer joins onto a path and + # emits into test.sh. A `..`/absolute/separator value would let the verifier + # read or write outside tests/gold/, so it must fail closed at parse time. + spec_path = tmp_path / "spider2_eval.jsonl" + spec_path.write_text( + json.dumps( + { + "instance_id": "t", + "evaluation": { + "func": "duckdb_match", + "parameters": {"gold": bad_gold, "condition_tabs": ["orders"]}, + }, + } + ) + + "\n" + ) + with pytest.raises(ValueError): + load_eval_spec(spec_path) + + # -------------------------------------------------------------------------- # comparator fixtures # -------------------------------------------------------------------------- From b774bb7272adfd803e2363224820825ff84efdde Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:41:54 +0800 Subject: [PATCH 20/26] fix(spider2-dbt): allowlist gold basename to block test.sh shell injection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cycle-4 basename guard rejected separators/'..'/absolute but still admitted POSIX-valid names with shell metacharacters (e.g. `x.duckdb; echo ... #`), which were interpolated UNQUOTED into the verifier test.sh --gold-db argument and would execute as shell during verification. Replace it with a conservative allowlist `[A-Za-z0-9._-]+\.duckdb` at the trust boundary in load_eval_spec — subsumes the path checks and blocks whitespace/metacharacters. Adds ;/space/$() regression cases. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 19 +++++++++++++ .../benchmarks/spider2_dbt/eval_spec.py | 27 +++++++++++++------ .../test_spider2_dbt_verify_comparator.py | 19 ++++++++++--- 3 files changed, 54 insertions(+), 11 deletions(-) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 6b0f47e..15e203a 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -228,3 +228,22 @@ Cycle-3-fix re-validation (commit a677c36) — independent verification only, no ### Summary Resolved the cycle-4 path-traversal defect. `load_eval_spec` now fails closed on any `gold` that is not a bare basename (`..`, `.`, absolute, or containing separators), sanitizing the value at the trust boundary so both the materializer's `copy2` targets and the emitted `--gold-db /tests/` path stay inside the verifier-only gold area. One file + one parametrized test (5 cases); cycles 1-3 left intact. spider2_dbt suite green (62 across comparator/harbor_view/verify_cli/translate). + +### Cycle 5 — validation gate REJECTED (2026-06-18, captain via Codex review) + +5th adversarial Codex pass found the cycle-4 basename guard still admits shell metacharacters; captain chose fix-now. Confirmed live. Routing back to `implementation`: + +1. **[high] Spec-controlled gold basename emitted UNQUOTED into test.sh → shell injection.** + The cycle-4 guard rejects separators/`..`/absolute, but a POSIX-valid basename like `x.duckdb; echo '{"reward":1.0}' > /logs/verifier/reward.json #` has no separator, so `Path(g).name == g` passes; it is then interpolated unquoted into `--gold-db /tests/` in test.sh and executes as shell during verification. External task input → trust boundary. + **Fix:** replace the basename guard in `load_eval_spec` with a conservative allowlist `^[A-Za-z0-9._-]+\.duckdb$` (subsumes the path checks AND blocks metacharacters/whitespace). Add `;`/space/`$()` regression cases. Keep cycles 1-4 intact. + +## Stage Report: implementation (cycle 5) + +- DONE: Replace the cycle-4 basename guard with a conservative allowlist at the trust boundary. + `eval_spec.py`: `_SAFE_GOLD_RE = re.compile(r"[A-Za-z0-9._-]+\.duckdb")`; `load_eval_spec` rejects any non-`None` `gold` that fails `fullmatch`. Strictly tighter than the cycle-4 check — separators, `..`, absolute, whitespace, and shell metacharacters all fail closed at parse, so neither the materializer's `copy2` nor the unquoted test.sh `--gold-db` can be subverted. Bare-fixture `gold=None` → `gold.duckdb` fallback (safe) unchanged. +- DONE: Regression cases for shell-injection + wrong-suffix gold values. + Parametrized `test_..._rejects_non_basename_gold` extended to 9 cases: the 4 path cases plus `x.duckdb; … #`, `g.duckdb $(id)`, `g .duckdb` (space), `g.sqlite` (wrong suffix). spider2_dbt suite (comparator/harbor_view/verify_cli/translate) = 66 passed. Cycles 1-4 untouched. + +### Summary + +Closed the cycle-5 shell-injection class definitively. The cycle-4 basename guard is replaced by a conservative `[A-Za-z0-9._-]+\.duckdb` allowlist in `load_eval_spec`, which subsumes the path-traversal checks and additionally rejects whitespace and shell metacharacters — so a spec-supplied gold name can neither escape `tests/gold/` nor inject into the unquoted `--gold-db` argument of the emitted test.sh. This is a terminal trust-boundary fix (allowlist, not edge-by-edge): one regex + the cycle-4 guard swap, 4 new regression cases (9 total). Cycles 1-4 intact; spider2_dbt suite 66 passed. diff --git a/src/razorback/benchmarks/spider2_dbt/eval_spec.py b/src/razorback/benchmarks/spider2_dbt/eval_spec.py index 33c8a60..f744987 100644 --- a/src/razorback/benchmarks/spider2_dbt/eval_spec.py +++ b/src/razorback/benchmarks/spider2_dbt/eval_spec.py @@ -3,9 +3,15 @@ from __future__ import annotations import json +import re from dataclasses import dataclass, field from pathlib import Path +# Conservative allowlist for the spec-supplied gold DB basename. A real Spider2 +# gold filename is a plain `.duckdb`; restricting to this set blocks path +# traversal AND shell injection at the trust boundary (see load_eval_spec). +_SAFE_GOLD_RE = re.compile(r"[A-Za-z0-9._-]+\.duckdb") + @dataclass(frozen=True) class EvalSpec: @@ -132,17 +138,22 @@ def load_eval_spec(path: Path) -> EvalSpec: "guard)" ) # `gold` is external Spider2 input that the materializer joins onto a path - # (tests/gold/) and emits into test.sh (--gold-db /tests/). It is - # documented as a bare basename; reject absolute paths, separators, and `..` - # so a malformed/hostile spec cannot read or write outside the verifier-only - # gold area. Fail closed at the trust boundary, not downstream. + # (tests/gold/) and emits UNQUOTED into the verifier test.sh + # (--gold-db /tests/). A conservative allowlist closes the whole class + # at the trust boundary: it subsumes the path checks (no separators, `..`, or + # absolute paths get through) AND rejects shell metacharacters/whitespace, so + # a malformed/hostile spec can neither escape tests/gold/ nor inject shell + # syntax into the verifier script. Real Spider2 gold names are plain + # `.duckdb`, so this is tighter than the data requires by design. if isinstance(gold, str) and gold.strip(): g = gold.strip() - if Path(g).name != g or g in (".", ".."): + if not _SAFE_GOLD_RE.fullmatch(g): raise ValueError( - f"unsafe evaluation.parameters.gold {gold!r} in {path}: must be " - "a bare .duckdb basename (no path separators, '..', or absolute " - "path) — refusing to score outside tests/gold/ (fail-closed)" + f"unsafe evaluation.parameters.gold {gold!r} in {path}: must " + "match [A-Za-z0-9._-]+.duckdb (a bare .duckdb basename, no path " + "separators, '..', whitespace, or shell metacharacters) — " + "refusing to score outside tests/gold/ or inject into test.sh " + "(fail-closed)" ) gold = g diff --git a/tests/unit/test_spider2_dbt_verify_comparator.py b/tests/unit/test_spider2_dbt_verify_comparator.py index 7194c0d..33b3fc8 100644 --- a/tests/unit/test_spider2_dbt_verify_comparator.py +++ b/tests/unit/test_spider2_dbt_verify_comparator.py @@ -186,14 +186,27 @@ def test_spider2_dbt_verify_load_eval_spec_missing_evaluation_raises(tmp_path: P @pytest.mark.parametrize( "bad_gold", - ["../dbt_project/foo.duckdb", "/etc/passwd", "sub/dir/g.duckdb", "..", "."], + [ + # path traversal / separators / absolute + "../dbt_project/foo.duckdb", + "/etc/passwd", + "sub/dir/g.duckdb", + "..", + ".", + # shell injection into the unquoted test.sh --gold-db arg + "x.duckdb; echo '{\"reward\":1.0}' > /logs/verifier/reward.json #", + "g.duckdb $(id)", + "g .duckdb", + "g.sqlite", # wrong suffix + ], ) def test_spider2_dbt_verify_load_eval_spec_rejects_non_basename_gold( tmp_path: Path, bad_gold: str ): # `gold` is external Spider2 input the materializer joins onto a path and - # emits into test.sh. A `..`/absolute/separator value would let the verifier - # read or write outside tests/gold/, so it must fail closed at parse time. + # emits UNQUOTED into test.sh. A `..`/absolute/separator value escapes + # tests/gold/; a shell-metacharacter/whitespace value injects into the + # verifier script. The conservative allowlist must fail both closed at parse. spec_path = tmp_path / "spider2_eval.jsonl" spec_path.write_text( json.dumps( From 843b4c180c97633da6a4408a626f61600ee04896 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:47:43 +0800 Subject: [PATCH 21/26] fix(spider2-dbt): shlex.quote both verifier args (predicted-db injection) The cycle-5 gold allowlist only guarded --gold-db. The --predicted-db arg interpolated db_name (resolved from the task's profiles.yml `path:`, external input) unquoted into test.sh, leaving the symmetric shell-injection boundary open. shlex.quote BOTH args at the emission point in _ensure_verifier_assets so neither external-input value can be interpreted as shell during verification. Gold allowlist retained (also guards path traversal). Adds a load-bearing regression with a profiles.yml `$()` DuckDB path. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 19 ++++++++++ .../benchmarks/spider2_dbt/harbor_view.py | 11 ++++-- tests/unit/test_spider2_dbt_verify_cli.py | 35 +++++++++++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 15e203a..5aa8244 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -247,3 +247,22 @@ Resolved the cycle-4 path-traversal defect. `load_eval_spec` now fails closed on ### Summary Closed the cycle-5 shell-injection class definitively. The cycle-4 basename guard is replaced by a conservative `[A-Za-z0-9._-]+\.duckdb` allowlist in `load_eval_spec`, which subsumes the path-traversal checks and additionally rejects whitespace and shell metacharacters — so a spec-supplied gold name can neither escape `tests/gold/` nor inject into the unquoted `--gold-db` argument of the emitted test.sh. This is a terminal trust-boundary fix (allowlist, not edge-by-edge): one regex + the cycle-4 guard swap, 4 new regression cases (9 total). Cycles 1-4 intact; spider2_dbt suite 66 passed. + +### Cycle 6 — validation gate REJECTED (2026-06-18, captain via Codex review) + +6th adversarial Codex pass found the SYMMETRIC injection: the predicted-DB arg was still unquoted; captain chose fix-now. Confirmed live. Routing back to `implementation`: + +1. **[high] Predicted-DB `db_name` emitted UNQUOTED into test.sh → shell injection (the other arg).** + The cycle-5 gold allowlist only guarded `--gold-db`. `--predicted-db /app/{db_name}.duckdb` interpolated `db_name` — resolved from the task's `profiles.yml` `path:` (external input) by `resolve_spider2_db_name` — without quoting. A profile path with shell metacharacters executes during verification (overwrite reward.json / exfiltrate verifier assets). Preflight's Docker RUN already quoted this value; the verifier script did not. + **Fix:** `shlex.quote` BOTH args at the emission point in `_ensure_verifier_assets` (terminal, source-agnostic). Keep the gold allowlist (it also guards traversal). Add a regression with a `profiles.yml` DuckDB path containing `$()`/whitespace asserting the emitted arg is quoted. + +## Stage Report: implementation (cycle 6) + +- DONE: `shlex.quote` both `--predicted-db` and `--gold-db` values where test.sh is emitted. + `harbor_view.py`: `_TEST_SH_TEMPLATE` now takes fully-formed `{predicted_db}`/`{gold_db}` and `_ensure_verifier_assets` passes `shlex.quote(f"{_APP_ROOT}/{db_name}.duckdb")` and `shlex.quote(f"/tests/{gold_basename}")`. Safe names are unchanged by quoting (existing `/app/.duckdb` and `/tests/playbook.duckdb` assertions still pass); metacharacter names are single-quoted, so neither external-input arg can inject. Gold allowlist retained for traversal defense-in-depth. +- DONE: Regression for a `profiles.yml` path with shell metacharacters. + `test_..._quotes_predicted_db_against_injection`: a `path: evil$(touch pwned).duckdb` profile materializes a test.sh whose `--predicted-db` is `shlex.quote`'d (raw `--predicted-db /app/evil$(touch pwned).duckdb` absent). Load-bearing: reverting the quote flips it red. spider2_dbt suite (comparator/harbor_view/verify_cli/translate) = 67 passed. Cycles 1-5 untouched. + +### Summary + +Closed the symmetric predicted-DB injection that the cycle-5 gold allowlist didn't cover. Both verifier arguments derived from external task input — `db_name` (from `profiles.yml` `path:`) and `gold_basename` (from the eval spec) — are now `shlex.quote`'d at the single test.sh emission point, so neither can be interpreted as shell syntax during verification regardless of source. The gold allowlist stays (it additionally blocks path-traversal, which quoting alone wouldn't). One-line template change + quoted format args + one load-bearing regression (profiles `$()` path). Cycles 1-5 intact; spider2_dbt suite 67 passed. The entire verifier shell boundary (both args, traversal + injection) is now sealed. diff --git a/src/razorback/benchmarks/spider2_dbt/harbor_view.py b/src/razorback/benchmarks/spider2_dbt/harbor_view.py index 2b06922..02b97a5 100644 --- a/src/razorback/benchmarks/spider2_dbt/harbor_view.py +++ b/src/razorback/benchmarks/spider2_dbt/harbor_view.py @@ -60,7 +60,7 @@ mkdir -p /logs/verifier python /tests/verify.py \\ --predicted-db {predicted_db} \\ - --gold-db /tests/{gold_db} \\ + --gold-db {gold_db} \\ --eval-spec /tests/spider2_eval.jsonl \\ --reward-out /logs/verifier/reward.json """ @@ -158,10 +158,15 @@ def _ensure_verifier_assets( test_sh = tests / "test.sh" if test_sh.is_symlink(): test_sh.unlink() + # shlex.quote BOTH args: db_name is resolved from the task's profiles.yml + # path (external input) and gold_basename from the eval spec; emitting either + # unquoted into the verifier shell script is a command-injection boundary. + # gold_basename is also allowlisted in load_eval_spec (traversal defense); + # quoting closes the injection half for both arguments at the emission point. test_sh.write_text( _TEST_SH_TEMPLATE.format( - predicted_db=f"{_APP_ROOT}/{db_name}.duckdb", - gold_db=gold_basename, + predicted_db=shlex.quote(f"{_APP_ROOT}/{db_name}.duckdb"), + gold_db=shlex.quote(f"/tests/{gold_basename}"), ) ) test_sh.chmod( diff --git a/tests/unit/test_spider2_dbt_verify_cli.py b/tests/unit/test_spider2_dbt_verify_cli.py index f695be6..59ca75e 100644 --- a/tests/unit/test_spider2_dbt_verify_cli.py +++ b/tests/unit/test_spider2_dbt_verify_cli.py @@ -302,3 +302,38 @@ def test_spider2_dbt_verify_test_sh_uses_resolved_db_name(tmp_path): test_sh = (view / "tests" / "test.sh").read_text() assert "/app/not-spider2-slug.duckdb" in test_sh assert "/app/spider2.duckdb" not in test_sh + + +def test_spider2_dbt_verify_test_sh_quotes_predicted_db_against_injection(tmp_path): + # SECURITY (cycle-5 symmetric): db_name is resolved from the task's + # profiles.yml `path:` (external input). A path carrying shell metacharacters + # must NOT be emitted raw into the verifier test.sh --predicted-db arg, or it + # executes during verification. shlex.quote at the emission point neutralizes + # it (the same class the gold allowlist blocks for --gold-db). + import shlex + + source = _write_gold_source( + tmp_path / "source", gold_basename="g.duckdb", with_default_gold=False + ) + (source / "dbt_project" / "profiles.yml").write_text( + "\n".join( + [ + "example:", + " outputs:", + " dev:", + " type: duckdb", + " path: evil$(touch pwned).duckdb", + " target: dev", + "", + ] + ) + ) + view = materialize_spider2_harbor_task_view( + source_task_dir=source, + view_root=tmp_path / "views", + task_slug="playbook001", + ) + test_sh = (view / "tests" / "test.sh").read_text() + # The metacharacter path is present only in its safely-quoted form. + assert shlex.quote("/app/evil$(touch pwned).duckdb") in test_sh + assert "--predicted-db /app/evil$(touch pwned).duckdb" not in test_sh From 0ce7d5110da96db49f12fb267efe742066dd6755 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 21:55:50 +0800 Subject: [PATCH 22/26] fix(spider2-dbt): drop pandas from verifier; re-port comparator on duckdb-native MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit duckdb_match imported pandas at module level, but verify.py runs INSIDE the task image and that image guarantees only duckdb (the build-time preflight imports it), not pandas — pandas is not a project dep and no injected layer installs it. An image without pandas would crash verify.py on import before emit_reward() writes reward.json, scoring valid runs as infra failures. Re-port the column-containment compare onto duckdb's .fetchall()/.description + stdlib (sorted, math.isclose, an _isna helper). Spider2 duckdb_match semantics preserved verbatim (containment, isclose 1e-2, sort key, condition_cols, multi-table AND, missing-table->0, NA==NA); the behavioral comparator suite stays green unchanged as the faithfulness net. Verifier now imports only duckdb — zero undeclared runtime deps. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 19 +++++ .../benchmarks/spider2_dbt/duckdb_match.py | 70 ++++++++++++------- 2 files changed, 62 insertions(+), 27 deletions(-) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index 5aa8244..a2a628d 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -266,3 +266,22 @@ Closed the cycle-5 shell-injection class definitively. The cycle-4 basename guar ### Summary Closed the symmetric predicted-DB injection that the cycle-5 gold allowlist didn't cover. Both verifier arguments derived from external task input — `db_name` (from `profiles.yml` `path:`) and `gold_basename` (from the eval spec) — are now `shlex.quote`'d at the single test.sh emission point, so neither can be interpreted as shell syntax during verification regardless of source. The gold allowlist stays (it additionally blocks path-traversal, which quoting alone wouldn't). One-line template change + quoted format args + one load-bearing regression (profiles `$()` path). Cycles 1-5 intact; spider2_dbt suite 67 passed. The entire verifier shell boundary (both args, traversal + injection) is now sealed. + +### Cycle 7 — validation gate REJECTED (2026-06-18, captain via Codex review) + +7th adversarial Codex pass found a runtime-dependency gap (different class from the shell-boundary cycles); captain chose fix-now via the duckdb-only re-port. Confirmed live. Routing back to `implementation`: + +1. **[high] Verifier imported undeclared/uninstalled `pandas` → crash-on-import before reward.json.** + `duckdb_match.py` imported `pandas` at module level; `verify.py` (which imports it) runs INSIDE the task image via `python /tests/verify.py`. The image is guaranteed to have `duckdb` (the build-time preflight imports it), but NOT `pandas` — it is not in `pyproject.toml` and dbt-duckdb does not pull it in as a core dep. In an image without pandas the verifier dies during import before `emit_reward()` writes `/logs/verifier/reward.json`, turning valid runs into infra failures. + **Fix (chosen direction: remove pandas):** re-port the column-containment compare onto duckdb's own `.fetchall()`/`.description` + stdlib (`sorted`, `math.isclose`), so the verifier depends ONLY on duckdb — the one library the image is already required to have. Keep all behavioral comparator tests green unchanged (the faithfulness net). Rejected alternative: keep pandas + add a `RUN pip install pandas` layer (network/pip/version assumptions, image bloat). + +## Stage Report: implementation (cycle 7) + +- DONE: Remove the `pandas` import from the verifier comparator; re-port on duckdb-native + stdlib. + `duckdb_match.py` no longer imports pandas. `_fetch_columns` replaces `fetchdf().transpose().values.tolist()` with `cur.fetchall()` + `cur.description` (column count) → list of column-vectors of native Python scalars (NULL→None); a zero-row table still yields one empty vector per column. `_isna` (`x is None or NaN-float`) replaces `pd.isna`; `_vectors_match`, the Spider2 sort key, the `math.isclose(abs_tol=1e-2)` numeric path, condition_cols restriction, the column-containment loop, the per-table AND, and the missing-pred-table→False try/except are all preserved verbatim. Only duckdb (already required by the build-time preflight) is imported. +- DONE: Keep the behavioral comparator tests green unchanged (faithfulness net). + All comparator tests pass unmodified: matching/mismatched DBs, all-tables-AND, missing predicted table→False, float within/beyond 1e-2 tolerance, column-containment reorder, extra-pred-column tolerated, condition_cols restrict + diff-detected, row-reorder match/mismatch under ignore_orders, multi-table gold-line mismatch. spider2_dbt suite (comparator/verify_cli/harbor_view/translate) = 67 passed. `grep import pandas` over `benchmarks/spider2_dbt/` → none (only a docstring `pd.isna` mention). + +### Summary + +Removed the verifier's `pandas` dependency by re-porting the comparator onto duckdb-native fetch + stdlib. `verify.py` now imports only `duckdb` — the single library the verify-time task image is already guaranteed to ship (proven by the build-time preflight that imports it) — so the emitted verifier can no longer crash-on-import in a dbt-duckdb image that lacks pandas (which is neither a project dep nor installed by any injected layer). The Spider2 `duckdb_match` semantics are preserved verbatim (column-containment, `isclose` 1e-2, Spider2 sort key, condition_cols, multi-table AND, missing-table→0, NA==NA); the full behavioral comparator suite stays green UNCHANGED as the faithfulness proof. Cycles 1-6 intact; spider2_dbt suite 67 passed. The verifier now has zero undeclared runtime deps. diff --git a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py index 3e610ad..c1cf3b8 100644 --- a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py +++ b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py @@ -1,12 +1,14 @@ # ABOUTME: spider2-dbt comparator faithfully reproducing Spider2 eval_utils.duckdb_match. -# ABOUTME: Column-containment over transposed column-vectors, math.isclose(1e-2), AND across tables. +# ABOUTME: Column-containment over column-vectors, math.isclose(1e-2), AND across tables. +# Depends ONLY on duckdb (the one library the verify-time image is guaranteed to +# have — the build-time preflight already imports it); no pandas, so the emitted +# verifier cannot crash-on-import in a task image that ships dbt-duckdb but not pandas. from __future__ import annotations import math from pathlib import Path import duckdb -import pandas as pd try: from razorback.benchmarks.spider2_dbt.eval_spec import EvalSpec @@ -17,10 +19,19 @@ _TOLERANCE = 1e-2 +def _isna(x) -> bool: + """stdlib stand-in for pandas.isna over scalar cells fetched from DuckDB. + + DuckDB returns SQL NULL as Python ``None``; a float ``NaN`` is the only other + NA-like scalar. Mirrors Spider2's ``pd.isna(...)`` NaN==NaN handling. + """ + return x is None or (isinstance(x, float) and math.isnan(x)) + + def _vectors_match(v1, v2, *, tol: float = _TOLERANCE, ignore_order: bool = False) -> bool: """Port of Spider2 eval_utils.compare_pandas_table.vectors_match. - Compares two column-vectors element-wise: NaN==NaN, numerics via + Compares two column-vectors element-wise: NA==NA, numerics via math.isclose(abs_tol=tol), everything else by ``!=``. When ignore_order is set, both vectors are sorted first with Spider2's exact key. """ @@ -32,7 +43,7 @@ def _vectors_match(v1, v2, *, tol: float = _TOLERANCE, ignore_order: bool = Fals if len(v1) != len(v2): return False for a, b in zip(v1, v2): - if pd.isna(a) and pd.isna(b): + if _isna(a) and _isna(b): continue elif isinstance(a, (int, float)) and isinstance(b, (int, float)): if not math.isclose(float(a), float(b), abs_tol=tol): @@ -44,9 +55,9 @@ def _vectors_match(v1, v2, *, tol: float = _TOLERANCE, ignore_order: bool = Fals return False -def _compare_pandas_table( - pred: pd.DataFrame, - gold: pd.DataFrame, +def _compare_table( + pred_cols: list[list], + gold_cols: list[list], *, condition_cols: list[int], ignore_order: bool, @@ -54,31 +65,36 @@ def _compare_pandas_table( """Port of Spider2 eval_utils.compare_pandas_table. Column-CONTAINMENT (not positional row equality): restrict gold to - ``condition_cols`` (all columns when empty), transpose both tables to - column-vectors, and require that EVERY gold column-vector matches SOME - pred column-vector. Extra pred columns are therefore tolerated. + ``condition_cols`` (all columns when empty), then require that EVERY gold + column-vector matches SOME pred column-vector. Extra pred columns are + therefore tolerated. Inputs are already transposed to column-vectors. """ if condition_cols: - gold_cols = gold.iloc[:, condition_cols] + gold_vecs = [gold_cols[c] for c in condition_cols] else: - gold_cols = gold - pred_cols = pred + gold_vecs = gold_cols - t_gold_list = gold_cols.transpose().values.tolist() - t_pred_list = pred_cols.transpose().values.tolist() - - for gold_vec in t_gold_list: + for gold_vec in gold_vecs: if not any( _vectors_match(gold_vec, pred_vec, ignore_order=ignore_order) - for pred_vec in t_pred_list + for pred_vec in pred_cols ): return False return True -def _fetch_df(con: duckdb.DuckDBPyConnection, table: str) -> pd.DataFrame: - """SELECT * FROM `table` as a DataFrame (mirrors Spider2 fetchdf()).""" - return con.execute(f'SELECT * FROM "{table}"').fetchdf() +def _fetch_columns(con: duckdb.DuckDBPyConnection, table: str) -> list[list]: + """``SELECT * FROM `table``` as a list of column-vectors (transposed rows). + + Replaces Spider2's ``fetchdf().transpose().values.tolist()``: DuckDB's + ``.fetchall()`` yields native Python scalars in row order (NULL -> None), + and ``.description`` fixes the column count so a zero-row table still + yields one empty vector per column. + """ + cur = con.execute(f'SELECT * FROM "{table}"') + rows = cur.fetchall() + ncols = len(cur.description) + return [[row[j] for row in rows] for j in range(ncols)] def compare_duckdb(*, predicted_db: Path, gold_db: Path, spec: EvalSpec) -> bool: @@ -92,23 +108,23 @@ def compare_duckdb(*, predicted_db: Path, gold_db: Path, spec: EvalSpec) -> bool """ gold_con = duckdb.connect(str(gold_db), read_only=True) try: - gold_tables = [_fetch_df(gold_con, t) for t in spec.condition_tabs] + gold_tables = [_fetch_columns(gold_con, t) for t in spec.condition_tabs] finally: gold_con.close() pred_con = duckdb.connect(str(predicted_db), read_only=True) try: try: - pred_tables = [_fetch_df(pred_con, t) for t in spec.condition_tabs] + pred_tables = [_fetch_columns(pred_con, t) for t in spec.condition_tabs] except Exception: return False finally: pred_con.close() - for i, (gold_df, pred_df) in enumerate(zip(gold_tables, pred_tables)): - if not _compare_pandas_table( - pred_df, - gold_df, + for i, (gold_cols, pred_cols) in enumerate(zip(gold_tables, pred_tables)): + if not _compare_table( + pred_cols, + gold_cols, condition_cols=spec.condition_cols[i], ignore_order=spec.ignore_orders[i], ): From 6c76cbe444afc98b639e027075bd0db87ba53dd2 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 22:03:00 +0800 Subject: [PATCH 23/26] fix(spider2-dbt): quote spec-supplied table names to block SQL injection _fetch_columns interpolated condition_tabs (external eval-spec input) raw into SELECT * FROM "". A `realt"; select 999; --` value breaks out; DuckDB runs the injected statement and .fetchall() returns its rows, so a hostile spec forces gold+pred to return identical rows -> reward 1.0 over mismatched DBs. Add _quote_ident (double embedded ") and use it in _fetch_columns: a breakout value becomes an unresolvable identifier, the gold fetch raises, and emit_reward scores 0 (fail-closed). Adds a load-bearing regression (verified live: raw -> [(999,)], fix -> CatalogException). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 19 +++++++++++++++++++ .../benchmarks/spider2_dbt/duckdb_match.py | 15 ++++++++++++++- .../test_spider2_dbt_verify_comparator.py | 17 +++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index a2a628d..c532f9f 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -285,3 +285,22 @@ Closed the symmetric predicted-DB injection that the cycle-5 gold allowlist didn ### Summary Removed the verifier's `pandas` dependency by re-porting the comparator onto duckdb-native fetch + stdlib. `verify.py` now imports only `duckdb` — the single library the verify-time task image is already guaranteed to ship (proven by the build-time preflight that imports it) — so the emitted verifier can no longer crash-on-import in a dbt-duckdb image that lacks pandas (which is neither a project dep nor installed by any injected layer). The Spider2 `duckdb_match` semantics are preserved verbatim (column-containment, `isclose` 1e-2, Spider2 sort key, condition_cols, multi-table AND, missing-table→0, NA==NA); the full behavioral comparator suite stays green UNCHANGED as the faithfulness proof. Cycles 1-6 intact; spider2_dbt suite 67 passed. The verifier now has zero undeclared runtime deps. + +### Cycle 8 — validation gate REJECTED (2026-06-18, captain via Codex review) + +8th adversarial Codex pass found a SQL-injection via spec-supplied table names; captain chose fix-now. Confirmed live. Routing back to `implementation`: + +1. **[high] `condition_tabs` table name interpolated raw into DuckDB SQL → reward rigging.** + `_fetch_columns` built `SELECT * FROM "{table}"` by string interpolation, but `condition_tabs` comes from the external `spider2_eval.jsonl`. A value like `realt"; select 999 AS a; --` (existing-table prefix) breaks out of the identifier; DuckDB runs the multi-statement and `.fetchall()` returns the LAST statement's rows. So a hostile spec makes BOTH gold and pred fetches return identical injected rows → forced `reward: 1.0` over genuinely mismatched DBs. Verified live (`.venv/bin/python`: raw → `[(999,)]`; quote-doubled → CatalogException). + **Fix:** add an identifier-quoting helper that doubles embedded `"` and use it in `_fetch_columns`; a bogus name then fails to resolve → the gold fetch raises → `emit_reward` scores 0 (fail-closed). Add a regression where `condition_tabs` carries `"; select ...; --` over mismatched DBs and assert it cannot score a match. + +## Stage Report: implementation (cycle 8) + +- DONE: Quote spec-supplied table identifiers in `_fetch_columns` (double embedded `"`). + `duckdb_match.py`: new `_quote_ident(name)` returns `'"' + name.replace('"','""') + '"'`; `_fetch_columns` now does `SELECT * FROM {_quote_ident(table)}`. A breakout value becomes a single bogus identifier that doesn't resolve → CatalogException; `compare_duckdb`'s gold fetch is not try/excepted, so it raises → `emit_reward`'s wrapper scores 0 (fail-closed). Legit table names (incl. ones with special chars) are preserved. +- DONE: Regression proving `condition_tabs` cannot SQL-inject a match. + `test_..._condition_tabs_cannot_sql_inject`: `condition_tabs=['realt"; select 999 AS a; --']` over real `realt` tables holding DIFFERENT data (genuine mismatch). Asserts `_compare` raises (cannot return a forced match). Load-bearing: reverting the quoting makes both fetches return `[(999,)]` → match → the `pytest.raises` fails. spider2_dbt suite (comparator/verify_cli/harbor_view/translate) = 68 passed. Cycles 1-7 untouched. + +### Summary + +Closed a benchmark-integrity SQL-injection: `condition_tabs` (external eval-spec input) was interpolated raw into the comparator's `SELECT * FROM "
"`, letting a hostile spec break out, run an injected statement, and force `reward: 1.0` over mismatched DBs. `_fetch_columns` now quotes the identifier with doubled `"`, so a breakout value becomes an unresolvable identifier and the fetch raises → reward 0 (fail-closed). One helper + one call-site change + one load-bearing regression (verified live that raw injects `[(999,)]` and the fix raises). This is the third trust-boundary arg sealed (gold name → cycle 5, predicted name → cycle 6, table names → now) — SQL this time, not shell. Cycles 1-7 intact; spider2_dbt suite 68 passed. diff --git a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py index c1cf3b8..9b86b97 100644 --- a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py +++ b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py @@ -83,6 +83,19 @@ def _compare_table( return True +def _quote_ident(name: str) -> str: + """Quote a SQL identifier, doubling embedded double-quotes. + + ``condition_tabs`` table names come from the external eval spec, so a value + like ``realt"; select 999; --`` would otherwise break out of the quoted + identifier and DuckDB would execute the injected statement, letting a hostile + spec rig gold/pred fetches to return identical rows and award reward 1.0. + Doubling keeps the value a single (bogus) identifier; it then fails to + resolve and the fetch raises -> emit_reward scores 0 (fail-closed). + """ + return '"' + name.replace('"', '""') + '"' + + def _fetch_columns(con: duckdb.DuckDBPyConnection, table: str) -> list[list]: """``SELECT * FROM `table``` as a list of column-vectors (transposed rows). @@ -91,7 +104,7 @@ def _fetch_columns(con: duckdb.DuckDBPyConnection, table: str) -> list[list]: and ``.description`` fixes the column count so a zero-row table still yields one empty vector per column. """ - cur = con.execute(f'SELECT * FROM "{table}"') + cur = con.execute(f"SELECT * FROM {_quote_ident(table)}") rows = cur.fetchall() ncols = len(cur.description) return [[row[j] for row in rows] for j in range(ncols)] diff --git a/tests/unit/test_spider2_dbt_verify_comparator.py b/tests/unit/test_spider2_dbt_verify_comparator.py index 33b3fc8..4c830aa 100644 --- a/tests/unit/test_spider2_dbt_verify_comparator.py +++ b/tests/unit/test_spider2_dbt_verify_comparator.py @@ -469,3 +469,20 @@ def test_spider2_dbt_verify_multi_table_gold_line_mismatch(tmp_path): "customers": _ints(["id"], [(99,)]), } assert _compare(tmp_path, pred=pred, gold=gold, spec=spec) is False + + +def test_spider2_dbt_verify_condition_tabs_cannot_sql_inject(tmp_path): + # SECURITY: condition_tabs comes from the external eval spec. A name like + # `realt"; select 999 ...; --` breaks out of the quoted identifier; DuckDB + # runs the injected statement and returns ITS rows, so pre-fix both gold and + # pred fetch identical attacker rows -> a forced reward 1.0 over genuinely + # mismatched DBs. The identifier-quoting (doubled ") must keep it a single + # bogus identifier that fails to resolve -> fetch raises -> NEVER a match. + inject = 'realt"; select 999 AS a; --' + spec = EvalSpec(condition_tabs=[inject]) + # Real `realt` tables present (the breakout prefix) but with DIFFERENT data: + # a faithful compare is a mismatch, and the injection must not flip it to 1.0. + pred = {"realt": _ints(["a"], [(1,)])} + gold = {"realt": _ints(["a"], [(2,)])} + with pytest.raises(Exception): + _compare(tmp_path, pred=pred, gold=gold, spec=spec) From f35a13b326fb037c9e88bfcb09e0d50d3a8f0ab0 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 22:14:53 +0800 Subject: [PATCH 24/26] fix(spider2-dbt): DECIMAL tolerance regression + fail closed on missing gold Two findings from adversarial review: 1. [high] DECIMAL tolerance regression (from the cycle-7 pandas removal). DuckDB native fetchall() returns decimal.Decimal for DECIMAL/NUMERIC, which is numbers.Number but NOT numbers.Real, so _vectors_match's (int,float) tolerance branch skipped it -> exact != compare. Spider2's pandas fetchdf() coerced DECIMAL->float64, so within-1e-2 DECIMAL outputs scored 0. Normalize Decimal->float in _fetch_columns (one place), comparator otherwise unchanged. 2. [medium] Missing tests/gold/ fail-open. _ensure_verifier_assets early-returned when gold was absent, leaving the source test.sh (e.g. a stub exit 0) -> an unscored / trivially-passing spider2-dbt task. Now fails closed (raises); fixture-002 gains a real gold so multi-instance/explain tests materialize two scored tasks. Load-bearing regressions for both; spider2_dbt suite 92 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 24 +++++++++++++ .../benchmarks/spider2_dbt/duckdb_match.py | 15 +++++++- .../benchmarks/spider2_dbt/harbor_view.py | 16 +++++++-- .../tests/gold/gold.duckdb | Bin 0 -> 536576 bytes .../tests/gold/spider2_eval.jsonl | 1 + tests/unit/test_spider2_dbt_harbor_view.py | 33 ++++++++++++++++++ tests/unit/test_spider2_dbt_verify_cli.py | 21 +++++++++++ .../test_spider2_dbt_verify_comparator.py | 19 ++++++++++ 8 files changed, 125 insertions(+), 4 deletions(-) create mode 100644 tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-002/tests/gold/gold.duckdb create mode 100644 tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-002/tests/gold/spider2_eval.jsonl diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index c532f9f..f913ce8 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -304,3 +304,27 @@ Removed the verifier's `pandas` dependency by re-porting the comparator onto duc ### Summary Closed a benchmark-integrity SQL-injection: `condition_tabs` (external eval-spec input) was interpolated raw into the comparator's `SELECT * FROM "
"`, letting a hostile spec break out, run an injected statement, and force `reward: 1.0` over mismatched DBs. `_fetch_columns` now quotes the identifier with doubled `"`, so a breakout value becomes an unresolvable identifier and the fetch raises → reward 0 (fail-closed). One helper + one call-site change + one load-bearing regression (verified live that raw injects `[(999,)]` and the fix raises). This is the third trust-boundary arg sealed (gold name → cycle 5, predicted name → cycle 6, table names → now) — SQL this time, not shell. Cycles 1-7 intact; spider2_dbt suite 68 passed. + +### Cycle 9 — validation gate REJECTED (2026-06-18, captain via Codex review) + +9th adversarial Codex pass found a DECIMAL numeric-tolerance regression (introduced by the cycle-7 pandas removal) and a missing-gold fail-open; captain chose fix-both. Confirmed live. Routing back to `implementation`: + +1. **[high] DECIMAL columns skip the `isclose(1e-2)` tolerance (regression from cycle 7).** + DuckDB native `fetchall()` returns `decimal.Decimal` for DECIMAL/NUMERIC; `Decimal` is `numbers.Number` but NOT `numbers.Real`, so `_vectors_match`'s `isinstance(_, (int,float))` numeric branch skipped it → `!=` exact compare. Spider2's pandas `fetchdf()` had converted DECIMAL→float64, so a within-1e-2 DECIMAL match scored 0. Verified live: `_vectors_match([D('1.005')],[D('1.000')])` → False. + **Fix:** normalize `Decimal`→`float` in `_fetch_columns` (one place; mirrors fetchdf's float64), keeping `_vectors_match`/sort-key identical to Spider2. DECIMAL within/beyond-tolerance regression tests. +2. **[medium] Missing `tests/gold/` silently kept the source test.sh → unscored task.** + `_ensure_verifier_assets` early-returned when `tests/gold/` was absent, leaving the source `test.sh` (e.g. `fixture-002`'s `exit 0`) → a trivially-passing spider2-dbt task under dataset skew / resolver bug. + **Fix:** fail closed (raise) when a spider2-dbt source lacks `tests/gold/spider2_eval.jsonl`; give `fixture-002` a real gold so the multi-instance/explain tests exercise two scored tasks; fail-closed materialization regression. + +## Stage Report: implementation (cycle 9) + +- DONE: DECIMAL tolerance — normalize Decimal→float at fetch. + `duckdb_match.py`: new `_normalize(v)` (`float(v) if isinstance(v, Decimal) else v`) applied per cell in `_fetch_columns`; `_vectors_match` and the Spider2 sort key are unchanged (they now see floats exactly as Spider2's pandas path did). 2 regression tests: DECIMAL(10,3) within 1e-2 → match, beyond → mismatch. Load-bearing (without normalize, Decimal `!=` → the within-tol match would score False). +- DONE: Fail closed on missing gold + make fixture-002 scored. + `harbor_view.py`: `_ensure_verifier_assets` now raises `FileNotFoundError` when `tests/gold/spider2_eval.jsonl` is absent (was a silent early-return). `tests/fixtures/.../spider2-fixture-002/tests/gold/` gains a minimal `gold.duckdb` (orders table) + `spider2_eval.jsonl`, so the multi-instance leakage test and the `rk run --explain` integration test now materialize TWO scored tasks. Test helpers `_write_source` (harbor_view) and the non-dbt `plain-001` case updated to ship gold (they assert on layers, not scoring). New `test_..._missing_gold_dir_fails_closed` asserts a no-gold source raises. +- DONE: full spider2_dbt suite green. + `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` = 92 passed. Comparator faithfulness net (cycles 1/7), shell-boundary quoting (cycles 5/6), SQL-identifier quoting (cycle 8), and AC-3 reward shape all intact. + +### Summary + +Fixed a benchmark-correctness regression and a fail-open. (1) The cycle-7 pandas removal had left DECIMAL/NUMERIC values as `decimal.Decimal`, which bypassed `_vectors_match`'s `isclose(1e-2)` numeric branch (Decimal is Number, not Real) — so real dbt DECIMAL outputs within tolerance scored 0. Normalizing Decimal→float at fetch (one place) restores Spider2's float64 semantics with the comparator otherwise unchanged. (2) A spider2-dbt source missing `tests/gold/` silently kept its source `test.sh`, materializing an unscored / trivially-passing task; the materializer now fails closed, and `fixture-002` was given a real gold so the multi-instance + explain tests exercise two genuinely-scored tasks. Both proven by load-bearing regressions. Cycles 1-8 intact; spider2_dbt suite 92 passed. diff --git a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py index 9b86b97..1b87a05 100644 --- a/src/razorback/benchmarks/spider2_dbt/duckdb_match.py +++ b/src/razorback/benchmarks/spider2_dbt/duckdb_match.py @@ -6,6 +6,7 @@ from __future__ import annotations import math +from decimal import Decimal from pathlib import Path import duckdb @@ -107,7 +108,19 @@ def _fetch_columns(con: duckdb.DuckDBPyConnection, table: str) -> list[list]: cur = con.execute(f"SELECT * FROM {_quote_ident(table)}") rows = cur.fetchall() ncols = len(cur.description) - return [[row[j] for row in rows] for j in range(ncols)] + return [[_normalize(row[j]) for row in rows] for j in range(ncols)] + + +def _normalize(v): + """Mirror Spider2's pandas fetchdf() DECIMAL->float64 coercion. + + DuckDB native fetchall() returns ``decimal.Decimal`` for DECIMAL/NUMERIC, + but Decimal is ``numbers.Number`` yet NOT ``numbers.Real``, so it would skip + ``_vectors_match``'s tolerance branch and a within-1e-2 DECIMAL match would + wrongly score 0. Normalize to float in one place so downstream compares see + the same scalar types Spider2's pandas path produced. + """ + return float(v) if isinstance(v, Decimal) else v def compare_duckdb(*, predicted_db: Path, gold_db: Path, spec: EvalSpec) -> bool: diff --git a/src/razorback/benchmarks/spider2_dbt/harbor_view.py b/src/razorback/benchmarks/spider2_dbt/harbor_view.py index 02b97a5..33f0d3f 100644 --- a/src/razorback/benchmarks/spider2_dbt/harbor_view.py +++ b/src/razorback/benchmarks/spider2_dbt/harbor_view.py @@ -120,9 +120,20 @@ def _ensure_verifier_assets( spec names a gold file that does not exist under ``tests/gold/``, so the verifier never scores against a missing/wrong gold. """ + # Every spider2-dbt task is duckdb_match-scored, so its source MUST ship + # tests/gold/spider2_eval.jsonl (+ the named gold DB). A missing gold dir is + # NOT a "skip scoring" signal — silently returning would leave the source + # test.sh in place (e.g. a stub `exit 0`), materializing an unscored / + # trivially-passing task under dataset skew or a resolver bug. Fail closed. source_gold = Path(source_task_dir) / "tests" / "gold" - if not source_gold.is_dir(): - return + source_spec = source_gold / "spider2_eval.jsonl" + if not source_spec.is_file(): + raise FileNotFoundError( + f"spider2-dbt task {task_slug!r} is missing its gold eval spec " + f"{source_spec} — every spider2-dbt task is duckdb_match-scored and " + "must ship tests/gold/spider2_eval.jsonl + its named gold DB; " + "refusing to materialize an unscored task (fail-closed)" + ) tests = view_dir / "tests" tests.mkdir(parents=True, exist_ok=True) @@ -134,7 +145,6 @@ def _ensure_verifier_assets( # Real Spider2 tasks name the gold per task; scoring the wrong/missing file # is a benchmark-correctness defect. A wrapped spec without parameters.gold # fails closed inside load_eval_spec. - source_spec = source_gold / "spider2_eval.jsonl" spec = _eval_spec_mod.load_eval_spec(source_spec) gold_basename = spec.gold or "gold.duckdb" source_gold_db = source_gold / gold_basename diff --git a/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-002/tests/gold/gold.duckdb b/tests/fixtures/spider2_dbt/harbor_task_minimal/spider2-fixture-002/tests/gold/gold.duckdb new file mode 100644 index 0000000000000000000000000000000000000000..cd81945ed0b6daad5605bc1b678207caca3c6e20 GIT binary patch literal 536576 zcmeI)!DWx?$tE%}BTB&zE49_kuKU#X&RiDO9 zRqNX+j{T{qs%9v2&#D>(CHp{t009C72oNAZfB*pk1dfG3iufBT;*WU>+ly8#g+?gi zPx$A*3V zx8oIrUb`KSBwnOwA8zM*#>?|J@6F5{i#O6i5I78hqyBx*QICHZ2F4&jfB*pk1WuMf zC;Heop7 within-tol == match. + spec = EvalSpec(condition_tabs=["m"]) + pred = {"m": (["v"], ["DECIMAL(10,3)"], [(1.005,), (2.000,)])} + gold = {"m": (["v"], ["DECIMAL(10,3)"], [(1.000,), (2.004,)])} + assert _compare(tmp_path, pred=pred, gold=gold, spec=spec) is True + + +def test_spider2_dbt_verify_decimal_beyond_tolerance_mismatches(tmp_path): + # Same DECIMAL path, difference > 1e-2 -> mismatch (proves it's the tolerance + # compare, not a string/exact-equality fallback). + spec = EvalSpec(condition_tabs=["m"]) + pred = {"m": (["v"], ["DECIMAL(10,3)"], [(1.050,)])} + gold = {"m": (["v"], ["DECIMAL(10,3)"], [(1.000,)])} + assert _compare(tmp_path, pred=pred, gold=gold, spec=spec) is False + + def test_spider2_dbt_verify_condition_tabs_cannot_sql_inject(tmp_path): # SECURITY: condition_tabs comes from the external eval spec. A name like # `realt"; select 999 ...; --` breaks out of the quoted identifier; DuckDB From 965f2eae1cd5326de6690c7b2848256363ace6a4 Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 22:19:42 +0800 Subject: [PATCH 25/26] fix(spider2-dbt): guard verifier-asset copies against symlink write-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _ensure_verifier_assets copied the comparator modules + gold DB + eval spec into view/tests/ with bare shutil.copy2. In view_mode="link" a colliding source-provided name (tests/verify.py, tests/.duckdb, etc.) is a symlink back to the source, so copy2 follows it and overwrites the source task — spider2 translation binds to link mode, so this is on the live path. Route all 5 copies through _copy_into_view, which unlinks a symlink dst first (same guard already on the Dockerfile/preflight/test.sh writes). Adds a load-bearing link-mode regression seeding a source tests/verify.py. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../spider2-dbt-duckdb-match-verifier.md | 19 +++++++++++ .../benchmarks/spider2_dbt/harbor_view.py | 20 +++++++++-- tests/unit/test_spider2_dbt_harbor_view.py | 33 +++++++++++++++++++ 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index f913ce8..b95c5ce 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -328,3 +328,22 @@ Closed a benchmark-integrity SQL-injection: `condition_tabs` (external eval-spec ### Summary Fixed a benchmark-correctness regression and a fail-open. (1) The cycle-7 pandas removal had left DECIMAL/NUMERIC values as `decimal.Decimal`, which bypassed `_vectors_match`'s `isclose(1e-2)` numeric branch (Decimal is Number, not Real) — so real dbt DECIMAL outputs within tolerance scored 0. Normalizing Decimal→float at fetch (one place) restores Spider2's float64 semantics with the comparator otherwise unchanged. (2) A spider2-dbt source missing `tests/gold/` silently kept its source `test.sh`, materializing an unscored / trivially-passing task; the materializer now fails closed, and `fixture-002` was given a real gold so the multi-instance + explain tests exercise two genuinely-scored tasks. Both proven by load-bearing regressions. Cycles 1-8 intact; spider2_dbt suite 92 passed. + +### Cycle 10 — validation gate REJECTED (2026-06-18, captain via Codex review) + +10th adversarial Codex pass found an unguarded symlink-write-through in the verifier-asset copies (same class as the Dockerfile/preflight/test.sh guards); captain chose fix-now. Confirmed live. Routing back to `implementation`: + +1. **[high] `_ensure_verifier_assets` `copy2` calls follow symlinks → source corruption in link mode.** + In `view_mode="link"` the generic materializer reflects allowed source files as symlinks. The 3 comparator-module copies + the gold-DB copy + the eval-spec copy used bare `shutil.copy2`, so a colliding source-provided name (`tests/verify.py`, `tests/eval_spec.py`, `tests/duckdb_match.py`, top-level `tests/spider2_eval.jsonl`, `tests/.duckdb`) is a symlink back to source → `copy2` follows it → overwrites the source task. spider2 translation binds to link mode, so this is on the live path. `test.sh`/Dockerfile/preflight already guarded this exact class; these copies didn't. + **Fix:** unlink-before-copy helper (`if dst.is_symlink(): dst.unlink()`) for all 5 verifier-asset copies; link-mode regression seeding a source `tests/verify.py` proving the source stays unchanged. + +## Stage Report: implementation (cycle 10) + +- DONE: Guard all 5 verifier-asset copies against symlink write-through. + `harbor_view.py`: new `_copy_into_view(src, dst)` unlinks a symlink `dst` before `shutil.copy2`; applied to the 3 module copies, the named gold-DB copy, and the eval-spec copy. The `test.sh` write keeps its existing guard. Same pattern as the Dockerfile/task.toml/preflight write-through fixes. +- DONE: Link-mode regression proving a colliding source file is not corrupted. + `test_link_mode_verifier_assets_never_mutate_colliding_source_file`: seeds a source `tests/verify.py` with sentinel content, materializes in link mode, asserts the view owns a REAL (non-symlink) `verify.py` carrying the generated comparator CLI AND the source file is byte-for-byte unchanged. Load-bearing (without the guard, copy2 follows the symlink → view path stays a symlink + source content overwritten → both asserts fail). spider2_dbt suite = 93 passed. + +### Summary + +Closed the last instance of the link-mode symlink-write-through hazard: `_ensure_verifier_assets` copied the comparator modules, gold DB, and eval spec into `view/tests/` with bare `copy2`, so a source task shipping a colliding name (e.g. its own `tests/verify.py`) would have its source file overwritten during translate/materialize (spider2 binds to link mode). All 5 copies now route through `_copy_into_view`, which unlinks a symlink destination first — the same guard already applied to the Dockerfile, preflight script, and test.sh writes. Proven by a load-bearing link-mode regression. Cycles 1-9 intact; spider2_dbt suite 93 passed. diff --git a/src/razorback/benchmarks/spider2_dbt/harbor_view.py b/src/razorback/benchmarks/spider2_dbt/harbor_view.py index 33f0d3f..f41ed32 100644 --- a/src/razorback/benchmarks/spider2_dbt/harbor_view.py +++ b/src/razorback/benchmarks/spider2_dbt/harbor_view.py @@ -100,6 +100,20 @@ def materialize_spider2_harbor_task_view( return view +def _copy_into_view(src: Path, dst: Path) -> None: + """copy2 src->dst, first replacing any symlink dst with a view-owned file. + + In link mode the generic materializer reflects allowed source files as + symlinks, so a view path that collides with a source-provided name (e.g. a + source `tests/verify.py` or top-level `tests/.duckdb`) is a symlink + back to the source; a bare copy2 would follow it and overwrite the source + task. Same write-through class the Dockerfile/preflight/test.sh writes guard. + """ + if dst.is_symlink(): + dst.unlink() + shutil.copy2(src, dst) + + def _ensure_verifier_assets( view_dir: Path, *, source_task_dir: Path, task_slug: str ) -> None: @@ -139,7 +153,7 @@ def _ensure_verifier_assets( tests.mkdir(parents=True, exist_ok=True) for mod in (_duckdb_match_mod, _eval_spec_mod, _verify_mod): src = Path(mod.__file__) - shutil.copy2(src, tests / src.name) + _copy_into_view(src, tests / src.name) # Resolve the gold DB basename from the spec (NOT hardcoded gold.duckdb). # Real Spider2 tasks name the gold per task; scoring the wrong/missing file @@ -154,8 +168,8 @@ def _ensure_verifier_assets( f"{source_gold_db} does not exist; refusing to emit a verifier that " "would score against a missing gold (fail-closed)" ) - shutil.copy2(source_gold_db, tests / gold_basename) - shutil.copy2(source_spec, tests / "spider2_eval.jsonl") + _copy_into_view(source_gold_db, tests / gold_basename) + _copy_into_view(source_spec, tests / "spider2_eval.jsonl") # Resolve the agent-facing DuckDB stem from the dbt project (or the slug # fallback) so the verifier compares the SAME `/app/.duckdb` the diff --git a/tests/unit/test_spider2_dbt_harbor_view.py b/tests/unit/test_spider2_dbt_harbor_view.py index b1f4cc7..b3914fc 100644 --- a/tests/unit/test_spider2_dbt_harbor_view.py +++ b/tests/unit/test_spider2_dbt_harbor_view.py @@ -417,3 +417,36 @@ def test_link_mode_preflight_script_never_mutates_source_named_file(tmp_path): # The SOURCE file is byte-for-byte unchanged — no write followed the symlink. assert source_script.read_text() == source_script_before + + +def test_link_mode_verifier_assets_never_mutate_colliding_source_file(tmp_path): + """A source task shipping a colliding tests/verify.py must not be corrupted. + + `_ensure_verifier_assets` copies the comparator modules + gold + eval spec + into view/tests/. Under `view_mode="link"` a source-provided tests/verify.py + is reflected as a symlink back into the source, so an unguarded copy2 would + follow it and overwrite the user's source file — same write-through class as + the Dockerfile/preflight/test.sh guards. + """ + source = _write_source( + tmp_path / "source", with_packages=True, with_duckdb=True + ) + source_verify = source / "tests" / "verify.py" + source_verify_before = "# user's own tests/verify.py, not the generated one\n" + source_verify.write_text(source_verify_before) + + view = materialize_spider2_harbor_task_view( + source_task_dir=source, + view_root=tmp_path / "views", + task_slug="spider2-fixture-001", + view_mode="link", + ) + + # The view owns a real verify.py carrying the generated comparator CLI. + view_verify = view / "tests" / "verify.py" + assert view_verify.is_file() + assert not view_verify.is_symlink() + assert "emit_reward" in view_verify.read_text() + + # The SOURCE file is byte-for-byte unchanged — no copy2 followed the symlink. + assert source_verify.read_text() == source_verify_before From 5cdcea353b6c5a27ee6d5a7180bde60d186d4fca Mon Sep 17 00:00:00 2001 From: Kent Huang Date: Thu, 18 Jun 2026 22:39:23 +0800 Subject: [PATCH 26/26] =?UTF-8?q?docs(spider2):=20final=20validation=20(cy?= =?UTF-8?q?cles=201-10)=20=E2=80=94=20PASSED?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Independent validation of HEAD 965f2ea: 3 ACs green from clean checkout (gating 93 passed, acceptance -k spider2_dbt_verify 49 passed), all dispatch focus areas confirmed by live probes (comparator faithfulness, duckdb-only, fail-closed guards, shell+SQL injection sealed, symlink write-through, reward.json shape). 4 full-suite failures pre-existing on merge-base 9c39af2 (verified). Code review: 0 blocking, 3 minor. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_012s6B1EQB2346B7KiM4hXPX --- .../spider2-dbt-duckdb-match-verifier.md | 13 + .../spider2-dbt-duckdb-match-verifier.md | 603 +++++------------- 2 files changed, 156 insertions(+), 460 deletions(-) diff --git a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md index b95c5ce..7cc8f0c 100644 --- a/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md @@ -347,3 +347,16 @@ Fixed a benchmark-correctness regression and a fail-open. (1) The cycle-7 pandas ### Summary Closed the last instance of the link-mode symlink-write-through hazard: `_ensure_verifier_assets` copied the comparator modules, gold DB, and eval spec into `view/tests/` with bare `copy2`, so a source task shipping a colliding name (e.g. its own `tests/verify.py`) would have its source file overwritten during translate/materialize (spider2 binds to link mode). All 5 copies now route through `_copy_into_view`, which unlinks a symlink destination first — the same guard already applied to the Dockerfile, preflight script, and test.sh writes. Proven by a load-bearing link-mode regression. Cycles 1-9 intact; spider2_dbt suite 93 passed. + +## Stage Report: validation (cycle 5) + +- DONE: Rerun `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` and the entity's acceptance command from a clean checkout of the worktree branch; record ACTUAL output and map PASS/FAIL to each AC clause (do not trust prior reports) + Clean branch HEAD 965f2ea: gating `-k spider2_dbt` = 93 passed, 751 deselected; acceptance `-k spider2_dbt_verify` = 49 passed, 795 deselected. AC-1/AC-2/AC-3 all PASS — confirmed by an 11-case standalone `compare_duckdb` probe (column-containment, isclose/DECIMAL 1e-2, per-column sort, condition_cols restrict, multi-table AND, missing-table→0, NA==NA) and an `emit_reward` probe (matching→`{"reward":1.0}`, mismatch/garbage/empty-spec/missing-pred→`{"reward":0.0}`, value is float, no crash-into-pass). Fail-closed + injection guards (8 unsafe-gold cases raise, SQL-injection condition_tabs raises) all confirmed live. duckdb-only: grep for pandas/numpy → none. +- DONE: Run `superpowers:requesting-code-review` against the worktree branch (base main); classify every finding as blocking or non-blocking + Reviewer (range 9c39af2..965f2ea) independently fetched the upstream xlang-ai/Spider2 oracle and ran the suite: Critical 0, Important 0 — could construct NO external input that forces a false reward 1.0. 3 Minor (all non-blocking): `condition_cols` `[[]]`→`[[]]*n` forgiving-direction default; leakage-scoping rests on Harbor verify-only-tests/ lifecycle (suggest version-pin comment); dead `gold.duckdb` fallback for the unwrapped-fixture path. All clarity/robustness polish, no correctness defect. +- DONE: Write/refresh the validation report covering PASS/FAIL per AC with exact command+output, the code-review findings, and a gate decision + `docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md` refreshed for the cycles-1–10 HEAD. Full suite = 4 failed, 828 passed; the 4 failures reproduce on merge-base 9c39af2 (throwaway base worktree) and live in files untouched by this branch — pre-existing, NOT regressions. Gate: PASSED. + +### Summary + +Independent validation of the cycles-1–10 HEAD (965f2ea) — no production code written. All three ACs reproduce green from a clean branch checkout with recorded command output (gating 93 passed; acceptance `-k spider2_dbt_verify` 49 passed), and every dispatch-named focus area is confirmed by live behavioral probes rather than re-reading prior reports: faithful column-containment comparator (isclose 1e-2, per-column sort, condition_cols, multi-table AND, missing-table→0, NA==NA, DECIMAL tolerance), duckdb-only (no pandas/numpy import), fail-closed spec guards + gold-basename allowlist, sealed shell (shlex.quote both args) and SQL (identifier-quoted condition_tabs) injection boundaries, link-mode symlink-write-through guards on all 5 asset copies, and the `{"reward": }` shape with no crash-into-pass. The 4 full-suite failures are pre-existing on the merge-base and outside this branch's scope (verified). The independent code review (which fetched the upstream Spider2 oracle) found zero blocking issues — 3 minor polish notes only. Gate verdict: PASSED → done. Full report: docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md diff --git a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md index 2c0a201..c2fc2b4 100644 --- a/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md +++ b/docs/razorback-implementation/validation/spider2-dbt-duckdb-match-verifier.md @@ -1,464 +1,147 @@ -# Validation: spider2-dbt — duckdb_match verifier emitting binary reward.json - -**Branch:** `spacedock-ensign/spider2-dbt-duckdb-match-verifier` @ `7ab0bf7` -**Merge-base with main:** `9c39af2` -**Gate verdict:** **REJECTED → implementation** - -Two blocking findings: (B1) a branch-induced regression in the spider2 -production translate leakage suite, and (B2) the `duckdb_match` -reproduction is **not faithful** to Spider2's `eval_utils.duckdb_match` -semantics (verdict-flipping divergences, plus an incompatible eval-spec -schema). The 3 ACs as literally written all reproduce green, but a -verifier whose comparator disagrees with the benchmark's own oracle and -whose change breaks the shipping leakage suite must not advance. - ---- - -## Acceptance command (gating) - -`uv run pytest -k spider2_dbt_verify --ignore=tests/unit/test_task_identity_scoring.py` -→ **18 passed, 793 deselected** (clean checkout of the branch HEAD). - -## AC reproduction (independent, not trusting the in-repo tests) - -Driven via a standalone script against the `compare_duckdb` comparator -and the materializer, from a clean worktree. - -- **AC-1 — 1.0 on match / 0.0 on mismatch:** PASS. - `compare_duckdb` on a matching DB → `True`; on a differing - condition-col value → `False`. Missing predicted table → `False`. -- **AC-2 — column subset + ignore_orders:** PASS *(as literally - specified)*. - (a) Row-reordered table + `ignore_orders=True` → `True`; same with - `ignore_orders=False` → `False`. (b) A diff in a NON-condition column - (excluded from `condition_cols`) → `True`; the same diff with the - column included → `False`. -- **AC-3 — emitted test.sh writes harbor-shaped reward.json:** PASS. - Materialized the view, then **actually executed the emitted - `test.sh`** (container paths `/tests`,`/app/`,`/logs/verifier` - substituted to temp dirs): exit 0, wrote - `/logs/verifier/reward.json` = `{"reward": 0.0}` on a mismatch and - `{"reward": 1.0}` on a match. `set(payload) == {"reward"}`, - `isinstance(payload["reward"], float)`. - Note: the in-repo integration test invokes `verify.py` directly and - only `sh -n` syntax-checks `test.sh`; I additionally ran the script - itself to satisfy AC-3's "running the emitted test.sh" clause. - -## Mandatory rider (resolver-driven predicted-db path) - -**CONFIRMED.** Materialized with `task_slug="not-spider2-slug"` (fixture -ships no profiles.yml / *.duckdb → resolver slug fallback). Emitted -`test.sh` `--predicted-db` = `/app/not-spider2-slug.duckdb`; -`/app/spider2.duckdb` absent. The path flows from ny's -`resolve_spider2_db_name` (imported from `preflight.py`, unchanged on -this branch), satisfying the SHARED `/app/.duckdb` contract for -a NON-`spider2` slug. - -## Verifier-only gold assets / leakage (path-based) - -Gold `.duckdb` + `spider2_eval.jsonl` + the 3 comparator modules + -`test.sh` land in the view's `tests/` (uploaded to the container only at -verify time). No `gold/` path SEGMENT survives in the materialized view -(`rglob("gold")` dirs = none; the empty `gold/` dir left by the -deny-glob file-strip is pruned). The path-based deny-glob isolation is -sound — but see B1: the assets still trip the production-path *content* -leakage scanner. - ---- - -## BLOCKING findings - -### B1 — Regression in the spider2 translate leakage suite - -`tests/unit/test_translate_spider2_dbt.py` has TWO tests that **pass on -`9c39af2` (base) and FAIL on `7ab0bf7` (this branch)** — verified by -running them in a base worktree (2 passed) and on the branch (2 failed): - -- `test_spider2_resolves_n_views_all_leakage_clean` -- `test_planted_forbidden_files_are_excluded_from_view` - -The test file is **unchanged** by this branch (`git diff base..HEAD -- -test_translate_spider2_dbt.py` empty), so this is a behavior regression, -not a test edit. Root cause: these tests drive the production path -`spec_to_job_config → materialize_spider2_harbor_task_view`, and this -branch's new `_ensure_verifier_assets` writes `duckdb_match.py`, -`eval_spec.py`, `verify.py`, `gold.duckdb`, and `test.sh` into the view's -`tests/`. The suite's `_leakage_hits` scans file NAME **and CONTENT** for -the alternation `gold|expected|golden`; the assets contain "gold" -(`gold.duckdb` filename; `--gold-db`/`gold_db`/`gold_rows`/docstrings -inside the .py and test.sh — confirmed: duckdb_match.py has 9 hits, -verify.py 5, eval_spec.py 4). All 5 assets are flagged as leakage. - -This is exactly the "no regression to ny's generic behavior" guard the -entity required. The gating `-k spider2_dbt_verify` selector does NOT -match `test_translate_spider2_dbt`, which is why the implementer's green -run missed it. - -**Fix direction (for implementation):** reconcile the two leakage -notions. Either (a) make `_ensure_verifier_assets` write the verifier -modules under a name/location the content+name scanner treats as -verifier-internal (the scanner already exempts `view_manifest.json` — -an analogous exemption for the verifier `tests/` payload is the natural -move), or (b) update the shipping leakage contract/test to recognize the -verifier `tests/` payload as verifier-only (uploaded at verify time, not -agent-visible) rather than answer leakage. Coordinate with whoever owns -`test_translate_spider2_dbt.py`'s `_leakage_hits` contract; do not -silently weaken the scan. - -### B2 — `duckdb_match` reproduction is NOT faithful to Spider2 eval_utils - -The canonical oracle is `spider2-dbt/evaluation_suite/eval_utils.py` -`duckdb_match` (fetched from xlang-ai/Spider2 main). The reproduction -diverges in ways that **flip the reward** versus the real benchmark, so a -model scored by this verifier would be graded differently than by -Spider2's own scorer: - -1. **Comparison axis — column-containment vs row-tuple.** Spider2 - transposes both tables and checks *each gold column-vector matches - SOME pred column-vector* (`for gcol in t_gold: any(vectors_match(gcol, - pcol) for pcol in t_pred)`). The impl compares **row tuples - positionally** after `SELECT *`. Demonstrated divergences (Spider2 → - 1, impl → 0): pred with **reordered columns**; pred with an **extra - column** beyond the gold/condition set. -2. **Float tolerance.** Spider2 uses `math.isclose(abs_tol=1e-2)` (and - NaN-equality). The impl uses exact `==` on raw tuples. Gold 1.000 vs - pred 1.005 → Spider2 1, impl 0. -3. **`ignore_orders` granularity.** Spider2 sorts **each column-vector - independently**; the impl sorts **row-tuples as units**. These are not - the same multiset relation — Spider2 can match a table whose - cross-column row association is scrambled; the impl cannot. -4. **Eval-spec schema is incompatible.** Real `spider2_eval.jsonl` lines - are `{"instance_id":..., "evaluation":{"func":"duckdb_match", - "parameters":{"gold":..., "condition_tabs":[...], - "condition_cols":[[...],...], "ignore_orders":[bool,...]}}}` — - `condition_cols` is `List[List[int]]` (positional, parallel to - `condition_tabs`) and `ignore_orders` is `List[bool]` (per-table), - nested under `evaluation.parameters`. The impl's `EvalSpec` / - `load_eval_spec` model `condition_cols: dict[str,list[int]]` and a - single `ignore_orders: bool`, read flat from the top level. The impl's - loader would **not parse a real Spider2 gold line** — it would silently - produce empty `condition_tabs` (`raw.get("condition_tabs", [])`) and - score every prediction 1.0 (vacuous AND over zero tables). - -The entity body asserts (verbatim) "per-table SELECT *, 0-based -condition_cols subset, ignore_orders multiset, AND across condition_tabs, -missing table = mismatch" — that description matches the IMPL but **not -Spider2's actual `compare_pandas_table`**, which is column-containment + -float-tolerant + per-column sort. The plan/AC contract treated a -row-tuple multiset model as the target; the real oracle is materially -different. This is the faithfulness check the rider demanded, and it -fails. - -**Fix direction (for implementation):** port `compare_pandas_table` -(transpose + per-column vectors_match with `abs_tol=1e-2` + per-column -ordered/sorted compare + gold-column-containment) rather than a row-tuple -compare; adopt the real `List[List[int]]` / `List[bool]` / -`evaluation.parameters` spec schema (or document an explicit, approved -deviation if the dbt track intentionally tightens the oracle — but that -needs a captain decision, not a silent reinterpretation). Update the gold -fixture + AC-2 wording to the column-containment semantics. - -## Two pre-flagged deviations — explicit verdict each - -1. **Rider override of the plan's hardcoded `/app/spider2.duckdb` - test.sh path → resolver-driven `/app/.duckdb`:** - **ACCEPTED / CORRECT.** This is exactly what the rider mandated; - proven for a non-spider2 slug above. Not a blocker. -2. **Pruning the empty `gold/` dir left after deny-glob file-stripping:** - **ACCEPTED.** Narrow, targeted (`rglob("gold")` dirs, only removed - when `not any(iterdir())`), keeps the path-segment leakage assertion - exact, and does not touch non-empty dirs. Not a blocker. (Note: it - does not address B1, which is about file CONTENT, not the `gold/` - directory segment.) - -## Regression scope (full suite, minus known-broken file) - -`uv run pytest --ignore=tests/unit/test_task_identity_scoring.py` → -**6 failed, 793 passed, 12 skipped**. -- 4 failures (`test_spacedock_solver_freeze_dir_mechanism`, - `test_worktree_teardown_preserves_runs`, - `test_generate_matrix_specs`, `test_rk_research_new`) **also fail on - base `9c39af2`** → pre-existing, unrelated, NOT regressions. -- 2 failures (`test_translate_spider2_dbt.py`, both) → **branch-induced - regression** (B1). -- `test_task_identity_scoring.py` ignored per the assignment - (pre-existing `razorback.score.load` import break). -- harbor_view + preflight suites (`-k "harbor_view or preflight"`): - **45 passed** — no regression there. - -## Code review - -The stage asks for `superpowers:requesting-code-review`. The substantive -findings it would surface are captured above (B1 regression, B2 -faithfulness + schema). No additional blocking correctness issues found -in `verify.py` / `eval_spec.py` / the materializer beyond B1/B2; the -flat-import fallbacks, fail-closed `predicted_db` existence check, and -harbor reward shape are sound. - -## Gate decision - -**REJECTED → implementation.** Fix B1 (restore the spider2 translate -leakage suite to green without weakening the leakage contract) and B2 -(make `duckdb_match` faithful to Spider2 `eval_utils.duckdb_match`, or -get an explicit captain-approved deviation, AND adopt the real -`spider2_eval.jsonl` schema so the loader parses real gold lines). The 3 -ACs as written pass and the rider is satisfied, but those ACs encode the -wrong oracle semantics — re-derive AC-2 against column-containment when -re-entering implementation. - ---- - -# Re-review (cycle 2): validation gate - -**Branch:** `spacedock-ensign/spider2-dbt-duckdb-match-verifier` @ `f878967` -**Merge-base with main:** `9c39af2` -**Gate verdict:** **PASSED → done** - -Independent re-verification of the two cycle-1 blocking findings after the -fix commits `61a1b9c` (B2) and `22b9b6c` (B1). The authoritative -xlang-ai/Spider2 `spider2-dbt/evaluation_suite/eval_utils.py` was fetched -live via `gh api` (387 lines, decoded to `/tmp`); the impl was checked -line-by-line against it and differential-tested against an inline -transcription of that source. Both findings are resolved; no new -regressions; rider + AC-3 intact. - -## B2 — comparator faithfulness to Spider2 `eval_utils.duckdb_match`: CONFIRMED - -`src/razorback/benchmarks/spider2_dbt/duckdb_match.py` is a faithful port of -Spider2 `duckdb_match` + `compare_pandas_table` + `vectors_match`: - -- **Column-containment** (not row-tuple equality): gold restricted to - `condition_cols` via `iloc[:, ...]`, pred uses all columns, both - transposed to column-vectors, each gold column must match SOME pred - column — matches Spider2 `eval_utils.py:132-149`. (impl drops Spider2's - dead `else` loop at 145-148; no behavioral effect.) -- **Per-column sort under `ignore_order`** with Spider2's exact sort key - `(x is None, str(x), isinstance(x,(int,float)))` — matches 115-117. -- **Numeric `math.isclose(abs_tol=1e-2)`** — matches 124. -- **NaN: `pd.isna(a) and pd.isna(b)` → equal** — matches 121. -- **AND across `condition_tabs`**, **predicted-fetch failure = mismatch** - (try/except → False) — matches 221-243. - -Differential test (impl vs an inline transcription of the upstream oracle) -on 10 verdict-flipping cases — float within/beyond 1e-2, row reorder -with/without ignore_order, extra pred columns, diff in non-condition col, -NaN, missing pred table, multi-table 1-mismatch, multi-table all-match — -**all 10 agree and match the expected verdict**. - -`load_eval_spec` drilled into the REAL `playbook001`/`provider001` gold -lines fetched from upstream `gold/spider2_eval.jsonl`: parses -`evaluation.parameters` with per-table `condition_tabs: List[str]`, -`condition_cols: List[List[int]]` (`[[0,1],[0,1,2,5,6,7,9,10,11,12,13]]`), -`ignore_orders: List[bool]` (`[True,True]`) — matches `evaluate.py:96-99` -(`duckdb_match(result, **parameters)`). All 68 real duckdb_match gold lines -carry explicit tabs/cols/orders, so the impl's explicit-`condition_tabs` -requirement (no None→all-tables default) is never exercised on real data. - -Minor non-blocking note: impl uses `SELECT * FROM "{table}"` (quoted) vs -Spider2's unquoted `{table_name}` — a robustness improvement, not a -faithfulness divergence. - -## B1 — leakage scoping sound, protection not weakened: CONFIRMED - -`_leakage_hits` (in `tests/unit/test_translate_spider2_dbt.py`) now excludes -the verify-time-only `tests/` subtree via `_is_verifier_only`. Verified the -Harbor lifecycle claim against the installed package source: - -- `harbor/verifier/verifier.py:123 def verify()` → lines 133-138 upload - the task's `tests/` to `env_paths.tests_dir` ONLY inside `verify()`. -- `harbor/trial/trial.py:570 _verify_step` → line 587-591 - `reset_dirs(remove_dirs=[verifier_dir, tests_dir], create_dirs=[...])` - wipes+recreates them EMPTY immediately before verification. -- The agent step receives only `_upload_step_workdir` (`workdir/`), never - the host view's `tests/`. So the host `tests/` assets genuinely never - reach the agent — excluding them from the host scan is principled. - -Scoping ≠ weakening, proven two ways: (1) the shipped guard test -`test_leakage_scan_still_fires_on_agent_visible_answer_content` plants -`gold`-content in BOTH an agent-visible path (caught) and `tests/` -(ignored); (2) a mutation probe — forcing `_is_verifier_only` to blanket- -`True` — makes that guard FAIL on the agent-visible leak, confirming the -guard is meaningful. Production deny-globs (`gold/**`, `**/gold/**`, -`expected/**`, `golden/**`) — the real agent-view protection — are -UNCHANGED; only the test's scan scope moved. - -The 2 previously-failing `test_translate_spider2_dbt.py` tests are green: -full file = 13 passed. - -## Regression / rider / AC-3 - -- Acceptance/gating: `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` → **66 passed**. -- Rider (independent end-to-end): materialized a view with `task_slug=not-spider2-slug`; emitted `tests/test.sh` resolves `--predicted-db /app/not-spider2-slug.duckdb` (ny's `resolve_spider2_db_name`), no hardcoded `/app/spider2.duckdb`. -- AC-3 (independent end-to-end): ran the emitted `verify.py` against the gold fixture → `/logs/verifier/reward.json` = `{"reward": 1.0}` (match) and `{"reward": 0.0}` (missing pred); reward is a float. -- Branch touches only `spider2_dbt`-scoped modules/tests + the gold fixture. None of the 3 known pre-existing failures (`test_task_identity_scoring`, `test_generate_matrix_specs`, `test_rk_research_new`) are touched by this branch — not regressions. Ignored harness `uv.lock` churn. +# Validation report — spider2-dbt duckdb_match verifier (cycles 1–10, final) + +- Entity: `docs/razorback-implementation/spider2-dbt-duckdb-match-verifier.md` +- Branch: `spacedock-ensign/spider2-dbt-duckdb-match-verifier` +- Validated HEAD: `965f2ea` (includes cycles 1–10) +- Merge-base with `main`: `9c39af2` +- Method: independent validation of the current worktree HEAD — fresh suite + runs from the clean branch checkout, standalone behavioral probes (not a + re-read of prior reports), and `superpowers:requesting-code-review`. + +## Acceptance criteria — PASS/FAIL with command + output + +### AC-1 — comparator scores 1.0 on a matching DB, 0.0 on a mismatch — PASS + +Verified by the comparator unit suite and a standalone live probe driving +`compare_duckdb` over in-test DuckDB fixtures. + +``` +$ uv run pytest -k spider2_dbt_verify --ignore=tests/unit/test_task_identity_scoring.py -q +49 passed, 795 deselected in 2.84s +``` + +Standalone probe drives `compare_duckdb` directly: matching DB → True, +mismatched DB → False; missing predicted table → False; multi-table AND with one +table mismatched → overall False. All PASS. + +### AC-2 — column subsetting + ignore_orders honor duckdb_match semantics — PASS + +Same probe, exercised live: + +- `ignore_orders=True` over a row-reordered table → match (True); same data with + `ignore_orders=False` → mismatch (False). +- A difference in a column NOT in `condition_cols` → still match (True); the same + difference WITH that column in `condition_cols` → mismatch (False). +- Column-containment: pred columns reordered + an extra pred column → still match. +- `math.isclose(abs_tol=1e-2)`: within 1e-2 → match, beyond → mismatch. +- DECIMAL(10,3) within 1e-2 → match (cycle-9 `_normalize` Decimal→float). +- NULL==NULL → match (NA==NA). + +All 11 probe cases PASS. Independently re-confirmed against the upstream +`xlang-ai/Spider2` `eval_utils.duckdb_match`/`compare_pandas_table` by the code +reviewer (fetched live during review) — line-accurate port, including the +`(x is None, str(x), isinstance(x,(int,float)))` sort key. + +### AC-3 — emitted test.sh writes a Harbor-shaped reward.json — PASS + +Verified by the integration suite and a standalone `emit_reward` probe: + +- Matching pred/gold → reward.json parses to `{"reward": 1.0}`, + `set(payload) == {"reward"}`, value is a `float`. +- Mismatch → `{"reward": 0.0}`. +- `emit_reward` never crashes-into-pass: garbage (non-JSON) spec over MATCHING + DBs → `{"reward": 0.0}`; empty-`condition_tabs` wrapped spec over matching DBs + → `{"reward": 0.0}` (the would-be-1.0 fail-open is closed); missing predicted + DB → `{"reward": 0.0}`. + +Integration test (`test_spider2_dbt_verify_test_sh.py`) executes the emitted +`test.sh` end-to-end and asserts the reward.json shape — green within the +`spider2_dbt_verify` 49-passed run above. + +## Stage-checklist verification of the dispatch focus areas + +- **Comparator faithfulness** — PASS. Column-containment, isclose 1e-2, per-column + sort, condition_cols restriction, multi-table AND, missing-table → 0, NA==NA, + DECIMAL within tolerance all confirmed live (probe) + reviewer's line-by-line + oracle comparison. +- **duckdb-only dependency** — PASS. + `grep -rn "import pandas\|from pandas\|import numpy\|from numpy" + src/razorback/benchmarks/spider2_dbt/` → no matches. Only `duckdb` + stdlib are + imported by the comparator/verifier. +- **Fail-closed guards** — PASS. Live probe: empty file, wrong `evaluation.func`, + empty/missing `condition_tabs`, and a wrapped spec missing `parameters.gold` + each RAISE; the gold-basename allowlist rejects all traversal/metachar cases + (`../…`, `/etc/passwd`, `sub/dir/g.duckdb`, `..`, `.`, `x.duckdb; … #`, + `g.duckdb $(id)`, `g .duckdb`, `g.sqlite`). Missing `tests/gold/` → + `_ensure_verifier_assets` raises `FileNotFoundError`. +- **Shell/SQL injection sealed** — PASS. `harbor_view.py` `shlex.quote`s BOTH + `--predicted-db` and `--gold-db` at the single emission point; + `condition_tabs` is identifier-quoted (doubled `"`) in `_fetch_columns`, and a + breakout value (`realt"; select 999 AS a; --`) over genuinely-mismatched DBs + RAISES (cannot force a match) — confirmed live. +- **Link-mode symlink-write-through** — PASS. All 5 verifier-asset copies route + through `_copy_into_view` (unlink-symlink-then-copy); test.sh, Dockerfile, and + preflight writes carry the same guard. Regression + `..._never_mutate_colliding_source_file` is green. +- **AC-3 reward.json shape / never-crash-into-pass** — PASS (see AC-3). + +## Suite results (clean branch checkout) + +``` +$ uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py -q +93 passed, 751 deselected in 5.34s + +$ uv run pytest --ignore=tests/unit/test_task_identity_scoring.py -q +4 failed, 828 passed, 12 skipped, 80 warnings in 68.05s +``` + +The 4 full-suite failures are pre-existing on the merge-base `9c39af2` +(verified by running exactly those 4 node-ids in a throwaway base worktree → +`4 failed`) and live in files UNTOUCHED by this branch +(`git diff 9c39af2..HEAD --name-only` is spider2_dbt scope + docs only). +NOT regressions: + +- `test_spacedock_solver_freeze_dir_mechanism.py::test_codex_runtime_dispatch_constructs_inner_agent` +- `test_worktree_teardown_preserves_runs.py::test_worktree_remove_force_does_not_destroy_runs` +- `test_generate_matrix_specs.py::test_matrix_specs_carry_query_mode_batch` +- `test_rk_research_new.py::test_rk_research_new_creates_scaffold_tree` + +## Code review findings (superpowers:requesting-code-review) + +Reviewer dispatched against `9c39af2..965f2ea`; it independently fetched the +upstream Spider2 oracle and ran the suite. + +- **Critical:** none. The reviewer could not construct any external input + (eval-spec JSON, gold basename, condition_tabs, task profile) that forces a + false reward 1.0. +- **Important:** none. +- **Minor (non-blocking):** + 1. `eval_spec.py:73` — `condition_cols` `[[]]`/`[None]` is expanded to + `[[]]*n` for any `n`; this is the *forgiving* direction and mirrors + upstream `compare_multi_pandas_table` defaulting (stricter `duckdb_match` + would assert). No correctness risk for well-formed specs; a clarifying + comment was suggested. + 2. The leakage-scan scoping that excludes the verify-only `tests/` subtree + rests on the Harbor "tests/ uploaded only at verify time, reset around the + agent run" lifecycle; a version-pinned comment was suggested so a future + Harbor bump can't silently invalidate it. The guard test proves the change + is scoping not weakening. + 3. `harbor_view.py:163` `spec.gold or "gold.duckdb"` fallback is effectively + dead for wrapped specs (they already raise without gold); reachable only + for the unwrapped-fixture path. A clarifying comment was suggested. + +All three are clarity/robustness polish, not correctness defects — classified +non-blocking. No production code was changed during validation. ## Gate decision -**PASSED → done.** Both cycle-1 blocking findings are independently -resolved against the live Spider2 source: B2 — the comparator and eval-spec -schema faithfully reproduce `eval_utils.duckdb_match` (verified by a 10-case -differential test against the upstream oracle and a real multi-table gold -line); B1 — the leakage fix is scoping, not weakening (mutation-probed), -justified by the verified Harbor verify-only-tests/ lifecycle. Rider and -AC-3 reward.json shape intact; no regressions outside the known pre-existing -set. No blocking findings remain. - ---- - -# Re-review (cycle 3): validation gate — fail-closed fix - -**Branch:** `spacedock-ensign/spider2-dbt-duckdb-match-verifier` @ `11c66f4` -**Merge-base with main:** `9c39af2` -**Fix commit under review:** `869918d` (fail-closed eval-spec validation) -**Gate verdict:** **PASSED → done** - -Independent re-verification of the cycle-2 fail-open hazard fix (Codex -finding: empty/malformed eval spec awarded `{"reward": 1.0}`). No -production code written. - -## Fail-closed fix is real and load-bearing: CONFIRMED - -`load_eval_spec` / `EvalSpec.__post_init__` (`eval_spec.py`) now fail closed, -and `emit_reward` (`verify.py:33-42`) wraps spec-load + compare in -`try/except → reward 0`. Independent probe at clean branch HEAD: - -- **RAISE cases** (all `ValueError`): empty file; `evaluation.parameters` - with no `condition_tabs`; explicit empty `condition_tabs`; non-`duckdb_match` - `evaluation.func` (`string_match`); direct `EvalSpec(condition_tabs=[])`. -- **Hazard-beating** — over a MATCHING pred/gold pair (cloned tables): - an empty/zero-table spec → `{"reward": 0.0}` and a wrong-func spec → - `{"reward": 0.0}` — **not the prior 1.0**. Garbage non-JSON → `0.0` with - no exception (no crash-into-pass). -- **Positive control:** a VALID matching spec still → `{"reward": 1.0}`, - so the guards are targeted, not blanket-failing. - -**Mutation test (guards are load-bearing):** stripping the `n==0` guard, -the empty-file raise, and the wrong-func raise from `eval_spec.py` made all -6 negative tests FAIL — and the CLI negative reproduced the exact -`{'reward': 1.0}` == `{'reward': 0.0}` assertion failure, i.e. the original -hazard returned. Tree restored to clean (`git diff` on `eval_spec.py` empty). - -Root-cause confirmed in source: `compare_duckdb` (`duckdb_match.py:116`) -returns `True` after the per-table AND-loop, so a zero-`condition_tabs` spec -would vacuously match — the guards prevent that spec from ever being built. - -## Cycle-1 fixes intact (B2 comparator + B1 leakage): CONFIRMED - -`git diff 0b64e92..869918d --name-only` (the cycle-2 fix delta) touches ONLY -`eval_spec.py`, `verify.py`, `test_spider2_dbt_verify_cli.py`, -`test_spider2_dbt_verify_comparator.py`. `duckdb_match.py` (B2), -`harbor_view.py` + `test_translate_spider2_dbt.py` (B1) are **untouched** — -cycle-1 is intact by construction, and re-confirmed live: - -- **B2 faithfulness:** column-containment with reordered + extra pred - columns → True; `math.isclose(1e-2)` within → True / beyond → False; - per-column sort under `ignore_order` → True; ordered mismatch → False; - missing pred table → False (mismatch). -- **B1 leakage scoping:** `test_translate_spider2_dbt.py` leakage/planted/ - resolves tests → 3 passed; the `leakage_scan_still_fires_on_agent_visible` - guard among 9 rider/AC-3/leakage tests → 9 passed (agent-visible leaks - still caught; only verify-only `tests/` excluded). - -## Regression / rider / AC-3 - -- **Gating:** `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` - → **73 passed, 751 deselected** (was 66 in cycle 2; +7 negatives). -- **Rider:** materialized view with `task_slug=not-spider2-slug` → emitted - `test.sh` `--predicted-db /app/not-spider2-slug.duckdb`; `/app/spider2.duckdb` - absent (ny's `resolve_spider2_db_name`). -- **AC-3:** independent end-to-end via `emit_reward` against the materialized - gold fixture — matching pred → `{"reward": 1.0}`, mismatch → `{"reward": 0.0}`; - `set(payload) == {"reward"}`, reward is `float`. -- Working tree clean apart from the known harness `uv.lock` churn (ignored - per assignment). Pre-existing unrelated failures on base - (`test_task_identity_scoring`, `test_generate_matrix_specs`, - `test_rk_research_new`, etc.) are not regressions and not touched. - -## Gate decision - -**PASSED → done.** The cycle-2 fail-open hazard is independently closed: -empty/malformed/schema-drifted specs raise or score 0, the matching-pair -hazard now yields 0 not 1.0, and a mutation test proves the guards (and their -negative tests) are load-bearing. Cycle-1 comparator faithfulness and leakage -scoping are intact (untouched by the fix + re-confirmed live); rider and AC-3 -reward shape intact; no new regressions. Per the convergence plan this was the -last re-validation before PR. - ---- - -## Re-validation (cycle-3 fix: gold DB basename) — 2026-06-18 - -Independent re-review after the cycle-3 fix (commit `a677c36`) that threads -the eval spec's `gold` basename through `EvalSpec` → materializer copy → -`test.sh --gold-db`, replacing the hardcoded `gold.duckdb`. No production code -written by this validator. - -### Fix is real (production code, branch HEAD `bac0705`) - -- `eval_spec.py`: `EvalSpec` gains `gold: str | None`; `load_eval_spec` parses - `evaluation.parameters.gold` and RAISES `ValueError` when a wrapped - `duckdb_match` spec omits/empties it (`eval_spec.py:128`). -- `harbor_view.py`: `_ensure_verifier_assets` loads the spec, resolves - `gold_basename = spec.gold or "gold.duckdb"`, copies the EXACT named file - from `tests/gold/`, raises `FileNotFoundError` if absent - (`harbor_view.py:141`), and formats `_TEST_SH_TEMPLATE` with - `--gold-db /tests/{gold_db}` (`:63`, `:164`) — no hardcoded `gold.duckdb`. - -### Independent exercise (standalone probe, no test infra) — 11/11 PASS - -- Loader: wrapped spec missing `gold` RAISES; empty-string `gold` RAISES; - `gold` parsed verbatim (`playbook.duckdb`). -- Materializer with `parameters.gold = playbook.duckdb` and NO `gold.duckdb` - present: `playbook.duckdb` copied; `gold.duckdb` NOT invented; test.sh emits - `--gold-db /tests/playbook.duckdb`, no `/tests/gold.duckdb`; leakage-clean - (no `gold/` path segment survives). -- End-to-end over the materialized named gold via `emit_reward`: matching pred - → `{"reward": 1.0}`, mismatch → `{"reward": 0.0}`. -- Missing named gold → `FileNotFoundError` (fail-closed). - -### Load-bearing (mutation tests) - -- Hardcoding `gold_basename = "gold.duckdb"` in `harbor_view.py` → - `test_spider2_dbt_verify_view_uses_non_default_gold_basename` and - `..._named_gold_actually_scores` FAIL (2 failed, 6 passed). Restored. -- Disabling the wrapped-spec missing-`gold` raise in `eval_spec.py` → - `test_...load_eval_spec_missing_gold_in_wrapped_spec_raises` FAILS - (1 failed, 7 passed). Restored. Tree clean (only `uv.lock`). - -### No regression to prior cycles / rider / AC-3 - -- Cycle-3 diff (`a677c36`) touches ONLY `eval_spec.py` + `harbor_view.py` - + their 2 test files. `duckdb_match.py` (cycle-1 comparator, last `61a1b9c`) - and `verify.py` (cycle-2 / AC-3, last `869918d`) UNTOUCHED — cycle-1/cycle-2 - intact by construction. -- Re-confirmed live: cycle-1 column-containment + `math.isclose(1e-2)` + - per-column sort under `ignore_order` + missing-table mismatch (3/3); - cycle-2 fail-closed empty-file / wrong-func / empty-`condition_tabs` raises - (3/3). -- Rider: `task_slug=not-spider2-slug` → emitted test.sh - `--predicted-db /app/not-spider2-slug.duckdb`, no `/app/spider2.duckdb` - (`...test_sh_uses_resolved_db_name` 1 passed). -- AC-3 reward shape `{"reward": }` intact (end-to-end probe above). - -### Suites (clean checkout) - -- `uv run pytest -k spider2_dbt --ignore=tests/unit/test_task_identity_scoring.py` - = **78 passed**, 751 deselected. -- `tests/unit/test_translate_spider2_dbt.py` = **13 passed**. -- Full suite (`--ignore` the broken scoring file) = 813 passed, 4 failed. - The 4 failures (`test_spacedock_solver_freeze_dir_mechanism`, - `test_worktree_teardown_preserves_runs`, `test_generate_matrix_specs`, - `test_rk_research_new`) ALSO fail on merge-base `9c39af2` (verified in a - throwaway base worktree) — pre-existing, unrelated, NOT regressions. Branch - touches only spider2_dbt-scoped files. - -## Gate decision (cycle-3 re-validation) +**PASSED → done.** -**PASSED → done.** The cycle-3 gold-basename fix is real and load-bearing: a -non-default `playbook.duckdb` is honored (copied + scored, leakage-clean), a -missing named gold fails closed, and a wrapped spec lacking `gold` raises — -each proven by an independent probe and a mutation test that flips the -regression tests red. Cycles 1-2, the predicted-DB rider, and the AC-3 reward -shape are intact (untouched by the fix + re-confirmed live). The only -remaining full-suite failures are pre-existing on base. Per the convergence -plan, this is the last re-validation before PR. +Rationale: all three ACs reproduce green from a clean checkout with actual +command output (gating suite 93 passed; acceptance `-k spider2_dbt_verify` 49 +passed), every dispatch-named focus area is confirmed by live behavioral probes +(comparator faithfulness, duckdb-only, fail-closed guards, shell+SQL injection +sealed, symlink write-through, reward.json shape), the only full-suite failures +are pre-existing on the merge-base and outside this branch's scope, and the +independent code review found zero blocking issues (3 minor polish notes). No +external input forces a false reward 1.0.