diff --git a/packages/razorback-plugin-dab/src/razorback_plugin_dab/verify/verify_batch.py b/packages/razorback-plugin-dab/src/razorback_plugin_dab/verify/verify_batch.py index 33b6bf8f..4d2ecec5 100644 --- a/packages/razorback-plugin-dab/src/razorback_plugin_dab/verify/verify_batch.py +++ b/packages/razorback-plugin-dab/src/razorback_plugin_dab/verify/verify_batch.py @@ -24,7 +24,15 @@ def emit_reward( answer = answers.get(key, "") if isinstance(answers, dict) else "" validate_fn = _load_validate(validators[query_id]) if answer: - is_valid, reason = validate_fn(answer) + # Isolate per-query validator failures: a single validator raising + # (e.g. a validator calling .lower() on a non-string answer) must NOT + # abort grading for the whole dataset — that would write no reward.json + # and silently drop the entire dataset from the run (RewardFileNotFoundError). + # Score the offending query 0 with the error as the reason and continue. + try: + is_valid, reason = validate_fn(answer) + except Exception as exc: # noqa: BLE001 — robustness boundary, any validator error + is_valid, reason = False, f"validator error: {type(exc).__name__}: {exc}" else: is_valid, reason = False, "empty answer" reward = 1.0 if is_valid else 0.0 diff --git a/packages/razorback-plugin-dab/tests/unit/test_verify_batch_reward_shape.py b/packages/razorback-plugin-dab/tests/unit/test_verify_batch_reward_shape.py index 09202dc7..1de97b86 100644 --- a/packages/razorback-plugin-dab/tests/unit/test_verify_batch_reward_shape.py +++ b/packages/razorback-plugin-dab/tests/unit/test_verify_batch_reward_shape.py @@ -114,3 +114,39 @@ def test_batch_verify_does_not_mask_validator_import_errors(tmp_path: Path) -> N assert "missing_verifier_dependency" in result.stderr assert not reward_out.exists() assert not per_query_out.exists() + + +def test_batch_verify_isolates_per_query_runtime_validator_error(tmp_path: Path) -> None: + """A single query's validator raising at call time (e.g. a non-string answer) + must score that query 0 and continue grading the rest — not abort the whole + dataset (which would drop it from the run as a RewardFileNotFoundError).""" + tests_dir = tmp_path / "tests" + tests_dir.mkdir() + shutil.copy2(Path(verify_batch_module.__file__), tests_dir / "verify_batch.py") + # q1 validator crashes on a non-string answer; q2 validator is well-behaved. + (tests_dir / "validate_q1.py").write_text( + "def validate(answer):\n" + " return (answer.lower() == 'x', 'checked')\n" + ) + (tests_dir / "validate_q2.py").write_text( + "def validate(answer):\n" + " return (answer == 'ok', 'checked')\n" + ) + answers = tmp_path / "answers.json" + answers.write_text(json.dumps({"q1": ["a", "b"], "q2": "ok"})) # q1 is a LIST + reward_out = tmp_path / "reward.json" + per_query_out = tmp_path / "reward_per_query.json" + + result = _run_generated_verify_batch( + tests_dir=tests_dir, + answers_path=answers, + reward_out=reward_out, + per_query_out=per_query_out, + ) + + assert result.returncode == 0, result.stderr + per_query = json.loads(per_query_out.read_text()) + assert per_query["q1"]["reward"] == 0.0 + assert "validator error" in per_query["q1"]["reason"] + assert per_query["q2"]["reward"] == 1.0 # the good query still graded + assert json.loads(reward_out.read_text()) == {"reward": 0.5}