Skip to content

Commit 779acfe

Browse files
committed
feat: add eval matrix runs and failure artifacts
Run eval suites across repeated frontier-model matrices while persisting per-run reports and failed fixture artifacts so flaky or broken live runs are easier to compare and debug. Made-with: Cursor
1 parent 11466cc commit 779acfe

File tree

18 files changed

+669
-12
lines changed

18 files changed

+669
-12
lines changed

eval/fixtures/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,30 @@ diffscope eval \
5454
--output eval-report.json
5555
```
5656

57+
Matrix + repeat run with artifacts:
58+
59+
```bash
60+
OPENROUTER_API_KEY=... \
61+
diffscope \
62+
--adapter openrouter \
63+
--base-url https://openrouter.ai/api/v1 \
64+
--model anthropic/claude-opus-4.1 \
65+
eval \
66+
--fixtures eval/fixtures \
67+
--suite review-depth-core \
68+
--matrix-model openai/o3 \
69+
--repeat 2 \
70+
--label frontier-smoke \
71+
--trend-file eval/trends/frontier-smoke.json \
72+
--artifact-dir eval/artifacts/frontier-smoke \
73+
--output eval/batch/frontier-smoke.json
74+
```
75+
5776
Notes:
5877
- Fixtures call the configured model and API provider; they are not deterministic unit tests.
5978
- Treat this set as a baseline and tighten `must_find`/`must_not_find` thresholds over time.
6079
- Benchmark-pack fixtures now preserve category/language/source metadata in the JSON report so live runs can be sliced by dimension.
6180
- Use `--baseline` together with the dimension drop flags when you want regressions to fail on shared suites, categories, or languages instead of only on the whole run.
6281
- Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history, including suite/category/language micro-F1 series and verifier-health counters.
82+
- Use `--matrix-model` plus `--repeat` to compare the configured primary model against a small frontier-model matrix and to spot flaky live-run variance.
83+
- Use `--artifact-dir` to persist failed-fixture artifacts and per-run JSON reports for debugging.

src/commands/eval/command.rs

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#[path = "command/batch.rs"]
2+
mod batch;
13
#[path = "command/fixtures.rs"]
24
mod fixtures;
35
#[path = "command/options.rs"]
@@ -11,6 +13,7 @@ use std::path::{Path, PathBuf};
1113
use crate::config;
1214

1315
use super::{EvalRunFilters, EvalRunMetadata, EvalRunOptions};
16+
use batch::run_eval_batch;
1417
use fixtures::run_eval_fixtures;
1518
use options::prepare_eval_options;
1619
use report::emit_eval_report;
@@ -22,12 +25,30 @@ pub async fn eval_command(
2225
options: EvalRunOptions,
2326
) -> Result<()> {
2427
config.verification_fail_open = true;
28+
if options.repeat > 1 || !options.matrix_models.is_empty() {
29+
return run_eval_batch(config, &fixtures_dir, output_path.as_deref(), &options).await;
30+
}
31+
2532
let execution = run_eval_fixtures(&config, &fixtures_dir, &options).await?;
2633
let prepared_options = prepare_eval_options(&options)?;
27-
let run_metadata = build_eval_run_metadata(&config, &fixtures_dir, &options, &execution);
34+
let report_output_path = output_path.clone().or_else(|| {
35+
options
36+
.artifact_dir
37+
.as_ref()
38+
.map(|dir| dir.join("report.json"))
39+
});
40+
let run_metadata = build_eval_run_metadata(
41+
&config,
42+
&fixtures_dir,
43+
&options,
44+
&execution,
45+
None,
46+
None,
47+
options.artifact_dir.as_deref(),
48+
);
2849
emit_eval_report(
2950
execution.results,
30-
output_path.as_deref(),
51+
report_output_path.as_deref(),
3152
prepared_options,
3253
run_metadata,
3354
)
@@ -39,6 +60,9 @@ fn build_eval_run_metadata(
3960
fixtures_dir: &Path,
4061
options: &EvalRunOptions,
4162
execution: &fixtures::EvalFixtureExecution,
63+
repeat_index: Option<usize>,
64+
repeat_total: Option<usize>,
65+
artifact_dir: Option<&Path>,
4266
) -> EvalRunMetadata {
4367
let (_, resolved_base_url, resolved_adapter) = config.resolve_provider();
4468
let provider = inferred_provider(
@@ -68,6 +92,9 @@ fn build_eval_run_metadata(
6892
.trend_file
6993
.as_ref()
7094
.map(|path| path.display().to_string()),
95+
artifact_dir: artifact_dir.map(|path| path.display().to_string()),
96+
repeat_index,
97+
repeat_total,
7198
}
7299
}
73100

0 commit comments

Comments
 (0)