evalops
diff --git a/‎eval/fixtures/README.md‎
Lines changed: 21 additions & 0 deletions b/‎eval/fixtures/README.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/commands/eval/command.rs‎
Lines changed: 29 additions & 2 deletions b/‎src/commands/eval/command.rs‎
Lines changed: 29 additions & 2 deletions
@@ -54,9 +54,30 @@ diffscope eval \
   --output eval-report.json
 ```
 
+Matrix + repeat run with artifacts:
+
+```bash
+OPENROUTER_API_KEY=... \
+diffscope \
+  --adapter openrouter \
+  --base-url https://openrouter.ai/api/v1 \
+  --model anthropic/claude-opus-4.1 \
+  eval \
+  --fixtures eval/fixtures \
+  --suite review-depth-core \
+  --matrix-model openai/o3 \
+  --repeat 2 \
+  --label frontier-smoke \
+  --trend-file eval/trends/frontier-smoke.json \
+  --artifact-dir eval/artifacts/frontier-smoke \
+  --output eval/batch/frontier-smoke.json
+```
+
 Notes:
 - Fixtures call the configured model and API provider; they are not deterministic unit tests.
 - Treat this set as a baseline and tighten `must_find`/`must_not_find` thresholds over time.
 - Benchmark-pack fixtures now preserve category/language/source metadata in the JSON report so live runs can be sliced by dimension.
 - Use `--baseline` together with the dimension drop flags when you want regressions to fail on shared suites, categories, or languages instead of only on the whole run.
 - Use `--trend-file` with `--label` to append comparable live-run checkpoints into a reusable `QualityTrend` JSON history, including suite/category/language micro-F1 series and verifier-health counters.
+- Use `--matrix-model` plus `--repeat` to compare the configured primary model against a small frontier-model matrix and to spot flaky live-run variance.
+- Use `--artifact-dir` to persist failed-fixture artifacts and per-run JSON reports for debugging.
@@ -1,3 +1,5 @@
+#[path = "command/batch.rs"]
+mod batch;
 #[path = "command/fixtures.rs"]
 mod fixtures;
 #[path = "command/options.rs"]
@@ -11,6 +13,7 @@ use std::path::{Path, PathBuf};
 use crate::config;
 
 use super::{EvalRunFilters, EvalRunMetadata, EvalRunOptions};
+use batch::run_eval_batch;
 use fixtures::run_eval_fixtures;
 use options::prepare_eval_options;
 use report::emit_eval_report;
@@ -22,12 +25,30 @@ pub async fn eval_command(
     options: EvalRunOptions,
 ) -> Result<()> {
     config.verification_fail_open = true;
+    if options.repeat > 1 || !options.matrix_models.is_empty() {
+        return run_eval_batch(config, &fixtures_dir, output_path.as_deref(), &options).await;
+    }
+
     let execution = run_eval_fixtures(&config, &fixtures_dir, &options).await?;
     let prepared_options = prepare_eval_options(&options)?;
-    let run_metadata = build_eval_run_metadata(&config, &fixtures_dir, &options, &execution);
+    let report_output_path = output_path.clone().or_else(|| {
+        options
+            .artifact_dir
+            .as_ref()
+            .map(|dir| dir.join("report.json"))
+    });
+    let run_metadata = build_eval_run_metadata(
+        &config,
+        &fixtures_dir,
+        &options,
+        &execution,
+        None,
+        None,
+        options.artifact_dir.as_deref(),
+    );
     emit_eval_report(
         execution.results,
-        output_path.as_deref(),
+        report_output_path.as_deref(),
         prepared_options,
         run_metadata,
     )
@@ -39,6 +60,9 @@ fn build_eval_run_metadata(
     fixtures_dir: &Path,
     options: &EvalRunOptions,
     execution: &fixtures::EvalFixtureExecution,
+    repeat_index: Option<usize>,
+    repeat_total: Option<usize>,
+    artifact_dir: Option<&Path>,
 ) -> EvalRunMetadata {
     let (_, resolved_base_url, resolved_adapter) = config.resolve_provider();
     let provider = inferred_provider(
@@ -68,6 +92,9 @@ fn build_eval_run_metadata(
             .trend_file
             .as_ref()
             .map(|path| path.display().to_string()),
+        artifact_dir: artifact_dir.map(|path| path.display().to_string()),
+        repeat_index,
+        repeat_total,
     }
 }