evalops
diff --git a/‎TODO.md‎
Lines changed: 1 addition & 1 deletion b/‎TODO.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/commands/eval/command.rs‎
Lines changed: 9 additions & 1 deletion b/‎src/commands/eval/command.rs‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/commands/eval/command/batch.rs‎
Lines changed: 2 additions & 2 deletions b/‎src/commands/eval/command/batch.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/commands/eval/command/fixtures.rs‎
Lines changed: 1 addition & 1 deletion b/‎src/commands/eval/command/fixtures.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/commands/eval/command/options.rs‎
Lines changed: 20 additions & 3 deletions b/‎src/commands/eval/command/options.rs‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎src/commands/eval/report/output.rs‎
Lines changed: 12 additions & 0 deletions b/‎src/commands/eval/report/output.rs‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/commands/eval/runner/execute/repro.rs‎
Lines changed: 2 additions & 2 deletions b/‎src/commands/eval/runner/execute/repro.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/commands/eval/types/report.rs‎
Lines changed: 6 additions & 0 deletions b/‎src/commands/eval/types/report.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/commands/review/command/check.rs‎
Lines changed: 1 addition & 1 deletion b/‎src/commands/review/command/check.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/commands/review/command/review.rs‎
Lines changed: 4 additions & 1 deletion b/‎src/commands/review/command/review.rs‎
Lines changed: 4 additions & 1 deletion
@@ -147,7 +147,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
 96. [ ] Add production replay evals using anonymized accepted/rejected review outcomes.
 97. [x] Add leaderboard reporting for reviewer usefulness metrics, not just precision/recall.
 98. [x] Add regression gates for feedback coverage, verifier health, and lifecycle-state accuracy.
-99. [ ] Add model-routing policies that explicitly separate generation, verification, and auditing roles.
+99. [x] Add model-routing policies that explicitly separate generation, verification, and auditing roles.
 100. [x] Publish a repeatable "independent auditor" benchmark story in the UI and CLI so DiffScope's differentiation is measurable.
 
 ## Current Execution Slice
 
@@ -88,6 +88,7 @@ fn build_eval_run_metadata(
         resolved_base_url.as_deref().or(config.base_url.as_deref()),
         resolved_adapter.as_deref().or(config.adapter.as_deref()),
     );
+    let generation_model = config.generation_model_name().to_string();
     let mut verification_judges = Vec::new();
     let mut seen_verification_judges = HashSet::new();
     for role in std::iter::once(config.verification.model_role)
@@ -106,7 +107,8 @@ fn build_eval_run_metadata(
         fixtures_selected: execution.selected_count,
         label: options.label.clone(),
         comparison_group: options.comparison_group.clone(),
-        model: config.model.clone(),
+        model: generation_model,
+        generation_model_role: Some(config.generation_model_role.as_str().to_string()),
         review_mode: review_mode_label(config.agent.enabled).to_string(),
         adapter: resolved_adapter.or_else(|| config.adapter.clone()),
         provider,
@@ -124,6 +126,12 @@ fn build_eval_run_metadata(
             .verification
             .enabled
             .then(|| config.verification.consensus_mode.as_str().to_string()),
+        auditing_model: options
+            .repro_validate
+            .then(|| config.auditing_model_name().to_string()),
+        auditing_model_role: options
+            .repro_validate
+            .then(|| config.auditing_model_role.as_str().to_string()),
         trend_file: options
             .trend_file
             .as_ref()
 
@@ -161,7 +161,7 @@ pub(super) async fn run_eval_batch(
         for review_mode in &review_modes {
             for repeat_index in 1..=repeat_total {
                 let mut run_config = config.clone();
-                run_config.model = model.clone();
+                run_config.set_model_for_role(run_config.generation_model_role, model.clone());
                 run_config.agent.enabled = review_mode.agent_enabled();
 
                 let mut run_options = options.clone();
@@ -272,7 +272,7 @@ fn batch_review_modes(
 
 fn matrix_models(config: &config::Config, options: &EvalRunOptions) -> Vec<String> {
     let mut models = Vec::new();
-    push_unique_model(&mut models, &config.model);
+    push_unique_model(&mut models, config.generation_model_name());
     for model in &options.matrix_models {
         let normalized = model.trim();
         if !normalized.is_empty() {
 
@@ -43,7 +43,7 @@ pub(super) async fn run_eval_fixtures(
             .map(|artifact_dir| EvalFixtureArtifactContext {
                 artifact_dir: artifact_dir.clone(),
                 run_label: options.label.clone(),
-                model: config.model.clone(),
+                model: config.generation_model_name().to_string(),
             });
 
     let mut results = Vec::new();
 
@@ -1,7 +1,7 @@
 use anyhow::Result;
 use std::collections::HashSet;
 
-use crate::config::{self, ModelRole};
+use crate::config;
 
 use super::super::fixtures::load_eval_report;
 use super::super::thresholds::{parse_rule_threshold_args, EvalThresholdOptions};
@@ -56,7 +56,7 @@ pub(super) fn ensure_frontier_eval_models(
 
     let mut models = Vec::new();
     let mut seen_models = HashSet::new();
-    for model in std::iter::once(config.model.clone())
+    for model in std::iter::once(config.generation_model_name().to_string())
         .chain(options.matrix_models.iter().cloned())
         .chain(
             std::iter::once(config.verification.model_role)
@@ -66,7 +66,7 @@ pub(super) fn ensure_frontier_eval_models(
         .chain(
             options
                 .repro_validate
-                .then(|| config.model_for_role(ModelRole::Fast).to_string()),
+                .then(|| config.auditing_model_name().to_string()),
         )
     {
         if seen_models.insert(model.clone()) {
@@ -111,6 +111,7 @@ fn is_frontier_review_model(model: &str) -> bool {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::config::ModelRole;
 
     #[test]
     fn is_frontier_review_model_accepts_requested_defaults() {
@@ -124,4 +125,20 @@ mod tests {
         assert!(!is_frontier_review_model("gpt-4o-mini"));
         assert!(!is_frontier_review_model("anthropic/claude-opus-4.1"));
     }
+
+    #[test]
+    fn ensure_frontier_eval_models_rejects_non_frontier_auditor_when_repro_enabled() {
+        let config = config::Config {
+            model_reasoning: Some("gpt-4o-mini".to_string()),
+            auditing_model_role: ModelRole::Reasoning,
+            ..config::Config::default()
+        };
+        let options = EvalRunOptions {
+            repro_validate: true,
+            ..EvalRunOptions::default()
+        };
+
+        let error = ensure_frontier_eval_models(&config, &options).unwrap_err();
+        assert!(error.to_string().contains("gpt-4o-mini"));
+    }
 }
@@ -16,6 +16,12 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
         if let Some(label) = report.run.label.as_deref() {
             println!("Run label: {label}");
         }
+        if let Some(generation_role) = report.run.generation_model_role.as_deref() {
+            println!(
+                "Generation route: {generation_role} -> {}",
+                report.run.model
+            );
+        }
         if !report.run.review_mode.is_empty() {
             println!("Review mode: {}", report.run.review_mode);
         }
@@ -42,6 +48,12 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
         if let Some(consensus_mode) = report.run.verification_consensus_mode.as_deref() {
             println!("Verification consensus: {consensus_mode}");
         }
+        if let (Some(auditing_role), Some(auditing_model)) = (
+            report.run.auditing_model_role.as_deref(),
+            report.run.auditing_model.as_deref(),
+        ) {
+            println!("Auditing route: {auditing_role} -> {auditing_model}");
+        }
         if let Some(trend_file) = report.run.trend_file.as_deref() {
             println!("Trend file: {trend_file}");
         }
 
@@ -8,7 +8,7 @@ use uuid::Uuid;
 
 use crate::adapters;
 use crate::adapters::llm::{LLMRequest, StructuredOutputSchema};
-use crate::config::{self, ModelRole};
+use crate::config;
 use crate::core;
 use crate::core::agent_loop::AgentToolCallLog;
 use crate::core::agent_tools::{build_review_tools, ReviewTool, ReviewToolContext};
@@ -52,7 +52,7 @@ pub(super) async fn maybe_run_reproduction_validation(
         return Ok(None);
     }
 
-    let model_config = config.to_model_config_for_role(ModelRole::Fast);
+    let model_config = config.to_model_config_for_role(config.auditing_model_role);
     let model_name = model_config.model_name.clone();
     let adapter: Arc<dyn adapters::llm::LLMAdapter> =
         Arc::from(adapters::llm::create_adapter(&model_config)?);
 
@@ -40,6 +40,8 @@ pub struct EvalRunMetadata {
     pub comparison_group: Option<String>,
     #[serde(default)]
     pub model: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub generation_model_role: Option<String>,
     #[serde(default, skip_serializing_if = "String::is_empty")]
     pub review_mode: String,
     #[serde(default, skip_serializing_if = "Option::is_none")]
@@ -57,6 +59,10 @@ pub struct EvalRunMetadata {
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub verification_consensus_mode: Option<String>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub auditing_model: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub auditing_model_role: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
     pub trend_file: Option<String>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub artifact_dir: Option<String>,
 
@@ -13,7 +13,7 @@ pub async fn check_command(
     format: OutputFormat,
 ) -> Result<()> {
     info!("Checking repository at: {}", path.display());
-    info!("Using model: {}", config.model);
+    info!("Using generation model: {}", config.generation_model_name());
 
     let git = core::GitIntegration::new(&path)?;
     let diff_content = git.get_uncommitted_diff()?;
 
@@ -15,7 +15,10 @@ pub async fn review_command(
     output_path: Option<PathBuf>,
     format: OutputFormat,
 ) -> Result<()> {
-    info!("Starting diff review with model: {}", config.model);
+    info!(
+        "Starting diff review with generation model: {}",
+        config.generation_model_name()
+    );
 
     let (repo_root, diff_content) = load_review_input(diff_path).await?;
     if diff_content.trim().is_empty() {