Skip to content

Commit 1cb4201

Browse files
committed
feat(config): add explicit review model routing
1 parent 3c101d7 commit 1cb4201

File tree

18 files changed

+275
-27
lines changed

18 files changed

+275
-27
lines changed

TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
147147
96. [ ] Add production replay evals using anonymized accepted/rejected review outcomes.
148148
97. [x] Add leaderboard reporting for reviewer usefulness metrics, not just precision/recall.
149149
98. [x] Add regression gates for feedback coverage, verifier health, and lifecycle-state accuracy.
150-
99. [ ] Add model-routing policies that explicitly separate generation, verification, and auditing roles.
150+
99. [x] Add model-routing policies that explicitly separate generation, verification, and auditing roles.
151151
100. [x] Publish a repeatable "independent auditor" benchmark story in the UI and CLI so DiffScope's differentiation is measurable.
152152

153153
## Current Execution Slice

src/commands/eval/command.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ fn build_eval_run_metadata(
8888
resolved_base_url.as_deref().or(config.base_url.as_deref()),
8989
resolved_adapter.as_deref().or(config.adapter.as_deref()),
9090
);
91+
let generation_model = config.generation_model_name().to_string();
9192
let mut verification_judges = Vec::new();
9293
let mut seen_verification_judges = HashSet::new();
9394
for role in std::iter::once(config.verification.model_role)
@@ -106,7 +107,8 @@ fn build_eval_run_metadata(
106107
fixtures_selected: execution.selected_count,
107108
label: options.label.clone(),
108109
comparison_group: options.comparison_group.clone(),
109-
model: config.model.clone(),
110+
model: generation_model,
111+
generation_model_role: Some(config.generation_model_role.as_str().to_string()),
110112
review_mode: review_mode_label(config.agent.enabled).to_string(),
111113
adapter: resolved_adapter.or_else(|| config.adapter.clone()),
112114
provider,
@@ -124,6 +126,12 @@ fn build_eval_run_metadata(
124126
.verification
125127
.enabled
126128
.then(|| config.verification.consensus_mode.as_str().to_string()),
129+
auditing_model: options
130+
.repro_validate
131+
.then(|| config.auditing_model_name().to_string()),
132+
auditing_model_role: options
133+
.repro_validate
134+
.then(|| config.auditing_model_role.as_str().to_string()),
127135
trend_file: options
128136
.trend_file
129137
.as_ref()

src/commands/eval/command/batch.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ pub(super) async fn run_eval_batch(
161161
for review_mode in &review_modes {
162162
for repeat_index in 1..=repeat_total {
163163
let mut run_config = config.clone();
164-
run_config.model = model.clone();
164+
run_config.set_model_for_role(run_config.generation_model_role, model.clone());
165165
run_config.agent.enabled = review_mode.agent_enabled();
166166

167167
let mut run_options = options.clone();
@@ -272,7 +272,7 @@ fn batch_review_modes(
272272

273273
fn matrix_models(config: &config::Config, options: &EvalRunOptions) -> Vec<String> {
274274
let mut models = Vec::new();
275-
push_unique_model(&mut models, &config.model);
275+
push_unique_model(&mut models, config.generation_model_name());
276276
for model in &options.matrix_models {
277277
let normalized = model.trim();
278278
if !normalized.is_empty() {

src/commands/eval/command/fixtures.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ pub(super) async fn run_eval_fixtures(
4343
.map(|artifact_dir| EvalFixtureArtifactContext {
4444
artifact_dir: artifact_dir.clone(),
4545
run_label: options.label.clone(),
46-
model: config.model.clone(),
46+
model: config.generation_model_name().to_string(),
4747
});
4848

4949
let mut results = Vec::new();

src/commands/eval/command/options.rs

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use anyhow::Result;
22
use std::collections::HashSet;
33

4-
use crate::config::{self, ModelRole};
4+
use crate::config;
55

66
use super::super::fixtures::load_eval_report;
77
use super::super::thresholds::{parse_rule_threshold_args, EvalThresholdOptions};
@@ -56,7 +56,7 @@ pub(super) fn ensure_frontier_eval_models(
5656

5757
let mut models = Vec::new();
5858
let mut seen_models = HashSet::new();
59-
for model in std::iter::once(config.model.clone())
59+
for model in std::iter::once(config.generation_model_name().to_string())
6060
.chain(options.matrix_models.iter().cloned())
6161
.chain(
6262
std::iter::once(config.verification.model_role)
@@ -66,7 +66,7 @@ pub(super) fn ensure_frontier_eval_models(
6666
.chain(
6767
options
6868
.repro_validate
69-
.then(|| config.model_for_role(ModelRole::Fast).to_string()),
69+
.then(|| config.auditing_model_name().to_string()),
7070
)
7171
{
7272
if seen_models.insert(model.clone()) {
@@ -111,6 +111,7 @@ fn is_frontier_review_model(model: &str) -> bool {
111111
#[cfg(test)]
112112
mod tests {
113113
use super::*;
114+
use crate::config::ModelRole;
114115

115116
#[test]
116117
fn is_frontier_review_model_accepts_requested_defaults() {
@@ -124,4 +125,20 @@ mod tests {
124125
assert!(!is_frontier_review_model("gpt-4o-mini"));
125126
assert!(!is_frontier_review_model("anthropic/claude-opus-4.1"));
126127
}
128+
129+
#[test]
130+
fn ensure_frontier_eval_models_rejects_non_frontier_auditor_when_repro_enabled() {
131+
let config = config::Config {
132+
model_reasoning: Some("gpt-4o-mini".to_string()),
133+
auditing_model_role: ModelRole::Reasoning,
134+
..config::Config::default()
135+
};
136+
let options = EvalRunOptions {
137+
repro_validate: true,
138+
..EvalRunOptions::default()
139+
};
140+
141+
let error = ensure_frontier_eval_models(&config, &options).unwrap_err();
142+
assert!(error.to_string().contains("gpt-4o-mini"));
143+
}
127144
}

src/commands/eval/report/output.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
1616
if let Some(label) = report.run.label.as_deref() {
1717
println!("Run label: {label}");
1818
}
19+
if let Some(generation_role) = report.run.generation_model_role.as_deref() {
20+
println!(
21+
"Generation route: {generation_role} -> {}",
22+
report.run.model
23+
);
24+
}
1925
if !report.run.review_mode.is_empty() {
2026
println!("Review mode: {}", report.run.review_mode);
2127
}
@@ -42,6 +48,12 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
4248
if let Some(consensus_mode) = report.run.verification_consensus_mode.as_deref() {
4349
println!("Verification consensus: {consensus_mode}");
4450
}
51+
if let (Some(auditing_role), Some(auditing_model)) = (
52+
report.run.auditing_model_role.as_deref(),
53+
report.run.auditing_model.as_deref(),
54+
) {
55+
println!("Auditing route: {auditing_role} -> {auditing_model}");
56+
}
4557
if let Some(trend_file) = report.run.trend_file.as_deref() {
4658
println!("Trend file: {trend_file}");
4759
}

src/commands/eval/runner/execute/repro.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use uuid::Uuid;
88

99
use crate::adapters;
1010
use crate::adapters::llm::{LLMRequest, StructuredOutputSchema};
11-
use crate::config::{self, ModelRole};
11+
use crate::config;
1212
use crate::core;
1313
use crate::core::agent_loop::AgentToolCallLog;
1414
use crate::core::agent_tools::{build_review_tools, ReviewTool, ReviewToolContext};
@@ -52,7 +52,7 @@ pub(super) async fn maybe_run_reproduction_validation(
5252
return Ok(None);
5353
}
5454

55-
let model_config = config.to_model_config_for_role(ModelRole::Fast);
55+
let model_config = config.to_model_config_for_role(config.auditing_model_role);
5656
let model_name = model_config.model_name.clone();
5757
let adapter: Arc<dyn adapters::llm::LLMAdapter> =
5858
Arc::from(adapters::llm::create_adapter(&model_config)?);

src/commands/eval/types/report.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ pub struct EvalRunMetadata {
4040
pub comparison_group: Option<String>,
4141
#[serde(default)]
4242
pub model: String,
43+
#[serde(default, skip_serializing_if = "Option::is_none")]
44+
pub generation_model_role: Option<String>,
4345
#[serde(default, skip_serializing_if = "String::is_empty")]
4446
pub review_mode: String,
4547
#[serde(default, skip_serializing_if = "Option::is_none")]
@@ -57,6 +59,10 @@ pub struct EvalRunMetadata {
5759
#[serde(default, skip_serializing_if = "Option::is_none")]
5860
pub verification_consensus_mode: Option<String>,
5961
#[serde(default, skip_serializing_if = "Option::is_none")]
62+
pub auditing_model: Option<String>,
63+
#[serde(default, skip_serializing_if = "Option::is_none")]
64+
pub auditing_model_role: Option<String>,
65+
#[serde(default, skip_serializing_if = "Option::is_none")]
6066
pub trend_file: Option<String>,
6167
#[serde(default, skip_serializing_if = "Option::is_none")]
6268
pub artifact_dir: Option<String>,

src/commands/review/command/check.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ pub async fn check_command(
1313
format: OutputFormat,
1414
) -> Result<()> {
1515
info!("Checking repository at: {}", path.display());
16-
info!("Using model: {}", config.model);
16+
info!("Using generation model: {}", config.generation_model_name());
1717

1818
let git = core::GitIntegration::new(&path)?;
1919
let diff_content = git.get_uncommitted_diff()?;

src/commands/review/command/review.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@ pub async fn review_command(
1515
output_path: Option<PathBuf>,
1616
format: OutputFormat,
1717
) -> Result<()> {
18-
info!("Starting diff review with model: {}", config.model);
18+
info!(
19+
"Starting diff review with generation model: {}",
20+
config.generation_model_name()
21+
);
1922

2023
let (repo_root, diff_content) = load_review_input(diff_path).await?;
2124
if diff_content.trim().is_empty() {

0 commit comments

Comments
 (0)