evalops
diff --git a/‎TODO.md‎
Lines changed: 1 addition & 1 deletion b/‎TODO.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎migrations/006_review_events_cost_breakdowns.sql‎
Lines changed: 3 additions & 0 deletions b/‎migrations/006_review_events_cost_breakdowns.sql‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/commands/eval/command.rs‎
Lines changed: 8 additions & 12 deletions b/‎src/commands/eval/command.rs‎
Lines changed: 8 additions & 12 deletions
diff --git a/‎src/commands/eval/metrics/comparisons.rs‎
Lines changed: 8 additions & 0 deletions b/‎src/commands/eval/metrics/comparisons.rs‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/commands/eval/metrics/suites.rs‎
Lines changed: 5 additions & 0 deletions b/‎src/commands/eval/metrics/suites.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/commands/eval/report/trend.rs‎
Lines changed: 2 additions & 0 deletions b/‎src/commands/eval/report/trend.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/commands/eval/runner/execute/dag.rs‎
Lines changed: 45 additions & 0 deletions b/‎src/commands/eval/runner/execute/dag.rs‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎src/commands/eval/runner/execute/repro.rs‎
Lines changed: 35 additions & 2 deletions b/‎src/commands/eval/runner/execute/repro.rs‎
Lines changed: 35 additions & 2 deletions
@@ -134,7 +134,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
 86. [ ] Add deployment docs for self-hosted review + analytics + trend retention setups.
 87. [ ] Add secret-management guidance and validation for multi-provider enterprise installs.
 88. [ ] Add background jobs for recomputing analytics after schema or scoring changes.
-89. [ ] Add cost dashboards by provider/model/role for review, verification, and eval workloads.
+89. [x] Add cost dashboards by provider/model/role for review, verification, and eval workloads.
 90. [ ] Add failure forensics bundles for self-hosted users when review or eval jobs degrade.
 
 ## 10. Eval, Benchmarking, and Model Governance
 
@@ -0,0 +1,3 @@
+-- Add workload/role/provider/model cost breakdown rows per review event.
+ALTER TABLE review_events
+ADD COLUMN IF NOT EXISTS cost_breakdowns JSONB NOT NULL DEFAULT '[]';
@@ -84,11 +84,14 @@ fn build_eval_run_metadata(
     artifact_dir: Option<&Path>,
 ) -> EvalRunMetadata {
     let (_, resolved_base_url, resolved_adapter) = config.resolve_provider();
-    let provider = inferred_provider(
-        resolved_base_url.as_deref().or(config.base_url.as_deref()),
-        resolved_adapter.as_deref().or(config.adapter.as_deref()),
-    );
+    let provider = config.inferred_provider_label_for_role(config.generation_model_role);
     let generation_model = config.generation_model_name().to_string();
+    let cost_breakdowns = crate::server::cost::aggregate_cost_breakdowns(
+        execution
+            .results
+            .iter()
+            .flat_map(|result| result.cost_breakdowns.clone()),
+    );
     let mut verification_judges = Vec::new();
     let mut seen_verification_judges = HashSet::new();
     for role in std::iter::once(config.verification.model_role)
@@ -140,17 +143,10 @@ fn build_eval_run_metadata(
         repeat_index,
         repeat_total,
         reproduction_validation: options.repro_validate,
+        cost_breakdowns,
     }
 }
 
-fn inferred_provider(base_url: Option<&str>, adapter: Option<&str>) -> Option<String> {
-    if base_url.is_some_and(|value| value.contains("openrouter.ai")) {
-        return Some("openrouter".to_string());
-    }
-
-    adapter.map(|value| value.to_string())
-}
-
 fn review_mode_label(agent_enabled: bool) -> &'static str {
     if agent_enabled {
         "agent-loop"
 
@@ -269,6 +269,7 @@ mod tests {
                 reproduction_summary: None,
                 artifact_path: None,
                 failures: vec![],
+                cost_breakdowns: vec![],
                 dag_traces: vec![],
             },
             EvalFixtureResult {
@@ -290,6 +291,7 @@ mod tests {
                 reproduction_summary: None,
                 artifact_path: None,
                 failures: vec![],
+                cost_breakdowns: vec![],
                 dag_traces: vec![],
             },
         ];
@@ -326,6 +328,7 @@ mod tests {
             reproduction_summary: None,
             artifact_path: None,
             failures: vec![],
+            cost_breakdowns: vec![],
             dag_traces: vec![],
         }];
 
@@ -353,6 +356,7 @@ mod tests {
             reproduction_summary: None,
             artifact_path: None,
             failures: vec![],
+            cost_breakdowns: vec![],
             dag_traces: vec![],
         }];
 
@@ -391,12 +395,14 @@ mod tests {
                     filtered_comments: 0,
                     abstained_comments: 0,
                     warnings: vec![],
+                    ..Default::default()
                 }],
             }),
             agent_activity: None,
             reproduction_summary: None,
             artifact_path: None,
             failures: vec![],
+            cost_breakdowns: vec![],
             dag_traces: vec![],
         }];
 
@@ -434,12 +440,14 @@ mod tests {
                     filtered_comments: 1,
                     abstained_comments: 1,
                     warnings: vec![],
+                    ..Default::default()
                 }],
             }),
             agent_activity: None,
             reproduction_summary: None,
             artifact_path: None,
             failures: vec![],
+            cost_breakdowns: vec![],
             dag_traces: vec![],
         }];
 
 
@@ -254,6 +254,7 @@ mod tests {
             reproduction_summary: None,
             artifact_path: None,
             failures: vec!["missing finding".to_string()],
+            cost_breakdowns: vec![],
             dag_traces: vec![],
         }];
 
@@ -292,6 +293,7 @@ mod tests {
                 reproduction_summary: None,
                 artifact_path: None,
                 failures: vec![],
+                cost_breakdowns: vec![],
                 dag_traces: vec![],
             },
             EvalFixtureResult {
@@ -318,6 +320,7 @@ mod tests {
                 reproduction_summary: None,
                 artifact_path: None,
                 failures: vec![],
+                cost_breakdowns: vec![],
                 dag_traces: vec![],
             },
         ];
@@ -369,6 +372,7 @@ mod tests {
                 reproduction_summary: None,
                 artifact_path: None,
                 failures: vec![],
+                cost_breakdowns: vec![],
                 dag_traces: vec![],
             },
             EvalFixtureResult {
@@ -390,6 +394,7 @@ mod tests {
                 reproduction_summary: None,
                 artifact_path: None,
                 failures: vec!["missing".to_string()],
+                cost_breakdowns: vec![],
                 dag_traces: vec![],
             },
         ];
 
@@ -90,6 +90,7 @@ fn trend_entry_for_report(report: &EvalReport) -> Option<TrendEntry> {
         verification_verified_checks: verification_health.map(|health| health.verified_checks),
         verification_total_checks: verification_health.map(|health| health.total_checks),
         verification_verified_pct: verification_health.map(|health| health.verified_pct),
+        cost_breakdowns: report.run.cost_breakdowns.clone(),
     })
 }
 
@@ -232,6 +233,7 @@ mod tests {
                 reproduction_summary: None,
                 artifact_path: None,
                 failures: vec![],
+                cost_breakdowns: vec![],
                 dag_traces: vec![],
             }],
         }
 
@@ -66,6 +66,7 @@ struct EvalFixtureDagContext {
     dag_config: EvalFixtureDagConfig,
     comments: Vec<core::Comment>,
     warnings: Vec<String>,
+    cost_breakdowns: Vec<crate::server::cost::CostBreakdownRow>,
     verification_report: Option<EvalVerificationReport>,
     agent_activity: Option<EvalAgentActivity>,
     reproduction_summary: Option<EvalReproductionSummary>,
@@ -82,6 +83,7 @@ enum EvalFixtureStageOutput {
     Review {
         comments: Vec<core::Comment>,
         warnings: Vec<String>,
+        cost_breakdowns: Vec<crate::server::cost::CostBreakdownRow>,
         verification_report: Option<EvalVerificationReport>,
         agent_activity: Option<EvalAgentActivity>,
         dag_traces: Vec<DagExecutionTrace>,
@@ -99,6 +101,7 @@ enum EvalFixtureStageOutput {
     ReproductionValidation {
         reproduction_summary: Option<EvalReproductionSummary>,
         warnings: Vec<String>,
+        cost_breakdowns: Vec<crate::server::cost::CostBreakdownRow>,
     },
     ArtifactCapture {
         artifact_path: Option<String>,
@@ -112,6 +115,7 @@ impl EvalFixtureDagContext {
             dag_config,
             comments: Vec::new(),
             warnings: Vec::new(),
+            cost_breakdowns: Vec::new(),
             verification_report: None,
             agent_activity: None,
             reproduction_summary: None,
@@ -147,6 +151,7 @@ impl EvalFixtureDagContext {
                 reproduction_summary: self.reproduction_summary,
                 artifact_path: self.artifact_path,
                 failures: self.failures,
+                cost_breakdowns: self.cost_breakdowns,
                 dag_traces,
             },
         })
@@ -409,9 +414,27 @@ fn spawn_stage(
             let repo_path = context.prepared.repo_path.clone();
             let config = config.clone();
             Ok(async move {
+                let generation_role = config.generation_model_role.as_str().to_string();
+                let generation_provider =
+                    config.inferred_provider_label_for_role(config.generation_model_role);
+                let generation_model = config.generation_model_name().to_string();
                 let review_result =
                     review_diff_content_raw(&diff_content, config, &repo_path).await?;
+                let cost_breakdowns = crate::server::cost::review_cost_breakdowns(
+                    crate::server::cost::CostBreakdownRequest {
+                        workload: "eval_generation",
+                        role: &generation_role,
+                        provider: generation_provider,
+                        model: &generation_model,
+                        prompt_tokens: review_result.total_prompt_tokens,
+                        completion_tokens: review_result.total_completion_tokens,
+                        total_tokens: review_result.total_tokens,
+                    },
+                    "eval_verification",
+                    review_result.verification_report.as_ref(),
+                );
                 Ok(EvalFixtureStageOutput::Review {
+                    cost_breakdowns,
                     verification_report: convert_verification_report(
                         review_result.verification_report,
                     ),
@@ -486,9 +509,27 @@ fn spawn_stage(
                     .as_ref()
                     .map(build_reproduction_warnings)
                     .unwrap_or_default();
+                let cost_breakdowns = reproduction_summary
+                    .as_ref()
+                    .and_then(|summary| {
+                        (summary.total_tokens > 0).then(|| {
+                            crate::server::cost::CostBreakdownRow::new(
+                                "eval_auditing",
+                                summary.role.as_str(),
+                                summary.provider.clone(),
+                                summary.model.as_str(),
+                                summary.prompt_tokens,
+                                summary.completion_tokens,
+                                summary.total_tokens,
+                            )
+                        })
+                    })
+                    .into_iter()
+                    .collect();
                 Ok(EvalFixtureStageOutput::ReproductionValidation {
                     reproduction_summary,
                     warnings,
+                    cost_breakdowns,
                 })
             }
             .boxed())
@@ -543,6 +584,7 @@ fn apply_stage_output(
             EvalFixtureStageOutput::Review {
                 comments,
                 warnings,
+                cost_breakdowns,
                 verification_report,
                 agent_activity,
                 dag_traces,
@@ -551,6 +593,7 @@ fn apply_stage_output(
             context.total_comments = comments.len();
             context.comments = comments;
             context.warnings = warnings;
+            context.cost_breakdowns = cost_breakdowns;
             context.verification_report = verification_report;
             context.agent_activity = agent_activity;
             context.dag_traces = dag_traces;
@@ -586,10 +629,12 @@ fn apply_stage_output(
             EvalFixtureStageOutput::ReproductionValidation {
                 reproduction_summary,
                 warnings,
+                cost_breakdowns,
             },
         ) => {
             context.reproduction_summary = reproduction_summary;
             context.warnings.extend(warnings);
+            context.cost_breakdowns.extend(cost_breakdowns);
             Ok(())
         }
         (
 
@@ -54,6 +54,8 @@ pub(super) async fn maybe_run_reproduction_validation(
 
     let model_config = config.to_model_config_for_role(config.auditing_model_role);
     let model_name = model_config.model_name.clone();
+    let role = config.auditing_model_role.as_str().to_string();
+    let provider = config.inferred_provider_label_for_role(config.auditing_model_role);
     let adapter: Arc<dyn adapters::llm::LLMAdapter> =
         Arc::from(adapters::llm::create_adapter(&model_config)?);
     let workspace = prepare_reproduction_workspace(prepared)?;
@@ -67,6 +69,9 @@ pub(super) async fn maybe_run_reproduction_validation(
     let tools = build_review_tools(tool_context, None);
 
     let mut checks = Vec::new();
+    let mut prompt_tokens = 0usize;
+    let mut completion_tokens = 0usize;
+    let mut total_tokens = 0usize;
     for comment in comments.iter().take(max_comments) {
         let (tool_evidence, tool_logs, tool_warnings) =
             gather_reproduction_evidence(&tools, comment, workspace.include_git_tools).await;
@@ -80,6 +85,11 @@ pub(super) async fn maybe_run_reproduction_validation(
         };
         match adapter.complete(request).await {
             Ok(response) => {
+                if let Some(usage) = response.usage.as_ref() {
+                    prompt_tokens += usage.prompt_tokens;
+                    completion_tokens += usage.completion_tokens;
+                    total_tokens += usage.total_tokens;
+                }
                 let parsed = parse_reproduction_response(&response.content);
                 let agent_activity = convert_agent_activity(Some(crate::review::AgentActivity {
                     total_iterations: usize::from(!tool_logs.is_empty()),
@@ -137,7 +147,15 @@ pub(super) async fn maybe_run_reproduction_validation(
         }
     }
 
-    Ok(Some(build_reproduction_summary(checks)))
+    Ok(Some(build_reproduction_summary(
+        checks,
+        model_name,
+        role,
+        provider,
+        prompt_tokens,
+        completion_tokens,
+        total_tokens,
+    )))
 }
 
 fn build_reproduction_prompt(
@@ -161,7 +179,15 @@ fn build_reproduction_prompt(
     )
 }
 
-fn build_reproduction_summary(checks: Vec<EvalReproductionCheck>) -> EvalReproductionSummary {
+fn build_reproduction_summary(
+    checks: Vec<EvalReproductionCheck>,
+    model: String,
+    role: String,
+    provider: Option<String>,
+    prompt_tokens: usize,
+    completion_tokens: usize,
+    total_tokens: usize,
+) -> EvalReproductionSummary {
     let mut summary = EvalReproductionSummary::default();
     for check in &checks {
         match check.reproduced {
@@ -170,6 +196,13 @@ fn build_reproduction_summary(checks: Vec<EvalReproductionCheck>) -> EvalReprodu
             None => summary.inconclusive += 1,
         }
     }
+    summary.model = model.clone();
+    summary.role = role;
+    summary.provider = provider;
+    summary.prompt_tokens = prompt_tokens;
+    summary.completion_tokens = completion_tokens;
+    summary.total_tokens = total_tokens;
+    summary.cost_estimate_usd = crate::server::cost::estimate_cost_usd(&model, total_tokens);
     summary.checks = checks;
     summary
 }
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+-- Add workload/role/provider/model cost breakdown rows per review event.`
	`2`	`+ALTER TABLE review_events`
	`3`	`+ADD COLUMN IF NOT EXISTS cost_breakdowns JSONB NOT NULL DEFAULT '[]';`
Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,7 @@ fn trend_entry_for_report(report: &EvalReport) -> Option<TrendEntry> {`
`90`	`90`	`verification_verified_checks: verification_health.map(\|health\| health.verified_checks),`
`91`	`91`	`verification_total_checks: verification_health.map(\|health\| health.total_checks),`
`92`	`92`	`verification_verified_pct: verification_health.map(\|health\| health.verified_pct),`
	`93`	`+ cost_breakdowns: report.run.cost_breakdowns.clone(),`
`93`	`94`	`})`
`94`	`95`	`}`
`95`	`96`
`@@ -232,6 +233,7 @@ mod tests {`
`232`	`233`	`reproduction_summary: None,`
`233`	`234`	`artifact_path: None,`
`234`	`235`	`failures: vec![],`
	`236`	`+ cost_breakdowns: vec![],`
`235`	`237`	`dag_traces: vec![],`
`236`	`238`	`}],`
`237`	`239`	`}`