|
1 | 1 | """Custom metrics using direct LLM integration.""" |
2 | 2 |
|
| 3 | +import json |
3 | 4 | import re |
4 | 5 | from typing import TYPE_CHECKING, Any, Optional |
5 | 6 |
|
|
9 | 10 | from lightspeed_evaluation.core.metrics.custom.prompts import ( |
10 | 11 | ANSWER_CORRECTNESS_PROMPT, |
11 | 12 | INTENT_EVALUATION_PROMPT, |
| 13 | + PROPOSAL_EVALUATION_CORRECTNESS_PROMPT, |
12 | 14 | ) |
13 | 15 | from lightspeed_evaluation.core.metrics.custom.proposal_eval import ( |
14 | 16 | evaluate_proposal_status, |
@@ -47,6 +49,9 @@ def __init__( |
47 | 49 | "intent_eval": self._evaluate_intent, |
48 | 50 | "tool_eval": self._evaluate_tool_calls, |
49 | 51 | "proposal_status": evaluate_proposal_status, |
| 52 | + "proposal_evaluation_correctness": ( |
| 53 | + self._evaluate_proposal_evaluation_correctness |
| 54 | + ), |
50 | 55 | } |
51 | 56 |
|
52 | 57 | print(f"✅ Custom Metrics initialized: {self.llm.model_name}") |
@@ -295,3 +300,119 @@ def _evaluate_intent( |
295 | 300 | return score, reason |
296 | 301 | except LLMError as e: |
297 | 302 | return None, f"Intent evaluation failed: {str(e)}" |
| 303 | + |
| 304 | + def _parse_proposal_eval_response( |
| 305 | + self, response: str |
| 306 | + ) -> tuple[Optional[float], str]: |
| 307 | + """Parse JSON LLM judge response for proposal evaluation. |
| 308 | +
|
| 309 | + Expected JSON schema:: |
| 310 | +
|
| 311 | + { |
| 312 | + "reasoning": "string", |
| 313 | + "diagnosis": float | null, |
| 314 | + "execution": float | null, |
| 315 | + "verification": float | null, |
| 316 | + "average": float |
| 317 | + } |
| 318 | + """ |
| 319 | + try: |
| 320 | + data = json.loads(response) |
| 321 | + except json.JSONDecodeError: |
| 322 | + return None, f"Invalid JSON from LLM: {response[:120]}" |
| 323 | + |
| 324 | + reasoning: str = data.get("reasoning", "") |
| 325 | + sub_scores: dict[str, Optional[float]] = { |
| 326 | + "diagnosis": self._try_parse_float(data.get("diagnosis")), |
| 327 | + "execution": self._try_parse_float(data.get("execution")), |
| 328 | + "verification": self._try_parse_float(data.get("verification")), |
| 329 | + } |
| 330 | + average: Optional[float] = self._try_parse_float(data.get("average")) |
| 331 | + |
| 332 | + present = [v for v in sub_scores.values() if v is not None] |
| 333 | + if average is None and present: |
| 334 | + average = sum(present) / len(present) |
| 335 | + |
| 336 | + parts = [ |
| 337 | + f"{dim}={v:.2f}" if v is not None else f"{dim}=N/A" |
| 338 | + for dim, v in sub_scores.items() |
| 339 | + ] |
| 340 | + if average is not None: |
| 341 | + parts.append(f"avg={average:.2f}") |
| 342 | + detail = ", ".join(parts) |
| 343 | + if reasoning: |
| 344 | + detail = f"{detail} — {reasoning}" |
| 345 | + |
| 346 | + return average, detail |
| 347 | + |
| 348 | + @staticmethod |
| 349 | + def _try_parse_float(value: Any) -> Optional[float]: |
| 350 | + """Try to parse a float from a value, return None on failure.""" |
| 351 | + try: |
| 352 | + return float(value) |
| 353 | + except (ValueError, TypeError): |
| 354 | + return None |
| 355 | + |
| 356 | + @staticmethod |
| 357 | + def _build_optional_expected_outcomes(turn_data: TurnData) -> str: |
| 358 | + """Build optional expected outcome sections for the judge prompt.""" |
| 359 | + sections: list[str] = [] |
| 360 | + mapping = { |
| 361 | + "Expected Analysis Outcome": turn_data.expected_analysis_outcome, |
| 362 | + "Expected Execution Outcome": turn_data.expected_execution_outcome, |
| 363 | + "Expected Verification Outcome": turn_data.expected_verification_outcome, |
| 364 | + } |
| 365 | + for label, value in mapping.items(): |
| 366 | + if value: |
| 367 | + sections.append(f"\n### {label}\n{value}") |
| 368 | + return "\n".join(sections) |
| 369 | + |
| 370 | + @staticmethod |
| 371 | + def _build_workflow_phases(turn_data: TurnData) -> str: |
| 372 | + """Build the workflow phases string for the judge prompt.""" |
| 373 | + phases = turn_data.proposal_phases |
| 374 | + if phases: |
| 375 | + return "Phases executed: " + ", ".join(phases) |
| 376 | + return "Phases executed: unknown (score only dimensions visible in the workflow summary)" |
| 377 | + |
| 378 | + def _evaluate_proposal_evaluation_correctness( |
| 379 | + self, |
| 380 | + _conv_data: Any, |
| 381 | + _turn_idx: Optional[int], |
| 382 | + turn_data: Optional[TurnData], |
| 383 | + is_conversation: bool, |
| 384 | + ) -> tuple[Optional[float], str]: |
| 385 | + """Evaluate agentic remediation workflow quality using LLM judge.""" |
| 386 | + if is_conversation: |
| 387 | + return None, "Proposal evaluation correctness is a turn-level metric" |
| 388 | + |
| 389 | + if turn_data is None or not turn_data.response: |
| 390 | + return None, "TurnData with response is required for proposal evaluation" |
| 391 | + |
| 392 | + if not turn_data.expected_outcome: |
| 393 | + return None, "No expected outcome provided for proposal evaluation" |
| 394 | + |
| 395 | + optional_sections = self._build_optional_expected_outcomes(turn_data) |
| 396 | + workflow_phases = self._build_workflow_phases(turn_data) |
| 397 | + |
| 398 | + prompt = PROPOSAL_EVALUATION_CORRECTNESS_PROMPT.format( |
| 399 | + request=turn_data.query or "N/A", |
| 400 | + workflow_phases=workflow_phases, |
| 401 | + workflow_summary=turn_data.response, |
| 402 | + expected_outcome=turn_data.expected_outcome, |
| 403 | + optional_expected_outcomes=optional_sections, |
| 404 | + ) |
| 405 | + |
| 406 | + try: |
| 407 | + llm_response = self._call_llm(prompt) |
| 408 | + score, reason = self._parse_proposal_eval_response(llm_response) |
| 409 | + |
| 410 | + if score is None: |
| 411 | + return ( |
| 412 | + None, |
| 413 | + f"Could not parse score from LLM response: {llm_response[:100]}...", |
| 414 | + ) |
| 415 | + |
| 416 | + return score, f"Proposal evaluation correctness: {reason}" |
| 417 | + except LLMError as e: |
| 418 | + return None, f"Proposal evaluation correctness failed: {str(e)}" |
0 commit comments