|
| 1 | +"""Proposal status evaluation for CRD-based agent workflows.""" |
| 2 | + |
| 3 | +from typing import Any, Optional |
| 4 | + |
| 5 | +from lightspeed_evaluation.core.models import TurnData |
| 6 | + |
| 7 | + |
| 8 | +def _derive_phase( |
| 9 | + conditions: list[dict[str, Any]], |
| 10 | + proposal_spec: Optional[dict[str, Any]] = None, |
| 11 | +) -> str: |
| 12 | + """Derive the terminal phase from CRD conditions. |
| 13 | +
|
| 14 | + Args: |
| 15 | + conditions: List of condition dicts from proposal_status. |
| 16 | + proposal_spec: Proposal spec to determine the last expected step. |
| 17 | +
|
| 18 | + Returns: |
| 19 | + Phase string: Completed, Failed, Denied, Escalated, or InProgress. |
| 20 | + """ |
| 21 | + by_type = {c["type"]: c for c in conditions} |
| 22 | + |
| 23 | + if by_type.get("Denied", {}).get("status") == "True": |
| 24 | + return "Denied" |
| 25 | + if by_type.get("Escalated", {}).get("status") == "True": |
| 26 | + return "Escalated" |
| 27 | + |
| 28 | + for c in conditions: |
| 29 | + if c.get("status") == "False" and c.get("reason") != "RetryingExecution": |
| 30 | + return "Failed" |
| 31 | + |
| 32 | + step_to_condition = {"verification": "Verified", "execution": "Executed"} |
| 33 | + if proposal_spec: |
| 34 | + last = next( |
| 35 | + (cond for step, cond in step_to_condition.items() if step in proposal_spec), |
| 36 | + "Analyzed", |
| 37 | + ) |
| 38 | + else: |
| 39 | + last = "Analyzed" |
| 40 | + for step in ("Verified", "Executed", "Analyzed"): |
| 41 | + if by_type.get(step, {}).get("status") == "True": |
| 42 | + last = step |
| 43 | + break |
| 44 | + |
| 45 | + if by_type.get(last, {}).get("status") == "True": |
| 46 | + return "Completed" |
| 47 | + |
| 48 | + return "InProgress" |
| 49 | + |
| 50 | + |
| 51 | +def _check_phase( |
| 52 | + expected: dict[str, Any], |
| 53 | + conditions: list[dict[str, Any]], |
| 54 | + proposal_spec: Optional[dict[str, Any]], |
| 55 | +) -> Optional[tuple[bool, str]]: |
| 56 | + """Check exact phase match.""" |
| 57 | + phase = expected.get("phase") |
| 58 | + if phase is None: |
| 59 | + return None |
| 60 | + |
| 61 | + actual = _derive_phase(conditions, proposal_spec) |
| 62 | + if actual == phase: |
| 63 | + return True, f"Phase matches: {actual}" |
| 64 | + return False, f"Phase mismatch: expected '{phase}', got '{actual}'" |
| 65 | + |
| 66 | + |
| 67 | +def _check_phase_in( |
| 68 | + expected: dict[str, Any], |
| 69 | + conditions: list[dict[str, Any]], |
| 70 | + proposal_spec: Optional[dict[str, Any]], |
| 71 | +) -> Optional[tuple[bool, str]]: |
| 72 | + """Check phase membership in a list.""" |
| 73 | + phase_in = expected.get("phase_in") |
| 74 | + if phase_in is None: |
| 75 | + return None |
| 76 | + |
| 77 | + actual = _derive_phase(conditions, proposal_spec) |
| 78 | + if actual in phase_in: |
| 79 | + return True, f"Phase '{actual}' in {phase_in}" |
| 80 | + return False, f"Phase '{actual}' not in {phase_in}" |
| 81 | + |
| 82 | + |
| 83 | +def _check_conditions( |
| 84 | + expected: dict[str, Any], |
| 85 | + conditions: list[dict[str, Any]], |
| 86 | +) -> Optional[tuple[bool, str]]: |
| 87 | + """Check specific condition assertions.""" |
| 88 | + expected_conditions = expected.get("conditions") |
| 89 | + if expected_conditions is None: |
| 90 | + return None |
| 91 | + |
| 92 | + by_type = {c["type"]: c for c in conditions} |
| 93 | + |
| 94 | + for exp_cond in expected_conditions: |
| 95 | + cond_type = exp_cond.get("type") |
| 96 | + if cond_type is None: |
| 97 | + return False, "Condition assertion missing 'type' field" |
| 98 | + |
| 99 | + actual_cond = by_type.get(cond_type) |
| 100 | + if actual_cond is None: |
| 101 | + return False, f"Condition '{cond_type}' not found in proposal status" |
| 102 | + |
| 103 | + exp_status = exp_cond.get("status") |
| 104 | + if exp_status is not None and actual_cond.get("status") != exp_status: |
| 105 | + return ( |
| 106 | + False, |
| 107 | + f"Condition '{cond_type}' status: " |
| 108 | + f"expected '{exp_status}', got '{actual_cond.get('status')}'", |
| 109 | + ) |
| 110 | + |
| 111 | + exp_reason = exp_cond.get("reason") |
| 112 | + if exp_reason is not None and actual_cond.get("reason") != exp_reason: |
| 113 | + return ( |
| 114 | + False, |
| 115 | + f"Condition '{cond_type}' reason: " |
| 116 | + f"expected '{exp_reason}', got '{actual_cond.get('reason')}'", |
| 117 | + ) |
| 118 | + |
| 119 | + return True, "All condition assertions passed" |
| 120 | + |
| 121 | + |
| 122 | +def _check_verification( |
| 123 | + expected: dict[str, Any], |
| 124 | + conditions: list[dict[str, Any]], |
| 125 | +) -> Optional[tuple[bool, str]]: |
| 126 | + """Check verification-specific assertions.""" |
| 127 | + verification = expected.get("verification") |
| 128 | + if verification is None: |
| 129 | + return None |
| 130 | + |
| 131 | + by_type = {c["type"]: c for c in conditions} |
| 132 | + verified = by_type.get("Verified") |
| 133 | + |
| 134 | + if verified is None: |
| 135 | + return False, "Verified condition not found in proposal status" |
| 136 | + |
| 137 | + passed = verification.get("passed") |
| 138 | + if passed is not None: |
| 139 | + actual_passed = verified.get("status") == "True" |
| 140 | + if actual_passed != passed: |
| 141 | + return ( |
| 142 | + False, |
| 143 | + f"Verification passed: expected {passed}, got {actual_passed}", |
| 144 | + ) |
| 145 | + |
| 146 | + summary_contains = verification.get("summary_contains") |
| 147 | + if summary_contains is not None: |
| 148 | + message = verified.get("message", "") |
| 149 | + if summary_contains.lower() not in message.lower(): |
| 150 | + return ( |
| 151 | + False, |
| 152 | + f"Verification summary does not contain '{summary_contains}': " |
| 153 | + f"got '{message[:200]}'", |
| 154 | + ) |
| 155 | + |
| 156 | + return True, "Verification assertions passed" |
| 157 | + |
| 158 | + |
| 159 | +def evaluate_proposal_status( |
| 160 | + _conv_data: Any, |
| 161 | + _turn_idx: Optional[int], |
| 162 | + turn_data: Optional[TurnData], |
| 163 | + is_conversation: bool, |
| 164 | +) -> tuple[Optional[float], str]: |
| 165 | + """Evaluate proposal status against expected assertions. |
| 166 | +
|
| 167 | + Args: |
| 168 | + _conv_data: Conversation data (unused). |
| 169 | + _turn_idx: Turn index (unused). |
| 170 | + turn_data: Turn data with proposal_status and expected_proposal_status. |
| 171 | + is_conversation: Whether this is conversation-level evaluation. |
| 172 | +
|
| 173 | + Returns: |
| 174 | + Tuple of (score, reason). Score is 1.0 if all checks pass, 0.0 on |
| 175 | + first failure, None if metric should be skipped. |
| 176 | + """ |
| 177 | + if is_conversation: |
| 178 | + return None, "Proposal status is a turn-level metric" |
| 179 | + |
| 180 | + if turn_data is None: |
| 181 | + return None, "TurnData is required for proposal status evaluation" |
| 182 | + |
| 183 | + if not turn_data.expected_proposal_status: |
| 184 | + return None, "No expected_proposal_status provided" |
| 185 | + |
| 186 | + if not turn_data.proposal_status: |
| 187 | + return 0.0, "proposal_status not populated by driver" |
| 188 | + |
| 189 | + expected = turn_data.expected_proposal_status |
| 190 | + conditions = turn_data.proposal_status.get("conditions", []) |
| 191 | + proposal_spec = turn_data.proposal_spec |
| 192 | + |
| 193 | + checks = [ |
| 194 | + _check_phase(expected, conditions, proposal_spec), |
| 195 | + _check_phase_in(expected, conditions, proposal_spec), |
| 196 | + _check_conditions(expected, conditions), |
| 197 | + _check_verification(expected, conditions), |
| 198 | + ] |
| 199 | + |
| 200 | + reasons: list[str] = [] |
| 201 | + for result in checks: |
| 202 | + if result is None: |
| 203 | + continue |
| 204 | + passed, reason = result |
| 205 | + if not passed: |
| 206 | + return 0.0, reason |
| 207 | + reasons.append(reason) |
| 208 | + |
| 209 | + return 1.0, "; ".join(reasons) if reasons else "All checks passed" |
0 commit comments