Skip to content

Commit af6e041

Browse files
riolocclaude
andcommitted
feat: integrate driver and rename to ProposalDriver
- Wire ProposalDriver into the evaluation pipeline: add "subprocess" to SUPPORTED_AGENT_TYPES, move SubprocessAgentConfig to core/models, register in AgentDriverRegistry via new registry module, and update all exports. Fixes ProposalDriver.__init__ to accept enabled kwarg. - Extend unit tests - Add agentic integration tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d12dfb5 commit af6e041

23 files changed

Lines changed: 1722 additions & 419 deletions

src/lightspeed_evaluation/core/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666

6767
# Agent Constants
6868
DEFAULT_AGENT_TYPE = "http_api"
69-
SUPPORTED_AGENT_TYPES = ["http_api"]
69+
SUPPORTED_AGENT_TYPES = ["http_api", "proposal"]
7070

7171
# Frameworks that don't require judge LLM (NLP, script-based evaluations)
7272
NON_LLM_FRAMEWORKS = frozenset({"nlp", "script"})

src/lightspeed_evaluation/core/metrics/custom/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66
ANSWER_CORRECTNESS_PROMPT,
77
INTENT_EVALUATION_PROMPT,
88
)
9+
from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
10+
evaluate_proposal_status,
11+
)
912
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
1013

1114
__all__ = [
1215
"CustomMetrics",
1316
"evaluate_keywords",
17+
"evaluate_proposal_status",
1418
"evaluate_tool_calls",
1519
# Prompts
1620
"ANSWER_CORRECTNESS_PROMPT",

src/lightspeed_evaluation/core/metrics/custom/custom.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
INTENT_EVALUATION_PROMPT,
1111
)
1212
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
13+
from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
14+
evaluate_proposal_status,
15+
)
1316
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
1417
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
1518
from lightspeed_evaluation.core.system.exceptions import LLMError
@@ -44,6 +47,7 @@ def __init__(
4447
"answer_correctness": self._evaluate_answer_correctness,
4548
"intent_eval": self._evaluate_intent,
4649
"tool_eval": self._evaluate_tool_calls,
50+
"proposal_status": evaluate_proposal_status,
4751
}
4852

4953
print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
"""Proposal status evaluation for CRD-based agent workflows."""
2+
3+
from typing import Any, Optional
4+
5+
from lightspeed_evaluation.core.models import TurnData
6+
7+
8+
def _derive_phase(
9+
conditions: list[dict[str, Any]],
10+
proposal_spec: Optional[dict[str, Any]] = None,
11+
) -> str:
12+
"""Derive the terminal phase from CRD conditions.
13+
14+
Args:
15+
conditions: List of condition dicts from proposal_status.
16+
proposal_spec: Proposal spec to determine the last expected step.
17+
18+
Returns:
19+
Phase string: Completed, Failed, Denied, Escalated, or InProgress.
20+
"""
21+
by_type = {c["type"]: c for c in conditions}
22+
23+
if by_type.get("Denied", {}).get("status") == "True":
24+
return "Denied"
25+
if by_type.get("Escalated", {}).get("status") == "True":
26+
return "Escalated"
27+
28+
for c in conditions:
29+
if c.get("status") == "False" and c.get("reason") != "RetryingExecution":
30+
return "Failed"
31+
32+
step_to_condition = {"verification": "Verified", "execution": "Executed"}
33+
if proposal_spec:
34+
last = next(
35+
(cond for step, cond in step_to_condition.items() if step in proposal_spec),
36+
"Analyzed",
37+
)
38+
else:
39+
last = "Analyzed"
40+
for step in ("Verified", "Executed", "Analyzed"):
41+
if by_type.get(step, {}).get("status") == "True":
42+
last = step
43+
break
44+
45+
if by_type.get(last, {}).get("status") == "True":
46+
return "Completed"
47+
48+
return "InProgress"
49+
50+
51+
def _check_phase(
52+
expected: dict[str, Any],
53+
conditions: list[dict[str, Any]],
54+
proposal_spec: Optional[dict[str, Any]],
55+
) -> Optional[tuple[bool, str]]:
56+
"""Check exact phase match."""
57+
phase = expected.get("phase")
58+
if phase is None:
59+
return None
60+
61+
actual = _derive_phase(conditions, proposal_spec)
62+
if actual == phase:
63+
return True, f"Phase matches: {actual}"
64+
return False, f"Phase mismatch: expected '{phase}', got '{actual}'"
65+
66+
67+
def _check_phase_in(
68+
expected: dict[str, Any],
69+
conditions: list[dict[str, Any]],
70+
proposal_spec: Optional[dict[str, Any]],
71+
) -> Optional[tuple[bool, str]]:
72+
"""Check phase membership in a list."""
73+
phase_in = expected.get("phase_in")
74+
if phase_in is None:
75+
return None
76+
77+
actual = _derive_phase(conditions, proposal_spec)
78+
if actual in phase_in:
79+
return True, f"Phase '{actual}' in {phase_in}"
80+
return False, f"Phase '{actual}' not in {phase_in}"
81+
82+
83+
def _check_conditions(
84+
expected: dict[str, Any],
85+
conditions: list[dict[str, Any]],
86+
) -> Optional[tuple[bool, str]]:
87+
"""Check specific condition assertions."""
88+
expected_conditions = expected.get("conditions")
89+
if expected_conditions is None:
90+
return None
91+
92+
by_type = {c["type"]: c for c in conditions}
93+
94+
for exp_cond in expected_conditions:
95+
cond_type = exp_cond.get("type")
96+
if cond_type is None:
97+
return False, "Condition assertion missing 'type' field"
98+
99+
actual_cond = by_type.get(cond_type)
100+
if actual_cond is None:
101+
return False, f"Condition '{cond_type}' not found in proposal status"
102+
103+
exp_status = exp_cond.get("status")
104+
if exp_status is not None and actual_cond.get("status") != exp_status:
105+
return (
106+
False,
107+
f"Condition '{cond_type}' status: "
108+
f"expected '{exp_status}', got '{actual_cond.get('status')}'",
109+
)
110+
111+
exp_reason = exp_cond.get("reason")
112+
if exp_reason is not None and actual_cond.get("reason") != exp_reason:
113+
return (
114+
False,
115+
f"Condition '{cond_type}' reason: "
116+
f"expected '{exp_reason}', got '{actual_cond.get('reason')}'",
117+
)
118+
119+
return True, "All condition assertions passed"
120+
121+
122+
def _check_verification(
123+
expected: dict[str, Any],
124+
conditions: list[dict[str, Any]],
125+
) -> Optional[tuple[bool, str]]:
126+
"""Check verification-specific assertions."""
127+
verification = expected.get("verification")
128+
if verification is None:
129+
return None
130+
131+
by_type = {c["type"]: c for c in conditions}
132+
verified = by_type.get("Verified")
133+
134+
if verified is None:
135+
return False, "Verified condition not found in proposal status"
136+
137+
passed = verification.get("passed")
138+
if passed is not None:
139+
actual_passed = verified.get("status") == "True"
140+
if actual_passed != passed:
141+
return (
142+
False,
143+
f"Verification passed: expected {passed}, got {actual_passed}",
144+
)
145+
146+
summary_contains = verification.get("summary_contains")
147+
if summary_contains is not None:
148+
message = verified.get("message", "")
149+
if summary_contains.lower() not in message.lower():
150+
return (
151+
False,
152+
f"Verification summary does not contain '{summary_contains}': "
153+
f"got '{message[:200]}'",
154+
)
155+
156+
return True, "Verification assertions passed"
157+
158+
159+
def evaluate_proposal_status(
160+
_conv_data: Any,
161+
_turn_idx: Optional[int],
162+
turn_data: Optional[TurnData],
163+
is_conversation: bool,
164+
) -> tuple[Optional[float], str]:
165+
"""Evaluate proposal status against expected assertions.
166+
167+
Args:
168+
_conv_data: Conversation data (unused).
169+
_turn_idx: Turn index (unused).
170+
turn_data: Turn data with proposal_status and expected_proposal_status.
171+
is_conversation: Whether this is conversation-level evaluation.
172+
173+
Returns:
174+
Tuple of (score, reason). Score is 1.0 if all checks pass, 0.0 on
175+
first failure, None if metric should be skipped.
176+
"""
177+
if is_conversation:
178+
return None, "Proposal status is a turn-level metric"
179+
180+
if turn_data is None:
181+
return None, "TurnData is required for proposal status evaluation"
182+
183+
if not turn_data.expected_proposal_status:
184+
return None, "No expected_proposal_status provided"
185+
186+
if not turn_data.proposal_status:
187+
return 0.0, "proposal_status not populated by driver"
188+
189+
expected = turn_data.expected_proposal_status
190+
conditions = turn_data.proposal_status.get("conditions", [])
191+
proposal_spec = turn_data.proposal_spec
192+
193+
checks = [
194+
_check_phase(expected, conditions, proposal_spec),
195+
_check_phase_in(expected, conditions, proposal_spec),
196+
_check_conditions(expected, conditions),
197+
_check_verification(expected, conditions),
198+
]
199+
200+
reasons: list[str] = []
201+
for result in checks:
202+
if result is None:
203+
continue
204+
passed, reason = result
205+
if not passed:
206+
return 0.0, reason
207+
reasons.append(reason)
208+
209+
return 1.0, "; ".join(reasons) if reasons else "All checks passed"

src/lightspeed_evaluation/core/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
HttpApiAgentConfig,
77
MCPHeadersConfig,
88
MCPServerConfig,
9+
ProposalAgentConfig,
910
)
1011
from lightspeed_evaluation.core.models.api import (
1112
APIRequest,
@@ -57,6 +58,7 @@
5758
"HttpApiAgentConfig",
5859
"MCPHeadersConfig",
5960
"MCPServerConfig",
61+
"ProposalAgentConfig",
6062
# Data models
6163
"TurnData",
6264
"EvaluationData",

src/lightspeed_evaluation/core/models/agents.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,22 @@ class HttpApiAgentConfig(HttpApiBaseFields):
134134
)
135135

136136

137+
class ProposalAgentConfig(BaseModel):
138+
"""Configuration for a Proposal CRD-based agent."""
139+
140+
model_config = ConfigDict(extra="forbid")
141+
142+
type: Literal["proposal"] = "proposal"
143+
namespace: str
144+
auto_approve: bool = True
145+
cleanup_proposals: bool = True
146+
timeout: int = Field(default=900, gt=0)
147+
poll_interval: int = Field(default=2, gt=0)
148+
149+
137150
# Discriminated union of all agent config types; extend by adding new
138151
# config classes to support additional agent types.
139-
AgentDefinition = Union[HttpApiAgentConfig]
152+
AgentDefinition = Union[HttpApiAgentConfig, ProposalAgentConfig]
140153

141154

142155
class AgentDefaultConfig(BaseModel):

src/lightspeed_evaluation/core/models/data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ class TurnData(StreamingMetricsMixin):
116116
description="Expected proposal status for assertion metrics",
117117
)
118118
proposal_status: Optional[dict[str, Any]] = Field(
119-
default=None, description="Raw CRD status populated by SubprocessDriver"
119+
default=None, description="Raw CRD status populated by ProposalDriver"
120120
)
121121

122122
# Set of turn metrics that don't pass the validation to ignore them later

src/lightspeed_evaluation/core/system/validator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@
5858
"with 'tool_name', 'arguments', and optional 'result'"
5959
),
6060
},
61+
"custom:proposal_status": {
62+
"required_fields": ["expected_proposal_status"],
63+
"description": "requires 'expected_proposal_status' field",
64+
},
6165
"script:action_eval": {
6266
"required_fields": ["verify_script"],
6367
"description": "requires 'verify_script' field",

src/lightspeed_evaluation/pipeline/evaluation/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender
1010
from lightspeed_evaluation.pipeline.evaluation.driver import (
1111
AgentDriver,
12-
AgentDriverRegistry,
12+
ProposalDriver,
1313
)
14+
from lightspeed_evaluation.pipeline.evaluation.registry import AgentDriverRegistry
1415
from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler
1516
from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
1617
from lightspeed_evaluation.pipeline.evaluation.pipeline import EvaluationPipeline
@@ -32,7 +33,7 @@
3233
"AgentDriver",
3334
),
3435
"AgentDriverRegistry": (
35-
"lightspeed_evaluation.pipeline.evaluation.driver",
36+
"lightspeed_evaluation.pipeline.evaluation.registry",
3637
"AgentDriverRegistry",
3738
),
3839
"ConversationProcessor": (
@@ -47,6 +48,10 @@
4748
"lightspeed_evaluation.pipeline.evaluation.evaluator",
4849
"MetricsEvaluator",
4950
),
51+
"ProposalDriver": (
52+
"lightspeed_evaluation.pipeline.evaluation.driver",
53+
"ProposalDriver",
54+
),
5055
}
5156

5257
__getattr__ = create_lazy_getattr(_LAZY_IMPORTS, __name__)

0 commit comments

Comments
 (0)