Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/lightspeed_evaluation/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@

# Agent Constants
DEFAULT_AGENT_TYPE = "http_api"
SUPPORTED_AGENT_TYPES = ["http_api"]
SUPPORTED_AGENT_TYPES = ["http_api", "proposal"]

# Frameworks that don't require judge LLM (NLP, script-based evaluations)
NON_LLM_FRAMEWORKS = frozenset({"nlp", "script"})
Expand Down
4 changes: 4 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
ANSWER_CORRECTNESS_PROMPT,
INTENT_EVALUATION_PROMPT,
)
from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
evaluate_proposal_status,
)
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls

__all__ = [
"CustomMetrics",
"evaluate_keywords",
"evaluate_proposal_status",
"evaluate_tool_calls",
# Prompts
"ANSWER_CORRECTNESS_PROMPT",
Expand Down
4 changes: 4 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
INTENT_EVALUATION_PROMPT,
)
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
evaluate_proposal_status,
)
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
from lightspeed_evaluation.core.system.exceptions import LLMError
Expand Down Expand Up @@ -44,6 +47,7 @@ def __init__(
"answer_correctness": self._evaluate_answer_correctness,
"intent_eval": self._evaluate_intent,
"tool_eval": self._evaluate_tool_calls,
"proposal_status": evaluate_proposal_status,
}

print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
Expand Down
209 changes: 209 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
"""Proposal status evaluation for CRD-based agent workflows."""

from typing import Any, Optional

from lightspeed_evaluation.core.models import TurnData


def _derive_phase(
conditions: list[dict[str, Any]],
proposal_spec: Optional[dict[str, Any]] = None,
) -> str:
"""Derive the terminal phase from CRD conditions.

Args:
conditions: List of condition dicts from proposal_status.
proposal_spec: Proposal spec to determine the last expected step.

Returns:
Phase string: Completed, Failed, Denied, Escalated, or InProgress.
"""
by_type = {c["type"]: c for c in conditions}

if by_type.get("Denied", {}).get("status") == "True":
return "Denied"
if by_type.get("Escalated", {}).get("status") == "True":
return "Escalated"

for c in conditions:
if c.get("status") == "False" and c.get("reason") != "RetryingExecution":
return "Failed"

step_to_condition = {"verification": "Verified", "execution": "Executed"}
if proposal_spec:
last = next(
(cond for step, cond in step_to_condition.items() if step in proposal_spec),
"Analyzed",
)
else:
last = "Analyzed"
for step in ("Verified", "Executed", "Analyzed"):
if by_type.get(step, {}).get("status") == "True":
last = step
break

if by_type.get(last, {}).get("status") == "True":
return "Completed"

return "InProgress"


def _check_phase(
expected: dict[str, Any],
conditions: list[dict[str, Any]],
proposal_spec: Optional[dict[str, Any]],
) -> Optional[tuple[bool, str]]:
"""Check exact phase match."""
phase = expected.get("phase")
if phase is None:
return None

actual = _derive_phase(conditions, proposal_spec)
if actual == phase:
return True, f"Phase matches: {actual}"
return False, f"Phase mismatch: expected '{phase}', got '{actual}'"


def _check_phase_in(
expected: dict[str, Any],
conditions: list[dict[str, Any]],
proposal_spec: Optional[dict[str, Any]],
) -> Optional[tuple[bool, str]]:
"""Check phase membership in a list."""
phase_in = expected.get("phase_in")
if phase_in is None:
return None

actual = _derive_phase(conditions, proposal_spec)
if actual in phase_in:
return True, f"Phase '{actual}' in {phase_in}"
return False, f"Phase '{actual}' not in {phase_in}"


def _check_conditions(
expected: dict[str, Any],
conditions: list[dict[str, Any]],
) -> Optional[tuple[bool, str]]:
"""Check specific condition assertions."""
expected_conditions = expected.get("conditions")
if expected_conditions is None:
return None

by_type = {c["type"]: c for c in conditions}

for exp_cond in expected_conditions:
cond_type = exp_cond.get("type")
if cond_type is None:
return False, "Condition assertion missing 'type' field"

actual_cond = by_type.get(cond_type)
if actual_cond is None:
return False, f"Condition '{cond_type}' not found in proposal status"

exp_status = exp_cond.get("status")
if exp_status is not None and actual_cond.get("status") != exp_status:
return (
False,
f"Condition '{cond_type}' status: "
f"expected '{exp_status}', got '{actual_cond.get('status')}'",
)

exp_reason = exp_cond.get("reason")
if exp_reason is not None and actual_cond.get("reason") != exp_reason:
return (
False,
f"Condition '{cond_type}' reason: "
f"expected '{exp_reason}', got '{actual_cond.get('reason')}'",
)

return True, "All condition assertions passed"


def _check_verification(
expected: dict[str, Any],
conditions: list[dict[str, Any]],
) -> Optional[tuple[bool, str]]:
"""Check verification-specific assertions."""
verification = expected.get("verification")
if verification is None:
return None

by_type = {c["type"]: c for c in conditions}
verified = by_type.get("Verified")

if verified is None:
return False, "Verified condition not found in proposal status"

passed = verification.get("passed")
if passed is not None:
actual_passed = verified.get("status") == "True"
if actual_passed != passed:
return (
False,
f"Verification passed: expected {passed}, got {actual_passed}",
)

summary_contains = verification.get("summary_contains")
if summary_contains is not None:
message = verified.get("message", "")
if summary_contains.lower() not in message.lower():
return (
False,
f"Verification summary does not contain '{summary_contains}': "
f"got '{message[:200]}'",
)

return True, "Verification assertions passed"


def evaluate_proposal_status(
_conv_data: Any,
_turn_idx: Optional[int],
turn_data: Optional[TurnData],
is_conversation: bool,
) -> tuple[Optional[float], str]:
"""Evaluate proposal status against expected assertions.

Args:
_conv_data: Conversation data (unused).
_turn_idx: Turn index (unused).
turn_data: Turn data with proposal_status and expected_proposal_status.
is_conversation: Whether this is conversation-level evaluation.

Returns:
Tuple of (score, reason). Score is 1.0 if all checks pass, 0.0 on
first failure, None if metric should be skipped.
"""
if is_conversation:
return None, "Proposal status is a turn-level metric"

if turn_data is None:
return None, "TurnData is required for proposal status evaluation"

if not turn_data.expected_proposal_status:
return None, "No expected_proposal_status provided"

if not turn_data.proposal_status:
return 0.0, "proposal_status not populated by driver"

expected = turn_data.expected_proposal_status
conditions = turn_data.proposal_status.get("conditions", [])
proposal_spec = turn_data.proposal_spec

checks = [
_check_phase(expected, conditions, proposal_spec),
_check_phase_in(expected, conditions, proposal_spec),
_check_conditions(expected, conditions),
_check_verification(expected, conditions),
]

reasons: list[str] = []
for result in checks:
if result is None:
continue
passed, reason = result
if not passed:
return 0.0, reason
reasons.append(reason)

return 1.0, "; ".join(reasons) if reasons else "All checks passed"
2 changes: 2 additions & 0 deletions src/lightspeed_evaluation/core/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
HttpApiAgentConfig,
MCPHeadersConfig,
MCPServerConfig,
ProposalAgentConfig,
)
from lightspeed_evaluation.core.models.api import (
APIRequest,
Expand Down Expand Up @@ -57,6 +58,7 @@
"HttpApiAgentConfig",
"MCPHeadersConfig",
"MCPServerConfig",
"ProposalAgentConfig",
# Data models
"TurnData",
"EvaluationData",
Expand Down
15 changes: 14 additions & 1 deletion src/lightspeed_evaluation/core/models/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,22 @@ class HttpApiAgentConfig(HttpApiBaseFields):
)


class ProposalAgentConfig(BaseModel):
"""Configuration for a Proposal CRD-based agent."""

model_config = ConfigDict(extra="forbid")

type: Literal["proposal"] = "proposal"
namespace: str
auto_approve: bool = True
cleanup_proposals: bool = True
timeout: int = Field(default=900, gt=0)
poll_interval: int = Field(default=2, gt=0)


# Discriminated union of all agent config types; extend by adding new
# config classes to support additional agent types.
AgentDefinition = Union[HttpApiAgentConfig]
AgentDefinition = Union[HttpApiAgentConfig, ProposalAgentConfig]


class AgentDefaultConfig(BaseModel):
Expand Down
15 changes: 15 additions & 0 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,21 @@ class TurnData(StreamingMetricsMixin):
default=None, description="Path to verify script for script-based evaluation"
)

# Subprocess driver fields
description: Optional[str] = Field(
default=None, description="Human-readable label for reports"
)
proposal_spec: Optional[dict[str, Any]] = Field(
default=None, description="Inline proposal spec for CRD-based agents"
)
expected_proposal_status: Optional[dict[str, Any]] = Field(
default=None,
description="Expected proposal status for assertion metrics",
)
proposal_status: Optional[dict[str, Any]] = Field(
default=None, description="Raw CRD status populated by ProposalDriver"
)

# Set of turn metrics that don't pass the validation to ignore them later
_invalid_metrics: set[str] = set()

Expand Down
4 changes: 4 additions & 0 deletions src/lightspeed_evaluation/core/system/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@
"with 'tool_name', 'arguments', and optional 'result'"
),
},
"custom:proposal_status": {
"required_fields": ["expected_proposal_status"],
"description": "requires 'expected_proposal_status' field",
},
"script:action_eval": {
"required_fields": ["verify_script"],
"description": "requires 'verify_script' field",
Expand Down
9 changes: 7 additions & 2 deletions src/lightspeed_evaluation/pipeline/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender
from lightspeed_evaluation.pipeline.evaluation.driver import (
AgentDriver,
AgentDriverRegistry,
ProposalDriver,
)
from lightspeed_evaluation.pipeline.evaluation.registry import AgentDriverRegistry
from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler
from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
from lightspeed_evaluation.pipeline.evaluation.pipeline import EvaluationPipeline
Expand All @@ -32,7 +33,7 @@
"AgentDriver",
),
"AgentDriverRegistry": (
"lightspeed_evaluation.pipeline.evaluation.driver",
"lightspeed_evaluation.pipeline.evaluation.registry",
"AgentDriverRegistry",
),
"ConversationProcessor": (
Expand All @@ -47,6 +48,10 @@
"lightspeed_evaluation.pipeline.evaluation.evaluator",
"MetricsEvaluator",
),
"ProposalDriver": (
"lightspeed_evaluation.pipeline.evaluation.driver",
"ProposalDriver",
),
}

__getattr__ = create_lazy_getattr(_LAZY_IMPORTS, __name__)
Loading
Loading