lightspeed-core
diff --git a/‎src/lightspeed_evaluation/core/constants.py‎
Lines changed: 1 addition & 1 deletion b/‎src/lightspeed_evaluation/core/constants.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightspeed_evaluation/core/metrics/custom/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/lightspeed_evaluation/core/metrics/custom/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/metrics/custom/custom.py‎
Lines changed: 4 additions & 0 deletions b/‎src/lightspeed_evaluation/core/metrics/custom/custom.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py‎
Lines changed: 209 additions & 0 deletions b/‎src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py‎
Lines changed: 209 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/lightspeed_evaluation/core/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/models/agents.py‎
Lines changed: 14 additions & 1 deletion b/‎src/lightspeed_evaluation/core/models/agents.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎src/lightspeed_evaluation/core/system/validator.py‎
Lines changed: 4 additions & 0 deletions b/‎src/lightspeed_evaluation/core/system/validator.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/pipeline/evaluation/__init__.py‎
Lines changed: 7 additions & 2 deletions b/‎src/lightspeed_evaluation/pipeline/evaluation/__init__.py‎
Lines changed: 7 additions & 2 deletions
@@ -63,7 +63,7 @@
 
 # Agent Constants
 DEFAULT_AGENT_TYPE = "http_api"
-SUPPORTED_AGENT_TYPES = ["http_api"]
+SUPPORTED_AGENT_TYPES = ["http_api", "subprocess"]
 
 # Frameworks that don't require judge LLM (NLP, script-based evaluations)
 NON_LLM_FRAMEWORKS = frozenset({"nlp", "script"})
 
@@ -6,11 +6,15 @@
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
 )
+from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
+    evaluate_proposal_status,
+)
 from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
 
 __all__ = [
     "CustomMetrics",
     "evaluate_keywords",
+    "evaluate_proposal_status",
     "evaluate_tool_calls",
     # Prompts
     "ANSWER_CORRECTNESS_PROMPT",
 
@@ -10,6 +10,9 @@
     INTENT_EVALUATION_PROMPT,
 )
 from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
+from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
+    evaluate_proposal_status,
+)
 from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
 from lightspeed_evaluation.core.system.exceptions import LLMError
@@ -44,6 +47,7 @@ def __init__(
             "answer_correctness": self._evaluate_answer_correctness,
             "intent_eval": self._evaluate_intent,
             "tool_eval": self._evaluate_tool_calls,
+            "proposal_status": evaluate_proposal_status,
         }
 
         print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
 
@@ -0,0 +1,209 @@
+"""Proposal status evaluation for CRD-based agent workflows."""
+
+from typing import Any, Optional
+
+from lightspeed_evaluation.core.models import TurnData
+
+
+def _derive_phase(
+    conditions: list[dict[str, Any]],
+    proposal_spec: Optional[dict[str, Any]] = None,
+) -> str:
+    """Derive the terminal phase from CRD conditions.
+
+    Args:
+        conditions: List of condition dicts from proposal_status.
+        proposal_spec: Proposal spec to determine the last expected step.
+
+    Returns:
+        Phase string: Completed, Failed, Denied, Escalated, or InProgress.
+    """
+    by_type = {c["type"]: c for c in conditions}
+
+    if by_type.get("Denied", {}).get("status") == "True":
+        return "Denied"
+    if by_type.get("Escalated", {}).get("status") == "True":
+        return "Escalated"
+
+    for c in conditions:
+        if c.get("status") == "False" and c.get("reason") != "RetryingExecution":
+            return "Failed"
+
+    step_to_condition = {"verification": "Verified", "execution": "Executed"}
+    if proposal_spec:
+        last = next(
+            (cond for step, cond in step_to_condition.items() if step in proposal_spec),
+            "Analyzed",
+        )
+    else:
+        last = "Analyzed"
+        for step in ("Verified", "Executed", "Analyzed"):
+            if by_type.get(step, {}).get("status") == "True":
+                last = step
+                break
+
+    if by_type.get(last, {}).get("status") == "True":
+        return "Completed"
+
+    return "InProgress"
+
+
+def _check_phase(
+    expected: dict[str, Any],
+    conditions: list[dict[str, Any]],
+    proposal_spec: Optional[dict[str, Any]],
+) -> Optional[tuple[bool, str]]:
+    """Check exact phase match."""
+    phase = expected.get("phase")
+    if phase is None:
+        return None
+
+    actual = _derive_phase(conditions, proposal_spec)
+    if actual == phase:
+        return True, f"Phase matches: {actual}"
+    return False, f"Phase mismatch: expected '{phase}', got '{actual}'"
+
+
+def _check_phase_in(
+    expected: dict[str, Any],
+    conditions: list[dict[str, Any]],
+    proposal_spec: Optional[dict[str, Any]],
+) -> Optional[tuple[bool, str]]:
+    """Check phase membership in a list."""
+    phase_in = expected.get("phase_in")
+    if phase_in is None:
+        return None
+
+    actual = _derive_phase(conditions, proposal_spec)
+    if actual in phase_in:
+        return True, f"Phase '{actual}' in {phase_in}"
+    return False, f"Phase '{actual}' not in {phase_in}"
+
+
+def _check_conditions(
+    expected: dict[str, Any],
+    conditions: list[dict[str, Any]],
+) -> Optional[tuple[bool, str]]:
+    """Check specific condition assertions."""
+    expected_conditions = expected.get("conditions")
+    if expected_conditions is None:
+        return None
+
+    by_type = {c["type"]: c for c in conditions}
+
+    for exp_cond in expected_conditions:
+        cond_type = exp_cond.get("type")
+        if cond_type is None:
+            return False, "Condition assertion missing 'type' field"
+
+        actual_cond = by_type.get(cond_type)
+        if actual_cond is None:
+            return False, f"Condition '{cond_type}' not found in proposal status"
+
+        exp_status = exp_cond.get("status")
+        if exp_status is not None and actual_cond.get("status") != exp_status:
+            return (
+                False,
+                f"Condition '{cond_type}' status: "
+                f"expected '{exp_status}', got '{actual_cond.get('status')}'",
+            )
+
+        exp_reason = exp_cond.get("reason")
+        if exp_reason is not None and actual_cond.get("reason") != exp_reason:
+            return (
+                False,
+                f"Condition '{cond_type}' reason: "
+                f"expected '{exp_reason}', got '{actual_cond.get('reason')}'",
+            )
+
+    return True, "All condition assertions passed"
+
+
+def _check_verification(
+    expected: dict[str, Any],
+    conditions: list[dict[str, Any]],
+) -> Optional[tuple[bool, str]]:
+    """Check verification-specific assertions."""
+    verification = expected.get("verification")
+    if verification is None:
+        return None
+
+    by_type = {c["type"]: c for c in conditions}
+    verified = by_type.get("Verified")
+
+    if verified is None:
+        return False, "Verified condition not found in proposal status"
+
+    passed = verification.get("passed")
+    if passed is not None:
+        actual_passed = verified.get("status") == "True"
+        if actual_passed != passed:
+            return (
+                False,
+                f"Verification passed: expected {passed}, got {actual_passed}",
+            )
+
+    summary_contains = verification.get("summary_contains")
+    if summary_contains is not None:
+        message = verified.get("message", "")
+        if summary_contains.lower() not in message.lower():
+            return (
+                False,
+                f"Verification summary does not contain '{summary_contains}': "
+                f"got '{message[:200]}'",
+            )
+
+    return True, "Verification assertions passed"
+
+
+def evaluate_proposal_status(
+    _conv_data: Any,
+    _turn_idx: Optional[int],
+    turn_data: Optional[TurnData],
+    is_conversation: bool,
+) -> tuple[Optional[float], str]:
+    """Evaluate proposal status against expected assertions.
+
+    Args:
+        _conv_data: Conversation data (unused).
+        _turn_idx: Turn index (unused).
+        turn_data: Turn data with proposal_status and expected_proposal_status.
+        is_conversation: Whether this is conversation-level evaluation.
+
+    Returns:
+        Tuple of (score, reason). Score is 1.0 if all checks pass, 0.0 on
+        first failure, None if metric should be skipped.
+    """
+    if is_conversation:
+        return None, "Proposal status is a turn-level metric"
+
+    if turn_data is None:
+        return None, "TurnData is required for proposal status evaluation"
+
+    if not turn_data.expected_proposal_status:
+        return None, "No expected_proposal_status provided"
+
+    if not turn_data.proposal_status:
+        return 0.0, "proposal_status not populated by driver"
+
+    expected = turn_data.expected_proposal_status
+    conditions = turn_data.proposal_status.get("conditions", [])
+    proposal_spec = turn_data.proposal_spec
+
+    checks = [
+        _check_phase(expected, conditions, proposal_spec),
+        _check_phase_in(expected, conditions, proposal_spec),
+        _check_conditions(expected, conditions),
+        _check_verification(expected, conditions),
+    ]
+
+    reasons: list[str] = []
+    for result in checks:
+        if result is None:
+            continue
+        passed, reason = result
+        if not passed:
+            return 0.0, reason
+        reasons.append(reason)
+
+    return 1.0, "; ".join(reasons) if reasons else "All checks passed"
@@ -6,6 +6,7 @@
     HttpApiAgentConfig,
     MCPHeadersConfig,
     MCPServerConfig,
+    SubprocessAgentConfig,
 )
 from lightspeed_evaluation.core.models.api import (
     APIRequest,
@@ -43,6 +44,7 @@
     "HttpApiAgentConfig",
     "MCPHeadersConfig",
     "MCPServerConfig",
+    "SubprocessAgentConfig",
     # Data models
     "TurnData",
     "EvaluationData",
 
@@ -136,9 +136,22 @@ class HttpApiAgentConfig(HttpApiBaseFields):
     )
 
 
+class SubprocessAgentConfig(BaseModel):
+    """Configuration for a subprocess-based CRD agent."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    type: Literal["subprocess"] = "subprocess"
+    namespace: str
+    auto_approve: bool = True
+    cleanup_proposals: bool = True
+    timeout: int = Field(default=900, gt=0)
+    poll_interval: int = Field(default=2, gt=0)
+
+
 # Discriminated union of all agent config types; extend by adding new
 # config classes to support additional agent types.
-AgentDefinition = Union[HttpApiAgentConfig]
+AgentDefinition = Union[HttpApiAgentConfig, SubprocessAgentConfig]
 
 
 class AgentDefaultConfig(BaseModel):
 
@@ -58,6 +58,10 @@
             "with 'tool_name', 'arguments', and optional 'result'"
         ),
     },
+    "custom:proposal_status": {
+        "required_fields": ["expected_proposal_status"],
+        "description": "requires 'expected_proposal_status' field",
+    },
     "script:action_eval": {
         "required_fields": ["verify_script"],
         "description": "requires 'verify_script' field",
 
@@ -9,8 +9,9 @@
     from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender
     from lightspeed_evaluation.pipeline.evaluation.driver import (
         AgentDriver,
-        AgentDriverRegistry,
+        SubprocessDriver,
     )
+    from lightspeed_evaluation.pipeline.evaluation.registry import AgentDriverRegistry
     from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler
     from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
     from lightspeed_evaluation.pipeline.evaluation.pipeline import EvaluationPipeline
@@ -32,7 +33,7 @@
         "AgentDriver",
     ),
     "AgentDriverRegistry": (
-        "lightspeed_evaluation.pipeline.evaluation.driver",
+        "lightspeed_evaluation.pipeline.evaluation.registry",
         "AgentDriverRegistry",
     ),
     "ConversationProcessor": (
@@ -47,6 +48,10 @@
         "lightspeed_evaluation.pipeline.evaluation.evaluator",
         "MetricsEvaluator",
     ),
+    "SubprocessDriver": (
+        "lightspeed_evaluation.pipeline.evaluation.driver",
+        "SubprocessDriver",
+    ),
 }
 
 __getattr__ = create_lazy_getattr(_LAZY_IMPORTS, __name__)