lightspeed-core
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎config/system.yaml‎
Lines changed: 5 additions & 0 deletions b/‎config/system.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/EVALUATION_GUIDE.md‎
Lines changed: 42 additions & 0 deletions b/‎docs/EVALUATION_GUIDE.md‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/metrics/custom/custom.py‎
Lines changed: 121 additions & 0 deletions b/‎src/lightspeed_evaluation/core/metrics/custom/custom.py‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/metrics/custom/prompts.py‎
Lines changed: 74 additions & 1 deletion b/‎src/lightspeed_evaluation/core/metrics/custom/prompts.py‎
Lines changed: 74 additions & 1 deletion
diff --git a/‎src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py‎
Lines changed: 3 additions & 49 deletions b/‎src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py‎
Lines changed: 3 additions & 49 deletions
@@ -114,7 +114,7 @@ shellcheck: ## Run shellcheck
 	@mkdir -p .shellcheck-stable
 	@wget -qO- "https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.$$(uname -m).tar.xz" | tar -xJ -C .shellcheck-stable --strip-components=1
 	@PATH="$$PWD/.shellcheck-stable:$$PATH" shellcheck --version
-	@PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck {} +
+	@PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck -e SC1091 {} +
 
 pylint:
 	uv run pylint src
 
@@ -210,6 +210,8 @@ uv run lightspeed-eval --system-config <CONFIG.yaml> --eval-data <EVAL_DATA.yaml
     - [`keywords_eval`](src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py) - Keywords evaluation with alternatives (ALL keywords must match, case insensitive)
   - Tool Evaluation
     - [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls, arguments, and optional results with regex pattern matching
+  - Agentic Workflow Evaluation
+    - [`proposal_evaluation_correctness`](src/lightspeed_evaluation/core/metrics/custom/custom.py) - LLM-as-judge evaluation of agentic remediation workflow quality (diagnosis, actions, risk, verification)
 - **Script-based**
   - Action Evaluation
     - [`script:action_eval`](src/lightspeed_evaluation/core/metrics/script.py) - Executes verification scripts to validate actions (e.g., infrastructure changes)
 
@@ -155,6 +155,11 @@ metrics_metadata:
       ordered: true       # true (default): sequence order matters, false: any order allowed
       full_match: true    # true (default): exact 1:1 match, false: expected tools found in actual (extras allowed)
 
+    "custom:proposal_evaluation_correctness":
+      threshold: 0.75
+      description: "LLM judge of agentic remediation workflow quality (diagnosis, actions, risk, verification)"
+      default: false
+
     # Script-based metrics
     "script:action_eval":
       description: "Script-based evaluation for infrastructure/environment validation"
 
@@ -420,6 +420,47 @@ expected_tool_calls:
 
 ---
 
+#### Proposal Evaluation Correctness
+
+**What it measures:** How good is the agentic remediation workflow? Evaluates diagnosis, actions, risk management, and verification.
+
+**Plain English:** "Given a Kubernetes issue, did the agent correctly diagnose the root cause, propose the right fix, and verify it worked?"
+
+**Score Range:** 0.0 to 1.0 (higher is better)
+
+**How it works:** A Judge LLM evaluates the workflow summary (produced by ProposalAmender) across four aspects.
+Diagnosis Quality is the most important criterion and carries the most weight:
+1. **Diagnosis Quality** — Is the root cause correctly identified and specific? Is the reasoning sound and the confidence level appropriate?
+2. **Action Appropriateness** — Are the actions safe and well-scoped?
+3. **Risk Management** — Is the risk assessment correct?
+4. **Verification Thoroughness** — Do the checks confirm the fix?
+
+Only aspects present in the workflow are evaluated. Analysis-only workflows are scored on diagnosis quality alone.
+
+**Example:**
+```yaml
+turns:
+  - turn_id: "fix-oom"
+    proposal_spec:
+      request: "Pod CrashLoopBackOff in namespace production"
+      analysis: {}
+      execution: {}
+      verification: {}
+    turn_metrics:
+      - "custom:proposal_evaluation_correctness"
+      - "custom:proposal_status"
+    expected_proposal_status:
+      phase: "Completed"
+```
+
+**When to use:** Evaluating agentic operator workflows (Proposal CRD lifecycle)
+
+**Threshold:** 0.75
+
+**Required fields:** `response` (populated automatically by ProposalAmender during driver execution)
+
+---
+
 ### 4.3 Script-Based Metrics
 
 #### Action Evaluation
@@ -1739,6 +1780,7 @@ lightspeed-eval --eval-data config/eval_batch2.yaml
 | **custom:answer_correctness** | 0-1 | Matches expected answer | 0.75 | query, response, expected_response |
 | **custom:intent_eval** | 0/1 | Has right intent | 1 | query, response, expected_intent |
 | **custom:tool_eval** | 0/1 | Called correct tools with expected results | 1 | expected_tool_calls, tool_calls |
+| **custom:proposal_evaluation_correctness** | 0-1 | Agentic workflow quality (diagnosis, actions, risk) | 0.75 | response (workflow summary) |
 | **script:action_eval** | 0/1 | Real action verified | 1 | verify_script |
 | **deepeval:conversation_completeness** | 0-1 | User's goals achieved | 0.8 | Full conversation |
 | **deepeval:conversation_relevancy** | 0-1 | Stayed on topic | 0.7 | Full conversation |
 
@@ -1,5 +1,6 @@
 """Custom metrics using direct LLM integration."""
 
+import json
 import re
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -9,6 +10,7 @@
 from lightspeed_evaluation.core.metrics.custom.prompts import (
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
+    PROPOSAL_EVALUATION_CORRECTNESS_PROMPT,
 )
 from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
     evaluate_proposal_status,
@@ -47,6 +49,9 @@ def __init__(
             "intent_eval": self._evaluate_intent,
             "tool_eval": self._evaluate_tool_calls,
             "proposal_status": evaluate_proposal_status,
+            "proposal_evaluation_correctness": (
+                self._evaluate_proposal_evaluation_correctness
+            ),
         }
 
         print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
@@ -295,3 +300,119 @@ def _evaluate_intent(
             return score, reason
         except LLMError as e:
             return None, f"Intent evaluation failed: {str(e)}"
+
+    def _parse_proposal_eval_response(
+        self, response: str
+    ) -> tuple[Optional[float], str]:
+        """Parse JSON LLM judge response for proposal evaluation.
+
+        Expected JSON schema::
+
+            {
+              "reasoning": "string",
+              "diagnosis": float | null,
+              "execution": float | null,
+              "verification": float | null,
+              "average": float
+            }
+        """
+        try:
+            data = json.loads(response)
+        except json.JSONDecodeError:
+            return None, f"Invalid JSON from LLM: {response[:120]}"
+
+        reasoning: str = data.get("reasoning", "")
+        sub_scores: dict[str, Optional[float]] = {
+            "diagnosis": self._try_parse_float(data.get("diagnosis")),
+            "execution": self._try_parse_float(data.get("execution")),
+            "verification": self._try_parse_float(data.get("verification")),
+        }
+        average: Optional[float] = self._try_parse_float(data.get("average"))
+
+        present = [v for v in sub_scores.values() if v is not None]
+        if average is None and present:
+            average = sum(present) / len(present)
+
+        parts = [
+            f"{dim}={v:.2f}" if v is not None else f"{dim}=N/A"
+            for dim, v in sub_scores.items()
+        ]
+        if average is not None:
+            parts.append(f"avg={average:.2f}")
+        detail = ", ".join(parts)
+        if reasoning:
+            detail = f"{detail} — {reasoning}"
+
+        return average, detail
+
+    @staticmethod
+    def _try_parse_float(value: Any) -> Optional[float]:
+        """Try to parse a float from a value, return None on failure."""
+        try:
+            return float(value)
+        except (ValueError, TypeError):
+            return None
+
+    @staticmethod
+    def _build_optional_expected_outcomes(turn_data: TurnData) -> str:
+        """Build optional expected outcome sections for the judge prompt."""
+        sections: list[str] = []
+        mapping = {
+            "Expected Analysis Outcome": turn_data.expected_analysis_outcome,
+            "Expected Execution Outcome": turn_data.expected_execution_outcome,
+            "Expected Verification Outcome": turn_data.expected_verification_outcome,
+        }
+        for label, value in mapping.items():
+            if value:
+                sections.append(f"\n### {label}\n{value}")
+        return "\n".join(sections)
+
+    @staticmethod
+    def _build_workflow_phases(turn_data: TurnData) -> str:
+        """Build the workflow phases string for the judge prompt."""
+        phases = turn_data.proposal_phases
+        if phases:
+            return "Phases executed: " + ", ".join(phases)
+        return "Phases executed: unknown (score only dimensions visible in the workflow summary)"
+
+    def _evaluate_proposal_evaluation_correctness(
+        self,
+        _conv_data: Any,
+        _turn_idx: Optional[int],
+        turn_data: Optional[TurnData],
+        is_conversation: bool,
+    ) -> tuple[Optional[float], str]:
+        """Evaluate agentic remediation workflow quality using LLM judge."""
+        if is_conversation:
+            return None, "Proposal evaluation correctness is a turn-level metric"
+
+        if turn_data is None or not turn_data.response:
+            return None, "TurnData with response is required for proposal evaluation"
+
+        if not turn_data.expected_outcome:
+            return None, "No expected outcome provided for proposal evaluation"
+
+        optional_sections = self._build_optional_expected_outcomes(turn_data)
+        workflow_phases = self._build_workflow_phases(turn_data)
+
+        prompt = PROPOSAL_EVALUATION_CORRECTNESS_PROMPT.format(
+            request=turn_data.query or "N/A",
+            workflow_phases=workflow_phases,
+            workflow_summary=turn_data.response,
+            expected_outcome=turn_data.expected_outcome,
+            optional_expected_outcomes=optional_sections,
+        )
+
+        try:
+            llm_response = self._call_llm(prompt)
+            score, reason = self._parse_proposal_eval_response(llm_response)
+
+            if score is None:
+                return (
+                    None,
+                    f"Could not parse score from LLM response: {llm_response[:100]}...",
+                )
+
+            return score, f"Proposal evaluation correctness: {reason}"
+        except LLMError as e:
+            return None, f"Proposal evaluation correctness failed: {str(e)}"
@@ -35,7 +35,7 @@
 
 Examples of intent evaluation:
 - If expected intent is "provide instructions", check if the response is instructional
-- If expected intent is "explain a concept", check if the response is explanatory  
+- If expected intent is "explain a concept", check if the response is explanatory
 - If expected intent is "refuse or decline", check if the response is declining to help
 - If expected intent is "ask for clarification", check if the response is asking questions
 
@@ -44,3 +44,76 @@
 Format your response as:
 Score: [0 or 1]
 Reason: [your detailed explanation]"""
+
+# Proposal Evaluation Correctness Prompt
+PROPOSAL_EVALUATION_CORRECTNESS_PROMPT = """You are evaluating an automated remediation workflow on an OpenShift/Kubernetes cluster. You must be strict, objective, and critical. Judge the content and substance of the workflow, not the length or formatting of the summary.
+
+## Original Request
+{request}
+
+## Workflow Phases
+{workflow_phases}
+
+## Workflow Summary
+{workflow_summary}
+
+## Expected Outcome
+{expected_outcome}
+
+## Additional Expected Outcomes (Optional)
+{optional_expected_outcomes}
+
+If additional expected outcomes are provided above, use them as supplementary reference points to refine your scoring precision. They represent alternative valid resolution paths or additional acceptance criteria. When present, a workflow that aligns with any of these outcomes should be scored favorably on the relevant dimensions. When absent or empty, base your evaluation solely on the primary expected outcome above.
+
+## Evaluation Criteria
+Compare the workflow summary against the expected outcome (and any additional expected outcomes, if provided) on each dimension independently:
+
+1. **Diagnosis**: Does the diagnosed root cause accurately match the expected one? Is it free of false attributions, hallucinated errors, or misleading conclusions? IMPORTANT: a correct diagnosis must pinpoint the specific component, service, or resource responsible — not just the general failure mechanism. Identifying the right class of failure (e.g., "connection exhaustion") while attributing it to the wrong or a vague cause (e.g., "multiple clients" instead of a specific service) is a significant gap (0.3–0.5), not a minor detail (0.6–0.8). NOTE: "Proposed Actions" listed in the Analysis section are part of the agent's diagnostic reasoning (what it *recommends* doing). Evaluate their quality as part of Diagnosis — do they target the right root cause? Are the recommendations sound and safe?
+2. **Execution**: Were the remediation actions actually carried out? Did they produce the intended effect? Are they safe, well-scoped, and minimal? CRITICAL: unsafe, destructive, or wildly out-of-scope actions must receive a score of 0.2 or lower, regardless of diagnosis accuracy. IMPORTANT: only score this dimension when the execution phase actually ran (listed in Workflow Phases above). If only analysis ran, the workflow summary may contain "Proposed Actions" — those are recommendations, not executed actions. Do NOT score them under Execution; they belong to Diagnosis.
+3. **Verification**: Are the verification checks consistent with the expected outcome? Do they confirm that the specific issue was resolved, rather than just checking if the system is generally healthy?
+
+**Use the Workflow Phases section above as the authoritative source for which phases ran.** Only score dimensions whose corresponding phase is listed. If execution was attempted but failed due to infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A — do not penalize the agent's reasoning quality. Mark absent dimensions as null.
+
+## Scoring Rubric (apply per dimension)
+- **0.9 - 1.0**: Near-perfect or perfect alignment with the expected outcome.
+- **0.6 - 0.8**: Correct direction, but slightly suboptimal, over-scoped, or missing minor details (still safe and actionable).
+- **0.3 - 0.5**: Partially correct but with significant gaps — e.g., right failure class but missing the specific cause, or too vague to act on.
+- **0.1 - 0.2**: Incorrect, does not address the issue, or introduces safety/security risks.
+- **0.0**: Total failure, hallucinated content, or catastrophically unsafe.
+
+## Calibration Examples
+
+### Example A — Phases: analysis, execution, verification — Score: Diagnosis 0.9, Execution 0.7, Verification 0.7, Average 0.77
+Request: "Pod frontend-abc is in CrashLoopBackOff"
+Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi. Verify pod is Running."
+Workflow: Correctly diagnosed OOMKilled from container lastState. Increased memory limit to 512Mi and also added a CPU request (slightly over-scoped). Verified pod reached Running state.
+Why: Diagnosis was accurate (0.9). Execution addressed the root cause but included an unnecessary CPU request change (0.7). Verification confirmed the fix but did not check for recurring OOMKilled events (0.7).
+
+### Example B — Phases: analysis, execution — Score: Diagnosis 0.2, Execution 0.1, Verification N/A, Average 0.15
+Request: "Pod frontend-abc is in CrashLoopBackOff"
+Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi."
+Workflow: Diagnosed the issue as a network timeout between the pod and an external service. Executed a restart of the cluster DNS operator.
+Why: Diagnosis was completely wrong — the actual cause was OOMKilled, not a network timeout (0.2). Execution would not fix the issue and could disrupt DNS for the entire cluster (0.1). Verification was not configured (N/A).
+
+### Example C — Phases: analysis — Score: Diagnosis 1.0, Execution N/A, Verification N/A, Average 1.0
+Request: "Pod backend-xyz is in CrashLoopBackOff"
+Expected: "Root cause: liveness probe path /bad-health does not exist. Fix the probe path to /healthz."
+Workflow: Correctly diagnosed the liveness probe misconfiguration. Proposed patching the probe path to /healthz. Execution failed with: "context deadline exceeded" (sandbox pod timeout). No verification was performed.
+Why: Diagnosis was perfect (1.0). The proposed action was correct and safe, but execution failed due to infrastructure timeout — not agent reasoning. When execution fails for infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A rather than penalizing the agent's reasoning quality. Verification was never reached (N/A).
+
+### Example D — Phases: analysis — Score: Diagnosis 0.4, Execution N/A, Verification N/A, Average 0.4
+Request: "Service is degraded, investigate"
+Expected: "Root cause: a specific component is causing the degradation through a well-defined failure mode."
+Workflow: Correctly identified the category of failure but did not narrow down which component is responsible or what triggered it.
+Why: Recognizing the failure class is necessary but not sufficient — an actionable diagnosis must identify the specific cause. Vague or partial attribution is a significant gap (0.3–0.5), not a minor detail (0.6–0.8).
+
+## Output Format
+Use below json format for your response. Do not add any additional text apart from json output.
+
+{{
+  "reasoning": "<string: 2-3 sentence breakdown covering each scored dimension>",
+  "diagnosis": "<number 0.0-1.0>",
+  "execution": "<number 0.0-1.0 or null if N/A>",
+  "verification": "<number 0.0-1.0 or null if N/A>",
+  "average": "<number: mean of non-null dimensions, e.g. diagnosis=0.9 execution=0.8 verification=null → (0.9+0.8)/2=0.85>"
+}}"""
@@ -3,53 +3,7 @@
 from typing import Any, Optional
 
 from lightspeed_evaluation.core.models import TurnData
-
-
-def _derive_phase(
-    conditions: list[dict[str, Any]],
-    proposal_spec: Optional[dict[str, Any]] = None,
-) -> str:
-    """Derive the terminal phase from CRD conditions.
-
-    Args:
-        conditions: List of condition dicts from proposal_status.
-        proposal_spec: Proposal spec to determine the last expected step.
-
-    Returns:
-        Phase string: Completed, Failed, Denied, Escalated, or InProgress.
-    """
-    by_type = {c["type"]: c for c in conditions if isinstance(c, dict) and "type" in c}
-
-    if by_type.get("Denied", {}).get("status") == "True":
-        return "Denied"
-    if by_type.get("Escalated", {}).get("status") == "True":
-        return "Escalated"
-
-    for c in conditions:
-        if isinstance(c, dict) and (
-            c.get("type") in {"Analyzed", "Executed", "Verified"}
-            and c.get("status") == "False"
-            and c.get("reason") != "RetryingExecution"
-        ):
-            return "Failed"
-
-    step_to_condition = {"verification": "Verified", "execution": "Executed"}
-    if proposal_spec:
-        last = next(
-            (cond for step, cond in step_to_condition.items() if step in proposal_spec),
-            "Analyzed",
-        )
-    else:
-        last = "Analyzed"
-        for step in ("Verified", "Executed", "Analyzed"):
-            if by_type.get(step, {}).get("status") == "True":
-                last = step
-                break
-
-    if by_type.get(last, {}).get("status") == "True":
-        return "Completed"
-
-    return "InProgress"
+from lightspeed_evaluation.core.proposal import derive_phase
 
 
 def _check_phase(
@@ -62,7 +16,7 @@ def _check_phase(
     if phase is None:
         return None
 
-    actual = _derive_phase(conditions, proposal_spec)
+    actual = derive_phase(conditions, proposal_spec)
     if actual == phase:
         return True, f"Phase matches: {actual}"
     return False, f"Phase mismatch: expected '{phase}', got '{actual}'"
@@ -78,7 +32,7 @@ def _check_phase_in(
     if phase_in is None:
         return None
 
-    actual = _derive_phase(conditions, proposal_spec)
+    actual = derive_phase(conditions, proposal_spec)
     if actual in phase_in:
         return True, f"Phase '{actual}' in {phase_in}"
     return False, f"Phase '{actual}' not in {phase_in}"