From 7f15b7fb2c296e124a804f49c1525dda8edcc18c Mon Sep 17 00:00:00 2001
From: Kaosiso Ezealigo <ezealigokosiso@gmail.com>
Date: Wed, 3 Jun 2026 11:46:05 +0200
Subject: [PATCH 01/36] Add unit tests for utility functions and configuration
 for Vitest

- Created unit tests for data transformation utilities including error extraction, response status preservation, and metadata stripping.
- Added tests for formatting utilities covering number, currency, latency, and percentage formatting.
- Implemented tests for path utilities to validate object navigation and manipulation.
- Developed tests for slug generation and validation functions.
- Added tests for template variable validation and extraction.
- Included tests for various validators including UUID and HTTP URL validation.
- Configured Vitest for running tests with coverage reporting and JUnit output.
---
 web/packages/agenta-annotation/package.json   |  11 +-
 .../agenta-annotation/test-results/junit.xml  | 163 +++++
 .../tests/__mocks__/agenta-ui.ts              |  11 +
 .../unit/annotation-form-helpers.test.ts      | 376 ++++++++++
 .../tests/unit/testset-sync.test.ts           | 659 ++++++++++++++++++
 .../agenta-annotation/vitest.config.ts        |  28 +
 web/packages/agenta-shared/package.json       |  11 +-
 .../agenta-shared/test-results/junit.xml      | 389 +++++++++++
 .../tests/unit/data-transforms.test.ts        | 165 +++++
 .../tests/unit/formatters.test.ts             | 222 ++++++
 .../tests/unit/path-utils.test.ts             | 166 +++++
 .../agenta-shared/tests/unit/slug.test.ts     | 234 +++++++
 .../tests/unit/template-variable.test.ts      | 147 ++++
 .../tests/unit/validators-and-ids.test.ts     | 138 ++++
 web/packages/agenta-shared/vitest.config.ts   |  19 +
 web/pnpm-lock.yaml                            |  12 +
 16 files changed, 2747 insertions(+), 4 deletions(-)
 create mode 100644 web/packages/agenta-annotation/test-results/junit.xml
 create mode 100644 web/packages/agenta-annotation/tests/__mocks__/agenta-ui.ts
 create mode 100644 web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts
 create mode 100644 web/packages/agenta-annotation/tests/unit/testset-sync.test.ts
 create mode 100644 web/packages/agenta-annotation/vitest.config.ts
 create mode 100644 web/packages/agenta-shared/test-results/junit.xml
 create mode 100644 web/packages/agenta-shared/tests/unit/data-transforms.test.ts
 create mode 100644 web/packages/agenta-shared/tests/unit/formatters.test.ts
 create mode 100644 web/packages/agenta-shared/tests/unit/path-utils.test.ts
 create mode 100644 web/packages/agenta-shared/tests/unit/slug.test.ts
 create mode 100644 web/packages/agenta-shared/tests/unit/template-variable.test.ts
 create mode 100644 web/packages/agenta-shared/tests/unit/validators-and-ids.test.ts
 create mode 100644 web/packages/agenta-shared/vitest.config.ts

diff --git a/web/packages/agenta-annotation/package.json b/web/packages/agenta-annotation/package.json
index 788f6308af..0874be43f7 100644
--- a/web/packages/agenta-annotation/package.json
+++ b/web/packages/agenta-annotation/package.json
@@ -8,7 +8,12 @@
     "scripts": {
         "build": "tsc --noEmit",
         "types:check": "tsc --noEmit",
-        "lint": "eslint --config ../eslint.config.mjs src/"
+        "lint": "eslint --config ../eslint.config.mjs src/",
+        "test": "pnpm run test:unit",
+        "test:unit": "vitest run",
+        "test:watch": "vitest",
+        "test:coverage": "vitest run --coverage",
+        "check": "pnpm run types:check && pnpm run lint"
     },
     "exports": {
         ".": "./src/index.ts",
@@ -26,6 +31,8 @@
     },
     "devDependencies": {
         "@types/node": "^20.8.10",
-        "typescript": "5.8.3"
+        "@vitest/coverage-v8": "^4.1.4",
+        "typescript": "5.8.3",
+        "vitest": "^4.1.4"
     }
 }
diff --git a/web/packages/agenta-annotation/test-results/junit.xml b/web/packages/agenta-annotation/test-results/junit.xml
new file mode 100644
index 0000000000..85cdaef2d1
--- /dev/null
+++ b/web/packages/agenta-annotation/test-results/junit.xml
@@ -0,0 +1,163 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<testsuites name="vitest tests" tests="78" failures="0" errors="0" time="0.112877977">
+    <testsuite name="tests/unit/annotation-form-helpers.test.ts" timestamp="2026-06-03T07:51:16.842Z" hostname="Kaosisos-MacBook-Pro.local" tests="38" failures="0" errors="0" skipped="0" time="0.079835364">
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns true for null" time="0.003193247">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns true for undefined" time="0.000426831">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns true for " time="0.00022392">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns true for " time="0.000394588">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for 0" time="0.000342228">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for false" time="0.000172727">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for 0" time="0.000318123">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for " time="0.000154355">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for {}" time="0.000593872">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for  " time="0.000258603">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getOutputsSchema &gt; returns the schema from resolveOutputSchema" time="0.018263232">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getOutputsSchema &gt; returns empty object when resolveOutputSchema returns null" time="0.021705992">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — scalar types &gt; produces a number field with null default" time="0.005147245">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — scalar types &gt; produces an integer field with null default" time="0.000455569">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — scalar types &gt; produces a boolean field with null default" time="0.000533427">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — scalar types &gt; produces a string field with empty-string default" time="0.000415174">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array type &gt; produces an array field with item schema" time="0.000462322">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array type &gt; defaults item type to string when items is missing" time="0.002111803">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — anyOf schema &gt; unwraps the first anyOf entry to get the real type" time="0.005339982">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array-of-types &gt; filters &apos;null&apos; from the type array and uses the remaining types" time="0.000613499">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array-of-types &gt; skips the property when only &apos;null&apos; type remains after filtering" time="0.005364649">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array-of-types &gt; includes non-null enum values and strips null/empty entries" time="0.000314292">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — edge cases &gt; returns empty object for an empty schema" time="0.000201713">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — edge cases &gt; skips unsupported types (e.g. &apos;object&apos;)" time="0.000369456">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — edge cases &gt; skips properties with no type field" time="0.000208122">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — flat outputs matching schema &gt; fills a number field from flat outputs" time="0.000565318">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — flat outputs matching schema &gt; fills a string field from flat outputs" time="0.000294481">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — flat outputs matching schema &gt; uses schema default when key is absent in outputs" time="0.00023096">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — flat outputs matching schema &gt; uses &apos;&apos; as default for a missing string field" time="0.000172772">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — nested outputs &gt; flattens metrics nested under &apos;metrics&apos; key" time="0.000358223">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — nested outputs &gt; flattens fields nested under &apos;notes&apos; key" time="0.000209121">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — nested outputs &gt; flattens fields nested under &apos;extra&apos; key" time="0.000184574">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — nested outputs &gt; flat keys outside of metrics/notes/extra are preserved directly" time="0.000175375">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; infers a number field from a numeric output value" time="0.000531908">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; infers a boolean field from a boolean output value" time="0.000429949">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; infers a string field from a string output value" time="0.000213978">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; serialises an object output to a JSON string field" time="0.000379304">
+        </testcase>
+        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; returns empty object when annotation outputs are empty" time="0.000287768">
+        </testcase>
+    </testsuite>
+    <testsuite name="tests/unit/testset-sync.test.ts" timestamp="2026-06-03T07:51:16.868Z" hostname="Kaosisos-MacBook-Pro.local" tests="40" failures="0" errors="0" skipped="0" time="0.033042613">
+        <testcase classname="tests/unit/testset-sync.test.ts" name="getQueueAnnotationTag &gt; formats queue ID into tag" time="0.003451276">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="getQueueAnnotationTag &gt; handles arbitrary queue IDs" time="0.000339992">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; always includes the queue tag and kind tag" time="0.00157184">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; merges existing tags without duplicates" time="0.001066907">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; adds output keys as tags" time="0.000432673">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; handles null existingTags gracefully" time="0.00031137">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; filters out falsy tags from existingTags" time="0.000652604">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — no match &gt; returns null annotation when list is empty" time="0.001475313">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — no match &gt; returns null annotation when no annotation matches the evaluator slug" time="0.001590326">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — queue-scoped matching &gt; returns the annotation when exactly one queue-scoped match exists" time="0.00062801">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — queue-scoped matching &gt; returns duplicate_queue_annotations when multiple queue-scoped annotations match" time="0.000372534">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — queue-scoped matching &gt; ignores annotations scoped to a different queue" time="0.000294816">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — legacy fallback &gt; falls back to a legacy annotation (no queue tags) when no queue-scoped match" time="0.000350558">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — legacy fallback &gt; returns duplicate_legacy_annotations when multiple legacy annotations match" time="0.000232561">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — evaluatorWorkflowId matching &gt; matches annotation by evaluator workflow ID" time="0.000220965">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="getTestsetSyncEvaluatorColumnKey &gt; returns evaluator slug when no annotation supplied" time="0.000211145">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="getTestsetSyncEvaluatorColumnKey &gt; prefers annotation&apos;s evaluator slug over evaluator.slug" time="0.000205561">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="getTestsetSyncEvaluatorColumnKey &gt; falls back to evaluator.workflowId when slug is empty" time="0.000135544">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="getTestsetSyncEvaluatorColumnKey &gt; returns empty string when evaluator has no slug or workflowId" time="0.000132458">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncOperations &gt; maps target rows to replace operations" time="0.00041353">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncOperations &gt; produces an empty replace list for a target with no rows" time="0.000249256">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; keeps rows whose rowId exists directly in baseRows" time="0.002495199">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; remaps a row using testcase_dedup_id when rowId is not in baseRows" time="0.000243903">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; also remaps using legacy __dedup_id__ key" time="0.00020136">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; drops rows with no matching rowId and no dedup key" time="0.000253405">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; updates rowCount to reflect mapped rows only" time="0.000469284">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTraceTestsetRows &gt; builds a row per scenario with trace inputs and output" time="0.001453622">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTraceTestsetRows &gt; expands a nested &apos;inputs&apos; key into top-level columns" time="0.001394469">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTraceTestsetRows &gt; merges annotation outputs into the row" time="0.001420822">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTraceTestsetRows &gt; handles a missing scenario gracefully (uses empty defaults)" time="0.000365183">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestcaseExportRows &gt; builds a row when annotation data exists for the testcase" time="0.000788308">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestcaseExportRows &gt; skips a scenario with no testcase mapping" time="0.000422125">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestcaseExportRows &gt; skips a testcase with no annotations" time="0.000299728">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; returns a missing_testcase conflict when testcase not found" time="0.001103675">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; returns a missing_testset conflict when testcase has no testset_id" time="0.000254052">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; returns a missing_latest_revision conflict when no revision for testset" time="0.00081726">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; produces a clean target when everything is resolved" time="0.000674425">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; records duplicate_queue_annotations conflict and skips the row" time="0.000710204">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; groups rows from different scenarios under the same testset target" time="0.000615301">
+        </testcase>
+        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; skips rows with no annotation data and does not add them as conflicts" time="0.000292529">
+        </testcase>
+    </testsuite>
+</testsuites>
diff --git a/web/packages/agenta-annotation/tests/__mocks__/agenta-ui.ts b/web/packages/agenta-annotation/tests/__mocks__/agenta-ui.ts
new file mode 100644
index 0000000000..5e8e4ebab4
--- /dev/null
+++ b/web/packages/agenta-annotation/tests/__mocks__/agenta-ui.ts
@@ -0,0 +1,11 @@
+/**
+ * Lightweight stub for @agenta/ui used in Vitest node-env tests.
+ * The real @agenta/ui pulls in antd which is enormous and causes the Vitest
+ * transformer to time out. Annotation tests exercise pure functions only.
+ */
+export const cn = (...args: unknown[]) => args.filter(Boolean).join(" ")
+export const textColors = {}
+export const bgColors = {}
+export const EnhancedModal = () => null
+export const ModalContent = () => null
+export const ModalFooter = () => null
diff --git a/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts b/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts
new file mode 100644
index 0000000000..d0a246ce6c
--- /dev/null
+++ b/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts
@@ -0,0 +1,376 @@
+/**
+ * Unit tests for pure helper functions exported from annotationFormController.ts:
+ *   - isEmptyValue
+ *   - getOutputsSchema
+ *   - getMetricFieldsFromEvaluator
+ *   - getMetricsFromAnnotation
+ *
+ * The module has many heavy imports (Jotai atoms, entity API calls, session
+ * controller). We mock the external packages so no network or Jotai store
+ * is touched during tests.
+ */
+
+import {beforeEach, describe, expect, it, vi} from "vitest"
+
+// ---------------------------------------------------------------------------
+// Module-level mocks — vi.mock is hoisted before imports by Vitest
+// ---------------------------------------------------------------------------
+
+const mockResolveOutputSchema = vi.fn()
+
+vi.mock("@agenta/entities/workflow", () => ({
+    resolveOutputSchema: (data: unknown) => mockResolveOutputSchema(data),
+    workflowQueryAtomFamily: () => ({isPending: false, data: null}),
+    workflowLatestRevisionQueryAtomFamily: () => ({isPending: false, data: null}),
+}))
+
+vi.mock("@agenta/entities/annotation", () => ({
+    createAnnotation: vi.fn(),
+    updateAnnotation: vi.fn(),
+    invalidateAnnotationCacheByLink: vi.fn(),
+}))
+
+vi.mock("@agenta/entities/evaluationRun", () => ({
+    evaluationRunMolecule: {selectors: {annotationSteps: vi.fn(), scenarioSteps: vi.fn()}},
+    queryEvaluationResults: vi.fn(),
+}))
+
+vi.mock("@agenta/entities/simpleQueue", () => ({
+    invalidateScenarioProgressCache: vi.fn(),
+    invalidateSimpleQueueCache: vi.fn(),
+    invalidateSimpleQueuesListCache: vi.fn(),
+    simpleQueuePaginatedStore: {refreshAtom: {}},
+}))
+
+vi.mock("@agenta/entities/trace", () => ({
+    fetchPreviewTrace: vi.fn(),
+}))
+
+vi.mock("@agenta/shared/api", () => ({
+    axios: {patch: vi.fn(), post: vi.fn()},
+    getAgentaApiUrl: () => "http://localhost",
+    queryClient: {invalidateQueries: vi.fn()},
+}))
+
+vi.mock("@agenta/shared/state", () => ({
+    projectIdAtom: {},
+}))
+
+vi.mock("../../src/state/controllers/annotationSessionController", () => ({
+    annotationSessionController: {
+        selectors: {
+            evaluatorStepRefs: () => ({}),
+            scenarioAnnotations: () => ({}),
+            scenarioStatuses: () => ({}),
+            activeRunId: () => ({}),
+            focusAutoNext: () => ({}),
+        },
+        set: {markCompleted: vi.fn(), navigateNext: vi.fn()},
+        cache: {invalidateScenarioAnnotations: vi.fn()},
+    },
+}))
+
+// Import the functions AFTER all vi.mock() declarations
+import {
+    getMetricFieldsFromEvaluator,
+    getMetricsFromAnnotation,
+    getOutputsSchema,
+    isEmptyValue,
+} from "../../src/state/controllers/annotationFormController"
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function makeWorkflow(schemaProperties: Record<string, unknown> = {}) {
+    // resolveOutputSchema is mocked to return its input,
+    // so we set data to the schema shape directly.
+    return {
+        data: {properties: schemaProperties},
+        slug: "test-evaluator",
+        id: "wf-1",
+    } as any
+}
+
+function makeAnnotation(
+    outputs: Record<string, unknown>,
+    references?: {evaluator?: {slug?: string}},
+) {
+    return {
+        trace_id: "trace-1",
+        span_id: "span-1",
+        data: {outputs},
+        references,
+        meta: {},
+    } as any
+}
+
+beforeEach(() => {
+    // Default: resolveOutputSchema returns the data as-is (pass-through)
+    mockResolveOutputSchema.mockImplementation((data: unknown) => data)
+})
+
+// ---------------------------------------------------------------------------
+// isEmptyValue
+// ---------------------------------------------------------------------------
+
+describe("isEmptyValue", () => {
+    it.each([
+        [null, true],
+        [undefined, true],
+        ["", true],
+        [[], true],
+    ])("returns true for %s", (value, expected) => {
+        expect(isEmptyValue(value)).toBe(expected)
+    })
+
+    it.each([
+        [0, false],
+        [false, false],
+        ["0", false],
+        [[null], false],
+        [{}, false],
+        [" ", false],
+    ])("returns false for %s", (value, expected) => {
+        expect(isEmptyValue(value)).toBe(expected)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getOutputsSchema
+// ---------------------------------------------------------------------------
+
+describe("getOutputsSchema", () => {
+    it("returns the schema from resolveOutputSchema", () => {
+        const schema = {properties: {score: {type: "number"}}}
+        const workflow = makeWorkflow(schema.properties)
+        const result = getOutputsSchema(workflow)
+        expect(result).toMatchObject({properties: {score: {type: "number"}}})
+    })
+
+    it("returns empty object when resolveOutputSchema returns null", () => {
+        mockResolveOutputSchema.mockReturnValueOnce(null)
+        const result = getOutputsSchema(makeWorkflow())
+        expect(result).toEqual({})
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getMetricFieldsFromEvaluator — scalar types
+// ---------------------------------------------------------------------------
+
+describe("getMetricFieldsFromEvaluator — scalar types", () => {
+    it("produces a number field with null default", () => {
+        const wf = makeWorkflow({score: {type: "number", minimum: 0, maximum: 10}})
+        const fields = getMetricFieldsFromEvaluator(wf)
+        expect(fields.score).toMatchObject({value: null, type: "number", minimum: 0, maximum: 10})
+    })
+
+    it("produces an integer field with null default", () => {
+        const wf = makeWorkflow({count: {type: "integer"}})
+        expect(getMetricFieldsFromEvaluator(wf).count).toMatchObject({value: null, type: "integer"})
+    })
+
+    it("produces a boolean field with null default", () => {
+        const wf = makeWorkflow({approved: {type: "boolean"}})
+        expect(getMetricFieldsFromEvaluator(wf).approved).toMatchObject({
+            value: null,
+            type: "boolean",
+        })
+    })
+
+    it("produces a string field with empty-string default", () => {
+        const wf = makeWorkflow({notes: {type: "string"}})
+        expect(getMetricFieldsFromEvaluator(wf).notes).toMatchObject({value: "", type: "string"})
+    })
+})
+
+describe("getMetricFieldsFromEvaluator — array type", () => {
+    it("produces an array field with item schema", () => {
+        const wf = makeWorkflow({
+            labels: {
+                type: "array",
+                items: {type: "string", enum: ["good", "bad"]},
+            },
+        })
+        const fields = getMetricFieldsFromEvaluator(wf)
+        expect(fields.labels).toMatchObject({
+            value: [],
+            type: "array",
+            items: {type: "string", enum: ["good", "bad"]},
+        })
+    })
+
+    it("defaults item type to string when items is missing", () => {
+        const wf = makeWorkflow({tags: {type: "array"}})
+        expect(getMetricFieldsFromEvaluator(wf).tags.items).toMatchObject({
+            type: "string",
+            enum: [],
+        })
+    })
+})
+
+describe("getMetricFieldsFromEvaluator — anyOf schema", () => {
+    it("unwraps the first anyOf entry to get the real type", () => {
+        const wf = makeWorkflow({
+            score: {anyOf: [{type: "number", minimum: 0}, {type: "null"}]},
+        })
+        expect(getMetricFieldsFromEvaluator(wf).score).toMatchObject({value: null, type: "number"})
+    })
+})
+
+describe("getMetricFieldsFromEvaluator — array-of-types", () => {
+    it("filters 'null' from the type array and uses the remaining types", () => {
+        const wf = makeWorkflow({status: {type: ["string", "null"]}})
+        const field = getMetricFieldsFromEvaluator(wf).status
+        expect(field.type).toEqual(["string"])
+        expect(field.value).toBe("")
+    })
+
+    it("skips the property when only 'null' type remains after filtering", () => {
+        const wf = makeWorkflow({x: {type: ["null"]}})
+        expect(getMetricFieldsFromEvaluator(wf)).not.toHaveProperty("x")
+    })
+
+    it("includes non-null enum values and strips null/empty entries", () => {
+        const wf = makeWorkflow({
+            choice: {type: ["string", "null"], enum: ["a", null, "", "b"]},
+        })
+        const field = getMetricFieldsFromEvaluator(wf).choice
+        expect(field.enum).toEqual(["a", "b"])
+    })
+})
+
+describe("getMetricFieldsFromEvaluator — edge cases", () => {
+    it("returns empty object for an empty schema", () => {
+        mockResolveOutputSchema.mockReturnValueOnce(null)
+        expect(getMetricFieldsFromEvaluator(makeWorkflow())).toEqual({})
+    })
+
+    it("skips unsupported types (e.g. 'object')", () => {
+        const wf = makeWorkflow({meta: {type: "object"}})
+        expect(getMetricFieldsFromEvaluator(wf)).not.toHaveProperty("meta")
+    })
+
+    it("skips properties with no type field", () => {
+        const wf = makeWorkflow({weird: {description: "no type here"}})
+        expect(getMetricFieldsFromEvaluator(wf)).not.toHaveProperty("weird")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getMetricsFromAnnotation — flat outputs
+// ---------------------------------------------------------------------------
+
+describe("getMetricsFromAnnotation — flat outputs matching schema", () => {
+    it("fills a number field from flat outputs", () => {
+        const wf = makeWorkflow({score: {type: "number"}})
+        const ann = makeAnnotation({score: 8.5})
+        const fields = getMetricsFromAnnotation(ann, wf)
+        expect(fields.score).toMatchObject({value: 8.5, type: "number"})
+    })
+
+    it("fills a string field from flat outputs", () => {
+        // "notes" is a reserved flattening key — use a plain field name
+        const wf = makeWorkflow({label: {type: "string"}})
+        const ann = makeAnnotation({label: "looks good"})
+        expect(getMetricsFromAnnotation(ann, wf).label).toMatchObject({
+            value: "looks good",
+            type: "string",
+        })
+    })
+
+    it("uses schema default when key is absent in outputs", () => {
+        const wf = makeWorkflow({score: {type: "number"}})
+        const ann = makeAnnotation({})
+        expect(getMetricsFromAnnotation(ann, wf).score).toMatchObject({value: null, type: "number"})
+    })
+
+    it("uses '' as default for a missing string field", () => {
+        const wf = makeWorkflow({label: {type: "string"}})
+        const ann = makeAnnotation({})
+        expect(getMetricsFromAnnotation(ann, wf).label.value).toBe("")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getMetricsFromAnnotation — nested output structures
+// ---------------------------------------------------------------------------
+
+describe("getMetricsFromAnnotation — nested outputs", () => {
+    it("flattens metrics nested under 'metrics' key", () => {
+        const wf = makeWorkflow({score: {type: "number"}})
+        const ann = makeAnnotation({metrics: {score: 9}})
+        expect(getMetricsFromAnnotation(ann, wf).score.value).toBe(9)
+    })
+
+    it("flattens fields nested under 'notes' key", () => {
+        const wf = makeWorkflow({comment: {type: "string"}})
+        const ann = makeAnnotation({notes: {comment: "great"}})
+        expect(getMetricsFromAnnotation(ann, wf).comment.value).toBe("great")
+    })
+
+    it("flattens fields nested under 'extra' key", () => {
+        const wf = makeWorkflow({custom: {type: "string"}})
+        const ann = makeAnnotation({extra: {custom: "value"}})
+        expect(getMetricsFromAnnotation(ann, wf).custom.value).toBe("value")
+    })
+
+    it("flat keys outside of metrics/notes/extra are preserved directly", () => {
+        const wf = makeWorkflow({direct: {type: "number"}})
+        const ann = makeAnnotation({direct: 42})
+        expect(getMetricsFromAnnotation(ann, wf).direct.value).toBe(42)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getMetricsFromAnnotation — schema-free (infer from outputs)
+// ---------------------------------------------------------------------------
+
+describe("getMetricsFromAnnotation — schema-free inference", () => {
+    beforeEach(() => {
+        // Empty schema → falls back to inferFieldsFromOutputs
+        mockResolveOutputSchema.mockReturnValue(null)
+    })
+
+    it("infers a number field from a numeric output value", () => {
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({score: 7})
+        const fields = getMetricsFromAnnotation(ann, wf)
+        expect(fields.score.type).toBe("integer")
+        expect(fields.score.value).toBe(7)
+    })
+
+    it("infers a boolean field from a boolean output value", () => {
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({approved: true})
+        expect(getMetricsFromAnnotation(ann, wf).approved).toMatchObject({
+            value: true,
+            type: "boolean",
+        })
+    })
+
+    it("infers a string field from a string output value", () => {
+        // "notes" is a reserved key — use a plain field name
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({comment: "hello"})
+        expect(getMetricsFromAnnotation(ann, wf).comment).toMatchObject({
+            value: "hello",
+            type: "string",
+        })
+    })
+
+    it("serialises an object output to a JSON string field", () => {
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({meta: {key: "val"}})
+        const field = getMetricsFromAnnotation(ann, wf).meta
+        expect(field.type).toBe("string")
+        expect(field.value).toBe(JSON.stringify({key: "val"}))
+    })
+
+    it("returns empty object when annotation outputs are empty", () => {
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({})
+        expect(getMetricsFromAnnotation(ann, wf)).toEqual({})
+    })
+})
diff --git a/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts b/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts
new file mode 100644
index 0000000000..598c60708d
--- /dev/null
+++ b/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts
@@ -0,0 +1,659 @@
+/**
+ * Unit tests for pure functions in src/state/testsetSync.ts.
+ *
+ * All functions under test are pure data transformations with no side effects.
+ * The entity imports in testsetSync.ts are type-only, so no mocking is needed.
+ */
+
+import {describe, expect, it} from "vitest"
+
+import type {Annotation} from "../../src/state/testsetSync"
+import {
+    buildTestcaseExportRows,
+    buildTestsetSyncOperations,
+    buildTestsetSyncPreview,
+    buildTraceTestsetRows,
+    getQueueAnnotationTag,
+    getTestsetSyncEvaluatorColumnKey,
+    mergeTestcaseAnnotationTags,
+    remapTargetRowsToBaseRevision,
+    selectQueueScopedAnnotation,
+    TESTCASE_QUEUE_KIND_TAG,
+} from "../../src/state/testsetSync"
+
+// ---------------------------------------------------------------------------
+// Minimal fixture builders
+// ---------------------------------------------------------------------------
+
+function makeAnnotation(
+    overrides: {
+        evaluatorSlug?: string
+        evaluatorId?: string
+        tags?: string[]
+        outputs?: Record<string, unknown>
+        traceId?: string
+        spanId?: string
+    } = {},
+): Annotation {
+    return {
+        trace_id: overrides.traceId ?? "trace-1",
+        span_id: overrides.spanId ?? "span-1",
+        meta: {tags: overrides.tags ?? []},
+        references: {
+            evaluator: {
+                id: overrides.evaluatorId,
+                slug: overrides.evaluatorSlug,
+            },
+        },
+        data: {outputs: overrides.outputs ?? {}},
+    } as unknown as Annotation
+}
+
+function queueTag(queueId: string) {
+    return `agenta:queue:${queueId}`
+}
+
+// ---------------------------------------------------------------------------
+// getQueueAnnotationTag
+// ---------------------------------------------------------------------------
+
+describe("getQueueAnnotationTag", () => {
+    it("formats queue ID into tag", () => {
+        expect(getQueueAnnotationTag("q-abc")).toBe("agenta:queue:q-abc")
+    })
+
+    it("handles arbitrary queue IDs", () => {
+        expect(getQueueAnnotationTag("123-456-789")).toBe("agenta:queue:123-456-789")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// mergeTestcaseAnnotationTags
+// ---------------------------------------------------------------------------
+
+describe("mergeTestcaseAnnotationTags", () => {
+    it("always includes the queue tag and kind tag", () => {
+        const tags = mergeTestcaseAnnotationTags({queueId: "q-1"})
+        expect(tags).toContain(queueTag("q-1"))
+        expect(tags).toContain(TESTCASE_QUEUE_KIND_TAG)
+    })
+
+    it("merges existing tags without duplicates", () => {
+        const tags = mergeTestcaseAnnotationTags({
+            queueId: "q-1",
+            existingTags: ["score", "notes", queueTag("q-1")],
+            outputKeys: ["score"],
+        })
+        expect(tags.filter((t) => t === "score")).toHaveLength(1)
+        expect(tags.filter((t) => t === queueTag("q-1"))).toHaveLength(1)
+        expect(tags).toContain("notes")
+    })
+
+    it("adds output keys as tags", () => {
+        const tags = mergeTestcaseAnnotationTags({
+            queueId: "q-1",
+            outputKeys: ["relevance", "fluency"],
+        })
+        expect(tags).toContain("relevance")
+        expect(tags).toContain("fluency")
+    })
+
+    it("handles null existingTags gracefully", () => {
+        const tags = mergeTestcaseAnnotationTags({queueId: "q-1", existingTags: null})
+        expect(tags).toContain(queueTag("q-1"))
+        expect(tags).toContain(TESTCASE_QUEUE_KIND_TAG)
+    })
+
+    it("filters out falsy tags from existingTags", () => {
+        const tags = mergeTestcaseAnnotationTags({
+            queueId: "q-1",
+            existingTags: ["", null as unknown as string, "valid-tag"],
+        })
+        expect(tags).not.toContain("")
+        expect(tags).not.toContain(null)
+        expect(tags).toContain("valid-tag")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// selectQueueScopedAnnotation
+// ---------------------------------------------------------------------------
+
+describe("selectQueueScopedAnnotation — no match", () => {
+    it("returns null annotation when list is empty", () => {
+        const result = selectQueueScopedAnnotation({
+            annotations: [],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: null, conflictCode: null})
+    })
+
+    it("returns null annotation when no annotation matches the evaluator slug", () => {
+        const ann = makeAnnotation({evaluatorSlug: "other-evaluator"})
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: null, conflictCode: null})
+    })
+})
+
+describe("selectQueueScopedAnnotation — queue-scoped matching", () => {
+    it("returns the annotation when exactly one queue-scoped match exists", () => {
+        const ann = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: ann, conflictCode: null})
+    })
+
+    it("returns duplicate_queue_annotations when multiple queue-scoped annotations match", () => {
+        const ann1 = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            traceId: "trace-1",
+        })
+        const ann2 = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            traceId: "trace-2",
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann1, ann2],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: null, conflictCode: "duplicate_queue_annotations"})
+    })
+
+    it("ignores annotations scoped to a different queue", () => {
+        const ann = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [queueTag("q-OTHER"), TESTCASE_QUEUE_KIND_TAG],
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        // Not a queue-scoped match for q-1, and it has a queue tag → not legacy either
+        expect(result.annotation).toBeNull()
+        expect(result.conflictCode).toBeNull()
+    })
+})
+
+describe("selectQueueScopedAnnotation — legacy fallback", () => {
+    it("falls back to a legacy annotation (no queue tags) when no queue-scoped match", () => {
+        const ann = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [], // no queue tags → legacy
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: ann, conflictCode: null})
+    })
+
+    it("returns duplicate_legacy_annotations when multiple legacy annotations match", () => {
+        const ann1 = makeAnnotation({evaluatorSlug: "relevance", tags: [], traceId: "trace-1"})
+        const ann2 = makeAnnotation({evaluatorSlug: "relevance", tags: [], traceId: "trace-2"})
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann1, ann2],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: null, conflictCode: "duplicate_legacy_annotations"})
+    })
+})
+
+describe("selectQueueScopedAnnotation — evaluatorWorkflowId matching", () => {
+    it("matches annotation by evaluator workflow ID", () => {
+        const ann = makeAnnotation({
+            evaluatorId: "wf-abc",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+            evaluatorWorkflowId: "wf-abc",
+        })
+        expect(result).toEqual({annotation: ann, conflictCode: null})
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getTestsetSyncEvaluatorColumnKey
+// ---------------------------------------------------------------------------
+
+describe("getTestsetSyncEvaluatorColumnKey", () => {
+    const evaluator = {slug: "relevance", workflowId: "wf-1"}
+
+    it("returns evaluator slug when no annotation supplied", () => {
+        expect(getTestsetSyncEvaluatorColumnKey({evaluator})).toBe("relevance")
+    })
+
+    it("prefers annotation's evaluator slug over evaluator.slug", () => {
+        const ann = makeAnnotation({evaluatorSlug: "resolved-slug"})
+        expect(getTestsetSyncEvaluatorColumnKey({evaluator, annotation: ann})).toBe("resolved-slug")
+    })
+
+    it("falls back to evaluator.workflowId when slug is empty", () => {
+        const noSlugEval = {slug: "", workflowId: "wf-fallback"}
+        expect(getTestsetSyncEvaluatorColumnKey({evaluator: noSlugEval})).toBe("wf-fallback")
+    })
+
+    it("returns empty string when evaluator has no slug or workflowId", () => {
+        expect(getTestsetSyncEvaluatorColumnKey({evaluator: {slug: "", workflowId: ""}})).toBe("")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildTestsetSyncOperations
+// ---------------------------------------------------------------------------
+
+describe("buildTestsetSyncOperations", () => {
+    it("maps target rows to replace operations", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 2,
+            rows: [
+                {
+                    scenarioId: "s-1",
+                    testcaseId: "tc-1",
+                    testsetId: "ts-1",
+                    rowId: "r-1",
+                    data: {x: 1},
+                },
+                {
+                    scenarioId: "s-2",
+                    testcaseId: "tc-2",
+                    testsetId: "ts-1",
+                    rowId: "r-2",
+                    data: {x: 2},
+                },
+            ],
+        }
+
+        const ops = buildTestsetSyncOperations(target)
+        expect(ops).toEqual({
+            rows: {
+                replace: [
+                    {id: "r-1", data: {x: 1}},
+                    {id: "r-2", data: {x: 2}},
+                ],
+            },
+        })
+    })
+
+    it("produces an empty replace list for a target with no rows", () => {
+        const ops = buildTestsetSyncOperations({
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 0,
+            rows: [],
+        })
+        expect(ops.rows.replace).toHaveLength(0)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// remapTargetRowsToBaseRevision
+// ---------------------------------------------------------------------------
+
+describe("remapTargetRowsToBaseRevision", () => {
+    it("keeps rows whose rowId exists directly in baseRows", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 1,
+            rows: [
+                {scenarioId: "s-1", testcaseId: "tc-1", testsetId: "ts-1", rowId: "r-1", data: {}},
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "r-1"}],
+        })
+        expect(result.rows).toHaveLength(1)
+        expect(result.rows[0].rowId).toBe("r-1")
+        expect(droppedRowCount).toBe(0)
+    })
+
+    it("remaps a row using testcase_dedup_id when rowId is not in baseRows", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 1,
+            rows: [
+                {
+                    scenarioId: "s-1",
+                    testcaseId: "tc-1",
+                    testsetId: "ts-1",
+                    rowId: "old-id",
+                    data: {testcase_dedup_id: "dedup-abc"},
+                },
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "new-id", data: {testcase_dedup_id: "dedup-abc"}}],
+        })
+        expect(result.rows[0].rowId).toBe("new-id")
+        expect(droppedRowCount).toBe(0)
+    })
+
+    it("also remaps using legacy __dedup_id__ key", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 1,
+            rows: [
+                {
+                    scenarioId: "s-1",
+                    testcaseId: "tc-1",
+                    testsetId: "ts-1",
+                    rowId: "old-id",
+                    data: {__dedup_id__: "dedup-xyz"},
+                },
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "mapped-id", data: {__dedup_id__: "dedup-xyz"}}],
+        })
+        expect(result.rows[0].rowId).toBe("mapped-id")
+        expect(droppedRowCount).toBe(0)
+    })
+
+    it("drops rows with no matching rowId and no dedup key", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 1,
+            rows: [
+                {scenarioId: "s-1", testcaseId: "tc-1", testsetId: "ts-1", rowId: "gone", data: {}},
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "other-id"}],
+        })
+        expect(result.rows).toHaveLength(0)
+        expect(droppedRowCount).toBe(1)
+    })
+
+    it("updates rowCount to reflect mapped rows only", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 2,
+            rows: [
+                {scenarioId: "s-1", testcaseId: "tc-1", testsetId: "ts-1", rowId: "r-1", data: {}},
+                {scenarioId: "s-2", testcaseId: "tc-2", testsetId: "ts-1", rowId: "gone", data: {}},
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "r-1"}],
+        })
+        expect(result.rowCount).toBe(1)
+        expect(droppedRowCount).toBe(1)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildTraceTestsetRows
+// ---------------------------------------------------------------------------
+
+describe("buildTraceTestsetRows", () => {
+    it("builds a row per scenario with trace inputs and output", () => {
+        const rows = buildTraceTestsetRows({
+            scenarioIds: ["s-1"],
+            traceInputsByScenario: new Map([["s-1", {question: "What is AI?"}]]),
+            traceOutputsByScenario: new Map([["s-1", "AI is..."]]),
+            annotationsByScenario: new Map(),
+            outputColumnName: "answer",
+        })
+        expect(rows).toHaveLength(1)
+        expect(rows[0].scenarioId).toBe("s-1")
+        expect(rows[0].data.question).toBe("What is AI?")
+        expect(rows[0].data.answer).toBe("AI is...")
+    })
+
+    it("expands a nested 'inputs' key into top-level columns", () => {
+        const rows = buildTraceTestsetRows({
+            scenarioIds: ["s-1"],
+            traceInputsByScenario: new Map([["s-1", {inputs: {a: 1, b: 2}}]]),
+            traceOutputsByScenario: new Map(),
+            annotationsByScenario: new Map(),
+            outputColumnName: "output",
+        })
+        expect(rows[0].data.a).toBe(1)
+        expect(rows[0].data.b).toBe(2)
+        expect(rows[0].data).not.toHaveProperty("inputs")
+    })
+
+    it("merges annotation outputs into the row", () => {
+        const rows = buildTraceTestsetRows({
+            scenarioIds: ["s-1"],
+            traceInputsByScenario: new Map([["s-1", {q: "hi"}]]),
+            traceOutputsByScenario: new Map([["s-1", "hello"]]),
+            annotationsByScenario: new Map([["s-1", {relevance: {score: 5}}]]),
+            outputColumnName: "output",
+        })
+        expect(rows[0].data.relevance).toMatchObject({score: 5})
+    })
+
+    it("handles a missing scenario gracefully (uses empty defaults)", () => {
+        const rows = buildTraceTestsetRows({
+            scenarioIds: ["s-missing"],
+            traceInputsByScenario: new Map(),
+            traceOutputsByScenario: new Map(),
+            annotationsByScenario: new Map(),
+            outputColumnName: "output",
+        })
+        expect(rows).toHaveLength(1)
+        expect(rows[0].data.output).toBeUndefined()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildTestcaseExportRows
+// ---------------------------------------------------------------------------
+
+describe("buildTestcaseExportRows", () => {
+    const evaluator = {slug: "quality", workflowId: "wf-q"}
+
+    function makeTestcase(id: string, testsetId: string) {
+        return {id, testset_id: testsetId, data: {prompt: "hello"}}
+    }
+
+    it("builds a row when annotation data exists for the testcase", () => {
+        const ann = makeAnnotation({
+            evaluatorSlug: "quality",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            outputs: {score: 8},
+        })
+        const rows = buildTestcaseExportRows({
+            scenarioIds: ["s-1"],
+            testcasesByScenarioId: new Map([["s-1", makeTestcase("tc-1", "ts-1") as any]]),
+            annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
+            evaluators: [evaluator],
+            queueId: "q-1",
+        })
+        expect(rows).toHaveLength(1)
+        expect(rows[0].testcaseId).toBe("tc-1")
+        expect(rows[0].testsetId).toBe("ts-1")
+        expect((rows[0].data as any).quality).toMatchObject({score: 8})
+    })
+
+    it("skips a scenario with no testcase mapping", () => {
+        const rows = buildTestcaseExportRows({
+            scenarioIds: ["s-missing"],
+            testcasesByScenarioId: new Map(),
+            annotationsByTestcaseId: new Map(),
+            evaluators: [evaluator],
+            queueId: "q-1",
+        })
+        expect(rows).toHaveLength(0)
+    })
+
+    it("skips a testcase with no annotations", () => {
+        const rows = buildTestcaseExportRows({
+            scenarioIds: ["s-1"],
+            testcasesByScenarioId: new Map([["s-1", makeTestcase("tc-1", "ts-1") as any]]),
+            annotationsByTestcaseId: new Map([["tc-1", []]]),
+            evaluators: [evaluator],
+            queueId: "q-1",
+        })
+        expect(rows).toHaveLength(0)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildTestsetSyncPreview
+// ---------------------------------------------------------------------------
+
+describe("buildTestsetSyncPreview", () => {
+    const evaluator = {slug: "quality", workflowId: "wf-q"}
+
+    function makeTestcase(id: string, testsetId: string) {
+        return {id, testset_id: testsetId, data: {}}
+    }
+
+    function makeQueueAnn(traceId = "trace-1") {
+        return makeAnnotation({
+            evaluatorSlug: "quality",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            outputs: {score: 7},
+            traceId,
+        })
+    }
+
+    it("returns a missing_testcase conflict when testcase not found", () => {
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-missing"}],
+            testcasesById: new Map(),
+            annotationsByTestcaseId: new Map(),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map(),
+        })
+        expect(preview.conflicts).toHaveLength(1)
+        expect(preview.conflicts[0].code).toBe("missing_testcase")
+        expect(preview.hasBlockingConflicts).toBe(true)
+    })
+
+    it("returns a missing_testset conflict when testcase has no testset_id", () => {
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", {id: "tc-1", data: {}} as any]]),
+            annotationsByTestcaseId: new Map(),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map(),
+        })
+        expect(preview.conflicts[0].code).toBe("missing_testset")
+    })
+
+    it("returns a missing_latest_revision conflict when no revision for testset", () => {
+        const ann = makeQueueAnn()
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1") as any]]),
+            annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map(), // ts-1 has no revision
+        })
+        expect(preview.conflicts.some((c) => c.code === "missing_latest_revision")).toBe(true)
+    })
+
+    it("produces a clean target when everything is resolved", () => {
+        const ann = makeQueueAnn()
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1") as any]]),
+            annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
+        })
+        expect(preview.conflicts).toHaveLength(0)
+        expect(preview.targets).toHaveLength(1)
+        expect(preview.targets[0].testsetId).toBe("ts-1")
+        expect(preview.targets[0].baseRevisionId).toBe("rev-1")
+        expect(preview.exportableRows).toBe(1)
+        expect(preview.hasBlockingConflicts).toBe(false)
+    })
+
+    it("records duplicate_queue_annotations conflict and skips the row", () => {
+        const ann1 = makeQueueAnn("trace-1")
+        const ann2 = makeQueueAnn("trace-2")
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1") as any]]),
+            annotationsByTestcaseId: new Map([["tc-1", [ann1, ann2]]]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
+        })
+        expect(preview.conflicts[0].code).toBe("duplicate_queue_annotations")
+        expect(preview.exportableRows).toBe(0)
+        expect(preview.hasBlockingConflicts).toBe(true)
+    })
+
+    it("groups rows from different scenarios under the same testset target", () => {
+        const ann1 = makeQueueAnn("trace-1")
+        const ann2 = makeQueueAnn("trace-2")
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [
+                {scenarioId: "s-1", testcaseId: "tc-1"},
+                {scenarioId: "s-2", testcaseId: "tc-2"},
+            ],
+            testcasesById: new Map([
+                ["tc-1", makeTestcase("tc-1", "ts-1") as any],
+                ["tc-2", makeTestcase("tc-2", "ts-1") as any],
+            ]),
+            annotationsByTestcaseId: new Map([
+                ["tc-1", [ann1]],
+                ["tc-2", [ann2]],
+            ]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
+        })
+        expect(preview.targets).toHaveLength(1)
+        expect(preview.targets[0].rowCount).toBe(2)
+        expect(preview.exportableRows).toBe(2)
+    })
+
+    it("skips rows with no annotation data and does not add them as conflicts", () => {
+        const annNoOutputs = makeAnnotation({
+            evaluatorSlug: "quality",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            outputs: {}, // empty
+        })
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1") as any]]),
+            annotationsByTestcaseId: new Map([["tc-1", [annNoOutputs]]]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
+        })
+        expect(preview.conflicts).toHaveLength(0)
+        expect(preview.exportableRows).toBe(0)
+    })
+})
diff --git a/web/packages/agenta-annotation/vitest.config.ts b/web/packages/agenta-annotation/vitest.config.ts
new file mode 100644
index 0000000000..92bca1ab9d
--- /dev/null
+++ b/web/packages/agenta-annotation/vitest.config.ts
@@ -0,0 +1,28 @@
+import path from "path"
+
+import {defineConfig} from "vitest/config"
+
+export default defineConfig({
+    resolve: {
+        alias: {
+            // Stub @agenta/ui to avoid pulling in the full antd tree.
+            // Annotation tests only exercise pure functions — no React rendering.
+            "@agenta/ui": path.resolve(__dirname, "tests/__mocks__/agenta-ui.ts"),
+        },
+    },
+    test: {
+        include: ["tests/unit/**/*.test.ts"],
+        environment: "node",
+        reporters: ["default", "junit"],
+        outputFile: {
+            junit: "./test-results/junit.xml",
+        },
+        coverage: {
+            provider: "v8",
+            include: ["src/**/*.ts"],
+            exclude: ["src/**/index.ts"],
+            reporter: ["text", "lcov", "json-summary"],
+            reportsDirectory: "./coverage",
+        },
+    },
+})
diff --git a/web/packages/agenta-shared/package.json b/web/packages/agenta-shared/package.json
index 2d49bad1cb..7b13e46b14 100644
--- a/web/packages/agenta-shared/package.json
+++ b/web/packages/agenta-shared/package.json
@@ -9,7 +9,12 @@
         "build": "pnpm run types:check",
         "types:check": "tsc --noEmit",
         "lint": "eslint --config ../eslint.config.mjs src/ --max-warnings 0",
-        "lint:fix": "eslint --config ../eslint.config.mjs src/ --max-warnings 0 --fix"
+        "lint:fix": "eslint --config ../eslint.config.mjs src/ --max-warnings 0 --fix",
+        "test": "pnpm run test:unit",
+        "test:unit": "vitest run",
+        "test:watch": "vitest",
+        "test:coverage": "vitest run --coverage",
+        "check": "pnpm run types:check && pnpm run lint"
     },
     "exports": {
         ".": "./src/index.ts",
@@ -31,7 +36,9 @@
     "devDependencies": {
         "@types/node": "^20.8.10",
         "@types/react": "^19.0.10",
-        "typescript": "5.8.3"
+        "@vitest/coverage-v8": "^4.1.4",
+        "typescript": "5.8.3",
+        "vitest": "^4.1.4"
     },
     "peerDependencies": {
         "@tanstack/react-query": ">=5.0.0",
diff --git a/web/packages/agenta-shared/test-results/junit.xml b/web/packages/agenta-shared/test-results/junit.xml
new file mode 100644
index 0000000000..ba991a034e
--- /dev/null
+++ b/web/packages/agenta-shared/test-results/junit.xml
@@ -0,0 +1,389 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<testsuites name="vitest tests" tests="187" failures="0" errors="0" time="0.15012691">
+    <testsuite name="tests/unit/data-transforms.test.ts" timestamp="2026-06-03T07:51:20.543Z" hostname="Kaosisos-MacBook-Pro.local" tests="22" failures="0" errors="0" skipped="0" time="0.014309687">
+        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from response.data.detail string" time="0.002546081">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from response.data.message string" time="0.000365527">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from response.data.error string" time="0.0002325">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from nested response.data.detail.message" time="0.000289911">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from an array of detail strings" time="0.000366091">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Error instances &gt; returns error.message for a plain Error" time="0.000310955">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — direct string/object &gt; returns a non-empty string value directly" time="0.000275387">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — direct string/object &gt; falls back to String(error) for unknown shapes" time="0.00019925">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="preserveResponseStatus &gt; wraps an error with a custom message" time="0.000816785">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="preserveResponseStatus &gt; preserves the response status from the original error" time="0.00033335">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="preserveResponseStatus &gt; preserves the original error message when no override is given" time="0.000300095">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; removes agenta_metadata keys from objects" time="0.000901889">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; removes __agenta_metadata keys from objects" time="0.000661765">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; recursively strips metadata from nested objects" time="0.000290145">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; strips metadata from objects inside arrays" time="0.00021932">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; returns primitives unchanged" time="0.000203792">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; unwraps a simple {__id, __metadata, value} wrapper" time="0.000355085">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; strips __id and __metadata from plain objects (non-wrapper)" time="0.000448585">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; recursively strips wrappers from nested objects" time="0.000242952">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; processes arrays recursively" time="0.001055276">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; returns null/undefined unchanged" time="0.000209983">
+        </testcase>
+        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; returns primitives unchanged" time="0.000148744">
+        </testcase>
+    </testsuite>
+    <testsuite name="tests/unit/formatters.test.ts" timestamp="2026-06-03T07:51:20.555Z" hostname="Kaosisos-MacBook-Pro.local" tests="44" failures="0" errors="0" skipped="0" time="0.029119193">
+        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; formats with locale thousand separators and 2 decimal places" time="0.003713118">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; returns &apos;-&apos; for null" time="0.000494948">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; returns &apos;-&apos; for undefined" time="0.000334845">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; formats zero" time="0.000426467">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; formats negative numbers" time="0.000340483">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatCompact &gt; formats thousands as K" time="0.000465451">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatCompact &gt; formats millions as M" time="0.000407826">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatCompact &gt; returns &apos;-&apos; for null" time="0.000305715">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatCurrency &gt; formats with dollar sign and 2 decimals for typical values" time="0.00254155">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatCurrency &gt; formats small values without trailing zeros (maximumFractionDigits: 6)" time="0.000456143">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatCurrency &gt; returns &apos;-&apos; for null" time="0.000727972">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; formats sub-millisecond values in μs" time="0.000334701">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; formats millisecond-range values in ms" time="0.000224338">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; formats second-range values in s" time="0.000194387">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; formats exactly 1 second" time="0.000137998">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; returns &apos;-&apos; for null" time="0.000128351">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; returns &apos;-&apos; for undefined" time="0.000136314">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatTokens &gt; formats values under 1000 as plain integers" time="0.000364411">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatTokens &gt; formats thousands as K with 1 decimal" time="0.000143129">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatTokens &gt; formats millions as M with 1 decimal" time="0.000140819">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatTokens &gt; returns &apos;-&apos; for null" time="0.00011791">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; formats decimal as percentage with 1 decimal for values &gt;= 10%" time="0.000228929">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; formats small values with 2 decimal places" time="0.000129466">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; returns &apos;100%&apos; for values &gt;= 99.95%" time="0.000478414">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; returns &apos;0%&apos; for zero" time="0.000144519">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; treats negative values as 0%" time="0.000189746">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; returns &apos;-&apos; for null" time="0.000144275">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatSignificant &gt; formats values with significant-figure-aware decimals" time="0.000419335">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatSignificant &gt; returns &apos;0&apos; for zero" time="0.000219374">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatSignificant &gt; uses scientific notation for extreme values" time="0.000279056">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatSignificant &gt; returns &apos;-&apos; for null" time="0.000256581">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; wraps strings in quotes" time="0.000306751">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; truncates long strings and adds ellipsis" time="0.00017171">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; formats numbers as-is" time="0.000130479">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; formats booleans as-is" time="0.000140545">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; formats arrays with length" time="0.000113721">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; formats small objects with key names" time="0.000124305">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; truncates objects with more than 3 keys" time="0.000131993">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; returns &apos;(null)&apos; for null" time="0.000111097">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; returns &apos;(undefined)&apos; for undefined" time="0.000152774">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="createFormatter &gt; applies multiplier, prefix, suffix, and fixed decimals" time="0.000473756">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="createFormatter &gt; uses the custom fallback for null/undefined" time="0.000223072">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="createFormatter &gt; uses compact notation when compact: true" time="0.000384642">
+        </testcase>
+        <testcase classname="tests/unit/formatters.test.ts" name="createFormatter &gt; prepends a prefix" time="0.000216185">
+        </testcase>
+    </testsuite>
+    <testsuite name="tests/unit/path-utils.test.ts" timestamp="2026-06-03T07:51:20.571Z" hostname="Kaosisos-MacBook-Pro.local" tests="25" failures="0" errors="0" skipped="0" time="0.037525386">
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — basic object navigation &gt; retrieves a deeply nested value" time="0.002557808">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — basic object navigation &gt; returns the root when the path is empty" time="0.000322721">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — basic object navigation &gt; returns undefined for a missing key" time="0.000256621">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — basic object navigation &gt; returns undefined when traversal hits null" time="0.000267802">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — array indexing &gt; accesses array elements by numeric index" time="0.000327434">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — array indexing &gt; accesses array elements by string index" time="0.000195651">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — array indexing &gt; returns undefined for out-of-bounds index" time="0.000250685">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — array indexing &gt; navigates mixed array/object paths" time="0.000218769">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — JSON string traversal &gt; parses a JSON string and continues traversal" time="0.00074782">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — JSON string traversal &gt; returns undefined when the string is not valid JSON" time="0.000317394">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — object mutation (immutable) &gt; sets a nested value without mutating the original" time="0.000377345">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — object mutation (immutable) &gt; creates intermediate objects for new paths" time="0.00015727">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — object mutation (immutable) &gt; replaces the root when path is empty" time="0.000134198">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — array mutation (immutable) &gt; sets an array element by index" time="0.000164713">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — array mutation (immutable) &gt; handles nested array+object paths" time="0.000166792">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — JSON string re-serialisation &gt; parses a JSON string, sets the value, and re-stringifies" time="0.000172575">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="deleteValueAtPath — object &gt; removes a key from a nested object (immutable)" time="0.002067215">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="deleteValueAtPath — object &gt; returns data unchanged when path is empty" time="0.020732904">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="deleteValueAtPath — array &gt; removes an element from an array by index" time="0.001636661">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns true when the key exists" time="0.000583877">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns false when the key is missing" time="0.000163844">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns false when a parent is null" time="0.000124015">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns true for valid array index" time="0.000115465">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns false for out-of-bounds array index" time="0.000115521">
+        </testcase>
+        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns true for the root when path is empty and data is defined" time="0.000104562">
+        </testcase>
+    </testsuite>
+    <testsuite name="tests/unit/slug.test.ts" timestamp="2026-06-03T07:51:20.581Z" hostname="Kaosisos-MacBook-Pro.local" tests="40" failures="0" errors="0" skipped="0" time="0.030062015">
+        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; lowercases and trims" time="0.003214561">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; replaces spaces with hyphens" time="0.00045722">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; collapses multiple spaces into one hyphen" time="0.000251821">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; strips leading and trailing hyphens" time="0.000354766">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; preserves allowed chars: digits, underscore, dot, hyphen" time="0.000235035">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; removes disallowed special characters" time="0.00021152">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; returns empty string for a blank input" time="0.000533555">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithSuffix &gt; produces &lt;base&gt;-&lt;4 chars&gt; format" time="0.003529513">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithSuffix &gt; falls back to &apos;resource&apos; when name slugifies to empty" time="0.000746449">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithSuffix &gt; produces different slugs on repeated calls (randomness)" time="0.000734153">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithExistingSuffix &gt; appends the provided suffix to the slugified name" time="0.000325176">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithExistingSuffix &gt; generates a new random suffix when suffix is null" time="0.000376281">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithExistingSuffix &gt; generates a new random suffix when suffix is undefined" time="0.000230762">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="getSlugSuffix &gt; returns the 4-char suffix when present" time="0.000250414">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="getSlugSuffix &gt; returns null when the trailing segment is not exactly 4 chars" time="0.00025666">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="getSlugSuffix &gt; returns null when there is no hyphen-separated suffix" time="0.000218056">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="stripSlugSuffix &gt; removes the 4-char suffix" time="0.000213417">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="stripSlugSuffix &gt; leaves the slug unchanged when no suffix is present" time="0.000179517">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="regenerateSlugSuffix &gt; replaces the known suffix with a new random one" time="0.001827256">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="regenerateSlugSuffix &gt; appends a new suffix when the slug does not end with the given suffix" time="0.000207291">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="regenerateSlugSuffix &gt; always produces a 4-char suffix" time="0.000200284">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug a" time="0.000257971">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug abc" time="0.000181899">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug my-app" time="0.000093801">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug my_app" time="0.000083682">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug app.v2" time="0.000083173">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug app-v2-ab12" time="0.00007983">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for empty string" time="0.00011098">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for slugs longer than 255 characters" time="0.000129017">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for double hyphens" time="0.0001072">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for double dots" time="0.000214997">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for slugs starting or ending with non-alphanumeric" time="0.000242711">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="buildGatewayToolSlug &gt; builds the correct double-underscore format" time="0.000756189">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isGatewayToolSlug &gt; returns true for a valid gateway tool slug" time="0.001401067">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="isGatewayToolSlug &gt; returns false for a non-gateway slug" time="0.003921047">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; parses all four parts correctly" time="0.001736217">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; returns null for a slug with wrong number of parts" time="0.000184061">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; returns null for a slug that does not start with &apos;tools&apos;" time="0.000400336">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; returns null for undefined input" time="0.000282527">
+        </testcase>
+        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; returns null when any segment is empty" time="0.000151165">
+        </testcase>
+    </testsuite>
+    <testsuite name="tests/unit/template-variable.test.ts" timestamp="2026-06-03T07:51:20.594Z" hostname="Kaosisos-MacBook-Pro.local" tests="23" failures="0" errors="0" skipped="0" time="0.01161637">
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — empty / malformed &gt; rejects an empty expression" time="0.002987017">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — empty / malformed &gt; rejects expressions with consecutive dots (..)" time="0.000484413">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — empty / malformed &gt; rejects expressions with consecutive slashes (//)" time="0.000282633">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; accepts a well-formed JSONPath" time="0.000333902">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; accepts bare &apos;$&apos; (whole context shorthand)" time="0.000213164">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; rejects &apos;$&lt;no-dot&gt;&apos; (malformed root)" time="0.000193018">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; rejects &apos;$.&apos; with no field after the dot" time="0.000297891">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; accepts any root segment — does NOT validate against envelope slots (permissive)" time="0.000347021">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; accepts a pointer rooted at a known envelope slot" time="0.00073819">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; rejects a multi-segment pointer with an unknown root slot" time="0.000411952">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; includes a &apos;did-you-mean&apos; suggestion for near-miss slot names" time="0.000275967">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; accepts a single-segment identifier-shaped pointer unconditionally (mustache close tag)" time="0.000139059">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; rejects &apos;/&apos; with no segments" time="0.000125541">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — plain names &gt; accepts plain identifiers" time="0.000245851">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — plain names &gt; accepts dot-notation paths" time="0.000119536">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="isValidTemplateVariable &gt; returns true for a valid expression" time="0.00015515">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="isValidTemplateVariable &gt; returns false for an invalid expression" time="0.000137059">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; strips {{ }} wrappers" time="0.00019578">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; strips {% %} wrappers" time="0.000117001">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; strips {%- -%} wrappers (whitespace-trimming variants)" time="0.000164">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; strips {# #} comment wrappers" time="0.000285222">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; returns the raw text when no wrapper is present" time="0.000128854">
+        </testcase>
+        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; returns empty string for empty input" time="0.000122009">
+        </testcase>
+    </testsuite>
+    <testsuite name="tests/unit/validators-and-ids.test.ts" timestamp="2026-06-03T07:51:20.602Z" hostname="Kaosisos-MacBook-Pro.local" tests="33" failures="0" errors="0" skipped="0" time="0.027494259">
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns true for valid UUID 123e4567-e89b-12d3-a456-426614174000" time="0.003901645">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns true for valid UUID 00000000-0000-0000-0000-000000000000" time="0.001252744">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns true for valid UUID FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF" time="0.000217274">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input " time="0.000379108">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input not-a-uuid" time="0.000236402">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input 123e4567-e89b-12d3-a456" time="0.000159416">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input 123e4567-e89b-12d3-a456-42661417400Z" time="0.00024089">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input 123e4567e89b12d3a456426614174000" time="0.000520701">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="validateUUID &gt; does not throw for a valid UUID" time="0.001772324">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="validateUUID &gt; throws with a descriptive message for an invalid UUID" time="0.000672121">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns true for http://example.com" time="0.000331258">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns true for https://example.com/path?q=1" time="0.000188039">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns false for ftp://example.com" time="0.000192804">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns false for not-a-url" time="0.000215639">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns false for " time="0.000138057">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns false for javascript:alert(1)" time="0.000163908">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns true for valid regex ^[a-z]+$" time="0.000265106">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns true for valid regex \d+" time="0.000129412">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns true for valid regex (foo|bar)" time="0.000198754">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns true for valid regex .*" time="0.000118527">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns false for invalid regex [invalid" time="0.000173268">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns false for invalid regex (unclosed" time="0.000099736">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns false for invalid regex *bad" time="0.000091557">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToTraceId &gt; strips dashes from a UUID" time="0.000158752">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToTraceId &gt; returns undefined for undefined input" time="0.008773454">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToTraceId &gt; returns undefined for empty string" time="0.000167004">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToSpanId &gt; returns the last 16 hex chars of the stripped UUID" time="0.000202041">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToSpanId &gt; returns undefined for undefined input" time="0.000172146">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToSpanId &gt; span ID length is always 16" time="0.001682264">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="removeTrailingSlash &gt; removes a trailing slash" time="0.000315106">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="removeTrailingSlash &gt; leaves a URI without trailing slash unchanged" time="0.000141222">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="removeTrailingSlash &gt; removes only the last slash, not interior ones" time="0.000687124">
+        </testcase>
+        <testcase classname="tests/unit/validators-and-ids.test.ts" name="removeTrailingSlash &gt; handles empty string" time="0.000208165">
+        </testcase>
+    </testsuite>
+</testsuites>
diff --git a/web/packages/agenta-shared/tests/unit/data-transforms.test.ts b/web/packages/agenta-shared/tests/unit/data-transforms.test.ts
new file mode 100644
index 0000000000..148b7a78dd
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/data-transforms.test.ts
@@ -0,0 +1,165 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    extractApiErrorMessage,
+    preserveResponseStatus,
+} from "../../src/utils/extractApiErrorMessage"
+import {stripAgentaMetadataDeep, stripEnhancedWrappers} from "../../src/utils/valueExtraction"
+
+// ---------------------------------------------------------------------------
+// extractApiErrorMessage
+// ---------------------------------------------------------------------------
+
+describe("extractApiErrorMessage — Axios-style errors", () => {
+    it("extracts from response.data.detail string", () => {
+        const error = {response: {data: {detail: "Not found"}}}
+        expect(extractApiErrorMessage(error)).toBe("Not found")
+    })
+
+    it("extracts from response.data.message string", () => {
+        const error = {response: {data: {message: "Forbidden"}}}
+        expect(extractApiErrorMessage(error)).toBe("Forbidden")
+    })
+
+    it("extracts from response.data.error string", () => {
+        const error = {response: {data: {error: "Internal error"}}}
+        expect(extractApiErrorMessage(error)).toBe("Internal error")
+    })
+
+    it("extracts from nested response.data.detail.message", () => {
+        const error = {response: {data: {detail: {message: "Nested message"}}}}
+        expect(extractApiErrorMessage(error)).toBe("Nested message")
+    })
+
+    it("extracts from an array of detail strings", () => {
+        const error = {response: {data: {detail: ["error one", "error two"]}}}
+        const result = extractApiErrorMessage(error)
+        expect(result).toContain("error one")
+    })
+})
+
+describe("extractApiErrorMessage — Error instances", () => {
+    it("returns error.message for a plain Error", () => {
+        expect(extractApiErrorMessage(new Error("Something failed"))).toBe("Something failed")
+    })
+})
+
+describe("extractApiErrorMessage — direct string/object", () => {
+    it("returns a non-empty string value directly", () => {
+        expect(extractApiErrorMessage("plain error string")).toBe("plain error string")
+    })
+
+    it("falls back to String(error) for unknown shapes", () => {
+        expect(extractApiErrorMessage(42)).toBe("42")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// preserveResponseStatus
+// ---------------------------------------------------------------------------
+
+describe("preserveResponseStatus", () => {
+    it("wraps an error with a custom message", () => {
+        const err = preserveResponseStatus(new Error("original"), "custom message")
+        expect(err.message).toBe("custom message")
+    })
+
+    it("preserves the response status from the original error", () => {
+        const axiosError = {response: {status: 404}, message: "Not found"}
+        const err = preserveResponseStatus(axiosError, "Not found")
+        expect(err.response?.status).toBe(404)
+    })
+
+    it("preserves the original error message when no override is given", () => {
+        const err = preserveResponseStatus(new Error("original"))
+        expect(err.message).toBe("original")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// stripAgentaMetadataDeep
+// ---------------------------------------------------------------------------
+
+describe("stripAgentaMetadataDeep", () => {
+    it("removes agenta_metadata keys from objects", () => {
+        const input = {name: "Alice", agenta_metadata: {source: "api"}}
+        const result = stripAgentaMetadataDeep(input)
+        expect(result).not.toHaveProperty("agenta_metadata")
+        expect((result as typeof input).name).toBe("Alice")
+    })
+
+    it("removes __agenta_metadata keys from objects", () => {
+        const input = {value: 1, __agenta_metadata: {}}
+        expect(stripAgentaMetadataDeep(input)).not.toHaveProperty("__agenta_metadata")
+    })
+
+    it("recursively strips metadata from nested objects", () => {
+        const input = {
+            user: {name: "Alice", agenta_metadata: {x: 1}},
+        }
+        const result = stripAgentaMetadataDeep(input) as typeof input
+        expect(result.user).not.toHaveProperty("agenta_metadata")
+        expect(result.user.name).toBe("Alice")
+    })
+
+    it("strips metadata from objects inside arrays", () => {
+        const input = [{score: 5, agenta_metadata: {}}]
+        const result = stripAgentaMetadataDeep(input) as typeof input
+        expect(result[0]).not.toHaveProperty("agenta_metadata")
+        expect(result[0].score).toBe(5)
+    })
+
+    it("returns primitives unchanged", () => {
+        expect(stripAgentaMetadataDeep("hello")).toBe("hello")
+        expect(stripAgentaMetadataDeep(42)).toBe(42)
+        expect(stripAgentaMetadataDeep(null)).toBeNull()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// stripEnhancedWrappers
+// ---------------------------------------------------------------------------
+
+describe("stripEnhancedWrappers", () => {
+    it("unwraps a simple {__id, __metadata, value} wrapper", () => {
+        const input = {__id: "x", __metadata: {}, value: "hello"}
+        expect(stripEnhancedWrappers(input)).toBe("hello")
+    })
+
+    it("strips __id and __metadata from plain objects (non-wrapper)", () => {
+        const input = {__id: "x", __metadata: {}, name: "Alice", age: 30}
+        const result = stripEnhancedWrappers(input) as {name: string; age: number}
+        expect(result).not.toHaveProperty("__id")
+        expect(result).not.toHaveProperty("__metadata")
+        expect(result.name).toBe("Alice")
+        expect(result.age).toBe(30)
+    })
+
+    it("recursively strips wrappers from nested objects", () => {
+        const input = {
+            user: {__id: "u1", __metadata: {}, name: "Alice"},
+        }
+        const result = stripEnhancedWrappers(input) as {user: {name: string}}
+        expect(result.user).not.toHaveProperty("__id")
+        expect(result.user.name).toBe("Alice")
+    })
+
+    it("processes arrays recursively", () => {
+        const input = [
+            {__id: "1", __metadata: {}, value: 1},
+            {__id: "2", __metadata: {}, value: 2},
+        ]
+        const result = stripEnhancedWrappers(input) as number[]
+        expect(result).toEqual([1, 2])
+    })
+
+    it("returns null/undefined unchanged", () => {
+        expect(stripEnhancedWrappers(null)).toBeNull()
+        expect(stripEnhancedWrappers(undefined)).toBeUndefined()
+    })
+
+    it("returns primitives unchanged", () => {
+        expect(stripEnhancedWrappers("hello")).toBe("hello")
+        expect(stripEnhancedWrappers(42)).toBe(42)
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/formatters.test.ts b/web/packages/agenta-shared/tests/unit/formatters.test.ts
new file mode 100644
index 0000000000..eb7bddca4e
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/formatters.test.ts
@@ -0,0 +1,222 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    createFormatter,
+    formatCompact,
+    formatCurrency,
+    formatLatency,
+    formatNumber,
+    formatPercent,
+    formatPreviewValue,
+    formatSignificant,
+    formatTokens,
+} from "../../src/utils/formatters/formatters"
+
+// ---------------------------------------------------------------------------
+// formatNumber
+// ---------------------------------------------------------------------------
+
+describe("formatNumber", () => {
+    it("formats with locale thousand separators and 2 decimal places", () => {
+        expect(formatNumber(1234.567)).toBe("1,234.57")
+    })
+
+    it("returns '-' for null", () => expect(formatNumber(null)).toBe("-"))
+    it("returns '-' for undefined", () => expect(formatNumber(undefined)).toBe("-"))
+
+    it("formats zero", () => expect(formatNumber(0)).toBe("0"))
+    it("formats negative numbers", () => expect(formatNumber(-1234)).toBe("-1,234"))
+})
+
+// ---------------------------------------------------------------------------
+// formatCompact
+// ---------------------------------------------------------------------------
+
+describe("formatCompact", () => {
+    it("formats thousands as K", () => expect(formatCompact(1500)).toBe("1.5K"))
+    it("formats millions as M", () => expect(formatCompact(1_500_000)).toBe("1.5M"))
+    it("returns '-' for null", () => expect(formatCompact(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatCurrency
+// ---------------------------------------------------------------------------
+
+describe("formatCurrency", () => {
+    it("formats with dollar sign and 2 decimals for typical values", () => {
+        expect(formatCurrency(1234.56)).toBe("$1,234.56")
+    })
+
+    it("formats small values without trailing zeros (maximumFractionDigits: 6)", () => {
+        expect(formatCurrency(0.00123)).toBe("$0.00123")
+    })
+
+    it("returns '-' for null", () => expect(formatCurrency(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatLatency
+// ---------------------------------------------------------------------------
+
+describe("formatLatency", () => {
+    it("formats sub-millisecond values in μs", () => {
+        expect(formatLatency(0.0001)).toBe("100μs")
+    })
+
+    it("formats millisecond-range values in ms", () => {
+        expect(formatLatency(0.5)).toBe("500ms")
+    })
+
+    it("formats second-range values in s", () => {
+        expect(formatLatency(2.5)).toBe("2.5s")
+    })
+
+    it("formats exactly 1 second", () => {
+        expect(formatLatency(1)).toBe("1s")
+    })
+
+    it("returns '-' for null", () => expect(formatLatency(null)).toBe("-"))
+    it("returns '-' for undefined", () => expect(formatLatency(undefined)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatTokens
+// ---------------------------------------------------------------------------
+
+describe("formatTokens", () => {
+    it("formats values under 1000 as plain integers", () => {
+        expect(formatTokens(500)).toBe("500")
+    })
+
+    it("formats thousands as K with 1 decimal", () => {
+        expect(formatTokens(1500)).toBe("1.5K")
+    })
+
+    it("formats millions as M with 1 decimal", () => {
+        expect(formatTokens(1_500_000)).toBe("1.5M")
+    })
+
+    it("returns '-' for null", () => expect(formatTokens(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatPercent
+// ---------------------------------------------------------------------------
+
+describe("formatPercent", () => {
+    it("formats decimal as percentage with 1 decimal for values >= 10%", () => {
+        expect(formatPercent(0.856)).toBe("85.6%")
+    })
+
+    it("formats small values with 2 decimal places", () => {
+        expect(formatPercent(0.001)).toBe("0.10%")
+    })
+
+    it("returns '100%' for values >= 99.95%", () => {
+        expect(formatPercent(1)).toBe("100%")
+        expect(formatPercent(0.9995)).toBe("100%")
+    })
+
+    it("returns '0%' for zero", () => {
+        expect(formatPercent(0)).toBe("0%")
+    })
+
+    it("treats negative values as 0%", () => {
+        expect(formatPercent(-0.1)).toBe("0%")
+    })
+
+    it("returns '-' for null", () => expect(formatPercent(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatSignificant
+// ---------------------------------------------------------------------------
+
+describe("formatSignificant", () => {
+    it("formats values with significant-figure-aware decimals", () => {
+        // 1234: exponent=3 → decimals=max(0, 2-3)=0 → "1234" (integer, no rounding)
+        expect(formatSignificant(1234)).toBe("1234")
+        // 0.00456: exponent=-3 → decimals=max(0, 2-(-3))=5 → "0.00456"
+        expect(formatSignificant(0.00456)).toBe("0.00456")
+    })
+
+    it("returns '0' for zero", () => {
+        expect(formatSignificant(0)).toBe("0")
+    })
+
+    it("uses scientific notation for extreme values", () => {
+        const result = formatSignificant(1.5e12)
+        expect(result).toMatch(/e/)
+    })
+
+    it("returns '-' for null", () => expect(formatSignificant(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatPreviewValue
+// ---------------------------------------------------------------------------
+
+describe("formatPreviewValue", () => {
+    it("wraps strings in quotes", () => {
+        expect(formatPreviewValue("hello")).toBe('"hello"')
+    })
+
+    it("truncates long strings and adds ellipsis", () => {
+        const long = "a".repeat(60)
+        const result = formatPreviewValue(long, 50)
+        expect(result).toBe(`"${"a".repeat(50)}..."`)
+    })
+
+    it("formats numbers as-is", () => {
+        expect(formatPreviewValue(123)).toBe("123")
+    })
+
+    it("formats booleans as-is", () => {
+        expect(formatPreviewValue(true)).toBe("true")
+        expect(formatPreviewValue(false)).toBe("false")
+    })
+
+    it("formats arrays with length", () => {
+        expect(formatPreviewValue([1, 2, 3])).toBe("[Array(3)]")
+    })
+
+    it("formats small objects with key names", () => {
+        expect(formatPreviewValue({a: 1, b: 2})).toBe("{a, b}")
+    })
+
+    it("truncates objects with more than 3 keys", () => {
+        const result = formatPreviewValue({a: 1, b: 2, c: 3, d: 4})
+        expect(result).toBe("{a, b, c...}")
+    })
+
+    it("returns '(null)' for null", () => expect(formatPreviewValue(null)).toBe("(null)"))
+    it("returns '(undefined)' for undefined", () =>
+        expect(formatPreviewValue(undefined)).toBe("(undefined)"))
+})
+
+// ---------------------------------------------------------------------------
+// createFormatter
+// ---------------------------------------------------------------------------
+
+describe("createFormatter", () => {
+    it("applies multiplier, prefix, suffix, and fixed decimals", () => {
+        const fmt = createFormatter({multiplier: 100, suffix: "%", decimals: 1})
+        expect(fmt(0.856)).toBe("85.6%")
+    })
+
+    it("uses the custom fallback for null/undefined", () => {
+        const fmt = createFormatter({fallback: "n/a"})
+        expect(fmt(null)).toBe("n/a")
+        expect(fmt(undefined)).toBe("n/a")
+    })
+
+    it("uses compact notation when compact: true", () => {
+        const fmt = createFormatter({compact: true})
+        expect(fmt(1500)).toBe("1.5K")
+    })
+
+    it("prepends a prefix", () => {
+        const fmt = createFormatter({prefix: "$", decimals: 2})
+        expect(fmt(10)).toBe("$10.00")
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/path-utils.test.ts b/web/packages/agenta-shared/tests/unit/path-utils.test.ts
new file mode 100644
index 0000000000..57c875af13
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/path-utils.test.ts
@@ -0,0 +1,166 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    deleteValueAtPath,
+    getValueAtPath,
+    hasValueAtPath,
+    setValueAtPath,
+} from "../../src/utils/pathUtils"
+
+// ---------------------------------------------------------------------------
+// getValueAtPath
+// ---------------------------------------------------------------------------
+
+describe("getValueAtPath — basic object navigation", () => {
+    const data = {user: {profile: {name: "Alice", age: 30}}}
+
+    it("retrieves a deeply nested value", () => {
+        expect(getValueAtPath(data, ["user", "profile", "name"])).toBe("Alice")
+    })
+
+    it("returns the root when the path is empty", () => {
+        expect(getValueAtPath(data, [])).toBe(data)
+    })
+
+    it("returns undefined for a missing key", () => {
+        expect(getValueAtPath(data, ["user", "missing"])).toBeUndefined()
+    })
+
+    it("returns undefined when traversal hits null", () => {
+        expect(getValueAtPath({a: null}, ["a", "b"])).toBeUndefined()
+    })
+})
+
+describe("getValueAtPath — array indexing", () => {
+    it("accesses array elements by numeric index", () => {
+        expect(getValueAtPath([10, 20, 30], [1])).toBe(20)
+    })
+
+    it("accesses array elements by string index", () => {
+        expect(getValueAtPath([10, 20, 30], ["2"])).toBe(30)
+    })
+
+    it("returns undefined for out-of-bounds index", () => {
+        expect(getValueAtPath([10, 20], [5])).toBeUndefined()
+    })
+
+    it("navigates mixed array/object paths", () => {
+        const data = {items: [{id: "a"}, {id: "b"}]}
+        expect(getValueAtPath(data, ["items", 1, "id"])).toBe("b")
+    })
+})
+
+describe("getValueAtPath — JSON string traversal", () => {
+    it("parses a JSON string and continues traversal", () => {
+        const data = {messages: '{"content": "hello"}'}
+        expect(getValueAtPath(data, ["messages", "content"])).toBe("hello")
+    })
+
+    it("returns undefined when the string is not valid JSON", () => {
+        const data = {messages: "not json"}
+        expect(getValueAtPath(data, ["messages", "content"])).toBeUndefined()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// setValueAtPath
+// ---------------------------------------------------------------------------
+
+describe("setValueAtPath — object mutation (immutable)", () => {
+    it("sets a nested value without mutating the original", () => {
+        const data = {user: {name: "Alice"}}
+        const updated = setValueAtPath(data, ["user", "name"], "Bob")
+        expect((updated as typeof data).user.name).toBe("Bob")
+        expect(data.user.name).toBe("Alice")
+    })
+
+    it("creates intermediate objects for new paths", () => {
+        const data = {}
+        const updated = setValueAtPath(data, ["a", "b"], 42) as {a: {b: number}}
+        expect(updated.a.b).toBe(42)
+    })
+
+    it("replaces the root when path is empty", () => {
+        expect(setValueAtPath({a: 1}, [], "new")).toBe("new")
+    })
+})
+
+describe("setValueAtPath — array mutation (immutable)", () => {
+    it("sets an array element by index", () => {
+        const arr = [1, 2, 3]
+        const updated = setValueAtPath(arr, [1], 99) as number[]
+        expect(updated[1]).toBe(99)
+        expect(arr[1]).toBe(2)
+    })
+
+    it("handles nested array+object paths", () => {
+        const data = {items: [{id: "a"}, {id: "b"}]}
+        const updated = setValueAtPath(data, ["items", 0, "id"], "z") as typeof data
+        expect(updated.items[0].id).toBe("z")
+        expect(updated.items[1].id).toBe("b")
+    })
+})
+
+describe("setValueAtPath — JSON string re-serialisation", () => {
+    it("parses a JSON string, sets the value, and re-stringifies", () => {
+        const data = {messages: '{"content": "hello"}'}
+        const updated = setValueAtPath(data, ["messages", "content"], "world") as typeof data
+        expect(updated.messages).toBe('{"content":"world"}')
+    })
+})
+
+// ---------------------------------------------------------------------------
+// deleteValueAtPath
+// ---------------------------------------------------------------------------
+
+describe("deleteValueAtPath — object", () => {
+    it("removes a key from a nested object (immutable)", () => {
+        const data = {user: {name: "Alice", age: 30}}
+        const updated = deleteValueAtPath(data, ["user", "age"]) as typeof data
+        expect(updated.user).not.toHaveProperty("age")
+        expect(updated.user.name).toBe("Alice")
+        expect(data.user.age).toBe(30)
+    })
+
+    it("returns data unchanged when path is empty", () => {
+        const data = {a: 1}
+        expect(deleteValueAtPath(data, [])).toBe(data)
+    })
+})
+
+describe("deleteValueAtPath — array", () => {
+    it("removes an element from an array by index", () => {
+        const result = deleteValueAtPath([10, 20, 30], [1]) as number[]
+        expect(result).toEqual([10, 30])
+    })
+})
+
+// ---------------------------------------------------------------------------
+// hasValueAtPath
+// ---------------------------------------------------------------------------
+
+describe("hasValueAtPath", () => {
+    it("returns true when the key exists", () => {
+        expect(hasValueAtPath({a: {b: 1}}, ["a", "b"])).toBe(true)
+    })
+
+    it("returns false when the key is missing", () => {
+        expect(hasValueAtPath({a: {}}, ["a", "missing"])).toBe(false)
+    })
+
+    it("returns false when a parent is null", () => {
+        expect(hasValueAtPath({a: null}, ["a", "b"])).toBe(false)
+    })
+
+    it("returns true for valid array index", () => {
+        expect(hasValueAtPath([10, 20, 30], [2])).toBe(true)
+    })
+
+    it("returns false for out-of-bounds array index", () => {
+        expect(hasValueAtPath([10, 20], [5])).toBe(false)
+    })
+
+    it("returns true for the root when path is empty and data is defined", () => {
+        expect(hasValueAtPath({a: 1}, [])).toBe(true)
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/slug.test.ts b/web/packages/agenta-shared/tests/unit/slug.test.ts
new file mode 100644
index 0000000000..02369b9644
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/slug.test.ts
@@ -0,0 +1,234 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    generateSlugWithExistingSuffix,
+    generateSlugWithSuffix,
+    getSlugSuffix,
+    isValidSlug,
+    regenerateSlugSuffix,
+    slugifyName,
+    stripSlugSuffix,
+} from "../../src/utils/slug"
+import {
+    buildGatewayToolSlug,
+    isGatewayToolSlug,
+    parseGatewayToolSlug,
+} from "../../src/utils/toolSlug"
+
+// ---------------------------------------------------------------------------
+// slugifyName
+// ---------------------------------------------------------------------------
+
+describe("slugifyName", () => {
+    it("lowercases and trims", () => {
+        expect(slugifyName("  Hello World  ")).toBe("hello-world")
+    })
+
+    it("replaces spaces with hyphens", () => {
+        expect(slugifyName("my app name")).toBe("my-app-name")
+    })
+
+    it("collapses multiple spaces into one hyphen", () => {
+        expect(slugifyName("foo   bar")).toBe("foo-bar")
+    })
+
+    it("strips leading and trailing hyphens", () => {
+        expect(slugifyName("-leading")).toBe("leading")
+        expect(slugifyName("trailing-")).toBe("trailing")
+    })
+
+    it("preserves allowed chars: digits, underscore, dot, hyphen", () => {
+        expect(slugifyName("my_app.v2-beta")).toBe("my_app.v2-beta")
+    })
+
+    it("removes disallowed special characters", () => {
+        expect(slugifyName("hello! @world#")).toBe("hello-world")
+    })
+
+    it("returns empty string for a blank input", () => {
+        expect(slugifyName("")).toBe("")
+        expect(slugifyName("   ")).toBe("")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// generateSlugWithSuffix
+// ---------------------------------------------------------------------------
+
+describe("generateSlugWithSuffix", () => {
+    it("produces <base>-<4 chars> format", () => {
+        const slug = generateSlugWithSuffix("My App")
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+    })
+
+    it("falls back to 'resource' when name slugifies to empty", () => {
+        const slug = generateSlugWithSuffix("!!!!")
+        expect(slug).toMatch(/^resource-[a-z0-9]{4}$/)
+    })
+
+    it("produces different slugs on repeated calls (randomness)", () => {
+        const slugs = new Set(Array.from({length: 10}, () => generateSlugWithSuffix("app")))
+        // With 36^4 = ~1.7M possibilities, collision probability over 10 draws is negligible
+        expect(slugs.size).toBeGreaterThan(1)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// generateSlugWithExistingSuffix
+// ---------------------------------------------------------------------------
+
+describe("generateSlugWithExistingSuffix", () => {
+    it("appends the provided suffix to the slugified name", () => {
+        expect(generateSlugWithExistingSuffix("My App", "ab12")).toBe("my-app-ab12")
+    })
+
+    it("generates a new random suffix when suffix is null", () => {
+        const slug = generateSlugWithExistingSuffix("My App", null)
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+    })
+
+    it("generates a new random suffix when suffix is undefined", () => {
+        const slug = generateSlugWithExistingSuffix("My App")
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getSlugSuffix
+// ---------------------------------------------------------------------------
+
+describe("getSlugSuffix", () => {
+    it("returns the 4-char suffix when present", () => {
+        expect(getSlugSuffix("my-app-ab12")).toBe("ab12")
+    })
+
+    it("returns null when the trailing segment is not exactly 4 chars", () => {
+        expect(getSlugSuffix("my-app-abc")).toBeNull()
+        expect(getSlugSuffix("my-app-abcde")).toBeNull()
+    })
+
+    it("returns null when there is no hyphen-separated suffix", () => {
+        expect(getSlugSuffix("myapp")).toBeNull()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// stripSlugSuffix
+// ---------------------------------------------------------------------------
+
+describe("stripSlugSuffix", () => {
+    it("removes the 4-char suffix", () => {
+        expect(stripSlugSuffix("my-app-ab12")).toBe("my-app")
+    })
+
+    it("leaves the slug unchanged when no suffix is present", () => {
+        expect(stripSlugSuffix("myapp")).toBe("myapp")
+        expect(stripSlugSuffix("my-app-toolong")).toBe("my-app-toolong")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// regenerateSlugSuffix
+// ---------------------------------------------------------------------------
+
+describe("regenerateSlugSuffix", () => {
+    it("replaces the known suffix with a new random one", () => {
+        const slug = regenerateSlugSuffix("my-app-ab12", "ab12")
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+        // The new suffix should differ from the old one (probabilistically)
+        // We just assert the format is correct
+    })
+
+    it("appends a new suffix when the slug does not end with the given suffix", () => {
+        const slug = regenerateSlugSuffix("my-app", "other")
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+    })
+
+    it("always produces a 4-char suffix", () => {
+        const slug = regenerateSlugSuffix("app-xyz1")
+        expect(slug).toMatch(/-[a-z0-9]{4}$/)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// isValidSlug
+// ---------------------------------------------------------------------------
+
+describe("isValidSlug", () => {
+    it.each(["a", "abc", "my-app", "my_app", "app.v2", "app-v2-ab12"])(
+        "returns true for valid slug %s",
+        (s) => expect(isValidSlug(s)).toBe(true),
+    )
+
+    it("returns false for empty string", () => {
+        expect(isValidSlug("")).toBe(false)
+    })
+
+    it("returns false for slugs longer than 255 characters", () => {
+        expect(isValidSlug("a".repeat(256))).toBe(false)
+    })
+
+    it("returns false for double hyphens", () => {
+        expect(isValidSlug("my--app")).toBe(false)
+    })
+
+    it("returns false for double dots", () => {
+        expect(isValidSlug("my..app")).toBe(false)
+    })
+
+    it("returns false for slugs starting or ending with non-alphanumeric", () => {
+        expect(isValidSlug("-app")).toBe(false)
+        expect(isValidSlug("app-")).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildGatewayToolSlug / isGatewayToolSlug / parseGatewayToolSlug
+// ---------------------------------------------------------------------------
+
+describe("buildGatewayToolSlug", () => {
+    it("builds the correct double-underscore format", () => {
+        expect(buildGatewayToolSlug("google", "gmail", "SEND_EMAIL", "my-connection")).toBe(
+            "tools__google__gmail__SEND_EMAIL__my-connection",
+        )
+    })
+})
+
+describe("isGatewayToolSlug", () => {
+    it("returns true for a valid gateway tool slug", () => {
+        expect(isGatewayToolSlug("tools__google__gmail__SEND__conn")).toBe(true)
+    })
+
+    it("returns false for a non-gateway slug", () => {
+        expect(isGatewayToolSlug("get_weather")).toBe(false)
+        expect(isGatewayToolSlug(undefined)).toBe(false)
+    })
+})
+
+describe("parseGatewayToolSlug", () => {
+    it("parses all four parts correctly", () => {
+        const result = parseGatewayToolSlug("tools__google__gmail__SEND_EMAIL__my-conn")
+        expect(result).toEqual({
+            provider: "google",
+            integration: "gmail",
+            action: "SEND_EMAIL",
+            connection: "my-conn",
+        })
+    })
+
+    it("returns null for a slug with wrong number of parts", () => {
+        expect(parseGatewayToolSlug("tools__google__gmail")).toBeNull()
+    })
+
+    it("returns null for a slug that does not start with 'tools'", () => {
+        expect(parseGatewayToolSlug("nottools__a__b__c__d")).toBeNull()
+    })
+
+    it("returns null for undefined input", () => {
+        expect(parseGatewayToolSlug(undefined)).toBeNull()
+    })
+
+    it("returns null when any segment is empty", () => {
+        expect(parseGatewayToolSlug("tools__google____SEND__conn")).toBeNull()
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/template-variable.test.ts b/web/packages/agenta-shared/tests/unit/template-variable.test.ts
new file mode 100644
index 0000000000..9fa5aafe9a
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/template-variable.test.ts
@@ -0,0 +1,147 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    extractTemplateExpression,
+    isValidTemplateVariable,
+    validateTemplateVariable,
+} from "../../src/utils/templateVariable"
+
+// ---------------------------------------------------------------------------
+// validateTemplateVariable — empty / malformed
+// ---------------------------------------------------------------------------
+
+describe("validateTemplateVariable — empty / malformed", () => {
+    it("rejects an empty expression", () => {
+        const result = validateTemplateVariable("")
+        expect(result.valid).toBe(false)
+        expect(result.reason).toMatch(/empty/i)
+    })
+
+    it("rejects expressions with consecutive dots (..)", () => {
+        expect(validateTemplateVariable("$.inputs..country").valid).toBe(false)
+    })
+
+    it("rejects expressions with consecutive slashes (//)", () => {
+        expect(validateTemplateVariable("/inputs//country").valid).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// validateTemplateVariable — JSONPath ($.)
+// ---------------------------------------------------------------------------
+
+describe("validateTemplateVariable — JSONPath", () => {
+    it("accepts a well-formed JSONPath", () => {
+        expect(validateTemplateVariable("$.inputs.country").valid).toBe(true)
+    })
+
+    it("accepts bare '$' (whole context shorthand)", () => {
+        expect(validateTemplateVariable("$").valid).toBe(true)
+    })
+
+    it("rejects '$<no-dot>' (malformed root)", () => {
+        const result = validateTemplateVariable("$outputs.country")
+        expect(result.valid).toBe(false)
+    })
+
+    it("rejects '$.' with no field after the dot", () => {
+        expect(validateTemplateVariable("$.").valid).toBe(false)
+    })
+
+    it("accepts any root segment — does NOT validate against envelope slots (permissive)", () => {
+        // Per mustache QA principle: $.arbitrary is valid; runtime validates
+        expect(validateTemplateVariable("$.arbitrary_column").valid).toBe(true)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// validateTemplateVariable — JSON Pointer (/)
+// ---------------------------------------------------------------------------
+
+describe("validateTemplateVariable — JSON Pointer", () => {
+    it("accepts a pointer rooted at a known envelope slot", () => {
+        expect(validateTemplateVariable("/inputs/country").valid).toBe(true)
+        expect(validateTemplateVariable("/outputs/result").valid).toBe(true)
+    })
+
+    it("rejects a multi-segment pointer with an unknown root slot", () => {
+        const result = validateTemplateVariable("/unknown/field")
+        expect(result.valid).toBe(false)
+        expect(result.reason).toMatch(/unknown envelope slot/i)
+    })
+
+    it("includes a 'did-you-mean' suggestion for near-miss slot names", () => {
+        const result = validateTemplateVariable("/input/country") // 'input' ≈ 'inputs'
+        expect(result.valid).toBe(false)
+        expect(result.suggestion).toBe("inputs")
+    })
+
+    it("accepts a single-segment identifier-shaped pointer unconditionally (mustache close tag)", () => {
+        // e.g. {{/section}} — single segment, identifier-shaped → valid
+        expect(validateTemplateVariable("/section").valid).toBe(true)
+    })
+
+    it("rejects '/' with no segments", () => {
+        expect(validateTemplateVariable("/").valid).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// validateTemplateVariable — plain names / dot notation
+// ---------------------------------------------------------------------------
+
+describe("validateTemplateVariable — plain names", () => {
+    it("accepts plain identifiers", () => {
+        expect(validateTemplateVariable("question").valid).toBe(true)
+        expect(validateTemplateVariable("my_variable").valid).toBe(true)
+    })
+
+    it("accepts dot-notation paths", () => {
+        expect(validateTemplateVariable("user.name").valid).toBe(true)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// isValidTemplateVariable
+// ---------------------------------------------------------------------------
+
+describe("isValidTemplateVariable", () => {
+    it("returns true for a valid expression", () => {
+        expect(isValidTemplateVariable("$.inputs.country")).toBe(true)
+    })
+
+    it("returns false for an invalid expression", () => {
+        expect(isValidTemplateVariable("")).toBe(false)
+        expect(isValidTemplateVariable("$outputs.x")).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// extractTemplateExpression
+// ---------------------------------------------------------------------------
+
+describe("extractTemplateExpression", () => {
+    it("strips {{ }} wrappers", () => {
+        expect(extractTemplateExpression("{{ $.inputs.country }}")).toBe("$.inputs.country")
+    })
+
+    it("strips {% %} wrappers", () => {
+        expect(extractTemplateExpression("{% if condition %}")).toBe("if condition")
+    })
+
+    it("strips {%- -%} wrappers (whitespace-trimming variants)", () => {
+        expect(extractTemplateExpression("{%- block -%}")).toBe("block")
+    })
+
+    it("strips {# #} comment wrappers", () => {
+        expect(extractTemplateExpression("{# comment #}")).toBe("comment")
+    })
+
+    it("returns the raw text when no wrapper is present", () => {
+        expect(extractTemplateExpression("plain")).toBe("plain")
+    })
+
+    it("returns empty string for empty input", () => {
+        expect(extractTemplateExpression("")).toBe("")
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/validators-and-ids.test.ts b/web/packages/agenta-shared/tests/unit/validators-and-ids.test.ts
new file mode 100644
index 0000000000..92fc346e63
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/validators-and-ids.test.ts
@@ -0,0 +1,138 @@
+import {describe, expect, it} from "vitest"
+
+import {isValidHttpUrl, isValidRegex, isValidUUID, validateUUID} from "../../src/utils/validators"
+import {uuidToSpanId, uuidToTraceId} from "../../src/utils/traceIds"
+import {removeTrailingSlash} from "../../src/utils/uriUtils"
+
+// ---------------------------------------------------------------------------
+// isValidUUID
+// ---------------------------------------------------------------------------
+
+describe("isValidUUID", () => {
+    it.each([
+        "123e4567-e89b-12d3-a456-426614174000",
+        "00000000-0000-0000-0000-000000000000",
+        "FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF",
+    ])("returns true for valid UUID %s", (uuid) => {
+        expect(isValidUUID(uuid)).toBe(true)
+    })
+
+    it.each([
+        "",
+        "not-a-uuid",
+        "123e4567-e89b-12d3-a456",
+        "123e4567-e89b-12d3-a456-42661417400Z",
+        "123e4567e89b12d3a456426614174000",
+    ])("returns false for invalid input %s", (input) => {
+        expect(isValidUUID(input)).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// validateUUID
+// ---------------------------------------------------------------------------
+
+describe("validateUUID", () => {
+    it("does not throw for a valid UUID", () => {
+        expect(() => validateUUID("123e4567-e89b-12d3-a456-426614174000", "id")).not.toThrow()
+    })
+
+    it("throws with a descriptive message for an invalid UUID", () => {
+        expect(() => validateUUID("not-valid", "userId")).toThrow(
+            "Invalid userId: must be a valid UUID",
+        )
+    })
+})
+
+// ---------------------------------------------------------------------------
+// isValidHttpUrl
+// ---------------------------------------------------------------------------
+
+describe("isValidHttpUrl", () => {
+    it.each(["http://example.com", "https://example.com/path?q=1"])("returns true for %s", (url) =>
+        expect(isValidHttpUrl(url)).toBe(true),
+    )
+
+    it.each(["ftp://example.com", "not-a-url", "", "javascript:alert(1)"])(
+        "returns false for %s",
+        (url) => expect(isValidHttpUrl(url)).toBe(false),
+    )
+})
+
+// ---------------------------------------------------------------------------
+// isValidRegex
+// ---------------------------------------------------------------------------
+
+describe("isValidRegex", () => {
+    it.each(["^[a-z]+$", "\\d+", "(foo|bar)", ".*"])("returns true for valid regex %s", (re) =>
+        expect(isValidRegex(re)).toBe(true),
+    )
+
+    it.each(["[invalid", "(unclosed", "*bad"])("returns false for invalid regex %s", (re) => {
+        expect(isValidRegex(re)).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// uuidToTraceId
+// ---------------------------------------------------------------------------
+
+describe("uuidToTraceId", () => {
+    it("strips dashes from a UUID", () => {
+        expect(uuidToTraceId("123e4567-e89b-12d3-a456-426614174000")).toBe(
+            "123e4567e89b12d3a456426614174000",
+        )
+    })
+
+    it("returns undefined for undefined input", () => {
+        expect(uuidToTraceId(undefined)).toBeUndefined()
+    })
+
+    it("returns undefined for empty string", () => {
+        expect(uuidToTraceId("")).toBeUndefined()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// uuidToSpanId
+// ---------------------------------------------------------------------------
+
+describe("uuidToSpanId", () => {
+    it("returns the last 16 hex chars of the stripped UUID", () => {
+        // UUID: 123e4567-e89b-12d3-a456-426614174000
+        // Full hex: 123e4567e89b12d3a456426614174000  (32 chars)
+        // Last 16:                  a456426614174000
+        expect(uuidToSpanId("123e4567-e89b-12d3-a456-426614174000")).toBe("a456426614174000")
+    })
+
+    it("returns undefined for undefined input", () => {
+        expect(uuidToSpanId(undefined)).toBeUndefined()
+    })
+
+    it("span ID length is always 16", () => {
+        const spanId = uuidToSpanId("ffffffff-ffff-ffff-ffff-ffffffffffff")
+        expect(spanId).toHaveLength(16)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// removeTrailingSlash
+// ---------------------------------------------------------------------------
+
+describe("removeTrailingSlash", () => {
+    it("removes a trailing slash", () => {
+        expect(removeTrailingSlash("http://example.com/")).toBe("http://example.com")
+    })
+
+    it("leaves a URI without trailing slash unchanged", () => {
+        expect(removeTrailingSlash("http://example.com")).toBe("http://example.com")
+    })
+
+    it("removes only the last slash, not interior ones", () => {
+        expect(removeTrailingSlash("http://example.com/path/")).toBe("http://example.com/path")
+    })
+
+    it("handles empty string", () => {
+        expect(removeTrailingSlash("")).toBe("")
+    })
+})
diff --git a/web/packages/agenta-shared/vitest.config.ts b/web/packages/agenta-shared/vitest.config.ts
new file mode 100644
index 0000000000..a9a2cfed1d
--- /dev/null
+++ b/web/packages/agenta-shared/vitest.config.ts
@@ -0,0 +1,19 @@
+import {defineConfig} from "vitest/config"
+
+export default defineConfig({
+    test: {
+        include: ["tests/unit/**/*.test.ts"],
+        environment: "node",
+        reporters: ["default", "junit"],
+        outputFile: {
+            junit: "./test-results/junit.xml",
+        },
+        coverage: {
+            provider: "v8",
+            include: ["src/**/*.ts"],
+            exclude: ["src/**/index.ts"],
+            reporter: ["text", "lcov", "json-summary"],
+            reportsDirectory: "./coverage",
+        },
+    },
+})
diff --git a/web/pnpm-lock.yaml b/web/pnpm-lock.yaml
index a93061a109..58efcbad6f 100644
--- a/web/pnpm-lock.yaml
+++ b/web/pnpm-lock.yaml
@@ -693,9 +693,15 @@ importers:
       '@types/node':
         specifier: ^20.8.10
         version: 20.19.39
+      '@vitest/coverage-v8':
+        specifier: ^4.1.4
+        version: 4.1.6(vitest@4.1.6)
       typescript:
         specifier: 5.8.3
         version: 5.8.3
+      vitest:
+        specifier: ^4.1.4
+        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@20.19.39)(@vitest/coverage-v8@4.1.6)(vite@8.0.12(@types/node@20.19.39)(esbuild@0.27.7)(jiti@2.7.0)(terser@5.47.0)(tsx@4.21.0)(yaml@2.8.4))
 
   packages/agenta-annotation-ui:
     dependencies:
@@ -1213,9 +1219,15 @@ importers:
       '@types/react':
         specifier: ^19.0.10
         version: 19.2.14
+      '@vitest/coverage-v8':
+        specifier: ^4.1.4
+        version: 4.1.6(vitest@4.1.6)
       typescript:
         specifier: 5.8.3
         version: 5.8.3
+      vitest:
+        specifier: ^4.1.4
+        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@20.19.39)(@vitest/coverage-v8@4.1.6)(vite@8.0.12(@types/node@20.19.39)(esbuild@0.27.7)(jiti@2.7.0)(terser@5.47.0)(tsx@4.21.0)(yaml@2.8.4))
 
   packages/agenta-ui:
     dependencies:

From a1ded4a202fb99503afa823f73b99027c4fad2bb Mon Sep 17 00:00:00 2001
From: Kaosiso Ezealigo <ezealigokosiso@gmail.com>
Date: Wed, 3 Jun 2026 11:51:31 +0200
Subject: [PATCH 02/36] chore(test): gitignore test-results and coverage dirs
 for shared and annotation packages

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 web/packages/agenta-annotation/.gitignore     |   3 +
 .../agenta-annotation/test-results/junit.xml  | 163 --------
 web/packages/agenta-shared/.gitignore         |   3 +
 .../agenta-shared/test-results/junit.xml      | 389 ------------------
 4 files changed, 6 insertions(+), 552 deletions(-)
 create mode 100644 web/packages/agenta-annotation/.gitignore
 delete mode 100644 web/packages/agenta-annotation/test-results/junit.xml
 create mode 100644 web/packages/agenta-shared/.gitignore
 delete mode 100644 web/packages/agenta-shared/test-results/junit.xml

diff --git a/web/packages/agenta-annotation/.gitignore b/web/packages/agenta-annotation/.gitignore
new file mode 100644
index 0000000000..96d253c48e
--- /dev/null
+++ b/web/packages/agenta-annotation/.gitignore
@@ -0,0 +1,3 @@
+# Generated by Vitest — do not commit
+test-results/
+coverage/
diff --git a/web/packages/agenta-annotation/test-results/junit.xml b/web/packages/agenta-annotation/test-results/junit.xml
deleted file mode 100644
index 85cdaef2d1..0000000000
--- a/web/packages/agenta-annotation/test-results/junit.xml
+++ /dev/null
@@ -1,163 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<testsuites name="vitest tests" tests="78" failures="0" errors="0" time="0.112877977">
-    <testsuite name="tests/unit/annotation-form-helpers.test.ts" timestamp="2026-06-03T07:51:16.842Z" hostname="Kaosisos-MacBook-Pro.local" tests="38" failures="0" errors="0" skipped="0" time="0.079835364">
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns true for null" time="0.003193247">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns true for undefined" time="0.000426831">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns true for " time="0.00022392">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns true for " time="0.000394588">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for 0" time="0.000342228">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for false" time="0.000172727">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for 0" time="0.000318123">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for " time="0.000154355">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for {}" time="0.000593872">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="isEmptyValue &gt; returns false for  " time="0.000258603">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getOutputsSchema &gt; returns the schema from resolveOutputSchema" time="0.018263232">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getOutputsSchema &gt; returns empty object when resolveOutputSchema returns null" time="0.021705992">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — scalar types &gt; produces a number field with null default" time="0.005147245">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — scalar types &gt; produces an integer field with null default" time="0.000455569">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — scalar types &gt; produces a boolean field with null default" time="0.000533427">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — scalar types &gt; produces a string field with empty-string default" time="0.000415174">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array type &gt; produces an array field with item schema" time="0.000462322">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array type &gt; defaults item type to string when items is missing" time="0.002111803">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — anyOf schema &gt; unwraps the first anyOf entry to get the real type" time="0.005339982">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array-of-types &gt; filters &apos;null&apos; from the type array and uses the remaining types" time="0.000613499">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array-of-types &gt; skips the property when only &apos;null&apos; type remains after filtering" time="0.005364649">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — array-of-types &gt; includes non-null enum values and strips null/empty entries" time="0.000314292">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — edge cases &gt; returns empty object for an empty schema" time="0.000201713">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — edge cases &gt; skips unsupported types (e.g. &apos;object&apos;)" time="0.000369456">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricFieldsFromEvaluator — edge cases &gt; skips properties with no type field" time="0.000208122">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — flat outputs matching schema &gt; fills a number field from flat outputs" time="0.000565318">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — flat outputs matching schema &gt; fills a string field from flat outputs" time="0.000294481">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — flat outputs matching schema &gt; uses schema default when key is absent in outputs" time="0.00023096">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — flat outputs matching schema &gt; uses &apos;&apos; as default for a missing string field" time="0.000172772">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — nested outputs &gt; flattens metrics nested under &apos;metrics&apos; key" time="0.000358223">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — nested outputs &gt; flattens fields nested under &apos;notes&apos; key" time="0.000209121">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — nested outputs &gt; flattens fields nested under &apos;extra&apos; key" time="0.000184574">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — nested outputs &gt; flat keys outside of metrics/notes/extra are preserved directly" time="0.000175375">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; infers a number field from a numeric output value" time="0.000531908">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; infers a boolean field from a boolean output value" time="0.000429949">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; infers a string field from a string output value" time="0.000213978">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; serialises an object output to a JSON string field" time="0.000379304">
-        </testcase>
-        <testcase classname="tests/unit/annotation-form-helpers.test.ts" name="getMetricsFromAnnotation — schema-free inference &gt; returns empty object when annotation outputs are empty" time="0.000287768">
-        </testcase>
-    </testsuite>
-    <testsuite name="tests/unit/testset-sync.test.ts" timestamp="2026-06-03T07:51:16.868Z" hostname="Kaosisos-MacBook-Pro.local" tests="40" failures="0" errors="0" skipped="0" time="0.033042613">
-        <testcase classname="tests/unit/testset-sync.test.ts" name="getQueueAnnotationTag &gt; formats queue ID into tag" time="0.003451276">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="getQueueAnnotationTag &gt; handles arbitrary queue IDs" time="0.000339992">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; always includes the queue tag and kind tag" time="0.00157184">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; merges existing tags without duplicates" time="0.001066907">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; adds output keys as tags" time="0.000432673">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; handles null existingTags gracefully" time="0.00031137">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="mergeTestcaseAnnotationTags &gt; filters out falsy tags from existingTags" time="0.000652604">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — no match &gt; returns null annotation when list is empty" time="0.001475313">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — no match &gt; returns null annotation when no annotation matches the evaluator slug" time="0.001590326">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — queue-scoped matching &gt; returns the annotation when exactly one queue-scoped match exists" time="0.00062801">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — queue-scoped matching &gt; returns duplicate_queue_annotations when multiple queue-scoped annotations match" time="0.000372534">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — queue-scoped matching &gt; ignores annotations scoped to a different queue" time="0.000294816">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — legacy fallback &gt; falls back to a legacy annotation (no queue tags) when no queue-scoped match" time="0.000350558">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — legacy fallback &gt; returns duplicate_legacy_annotations when multiple legacy annotations match" time="0.000232561">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="selectQueueScopedAnnotation — evaluatorWorkflowId matching &gt; matches annotation by evaluator workflow ID" time="0.000220965">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="getTestsetSyncEvaluatorColumnKey &gt; returns evaluator slug when no annotation supplied" time="0.000211145">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="getTestsetSyncEvaluatorColumnKey &gt; prefers annotation&apos;s evaluator slug over evaluator.slug" time="0.000205561">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="getTestsetSyncEvaluatorColumnKey &gt; falls back to evaluator.workflowId when slug is empty" time="0.000135544">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="getTestsetSyncEvaluatorColumnKey &gt; returns empty string when evaluator has no slug or workflowId" time="0.000132458">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncOperations &gt; maps target rows to replace operations" time="0.00041353">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncOperations &gt; produces an empty replace list for a target with no rows" time="0.000249256">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; keeps rows whose rowId exists directly in baseRows" time="0.002495199">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; remaps a row using testcase_dedup_id when rowId is not in baseRows" time="0.000243903">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; also remaps using legacy __dedup_id__ key" time="0.00020136">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; drops rows with no matching rowId and no dedup key" time="0.000253405">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="remapTargetRowsToBaseRevision &gt; updates rowCount to reflect mapped rows only" time="0.000469284">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTraceTestsetRows &gt; builds a row per scenario with trace inputs and output" time="0.001453622">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTraceTestsetRows &gt; expands a nested &apos;inputs&apos; key into top-level columns" time="0.001394469">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTraceTestsetRows &gt; merges annotation outputs into the row" time="0.001420822">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTraceTestsetRows &gt; handles a missing scenario gracefully (uses empty defaults)" time="0.000365183">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestcaseExportRows &gt; builds a row when annotation data exists for the testcase" time="0.000788308">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestcaseExportRows &gt; skips a scenario with no testcase mapping" time="0.000422125">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestcaseExportRows &gt; skips a testcase with no annotations" time="0.000299728">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; returns a missing_testcase conflict when testcase not found" time="0.001103675">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; returns a missing_testset conflict when testcase has no testset_id" time="0.000254052">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; returns a missing_latest_revision conflict when no revision for testset" time="0.00081726">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; produces a clean target when everything is resolved" time="0.000674425">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; records duplicate_queue_annotations conflict and skips the row" time="0.000710204">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; groups rows from different scenarios under the same testset target" time="0.000615301">
-        </testcase>
-        <testcase classname="tests/unit/testset-sync.test.ts" name="buildTestsetSyncPreview &gt; skips rows with no annotation data and does not add them as conflicts" time="0.000292529">
-        </testcase>
-    </testsuite>
-</testsuites>
diff --git a/web/packages/agenta-shared/.gitignore b/web/packages/agenta-shared/.gitignore
new file mode 100644
index 0000000000..96d253c48e
--- /dev/null
+++ b/web/packages/agenta-shared/.gitignore
@@ -0,0 +1,3 @@
+# Generated by Vitest — do not commit
+test-results/
+coverage/
diff --git a/web/packages/agenta-shared/test-results/junit.xml b/web/packages/agenta-shared/test-results/junit.xml
deleted file mode 100644
index ba991a034e..0000000000
--- a/web/packages/agenta-shared/test-results/junit.xml
+++ /dev/null
@@ -1,389 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<testsuites name="vitest tests" tests="187" failures="0" errors="0" time="0.15012691">
-    <testsuite name="tests/unit/data-transforms.test.ts" timestamp="2026-06-03T07:51:20.543Z" hostname="Kaosisos-MacBook-Pro.local" tests="22" failures="0" errors="0" skipped="0" time="0.014309687">
-        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from response.data.detail string" time="0.002546081">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from response.data.message string" time="0.000365527">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from response.data.error string" time="0.0002325">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from nested response.data.detail.message" time="0.000289911">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Axios-style errors &gt; extracts from an array of detail strings" time="0.000366091">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — Error instances &gt; returns error.message for a plain Error" time="0.000310955">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — direct string/object &gt; returns a non-empty string value directly" time="0.000275387">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="extractApiErrorMessage — direct string/object &gt; falls back to String(error) for unknown shapes" time="0.00019925">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="preserveResponseStatus &gt; wraps an error with a custom message" time="0.000816785">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="preserveResponseStatus &gt; preserves the response status from the original error" time="0.00033335">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="preserveResponseStatus &gt; preserves the original error message when no override is given" time="0.000300095">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; removes agenta_metadata keys from objects" time="0.000901889">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; removes __agenta_metadata keys from objects" time="0.000661765">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; recursively strips metadata from nested objects" time="0.000290145">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; strips metadata from objects inside arrays" time="0.00021932">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripAgentaMetadataDeep &gt; returns primitives unchanged" time="0.000203792">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; unwraps a simple {__id, __metadata, value} wrapper" time="0.000355085">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; strips __id and __metadata from plain objects (non-wrapper)" time="0.000448585">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; recursively strips wrappers from nested objects" time="0.000242952">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; processes arrays recursively" time="0.001055276">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; returns null/undefined unchanged" time="0.000209983">
-        </testcase>
-        <testcase classname="tests/unit/data-transforms.test.ts" name="stripEnhancedWrappers &gt; returns primitives unchanged" time="0.000148744">
-        </testcase>
-    </testsuite>
-    <testsuite name="tests/unit/formatters.test.ts" timestamp="2026-06-03T07:51:20.555Z" hostname="Kaosisos-MacBook-Pro.local" tests="44" failures="0" errors="0" skipped="0" time="0.029119193">
-        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; formats with locale thousand separators and 2 decimal places" time="0.003713118">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; returns &apos;-&apos; for null" time="0.000494948">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; returns &apos;-&apos; for undefined" time="0.000334845">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; formats zero" time="0.000426467">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatNumber &gt; formats negative numbers" time="0.000340483">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatCompact &gt; formats thousands as K" time="0.000465451">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatCompact &gt; formats millions as M" time="0.000407826">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatCompact &gt; returns &apos;-&apos; for null" time="0.000305715">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatCurrency &gt; formats with dollar sign and 2 decimals for typical values" time="0.00254155">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatCurrency &gt; formats small values without trailing zeros (maximumFractionDigits: 6)" time="0.000456143">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatCurrency &gt; returns &apos;-&apos; for null" time="0.000727972">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; formats sub-millisecond values in μs" time="0.000334701">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; formats millisecond-range values in ms" time="0.000224338">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; formats second-range values in s" time="0.000194387">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; formats exactly 1 second" time="0.000137998">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; returns &apos;-&apos; for null" time="0.000128351">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatLatency &gt; returns &apos;-&apos; for undefined" time="0.000136314">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatTokens &gt; formats values under 1000 as plain integers" time="0.000364411">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatTokens &gt; formats thousands as K with 1 decimal" time="0.000143129">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatTokens &gt; formats millions as M with 1 decimal" time="0.000140819">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatTokens &gt; returns &apos;-&apos; for null" time="0.00011791">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; formats decimal as percentage with 1 decimal for values &gt;= 10%" time="0.000228929">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; formats small values with 2 decimal places" time="0.000129466">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; returns &apos;100%&apos; for values &gt;= 99.95%" time="0.000478414">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; returns &apos;0%&apos; for zero" time="0.000144519">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; treats negative values as 0%" time="0.000189746">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPercent &gt; returns &apos;-&apos; for null" time="0.000144275">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatSignificant &gt; formats values with significant-figure-aware decimals" time="0.000419335">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatSignificant &gt; returns &apos;0&apos; for zero" time="0.000219374">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatSignificant &gt; uses scientific notation for extreme values" time="0.000279056">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatSignificant &gt; returns &apos;-&apos; for null" time="0.000256581">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; wraps strings in quotes" time="0.000306751">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; truncates long strings and adds ellipsis" time="0.00017171">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; formats numbers as-is" time="0.000130479">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; formats booleans as-is" time="0.000140545">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; formats arrays with length" time="0.000113721">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; formats small objects with key names" time="0.000124305">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; truncates objects with more than 3 keys" time="0.000131993">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; returns &apos;(null)&apos; for null" time="0.000111097">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="formatPreviewValue &gt; returns &apos;(undefined)&apos; for undefined" time="0.000152774">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="createFormatter &gt; applies multiplier, prefix, suffix, and fixed decimals" time="0.000473756">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="createFormatter &gt; uses the custom fallback for null/undefined" time="0.000223072">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="createFormatter &gt; uses compact notation when compact: true" time="0.000384642">
-        </testcase>
-        <testcase classname="tests/unit/formatters.test.ts" name="createFormatter &gt; prepends a prefix" time="0.000216185">
-        </testcase>
-    </testsuite>
-    <testsuite name="tests/unit/path-utils.test.ts" timestamp="2026-06-03T07:51:20.571Z" hostname="Kaosisos-MacBook-Pro.local" tests="25" failures="0" errors="0" skipped="0" time="0.037525386">
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — basic object navigation &gt; retrieves a deeply nested value" time="0.002557808">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — basic object navigation &gt; returns the root when the path is empty" time="0.000322721">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — basic object navigation &gt; returns undefined for a missing key" time="0.000256621">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — basic object navigation &gt; returns undefined when traversal hits null" time="0.000267802">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — array indexing &gt; accesses array elements by numeric index" time="0.000327434">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — array indexing &gt; accesses array elements by string index" time="0.000195651">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — array indexing &gt; returns undefined for out-of-bounds index" time="0.000250685">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — array indexing &gt; navigates mixed array/object paths" time="0.000218769">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — JSON string traversal &gt; parses a JSON string and continues traversal" time="0.00074782">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="getValueAtPath — JSON string traversal &gt; returns undefined when the string is not valid JSON" time="0.000317394">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — object mutation (immutable) &gt; sets a nested value without mutating the original" time="0.000377345">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — object mutation (immutable) &gt; creates intermediate objects for new paths" time="0.00015727">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — object mutation (immutable) &gt; replaces the root when path is empty" time="0.000134198">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — array mutation (immutable) &gt; sets an array element by index" time="0.000164713">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — array mutation (immutable) &gt; handles nested array+object paths" time="0.000166792">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="setValueAtPath — JSON string re-serialisation &gt; parses a JSON string, sets the value, and re-stringifies" time="0.000172575">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="deleteValueAtPath — object &gt; removes a key from a nested object (immutable)" time="0.002067215">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="deleteValueAtPath — object &gt; returns data unchanged when path is empty" time="0.020732904">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="deleteValueAtPath — array &gt; removes an element from an array by index" time="0.001636661">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns true when the key exists" time="0.000583877">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns false when the key is missing" time="0.000163844">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns false when a parent is null" time="0.000124015">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns true for valid array index" time="0.000115465">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns false for out-of-bounds array index" time="0.000115521">
-        </testcase>
-        <testcase classname="tests/unit/path-utils.test.ts" name="hasValueAtPath &gt; returns true for the root when path is empty and data is defined" time="0.000104562">
-        </testcase>
-    </testsuite>
-    <testsuite name="tests/unit/slug.test.ts" timestamp="2026-06-03T07:51:20.581Z" hostname="Kaosisos-MacBook-Pro.local" tests="40" failures="0" errors="0" skipped="0" time="0.030062015">
-        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; lowercases and trims" time="0.003214561">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; replaces spaces with hyphens" time="0.00045722">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; collapses multiple spaces into one hyphen" time="0.000251821">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; strips leading and trailing hyphens" time="0.000354766">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; preserves allowed chars: digits, underscore, dot, hyphen" time="0.000235035">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; removes disallowed special characters" time="0.00021152">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="slugifyName &gt; returns empty string for a blank input" time="0.000533555">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithSuffix &gt; produces &lt;base&gt;-&lt;4 chars&gt; format" time="0.003529513">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithSuffix &gt; falls back to &apos;resource&apos; when name slugifies to empty" time="0.000746449">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithSuffix &gt; produces different slugs on repeated calls (randomness)" time="0.000734153">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithExistingSuffix &gt; appends the provided suffix to the slugified name" time="0.000325176">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithExistingSuffix &gt; generates a new random suffix when suffix is null" time="0.000376281">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="generateSlugWithExistingSuffix &gt; generates a new random suffix when suffix is undefined" time="0.000230762">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="getSlugSuffix &gt; returns the 4-char suffix when present" time="0.000250414">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="getSlugSuffix &gt; returns null when the trailing segment is not exactly 4 chars" time="0.00025666">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="getSlugSuffix &gt; returns null when there is no hyphen-separated suffix" time="0.000218056">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="stripSlugSuffix &gt; removes the 4-char suffix" time="0.000213417">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="stripSlugSuffix &gt; leaves the slug unchanged when no suffix is present" time="0.000179517">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="regenerateSlugSuffix &gt; replaces the known suffix with a new random one" time="0.001827256">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="regenerateSlugSuffix &gt; appends a new suffix when the slug does not end with the given suffix" time="0.000207291">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="regenerateSlugSuffix &gt; always produces a 4-char suffix" time="0.000200284">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug a" time="0.000257971">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug abc" time="0.000181899">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug my-app" time="0.000093801">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug my_app" time="0.000083682">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug app.v2" time="0.000083173">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns true for valid slug app-v2-ab12" time="0.00007983">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for empty string" time="0.00011098">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for slugs longer than 255 characters" time="0.000129017">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for double hyphens" time="0.0001072">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for double dots" time="0.000214997">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isValidSlug &gt; returns false for slugs starting or ending with non-alphanumeric" time="0.000242711">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="buildGatewayToolSlug &gt; builds the correct double-underscore format" time="0.000756189">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isGatewayToolSlug &gt; returns true for a valid gateway tool slug" time="0.001401067">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="isGatewayToolSlug &gt; returns false for a non-gateway slug" time="0.003921047">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; parses all four parts correctly" time="0.001736217">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; returns null for a slug with wrong number of parts" time="0.000184061">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; returns null for a slug that does not start with &apos;tools&apos;" time="0.000400336">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; returns null for undefined input" time="0.000282527">
-        </testcase>
-        <testcase classname="tests/unit/slug.test.ts" name="parseGatewayToolSlug &gt; returns null when any segment is empty" time="0.000151165">
-        </testcase>
-    </testsuite>
-    <testsuite name="tests/unit/template-variable.test.ts" timestamp="2026-06-03T07:51:20.594Z" hostname="Kaosisos-MacBook-Pro.local" tests="23" failures="0" errors="0" skipped="0" time="0.01161637">
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — empty / malformed &gt; rejects an empty expression" time="0.002987017">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — empty / malformed &gt; rejects expressions with consecutive dots (..)" time="0.000484413">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — empty / malformed &gt; rejects expressions with consecutive slashes (//)" time="0.000282633">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; accepts a well-formed JSONPath" time="0.000333902">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; accepts bare &apos;$&apos; (whole context shorthand)" time="0.000213164">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; rejects &apos;$&lt;no-dot&gt;&apos; (malformed root)" time="0.000193018">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; rejects &apos;$.&apos; with no field after the dot" time="0.000297891">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSONPath &gt; accepts any root segment — does NOT validate against envelope slots (permissive)" time="0.000347021">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; accepts a pointer rooted at a known envelope slot" time="0.00073819">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; rejects a multi-segment pointer with an unknown root slot" time="0.000411952">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; includes a &apos;did-you-mean&apos; suggestion for near-miss slot names" time="0.000275967">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; accepts a single-segment identifier-shaped pointer unconditionally (mustache close tag)" time="0.000139059">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — JSON Pointer &gt; rejects &apos;/&apos; with no segments" time="0.000125541">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — plain names &gt; accepts plain identifiers" time="0.000245851">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="validateTemplateVariable — plain names &gt; accepts dot-notation paths" time="0.000119536">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="isValidTemplateVariable &gt; returns true for a valid expression" time="0.00015515">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="isValidTemplateVariable &gt; returns false for an invalid expression" time="0.000137059">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; strips {{ }} wrappers" time="0.00019578">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; strips {% %} wrappers" time="0.000117001">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; strips {%- -%} wrappers (whitespace-trimming variants)" time="0.000164">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; strips {# #} comment wrappers" time="0.000285222">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; returns the raw text when no wrapper is present" time="0.000128854">
-        </testcase>
-        <testcase classname="tests/unit/template-variable.test.ts" name="extractTemplateExpression &gt; returns empty string for empty input" time="0.000122009">
-        </testcase>
-    </testsuite>
-    <testsuite name="tests/unit/validators-and-ids.test.ts" timestamp="2026-06-03T07:51:20.602Z" hostname="Kaosisos-MacBook-Pro.local" tests="33" failures="0" errors="0" skipped="0" time="0.027494259">
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns true for valid UUID 123e4567-e89b-12d3-a456-426614174000" time="0.003901645">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns true for valid UUID 00000000-0000-0000-0000-000000000000" time="0.001252744">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns true for valid UUID FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF" time="0.000217274">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input " time="0.000379108">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input not-a-uuid" time="0.000236402">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input 123e4567-e89b-12d3-a456" time="0.000159416">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input 123e4567-e89b-12d3-a456-42661417400Z" time="0.00024089">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidUUID &gt; returns false for invalid input 123e4567e89b12d3a456426614174000" time="0.000520701">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="validateUUID &gt; does not throw for a valid UUID" time="0.001772324">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="validateUUID &gt; throws with a descriptive message for an invalid UUID" time="0.000672121">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns true for http://example.com" time="0.000331258">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns true for https://example.com/path?q=1" time="0.000188039">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns false for ftp://example.com" time="0.000192804">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns false for not-a-url" time="0.000215639">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns false for " time="0.000138057">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidHttpUrl &gt; returns false for javascript:alert(1)" time="0.000163908">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns true for valid regex ^[a-z]+$" time="0.000265106">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns true for valid regex \d+" time="0.000129412">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns true for valid regex (foo|bar)" time="0.000198754">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns true for valid regex .*" time="0.000118527">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns false for invalid regex [invalid" time="0.000173268">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns false for invalid regex (unclosed" time="0.000099736">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="isValidRegex &gt; returns false for invalid regex *bad" time="0.000091557">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToTraceId &gt; strips dashes from a UUID" time="0.000158752">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToTraceId &gt; returns undefined for undefined input" time="0.008773454">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToTraceId &gt; returns undefined for empty string" time="0.000167004">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToSpanId &gt; returns the last 16 hex chars of the stripped UUID" time="0.000202041">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToSpanId &gt; returns undefined for undefined input" time="0.000172146">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="uuidToSpanId &gt; span ID length is always 16" time="0.001682264">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="removeTrailingSlash &gt; removes a trailing slash" time="0.000315106">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="removeTrailingSlash &gt; leaves a URI without trailing slash unchanged" time="0.000141222">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="removeTrailingSlash &gt; removes only the last slash, not interior ones" time="0.000687124">
-        </testcase>
-        <testcase classname="tests/unit/validators-and-ids.test.ts" name="removeTrailingSlash &gt; handles empty string" time="0.000208165">
-        </testcase>
-    </testsuite>
-</testsuites>

From 6b8a8022bcca0b527d36baa000c9457f3054bf30 Mon Sep 17 00:00:00 2001
From: Kaosiso Ezealigo <ezealigokosiso@gmail.com>
Date: Wed, 3 Jun 2026 12:37:23 +0200
Subject: [PATCH 03/36] =?UTF-8?q?fix(test):=20address=20PR=20review=20comm?=
 =?UTF-8?q?ents=20=E2=80=94=20typed=20fixtures,=20falsy-root=20coverage,?=
 =?UTF-8?q?=20template-variable=20alignment?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace `as any` fixture casts with `as unknown as T` in annotation tests
- Fix incorrect Annotation import source in testset-sync (now from @agenta/entities/annotation)
- Add Testcase type import and remove all as-any call-site casts in testset-sync
- Add falsy-root short-circuit tests for getValueAtPath (0, false, "", null)
- Realign template-variable tests to the strict envelope-slot behavior on main

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../unit/annotation-form-helpers.test.ts      | 10 +++--
 .../tests/unit/testset-sync.test.ts           | 31 +++++++-------
 .../tests/unit/path-utils.test.ts             | 18 ++++++++
 .../tests/unit/template-variable.test.ts      | 41 ++++++++-----------
 4 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts b/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts
index d0a246ce6c..f796a10c8c 100644
--- a/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts
+++ b/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts
@@ -77,32 +77,34 @@ import {
     getOutputsSchema,
     isEmptyValue,
 } from "../../src/state/controllers/annotationFormController"
+import type {Annotation} from "@agenta/entities/annotation"
+import type {Workflow} from "@agenta/entities/workflow"
 
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
 
-function makeWorkflow(schemaProperties: Record<string, unknown> = {}) {
+function makeWorkflow(schemaProperties: Record<string, unknown> = {}): Workflow {
     // resolveOutputSchema is mocked to return its input,
     // so we set data to the schema shape directly.
     return {
         data: {properties: schemaProperties},
         slug: "test-evaluator",
         id: "wf-1",
-    } as any
+    } as unknown as Workflow
 }
 
 function makeAnnotation(
     outputs: Record<string, unknown>,
     references?: {evaluator?: {slug?: string}},
-) {
+): Annotation {
     return {
         trace_id: "trace-1",
         span_id: "span-1",
         data: {outputs},
         references,
         meta: {},
-    } as any
+    } as unknown as Annotation
 }
 
 beforeEach(() => {
diff --git a/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts b/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts
index 598c60708d..4c7ce5c783 100644
--- a/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts
+++ b/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts
@@ -7,7 +7,8 @@
 
 import {describe, expect, it} from "vitest"
 
-import type {Annotation} from "../../src/state/testsetSync"
+import type {Annotation} from "@agenta/entities/annotation"
+import type {Testcase} from "@agenta/entities/testcase"
 import {
     buildTestcaseExportRows,
     buildTestsetSyncOperations,
@@ -475,8 +476,8 @@ describe("buildTraceTestsetRows", () => {
 describe("buildTestcaseExportRows", () => {
     const evaluator = {slug: "quality", workflowId: "wf-q"}
 
-    function makeTestcase(id: string, testsetId: string) {
-        return {id, testset_id: testsetId, data: {prompt: "hello"}}
+    function makeTestcase(id: string, testsetId: string): Testcase {
+        return {id, testset_id: testsetId, data: {prompt: "hello"}} as unknown as Testcase
     }
 
     it("builds a row when annotation data exists for the testcase", () => {
@@ -487,7 +488,7 @@ describe("buildTestcaseExportRows", () => {
         })
         const rows = buildTestcaseExportRows({
             scenarioIds: ["s-1"],
-            testcasesByScenarioId: new Map([["s-1", makeTestcase("tc-1", "ts-1") as any]]),
+            testcasesByScenarioId: new Map([["s-1", makeTestcase("tc-1", "ts-1")]]),
             annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
             evaluators: [evaluator],
             queueId: "q-1",
@@ -495,7 +496,7 @@ describe("buildTestcaseExportRows", () => {
         expect(rows).toHaveLength(1)
         expect(rows[0].testcaseId).toBe("tc-1")
         expect(rows[0].testsetId).toBe("ts-1")
-        expect((rows[0].data as any).quality).toMatchObject({score: 8})
+        expect((rows[0].data as Record<string, unknown>).quality).toMatchObject({score: 8})
     })
 
     it("skips a scenario with no testcase mapping", () => {
@@ -512,7 +513,7 @@ describe("buildTestcaseExportRows", () => {
     it("skips a testcase with no annotations", () => {
         const rows = buildTestcaseExportRows({
             scenarioIds: ["s-1"],
-            testcasesByScenarioId: new Map([["s-1", makeTestcase("tc-1", "ts-1") as any]]),
+            testcasesByScenarioId: new Map([["s-1", makeTestcase("tc-1", "ts-1")]]),
             annotationsByTestcaseId: new Map([["tc-1", []]]),
             evaluators: [evaluator],
             queueId: "q-1",
@@ -528,8 +529,8 @@ describe("buildTestcaseExportRows", () => {
 describe("buildTestsetSyncPreview", () => {
     const evaluator = {slug: "quality", workflowId: "wf-q"}
 
-    function makeTestcase(id: string, testsetId: string) {
-        return {id, testset_id: testsetId, data: {}}
+    function makeTestcase(id: string, testsetId: string): Testcase {
+        return {id, testset_id: testsetId, data: {}} as unknown as Testcase
     }
 
     function makeQueueAnn(traceId = "trace-1") {
@@ -559,7 +560,7 @@ describe("buildTestsetSyncPreview", () => {
         const preview = buildTestsetSyncPreview({
             queueId: "q-1",
             completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
-            testcasesById: new Map([["tc-1", {id: "tc-1", data: {}} as any]]),
+            testcasesById: new Map([["tc-1", {id: "tc-1", data: {}} as unknown as Testcase]]),
             annotationsByTestcaseId: new Map(),
             evaluators: [evaluator],
             latestRevisionIdsByTestsetId: new Map(),
@@ -572,7 +573,7 @@ describe("buildTestsetSyncPreview", () => {
         const preview = buildTestsetSyncPreview({
             queueId: "q-1",
             completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
-            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1") as any]]),
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1")]]),
             annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
             evaluators: [evaluator],
             latestRevisionIdsByTestsetId: new Map(), // ts-1 has no revision
@@ -585,7 +586,7 @@ describe("buildTestsetSyncPreview", () => {
         const preview = buildTestsetSyncPreview({
             queueId: "q-1",
             completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
-            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1") as any]]),
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1")]]),
             annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
             evaluators: [evaluator],
             latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
@@ -604,7 +605,7 @@ describe("buildTestsetSyncPreview", () => {
         const preview = buildTestsetSyncPreview({
             queueId: "q-1",
             completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
-            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1") as any]]),
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1")]]),
             annotationsByTestcaseId: new Map([["tc-1", [ann1, ann2]]]),
             evaluators: [evaluator],
             latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
@@ -624,8 +625,8 @@ describe("buildTestsetSyncPreview", () => {
                 {scenarioId: "s-2", testcaseId: "tc-2"},
             ],
             testcasesById: new Map([
-                ["tc-1", makeTestcase("tc-1", "ts-1") as any],
-                ["tc-2", makeTestcase("tc-2", "ts-1") as any],
+                ["tc-1", makeTestcase("tc-1", "ts-1")],
+                ["tc-2", makeTestcase("tc-2", "ts-1")],
             ]),
             annotationsByTestcaseId: new Map([
                 ["tc-1", [ann1]],
@@ -648,7 +649,7 @@ describe("buildTestsetSyncPreview", () => {
         const preview = buildTestsetSyncPreview({
             queueId: "q-1",
             completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
-            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1") as any]]),
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1")]]),
             annotationsByTestcaseId: new Map([["tc-1", [annNoOutputs]]]),
             evaluators: [evaluator],
             latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
diff --git a/web/packages/agenta-shared/tests/unit/path-utils.test.ts b/web/packages/agenta-shared/tests/unit/path-utils.test.ts
index 57c875af13..b330ac54c3 100644
--- a/web/packages/agenta-shared/tests/unit/path-utils.test.ts
+++ b/web/packages/agenta-shared/tests/unit/path-utils.test.ts
@@ -31,6 +31,24 @@ describe("getValueAtPath — basic object navigation", () => {
     })
 })
 
+describe("getValueAtPath — falsy root short-circuit", () => {
+    it("returns 0 immediately (falsy root, path ignored)", () => {
+        expect(getValueAtPath(0, ["a"])).toBe(0)
+    })
+
+    it("returns false immediately (falsy root, path ignored)", () => {
+        expect(getValueAtPath(false, ["a"])).toBe(false)
+    })
+
+    it("returns empty string immediately (falsy root, path ignored)", () => {
+        expect(getValueAtPath("", ["a"])).toBe("")
+    })
+
+    it("returns null immediately (falsy root, path ignored)", () => {
+        expect(getValueAtPath(null, ["a"])).toBeNull()
+    })
+})
+
 describe("getValueAtPath — array indexing", () => {
     it("accesses array elements by numeric index", () => {
         expect(getValueAtPath([10, 20, 30], [1])).toBe(20)
diff --git a/web/packages/agenta-shared/tests/unit/template-variable.test.ts b/web/packages/agenta-shared/tests/unit/template-variable.test.ts
index 9fa5aafe9a..edcee7157d 100644
--- a/web/packages/agenta-shared/tests/unit/template-variable.test.ts
+++ b/web/packages/agenta-shared/tests/unit/template-variable.test.ts
@@ -27,30 +27,30 @@ describe("validateTemplateVariable — empty / malformed", () => {
 })
 
 // ---------------------------------------------------------------------------
-// validateTemplateVariable — JSONPath ($.)
+// validateTemplateVariable — JSONPath ($)
 // ---------------------------------------------------------------------------
 
 describe("validateTemplateVariable — JSONPath", () => {
-    it("accepts a well-formed JSONPath", () => {
-        expect(validateTemplateVariable("$.inputs.country").valid).toBe(true)
+    it("rejects bare '$' (no envelope slot after root)", () => {
+        // On main: tokens after stripping '$.' are empty → invalid
+        expect(validateTemplateVariable("$").valid).toBe(false)
     })
 
-    it("accepts bare '$' (whole context shorthand)", () => {
-        expect(validateTemplateVariable("$").valid).toBe(true)
+    it("accepts a well-formed JSONPath rooted at a known slot", () => {
+        expect(validateTemplateVariable("$.inputs.country").valid).toBe(true)
+        expect(validateTemplateVariable("$.outputs.result").valid).toBe(true)
     })
 
-    it("rejects '$<no-dot>' (malformed root)", () => {
-        const result = validateTemplateVariable("$outputs.country")
+    it("rejects a JSONPath whose root is not a known envelope slot", () => {
+        const result = validateTemplateVariable("$.arbitrary_column")
         expect(result.valid).toBe(false)
+        expect(result.reason).toMatch(/unknown envelope slot/i)
     })
 
-    it("rejects '$.' with no field after the dot", () => {
-        expect(validateTemplateVariable("$.").valid).toBe(false)
-    })
-
-    it("accepts any root segment — does NOT validate against envelope slots (permissive)", () => {
-        // Per mustache QA principle: $.arbitrary is valid; runtime validates
-        expect(validateTemplateVariable("$.arbitrary_column").valid).toBe(true)
+    it("includes a 'did-you-mean' suggestion for near-miss slot names", () => {
+        const result = validateTemplateVariable("$.input.country") // 'input' ≈ 'inputs'
+        expect(result.valid).toBe(false)
+        expect(result.suggestion).toBe("inputs")
     })
 })
 
@@ -64,23 +64,18 @@ describe("validateTemplateVariable — JSON Pointer", () => {
         expect(validateTemplateVariable("/outputs/result").valid).toBe(true)
     })
 
-    it("rejects a multi-segment pointer with an unknown root slot", () => {
-        const result = validateTemplateVariable("/unknown/field")
+    it("rejects a pointer with an unknown root slot", () => {
+        const result = validateTemplateVariable("/section")
         expect(result.valid).toBe(false)
         expect(result.reason).toMatch(/unknown envelope slot/i)
     })
 
     it("includes a 'did-you-mean' suggestion for near-miss slot names", () => {
-        const result = validateTemplateVariable("/input/country") // 'input' ≈ 'inputs'
+        const result = validateTemplateVariable("/input/country")
         expect(result.valid).toBe(false)
         expect(result.suggestion).toBe("inputs")
     })
 
-    it("accepts a single-segment identifier-shaped pointer unconditionally (mustache close tag)", () => {
-        // e.g. {{/section}} — single segment, identifier-shaped → valid
-        expect(validateTemplateVariable("/section").valid).toBe(true)
-    })
-
     it("rejects '/' with no segments", () => {
         expect(validateTemplateVariable("/").valid).toBe(false)
     })
@@ -112,7 +107,7 @@ describe("isValidTemplateVariable", () => {
 
     it("returns false for an invalid expression", () => {
         expect(isValidTemplateVariable("")).toBe(false)
-        expect(isValidTemplateVariable("$outputs.x")).toBe(false)
+        expect(isValidTemplateVariable("$.unknown_slot")).toBe(false)
     })
 })
 

From 724f827a3569816605ecc31f07f64c2d23599976 Mon Sep 17 00:00:00 2001
From: junaway <7041392+junaway@users.noreply.github.com>
Date: Thu, 4 Jun 2026 15:01:43 +0000
Subject: [PATCH 04/36] v0.102.0

---
 api/pyproject.toml                          | 2 +-
 api/uv.lock                                 | 6 +++---
 clients/python/pyproject.toml               | 2 +-
 clients/python/uv.lock                      | 2 +-
 hosting/kubernetes/helm/Chart.yaml          | 4 ++--
 sdks/python/pyproject.toml                  | 2 +-
 sdks/python/uv.lock                         | 4 ++--
 services/pyproject.toml                     | 2 +-
 services/uv.lock                            | 6 +++---
 web/ee/package.json                         | 2 +-
 web/oss/package.json                        | 2 +-
 web/package.json                            | 2 +-
 web/packages/agenta-api-client/package.json | 2 +-
 13 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/api/pyproject.toml b/api/pyproject.toml
index f189225c27..bed728e76f 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "api"
-version = "0.101.1"
+version = "0.102.0"
 description = "Agenta API"
 requires-python = ">=3.11,<3.14"
 authors = [
diff --git a/api/uv.lock b/api/uv.lock
index 8e399a88c5..e03183d4a2 100644
--- a/api/uv.lock
+++ b/api/uv.lock
@@ -8,7 +8,7 @@ resolution-markers = [
 
 [[package]]
 name = "agenta"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../sdks/python" }
 dependencies = [
     { name = "agenta-client" },
@@ -70,7 +70,7 @@ dev = [
 
 [[package]]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../clients/python" }
 dependencies = [
     { name = "httpx" },
@@ -259,7 +259,7 @@ wheels = [
 
 [[package]]
 name = "api"
-version = "0.101.1"
+version = "0.102.0"
 source = { virtual = "." }
 dependencies = [
     { name = "agenta" },
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index eb0167cfb9..b6b760f4c4 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 description = "Fern-generated Python client for the Agenta API."
 requires-python = ">=3.11,<3.14"
 authors = [
diff --git a/clients/python/uv.lock b/clients/python/uv.lock
index bcf7e18af6..96b0d44d15 100644
--- a/clients/python/uv.lock
+++ b/clients/python/uv.lock
@@ -4,7 +4,7 @@ requires-python = ">=3.11, <3.14"
 
 [[package]]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "." }
 dependencies = [
     { name = "httpx" },
diff --git a/hosting/kubernetes/helm/Chart.yaml b/hosting/kubernetes/helm/Chart.yaml
index 5a80e7f5c7..3c88f08f85 100644
--- a/hosting/kubernetes/helm/Chart.yaml
+++ b/hosting/kubernetes/helm/Chart.yaml
@@ -2,8 +2,8 @@ apiVersion: v2
 name: agenta
 description: A Helm chart for deploying Agenta (OSS or EE) on Kubernetes
 type: application
-version: 0.101.1
-appVersion: "v0.101.1"
+version: 0.102.0
+appVersion: "v0.102.0"
 keywords:
   - agenta
   - llm
diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml
index 94d30e10eb..f5dffe454d 100644
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "agenta"
-version = "0.101.1"
+version = "0.102.0"
 description = "The SDK for agenta is an open-source LLMOps platform."
 readme = "README.md"
 requires-python = ">=3.11,<3.14"
diff --git a/sdks/python/uv.lock b/sdks/python/uv.lock
index 45f9972644..33bb5ed7cd 100644
--- a/sdks/python/uv.lock
+++ b/sdks/python/uv.lock
@@ -4,7 +4,7 @@ requires-python = ">=3.11, <3.14"
 
 [[package]]
 name = "agenta"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "." }
 dependencies = [
     { name = "agenta-client" },
@@ -83,7 +83,7 @@ dev = [
 
 [[package]]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../../clients/python" }
 dependencies = [
     { name = "httpx" },
diff --git a/services/pyproject.toml b/services/pyproject.toml
index fb3bcc09f2..b29077b98f 100644
--- a/services/pyproject.toml
+++ b/services/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "services"
-version = "0.101.1"
+version = "0.102.0"
 description = "Agenta Services (Chat & Completion)"
 requires-python = ">=3.11,<3.14"
 authors = [
diff --git a/services/uv.lock b/services/uv.lock
index 120dd8a54e..61aea0ab8f 100644
--- a/services/uv.lock
+++ b/services/uv.lock
@@ -8,7 +8,7 @@ resolution-markers = [
 
 [[package]]
 name = "agenta"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../sdks/python" }
 dependencies = [
     { name = "agenta-client" },
@@ -70,7 +70,7 @@ dev = [
 
 [[package]]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../clients/python" }
 dependencies = [
     { name = "httpx" },
@@ -2363,7 +2363,7 @@ wheels = [
 
 [[package]]
 name = "services"
-version = "0.101.1"
+version = "0.102.0"
 source = { virtual = "." }
 dependencies = [
     { name = "agenta" },
diff --git a/web/ee/package.json b/web/ee/package.json
index 9b6f2b07a9..9e87a9a17a 100644
--- a/web/ee/package.json
+++ b/web/ee/package.json
@@ -1,6 +1,6 @@
 {
     "name": "@agenta/ee",
-    "version": "0.101.1",
+    "version": "0.102.0",
     "private": true,
     "engines": {
         "node": "24.x"
diff --git a/web/oss/package.json b/web/oss/package.json
index bb9da8b419..29dc761848 100644
--- a/web/oss/package.json
+++ b/web/oss/package.json
@@ -1,6 +1,6 @@
 {
     "name": "@agenta/oss",
-    "version": "0.101.1",
+    "version": "0.102.0",
     "private": true,
     "engines": {
         "node": "24.x"
diff --git a/web/package.json b/web/package.json
index 023d05a446..e9e9a989de 100644
--- a/web/package.json
+++ b/web/package.json
@@ -1,6 +1,6 @@
 {
     "name": "agenta-web",
-    "version": "0.101.1",
+    "version": "0.102.0",
     "workspaces": [
         "ee",
         "oss",
diff --git a/web/packages/agenta-api-client/package.json b/web/packages/agenta-api-client/package.json
index 93f9d20f6b..ef00623602 100644
--- a/web/packages/agenta-api-client/package.json
+++ b/web/packages/agenta-api-client/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agentaai/api-client",
-  "version": "0.101.1",
+  "version": "0.102.0",
   "private": true,
   "type": "module",
   "main": "./dist/index.js",

From 6c32b05911e30bf557afaa9d8205ac90d98e9c31 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Thu, 28 May 2026 12:31:10 +0200
Subject: [PATCH 05/36] fix(frontend): re-enable full-page playground for
 evaluator workflows

PR #4384 disabled EVALUATOR_FULL_PAGE_NAV_ENABLED because the app-style
playground was a regression for evaluators (lost the upstream-app
connection) and app-scoped observability defaulted to "invocation"
instead of "annotation" for evaluator workflows. This change addresses
both blockers and re-enables the flow by default.

Playground
- ConfigureEvaluatorPage: upstream app workflow can be connected via
  EntityPicker (skip-variant adapter, filtered to non-evaluator
  non-feedback workflows). Disconnect affordance on the picker
  trigger and as a popup footer.
- Standalone evaluator runs no longer require an upstream app
  (TestsetDropdown is always available; runDisabled gate removed).
- Playground chain traces now write evaluator references
  (evaluator / evaluator_variant / evaluator_revision slots) so the
  per-evaluator observability page can find them. EntityPicker
  search bar respects a new parentLabel option so app pickers no
  longer show "Search evaluator..."

Observability filters
- Per-workflow-kind trace_type default extracted into
  @agenta/entities (defaultTraceTypeForWorkflow): annotation for
  evaluators, invocation otherwise. Pure helper unit-tested with
  vitest.
- References scope filter adapts to the effective trace_type:
  evaluators with trace_type=annotation pin to references.evaluator,
  invocation pins to references.application, and "no trace_type"
  ORs across both slots so all traces mentioning the evaluator
  surface.
- Dialog reconciliation: live label flip while editing trace_type
  in the filter dialog ("Application ID" / "Evaluator ID") via an
  opt-in reconcileFilterRows callback on Filters; observability
  page provides an evaluator-workflow-aware reconciler.
- Filter persistence across reloads: per-app via atomWithStorage
  under "agenta:observability:filters", with __global__ fallback
  for project-level pages. Both userFilters and traceTypeChoice
  share one packed storage atom.
- Cleaner state machine for trace_type intent: tagged union
  (default / value / cleared) replaces the dual-atom dance that
  could silently revert.
- application_id URL param dropped for evaluator workflows; the
  query is gated on workflow context being settled to avoid
  firing with the wrong scope.

Tests
- vitest unit tests for defaultTraceTypeForWorkflow.
- Playwright acceptance for full-page playground: post-create
  nav, row click for LLM and declarative evaluators, direct URL,
  sidebar switcher; fixes the previously broken
  select-app-and-run test for the new flow.
---
 .../EvaluatorPlaygroundHeader.tsx             |  46 +-
 .../components/ConfigureEvaluator/atoms.ts    |  64 ++-
 .../components/ConfigureEvaluator/index.tsx   |  50 +-
 web/oss/src/components/Evaluators/index.tsx   |  23 +-
 web/oss/src/components/Filters/Filters.tsx    |  50 +-
 web/oss/src/components/Filters/types.d.ts     |  14 +
 .../assets/PlaygroundVariantConfigHeader.tsx  |   4 +
 .../src/components/PlaygroundRouter/index.tsx | 119 +----
 .../Sidebar/components/WorkflowEntityCard.tsx |  30 +-
 .../WorkflowRevisionDrawerWrapper/index.tsx   |  43 +-
 .../CreateEvaluatorDrawer/index.tsx           |   8 +
 .../assets/filters/fieldAdapter.ts            |  24 +-
 .../components/ObservabilityHeader/index.tsx  |  90 ++++
 .../state/newObservability/atoms/controls.ts  | 307 +++++++++---
 .../state/newObservability/atoms/queries.ts   |  23 +-
 web/oss/src/state/workflow/flags.ts           |  26 +-
 .../playwright/acceptance/evaluators/index.ts | 458 ++++++++++++++++--
 .../playwright/acceptance/evaluators/tests.ts |   8 +
 .../src/workflow/core/index.ts                |   8 +
 .../src/workflow/core/schema.ts               |   6 +
 .../src/workflow/core/traceTypeDefault.ts     |  41 ++
 .../agenta-entities/src/workflow/index.ts     |   5 +
 .../tests/unit/traceTypeDefault.test.ts       |  59 +++
 .../workflowRevisionRelationAdapter.ts        |  41 +-
 .../src/state/execution/executionRunner.ts    | 109 ++++-
 25 files changed, 1330 insertions(+), 326 deletions(-)
 create mode 100644 web/packages/agenta-entities/src/workflow/core/traceTypeDefault.ts
 create mode 100644 web/packages/agenta-entities/tests/unit/traceTypeDefault.test.ts

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
index 879f13436f..174f7273f2 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
@@ -6,7 +6,7 @@
  * Reads evaluator info from playground nodes (URL-driven, no props needed).
  */
 
-import {useMemo} from "react"
+import {useCallback, useMemo} from "react"
 
 import {workflowMolecule} from "@agenta/entities/workflow"
 import {EntityPicker} from "@agenta/entity-ui"
@@ -15,11 +15,12 @@ import type {
     WorkflowRevisionSelectionResult,
 } from "@agenta/entity-ui/selection"
 import {playgroundController} from "@agenta/playground"
-import {Typography} from "antd"
-import {useAtomValue} from "jotai"
+import {X} from "@phosphor-icons/react"
+import {Button, Tooltip, Typography} from "antd"
+import {useAtomValue, useSetAtom} from "jotai"
 import dynamic from "next/dynamic"
 
-import {selectedAppLabelAtom} from "./atoms"
+import {disconnectAppFromEvaluatorAtom, selectedAppLabelAtom} from "./atoms"
 
 const TestsetDropdown = dynamic(
     () => import("@/oss/components/Playground/Components/TestsetDropdown"),
@@ -71,10 +72,25 @@ const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
 
     // Selected app label for display in the picker trigger
     const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+    const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom)
+    const handleDisconnect = useCallback(() => {
+        disconnectApp()
+    }, [disconnectApp])
 
     // Check if we have an app node (depth-0 with a different entity than evaluator)
     const hasAppSelected = nodes.some((n) => n.depth === 0 && n.entityId !== evaluatorEntityId)
 
+    // Footer inside the picker popover — only when an app is currently connected.
+    // Mirrors the "Disconnect all" pattern used by the evaluator picker in
+    // `Playground/Components/PlaygroundHeader/index.tsx`.
+    const popupFooter = hasAppSelected ? (
+        <div className="border-0 border-t border-solid border-[rgba(5,23,41,0.06)] p-2">
+            <Button size="small" danger className="w-full" onClick={handleDisconnect}>
+                Disconnect app
+            </Button>
+        </div>
+    ) : undefined
+
     return (
         <div className="flex items-center justify-between gap-4 px-2.5 py-2 bg-[var(--ag-rgba-000-02)] border-0 border-b border-solid border-[var(--ag-rgba-051729-06)]">
             <div className="flex shrink-0 items-center gap-2 pl-2">
@@ -83,15 +99,33 @@ const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
                 </Typography>
             </div>
 
-            <div className="flex min-w-0 flex-1 items-center justify-end gap-2">
+            <div className="flex min-w-0 flex-1 items-center justify-end gap-1">
                 <EntityPicker<WorkflowRevisionSelectionResult>
                     variant="popover-cascader"
                     adapter={appWorkflowAdapter}
                     onSelect={onAppSelect}
                     size="small"
                     placeholder={selectedAppLabel ?? "Select app"}
+                    popupFooter={popupFooter}
                 />
-                {hasAppSelected && <TestsetDropdown />}
+                {hasAppSelected && (
+                    <Tooltip title="Disconnect app">
+                        <Button
+                            type="text"
+                            size="small"
+                            icon={<X size={12} />}
+                            onClick={handleDisconnect}
+                            aria-label="Disconnect app"
+                        />
+                    </Tooltip>
+                )}
+                {/* Testset is always connectable, with or without an upstream
+                 * app. The earlier `hasAppSelected` gate matched the
+                 * runDisabled gate we removed in T7 — same regression, same
+                 * fix: standalone evaluator runs need a testset just as much
+                 * as chained ones (the evaluator's prompt template variables
+                 * still come from testcase row fields). */}
+                <TestsetDropdown />
             </div>
         </div>
     )
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
index fdbd5d271b..0a6c2d3625 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
@@ -115,8 +115,27 @@ export const hasAppConnectedAtom = atom((get) => {
     return nodes.some((n) => n.depth > 0)
 })
 
-/** Label of the currently selected app workflow (for display in header picker). */
-export const selectedAppLabelAtom = atom<string | null>(null)
+/**
+ * Label of the currently selected app workflow (for display in header picker).
+ *
+ * Derived from the node graph: when an evaluator-as-downstream (depth > 0)
+ * exists, the primary (depth-0) node is the connected app, and its `label`
+ * is what we want to show. Returns `null` in standalone mode (no downstream).
+ *
+ * Derived (not a primitive atom) so URL-hydration of the snapshot — which
+ * restores `playgroundNodesAtom` along with each node's `label` — automatically
+ * surfaces the right label without any explicit re-seeding from the page.
+ * Previously the atom was a primitive `atom<string | null>(null)`, which left
+ * the picker placeholder empty after reload while the disconnect button and
+ * testset dropdown (both gated on the node graph) showed normally.
+ */
+export const selectedAppLabelAtom = atom<string | null>((get) => {
+    const nodes = get(playgroundNodesAtom)
+    const hasDownstream = nodes.some((n) => n.depth > 0)
+    if (!hasDownstream) return null
+    const primary = nodes.find((n) => n.depth === 0)
+    return primary?.label ?? null
+})
 
 // ============================================================================
 // CONNECT APP (on app select)
@@ -143,8 +162,9 @@ export const connectAppToEvaluatorAtom = atom(
     ) => {
         const {appRevisionId, appLabel, evaluatorRevisionId, evaluatorLabel} = params
 
-        // Track selected app label for display + persist across sessions
-        set(selectedAppLabelAtom, appLabel)
+        // Persist across sessions. The picker display label is derived from
+        // the depth-0 node's `label` via `selectedAppLabelAtom`, so no extra
+        // write needed here.
         set(persistedAppSelectionAtom, {appRevisionId, appLabel})
 
         // Replace primary node with app
@@ -167,3 +187,39 @@ export const connectAppToEvaluatorAtom = atom(
         })
     },
 )
+
+// ============================================================================
+// DISCONNECT APP (reverse the connect)
+// ============================================================================
+
+/**
+ * Disconnect the upstream app and return to standalone evaluator mode.
+ *
+ * Reverse of `connectAppToEvaluatorAtom`:
+ * 1. Capture the downstream evaluator's identity (we need it after removal).
+ * 2. Remove the downstream evaluator node (`removeNodeAtom` keeps primary if
+ *    target is depth > 0; if there's no depth-1 node, this is a no-op and we
+ *    just swap primary).
+ * 3. Swap the primary node back to the evaluator. `changePrimaryNodeAtom`
+ *    clears `outputConnectionsAtom` for us as a side-effect.
+ * 4. Clear the persisted app selection + display label so the picker placeholder
+ *    reverts to "Select app".
+ */
+export const disconnectAppFromEvaluatorAtom = atom(null, (get, set) => {
+    const nodes = get(playgroundController.selectors.nodes())
+    const downstreamEvaluator = nodes.find((n) => n.depth > 0)
+    if (!downstreamEvaluator) return
+
+    const evaluatorEntity = {
+        type: downstreamEvaluator.entityType,
+        id: downstreamEvaluator.entityId,
+        label: downstreamEvaluator.label ?? "Evaluator",
+    }
+
+    set(playgroundController.actions.removeNode, downstreamEvaluator.id)
+    set(playgroundController.actions.changePrimaryNode, evaluatorEntity)
+    // `selectedAppLabelAtom` is derived from the node graph — clearing the
+    // downstream above is what flips it back to `null`. Only the persisted
+    // localStorage cache needs an explicit clear.
+    set(persistedAppSelectionAtom, null)
+})
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
index 35ff909bbd..3b94c3a163 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
@@ -14,7 +14,6 @@ import {useCallback, useEffect, useMemo} from "react"
 
 import {loadableController} from "@agenta/entities/loadable"
 import {testcaseMolecule} from "@agenta/entities/testcase"
-import {EntityPicker} from "@agenta/entity-ui"
 import {
     createWorkflowRevisionAdapter,
     type WorkflowRevisionSelectionResult,
@@ -22,7 +21,6 @@ import {
 import {playgroundController} from "@agenta/playground"
 import {type PlaygroundUIProviders} from "@agenta/playground-ui"
 import {preloadEditorPlugins, SyncStateTag} from "@agenta/ui"
-import {Typography} from "antd"
 import {useAtomValue, useSetAtom} from "jotai"
 import dynamic from "next/dynamic"
 
@@ -32,12 +30,7 @@ import {OSSPlaygroundShell} from "@/oss/components/Playground/OSSPlaygroundShell
 import SharedGenerationResultUtils from "@/oss/components/SharedGenerationResultUtils"
 import {playgroundSyncAtom} from "@/oss/state/url/playground"
 
-import {
-    connectAppToEvaluatorAtom,
-    evaluatorConfigEntityIdsAtom,
-    hasAppConnectedAtom,
-    selectedAppLabelAtom,
-} from "./atoms"
+import {connectAppToEvaluatorAtom, evaluatorConfigEntityIdsAtom} from "./atoms"
 import EvaluatorPlaygroundHeader from "./EvaluatorPlaygroundHeader"
 
 const PlaygroundMainView = dynamic(
@@ -77,13 +70,11 @@ const ConfigureEvaluatorPageInner = () => {
     useAtomValue(playgroundSyncAtom)
 
     const configEntityIds = useAtomValue(evaluatorConfigEntityIdsAtom)
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
     const connectApp = useSetAtom(connectAppToEvaluatorAtom)
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
 
     // Read the current evaluator entity from playground nodes
-    // Phase 1: evaluator is at depth 0 (primary)
-    // Phase 2: evaluator is at depth 1 (downstream)
+    // Phase 1: evaluator is at depth 0 (primary, standalone run)
+    // Phase 2: evaluator is at depth 1 (downstream of a connected app — chain run)
     const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
     const evaluatorNode = useMemo(() => {
         const downstream = nodes.find((n) => n.depth > 0)
@@ -96,13 +87,21 @@ const ConfigureEvaluatorPageInner = () => {
         void preloadEditorPlugins()
     }, [])
 
-    // App workflow picker (shared between header and empty state)
+    // App workflow picker — opt-in for chain-mode execution. The evaluator can
+    // also run standalone: the user fills the testcase row's template variables
+    // (e.g. `{{inputs}}`, `{{outputs}}` for LLM-as-a-judge) directly. The
+    // header surfaces this picker; we never block the run panel on it.
     const appWorkflowAdapter = useMemo(
         () =>
             createWorkflowRevisionAdapter({
                 skipVariantLevel: true,
                 excludeRevisionZero: true,
                 flags: {is_evaluator: false, is_feedback: false},
+                // The picker on the evaluator playground header is picking an
+                // upstream *app* workflow to connect to — without this the
+                // search bar would say "Search evaluator…" (the adapter's
+                // historical default) while the user is choosing an app.
+                parentLabel: "Application",
             }),
         [],
     )
@@ -120,24 +119,6 @@ const ConfigureEvaluatorPageInner = () => {
         [connectApp, evaluatorNode],
     )
 
-    const runDisabledContent = useMemo(
-        () => (
-            <>
-                <Typography.Text type="secondary" className="text-sm">
-                    Select an app to run the evaluator chain
-                </Typography.Text>
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={handleAppSelect}
-                    size="middle"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-            </>
-        ),
-        [appWorkflowAdapter, handleAppSelect, selectedAppLabel],
-    )
-
     const providers = useMemo(
         () =>
             ({
@@ -156,12 +137,7 @@ const ConfigureEvaluatorPageInner = () => {
                     appWorkflowAdapter={appWorkflowAdapter}
                     onAppSelect={handleAppSelect}
                 />
-                <PlaygroundMainView
-                    mode="evaluator"
-                    configEntityIdsOverride={configEntityIds}
-                    runDisabled={!hasAppConnected}
-                    runDisabledContent={runDisabledContent}
-                />
+                <PlaygroundMainView mode="evaluator" configEntityIdsOverride={configEntityIds} />
             </div>
         </OSSPlaygroundShell>
     )
diff --git a/web/oss/src/components/Evaluators/index.tsx b/web/oss/src/components/Evaluators/index.tsx
index ed318a1469..00e8737b30 100644
--- a/web/oss/src/components/Evaluators/index.tsx
+++ b/web/oss/src/components/Evaluators/index.tsx
@@ -3,7 +3,6 @@ import {memo, useCallback, useEffect, useMemo, useState} from "react"
 import {
     createEvaluatorFromTemplate,
     type EvaluatorCatalogTemplate,
-    hasFullPagePlaygroundUX,
     invalidateEvaluatorsListCache,
     workflowMolecule,
 } from "@agenta/entities/workflow"
@@ -260,22 +259,18 @@ const EvaluatorsRegistry = ({scope = "project", mode = "active"}: EvaluatorsRegi
                 return
             }
 
-            // Only prompt/code-authored evaluators open in the full-page
-            // playground. Declarative classifiers (match, contains, regex,
-            // json_multi_field_match, …) fall back to the drawer-edit flow —
-            // their config is a handful of form fields and the playground
-            // page would surface misleading envelope variable inputs.
+            // All non-archived automatic evaluators open in the full-page
+            // playground. Earlier this was gated on classifier type
+            // (`hasFullPagePlaygroundUX`) so declarative classifiers stayed in
+            // the drawer-edit flow, but in practice that meant whole evaluator
+            // types had no UI path into the per-evaluator pages (variants,
+            // traces). Drawer stays available as a secondary affordance via
+            // the row context menu's Configure action.
             //
             // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is
-            // off, every row click resolves to the drawer regardless of the
-            // evaluator's classifier (the new flow stays code-complete but
-            // hidden until follow-up fixes land).
-            const entity = record.revisionId ? workflowMolecule.get.data(record.revisionId) : null
+            // off, every row click resolves to the drawer.
             const shouldNavigateToFullPage = Boolean(
-                EVALUATOR_FULL_PAGE_NAV_ENABLED &&
-                record.workflowId &&
-                entity &&
-                hasFullPagePlaygroundUX(entity as Parameters<typeof hasFullPagePlaygroundUX>[0]),
+                EVALUATOR_FULL_PAGE_NAV_ENABLED && record.workflowId,
             )
 
             const navigated =
diff --git a/web/oss/src/components/Filters/Filters.tsx b/web/oss/src/components/Filters/Filters.tsx
index b1dd2d8385..42a53d736b 100644
--- a/web/oss/src/components/Filters/Filters.tsx
+++ b/web/oss/src/components/Filters/Filters.tsx
@@ -283,6 +283,7 @@ const Filters: React.FC<Props> = ({
     onApplyFilter,
     onClearFilter,
     buttonProps,
+    reconcileFilterRows,
 }) => {
     const evaluatorPreviews = useAtomValue(evaluatorsListDataAtom)
 
@@ -358,6 +359,37 @@ const Filters: React.FC<Props> = ({
                             : item.value == null
                               ? []
                               : [item.value]
+
+                        // Prefer a candidate whose `referenceCategory` matches
+                        // the entry's `"attributes.key"`. This disambiguates
+                        // the `references` family — application.id /
+                        // evaluator.id / environment.id all share
+                        // `baseField: "references"` and
+                        // `referenceProperty: "id"`, so without this check the
+                        // first match (application.id) always wins, mislabelling
+                        // an evaluator-scoped filter as "Application ID".
+                        const attributesKey = (() => {
+                            for (const entry of valuesArray) {
+                                if (entry && typeof entry === "object") {
+                                    const ak = (entry as Record<string, unknown>)["attributes.key"]
+                                    if (typeof ak === "string") return ak
+                                }
+                            }
+                            return undefined
+                        })()
+                        if (attributesKey) {
+                            for (const candidate of matches) {
+                                if (candidate.referenceCategory !== attributesKey) continue
+                                if (!candidate.referenceProperty) continue
+                                const refProp = candidate.referenceProperty
+                                const hasMatch = valuesArray.some(
+                                    (entry) =>
+                                        entry && typeof entry === "object" && refProp in entry,
+                                )
+                                if (hasMatch) return candidate
+                            }
+                        }
+
                         for (const candidate of matches) {
                             if (!candidate.referenceProperty) continue
                             const refProp = candidate.referenceProperty
@@ -511,6 +543,22 @@ const Filters: React.FC<Props> = ({
     const [isFilterOpen, setIsFilterOpen] = useState(false)
     const [keySearchTerms, setKeySearchTerms] = useState<Record<number, string>>({})
 
+    /**
+     * Display-only projection of `filter`. The reconciler is opt-in (passed by
+     * the parent) and may rewrite *cosmetic* row fields like `selectedField` /
+     * `selectedLabel` so the UI reflects an in-flight choice (e.g.,
+     * observability flipping the references row's label between "Application
+     * ID" / "Evaluator ID" as the user picks a trace_type, before Apply).
+     *
+     * Mutations still call `setFilter(filter)` by index, so the reconciler is
+     * required to preserve array length and per-index order — that contract
+     * is documented on the prop.
+     */
+    const displayedFilter = useMemo(
+        () => (reconcileFilterRows ? reconcileFilterRows(filter) : filter),
+        [filter, reconcileFilterRows],
+    )
+
     const sanitizedFilters = useMemo(() => {
         return sanitizeFilterItems(
             filter.filter(({field, operator, isPermanent, isCustomField}) => {
@@ -816,7 +864,7 @@ const Filters: React.FC<Props> = ({
                     </div>
 
                     <div className={filterContainerClass}>
-                        {filter.map((item, idx) => {
+                        {displayedFilter.map((item, idx) => {
                             const uiKey = item.selectedField || item.field || ""
                             const baseFieldCfg = getField(uiKey)
                             const field = effectiveFieldForRow(baseFieldCfg, item)
diff --git a/web/oss/src/components/Filters/types.d.ts b/web/oss/src/components/Filters/types.d.ts
index b25512a170..03f00d6a63 100644
--- a/web/oss/src/components/Filters/types.d.ts
+++ b/web/oss/src/components/Filters/types.d.ts
@@ -8,6 +8,20 @@ export interface Props {
     onApplyFilter: (filters: Filter[]) => void
     onClearFilter: (filters: Filter[]) => void
     buttonProps?: ButtonProps
+    /**
+     * Optional callback to derive a *display-only* view of the local filter
+     * state. Called whenever the user changes a row in the dialog. The dialog
+     * renders from the returned array, but mutations still target the
+     * underlying `filter` state by index, so the reconciler MUST preserve
+     * array length and per-index order.
+     *
+     * Used by observability to keep the permanent references row's label
+     * ("Application ID" vs "Evaluator ID") in sync with the dialog's local
+     * `trace_type` selection *before* the user clicks Apply — without the
+     * reconciler, the label only refreshes after Apply when the atom
+     * re-derives the permanent row.
+     */
+    reconcileFilterRows?: (rows: FilterItem[]) => FilterItem[]
 }
 
 export type CustomValueType = "string" | "number" | "boolean"
diff --git a/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx b/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx
index 864f2d938d..94e2478278 100644
--- a/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx
+++ b/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx
@@ -72,6 +72,10 @@ const PlaygroundVariantConfigHeader = ({
                 skipVariantLevel: true,
                 excludeRevisionZero: true,
                 flags: {is_evaluator: false, is_feedback: false},
+                // App browse picker — without this the search bar would say
+                // "Search evaluator…" (the adapter's default in skip-variant
+                // mode) while the user is browsing apps.
+                parentLabel: "Application",
             }),
         [],
     )
diff --git a/web/oss/src/components/PlaygroundRouter/index.tsx b/web/oss/src/components/PlaygroundRouter/index.tsx
index bd983f1461..3158366096 100644
--- a/web/oss/src/components/PlaygroundRouter/index.tsx
+++ b/web/oss/src/components/PlaygroundRouter/index.tsx
@@ -1,24 +1,13 @@
-import {memo, useEffect, useMemo, useRef} from "react"
+import {memo} from "react"
 
-import {
-    hasFullPagePlaygroundUX,
-    workflowLatestRevisionIdAtomFamily,
-    workflowMolecule,
-} from "@agenta/entities/workflow"
 import {bgColors} from "@agenta/ui"
 import {DownOutlined} from "@ant-design/icons"
 import {Flask, Plus} from "@phosphor-icons/react"
 import {Button, Space, Typography} from "antd"
 import {useAtomValue} from "jotai"
 import dynamic from "next/dynamic"
-import {useRouter} from "next/router"
 
-import {appIdentifiersAtom} from "@/oss/state/appState"
-import {
-    currentWorkflowAtom,
-    currentWorkflowContextAtom,
-    EVALUATOR_FULL_PAGE_NAV_ENABLED,
-} from "@/oss/state/workflow"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 const PlaygroundLoadingShell = () => {
     return (
@@ -60,92 +49,28 @@ const Playground = dynamic(() => import("../Playground/Playground"), {
     loading: PlaygroundLoadingShell,
 })
 
-/**
- * Stale-URL guard for evaluator playgrounds. Most evaluators (classifiers,
- * matchers, JSON validators, …) have no meaningful full-page playground UX —
- * just a handful of form fields the drawer already renders. When the
- * resolved workflow is one of those evaluators, redirect to the evaluators
- * registry with the revision pre-selected so the drawer opens automatically.
- * Prompt/code-authored evaluators (auto_ai_critique, llm, code) are kept on
- * the playground page.
- *
- * Classification source: the workflow LIST entry has no `data.uri` (data is
- * only populated on revision-detail responses), so we resolve the latest
- * revision via `workflowLatestRevisionIdAtomFamily` and read its seeded
- * entity from the molecule to get the URI. Without this, every evaluator
- * playground briefly looks "unknown" and the guard would mis-redirect
- * prompt-based evaluators like LLM-as-a-judge.
- */
-const useEvaluatorPlaygroundGuard = () => {
-    const ctx = useAtomValue(currentWorkflowContextAtom)
-    const workflow = useAtomValue(currentWorkflowAtom)
-    const {workspaceId, projectId} = useAtomValue(appIdentifiersAtom)
-    const router = useRouter()
-    const redirectedFor = useRef<string | null>(null)
-
-    const workflowId = ctx.workflowId ?? ""
-    const latestRevisionId = useAtomValue(
-        useMemo(() => workflowLatestRevisionIdAtomFamily(workflowId), [workflowId]),
-    )
-
-    useEffect(() => {
-        if (ctx.isResolving || ctx.isError || ctx.isNotFound) return
-        if (ctx.workflowKind !== "evaluator") return
-        if (!workflow || !ctx.workflowId) return
-        if (!workspaceId || !projectId) return
-        if (redirectedFor.current === ctx.workflowId) return
-
-        // Resolve the latest revision data — it carries `data.uri` and the
-        // URI-derived flags (`is_llm`, `is_code`) that classifier vs prompt
-        // evaluators differ on. The workflow list entry has neither.
-        const latestRevision = latestRevisionId
-            ? (workflowMolecule.get.data(latestRevisionId) as
-                  | Parameters<typeof hasFullPagePlaygroundUX>[0]
-                  | null)
-            : null
-
-        // Bail until we have a classifiable record. Redirecting on a half-
-        // loaded workflow would bounce prompt-based evaluators (whose URI
-        // hasn't been seeded yet) into the drawer mid-load.
-        const hasUri = Boolean(latestRevision?.data?.uri)
-        const hasTypeFlag = Boolean(
-            latestRevision?.flags?.is_llm ||
-            latestRevision?.flags?.is_code ||
-            workflow.flags?.is_llm ||
-            workflow.flags?.is_code,
-        )
-        if (!hasUri && !hasTypeFlag) return
-
-        // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off,
-        // skip the "stay on /playground" early return so every evaluator URL
-        // (including direct visits / bookmarks) bounces back to /evaluators
-        // and opens the drawer.
-        const classifyTarget = latestRevision ?? workflow
-        if (EVALUATOR_FULL_PAGE_NAV_ENABLED && hasFullPagePlaygroundUX(classifyTarget)) return
-
-        const base = `/w/${encodeURIComponent(workspaceId)}/p/${encodeURIComponent(projectId)}`
-        const target = latestRevisionId
-            ? `${base}/evaluators?revisionId=${encodeURIComponent(latestRevisionId)}`
-            : `${base}/evaluators`
-
-        redirectedFor.current = ctx.workflowId
-        router.replace(target)
-    }, [
-        ctx.isResolving,
-        ctx.isError,
-        ctx.isNotFound,
-        ctx.workflowKind,
-        ctx.workflowId,
-        workflow,
-        latestRevisionId,
-        workspaceId,
-        projectId,
-        router,
-    ])
-}
+// When the current workflow is an evaluator we render the evaluator-flavored
+// page (with `EvaluatorPlaygroundHeader` + `connectAppToEvaluatorAtom`) instead
+// of the generic app `<Playground />`. Same code path that powers
+// `/evaluators/playground` today — `playgroundSyncAtom` matches `/playground`
+// anywhere in the pathname so hydration works at both URLs unchanged.
+const ConfigureEvaluatorPage = dynamic(
+    () => import("@/oss/components/Evaluators/components/ConfigureEvaluator"),
+    {ssr: false, loading: PlaygroundLoadingShell},
+)
 
 const PlaygroundRouter = () => {
-    useEvaluatorPlaygroundGuard()
+    const ctx = useAtomValue(currentWorkflowContextAtom)
+
+    // Evaluators get the evaluator-flavored page so the upstream-app picker
+    // is visible (the generic header only exposes the reverse direction —
+    // app-needs-evaluator — not evaluator-needs-app). All evaluator kinds
+    // (LLM/code, declarative classifiers, custom hooks, …) land here on
+    // direct URL visits + sidebar switcher clicks; for simple classifiers
+    // ConfigureEvaluatorPage renders the same few form fields the drawer
+    // would, with the bonus of the evaluator-as-app surface (variants,
+    // traces, sidebar context).
+    if (ctx.workflowKind === "evaluator") return <ConfigureEvaluatorPage />
     return <Playground />
 }
 
diff --git a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
index 734ddbd2b4..b2ca96b43f 100644
--- a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
+++ b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
@@ -1,7 +1,6 @@
 import {memo, useCallback, useMemo, useState} from "react"
 
 import {
-    fullPagePlaygroundEvaluatorsAtom,
     nonArchivedAppWorkflowsAtom,
     nonArchivedEvaluatorsAtom,
     parseWorkflowKeyFromUri,
@@ -116,24 +115,15 @@ const SWITCHER_MENU_CLASS = clsx(
 const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => {
     const ctx = useAtomValue(currentWorkflowContextAtom)
     const apps = useAtomValue(nonArchivedAppWorkflowsAtom) as readonly Workflow[]
-    // Full set of evaluators — used for resolving the *active* workflow (the
-    // user may be inside a drawer-only evaluator currently). The switcher
-    // dropdown below uses `fullPagePlaygroundEvaluators` instead so it only
-    // lists evaluators whose destination is /apps/[id]/playground — clicking
-    // a declarative classifier or human evaluator from the sidebar would
-    // route through the route guard and bounce back to /evaluators, which is
-    // confusing.
     const evaluators = useAtomValue(nonArchivedEvaluatorsAtom) as readonly Workflow[]
     // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off, the
-    // switcher dropdown hides the "Evaluators" group entirely. Clicking an
-    // entry would route to `/apps/<evaluatorId>/playground`, which the
-    // (also-gated) `PlaygroundRouter` guard would immediately bounce back to
-    // `/evaluators` — exposing the entry would just produce a flicker.
-    const fullPagePlaygroundEvaluatorsRaw = useAtomValue(
-        fullPagePlaygroundEvaluatorsAtom,
-    ) as readonly Workflow[]
-    const fullPagePlaygroundEvaluators: readonly Workflow[] = EVALUATOR_FULL_PAGE_NAV_ENABLED
-        ? fullPagePlaygroundEvaluatorsRaw
+    // switcher dropdown hides the "Evaluators" group entirely. With the flag
+    // on, ALL non-archived evaluators are listed — every evaluator kind has
+    // a working `/apps/<id>/*` surface (PlaygroundRouter renders
+    // ConfigureEvaluatorPage for all evaluator workflows regardless of
+    // template type), so there's no reason to filter to LLM/code only.
+    const switcherEvaluators: readonly Workflow[] = EVALUATOR_FULL_PAGE_NAV_ENABLED
+        ? evaluators
         : EMPTY_WORKFLOWS
     const recentAppId = useAtomValue(recentAppIdAtom)
     const recentEvaluatorId = useAtomValue(recentEvaluatorIdAtom)
@@ -192,16 +182,16 @@ const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => {
                 children: apps.map((w) => toMenuItem(w, false)),
             })
         }
-        if (fullPagePlaygroundEvaluators.length) {
+        if (switcherEvaluators.length) {
             items.push({
                 key: "evaluators-header",
                 type: "group",
                 label: "Evaluators",
-                children: fullPagePlaygroundEvaluators.map((w) => toMenuItem(w, true)),
+                children: switcherEvaluators.map((w) => toMenuItem(w, true)),
             })
         }
         return items
-    }, [apps, fullPagePlaygroundEvaluators])
+    }, [apps, switcherEvaluators])
 
     const handleSwitcherClick = useCallback<NonNullable<MenuProps["onClick"]>>(
         ({key}) => {
diff --git a/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx b/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
index 6726f2423c..7349e251d2 100644
--- a/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
+++ b/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
@@ -15,7 +15,6 @@ import {testcaseMolecule} from "@agenta/entities/testcase"
 import {
     registerWorkflowCommitCallbacks,
     getWorkflowCommitCallbacks,
-    hasFullPagePlaygroundUX,
     parseEvaluatorKeyFromUri,
     evaluatorTemplatesMapAtom,
     workflowMolecule,
@@ -200,7 +199,6 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
     const resetAll = useSetAtom(playgroundController.actions.resetAll)
     const clearAllRuns = useSetAtom(clearAllRunsMutationAtom)
     const setInitialized = useSetAtom(playgroundInitializedAtom)
-    const setSelectedAppLabel = useSetAtom(selectedAppLabelAtom)
     const setConnectedTestset = useSetAtom(connectedTestsetAtom)
     const connectApp = useSetAtom(connectAppToEvaluatorAtom)
     const setPersistedTestset = useSetAtom(persistedTestsetSelectionAtom)
@@ -211,10 +209,12 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
 
             const store = getDefaultStore()
 
-            // Restore persisted app selection (survives drawer close/reopen and commits)
+            // Restore persisted app selection (survives drawer close/reopen and commits).
+            // `selectedAppLabelAtom` is derived from the node graph now — the
+            // `connectApp` call below seeds the depth-0 node with the persisted
+            // label, which the derived atom picks up automatically.
             const persisted = store.get(persistedAppSelectionAtom)
             if (persisted) {
-                setSelectedAppLabel(persisted.appLabel)
                 connectApp({
                     appRevisionId: persisted.appRevisionId,
                     appLabel: persisted.appLabel,
@@ -272,7 +272,8 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
 
             resetAll()
             setInitialized(false)
-            setSelectedAppLabel(null)
+            // `selectedAppLabelAtom` is derived from the node graph — `resetAll`
+            // above clears the nodes, which flips the label back to `null`.
             setConnectedTestset(null)
         }
     }, [
@@ -281,7 +282,6 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
         resetAll,
         clearAllRuns,
         setInitialized,
-        setSelectedAppLabel,
         setConnectedTestset,
         connectApp,
     ])
@@ -334,6 +334,10 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
                 skipVariantLevel: true,
                 excludeRevisionZero: true,
                 flags: {is_evaluator: false, is_feedback: false},
+                // Picking an *app* to connect upstream of the evaluator — the
+                // adapter's default "Evaluator" label would make the search
+                // bar say "Search evaluator…" which is wrong here.
+                parentLabel: "Application",
             }),
         [],
     )
@@ -492,23 +496,18 @@ const useDrawerCreateCommitCallback = () => {
                     // (`Router.pathname` only flips on `routeChangeComplete`,
                     // so a synchronous close after `router.push` would patch
                     // the still-current `/evaluators` URL and push back to it.)
+                    //
                     // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the
-                    // flag is off, post-create stays in the drawer flow even
-                    // for evaluators whose classifier supports full-page UX.
-                    let eligibleForPlayground = false
-                    if (
-                        EVALUATOR_FULL_PAGE_NAV_ENABLED &&
-                        newAppId &&
-                        newRevisionId &&
-                        newWorkflow
-                    ) {
-                        eligibleForPlayground = hasFullPagePlaygroundUX({
-                            flags: newWorkflow.flags ?? null,
-                            data: newWorkflow.data ?? null,
-                            meta: newWorkflow.meta ?? null,
-                            slug: newWorkflow.slug ?? null,
-                        })
-                    }
+                    // flag is off, post-create stays in the drawer flow. When
+                    // on, every freshly committed evaluator (regardless of
+                    // template type) lands on `/apps/<id>/playground` —
+                    // mirroring app-create's post-commit navigation. The
+                    // earlier classifier-only gate was removed so declarative
+                    // evaluators get the same surface (variants, traces,
+                    // sidebar context) as LLM/code ones.
+                    const eligibleForPlayground = Boolean(
+                        EVALUATOR_FULL_PAGE_NAV_ENABLED && newAppId && newRevisionId,
+                    )
 
                     if (eligibleForPlayground && newAppId && newRevisionId) {
                         const url = `${baseAppURLRef.current}/${encodeURIComponent(
diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
index eaad8e3de3..ae366bab51 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
@@ -88,6 +88,10 @@ const DrawerHeader = ({entityId, onClose}: {entityId: string; onClose: () => voi
                 skipVariantLevel: true,
                 excludeRevisionZero: true,
                 flags: {is_evaluator: false, is_feedback: false},
+                // Picking an *app* to attach to the evaluator — without this
+                // the search bar would say "Search evaluator…" (the adapter's
+                // historical default in skip-variant mode).
+                parentLabel: "Application",
             }),
         [],
     )
@@ -187,6 +191,10 @@ const DrawerContent = ({
                 skipVariantLevel: true,
                 excludeRevisionZero: true,
                 flags: {is_evaluator: false, is_feedback: false},
+                // Picking an *app* to attach to the evaluator — without this
+                // the search bar would say "Search evaluator…" (the adapter's
+                // historical default in skip-variant mode).
+                parentLabel: "Application",
             }),
         [],
     )
diff --git a/web/oss/src/components/pages/observability/assets/filters/fieldAdapter.ts b/web/oss/src/components/pages/observability/assets/filters/fieldAdapter.ts
index 243a71097a..15b68bdf68 100644
--- a/web/oss/src/components/pages/observability/assets/filters/fieldAdapter.ts
+++ b/web/oss/src/components/pages/observability/assets/filters/fieldAdapter.ts
@@ -25,6 +25,13 @@ export interface FieldConfig {
     valueDisplayText?: string
     queryKey?: string
     referenceProperty?: string
+    /**
+     * Category for the `references` family (application / evaluator /
+     * application_variant / environment). Used by `mapFilterData` to
+     * disambiguate which sub-column an incoming filter row maps to when
+     * multiple share `baseField: "references"` and `referenceProperty: "id"`.
+     */
+    referenceCategory?: string
     // reference/application/evaluator transforms
     toExternal?: (normalized: any) => any
     toUI?: (external: any) => any
@@ -86,6 +93,7 @@ const walk = (nodes: FilterMenuNode[], acc: FieldConfig[]) => {
             valueDisplayText: leaf.valueDisplayText,
             queryKey: leaf.queryKey,
             referenceProperty: leaf.referenceProperty,
+            referenceCategory: leaf.referenceCategory,
         }
 
         // references/application/evaluator → keep simple mapper
@@ -112,9 +120,23 @@ const walk = (nodes: FilterMenuNode[], acc: FieldConfig[]) => {
             }
             cfg.toUI = (external: any) => {
                 const arr = Array.isArray(external) ? external : external ? [external] : []
-                return arr.map((e: any) =>
+                // De-dup by extracted value. References can OR-match across
+                // slots in a single condition (e.g.,
+                // `[{id:X, key:eval}, {id:X, key:app}]` for "match this entity
+                // in either slot"). Without de-dup the UI shows the same id
+                // twice. The backend keeps the rich shape via `toExternal`.
+                const mapped = arr.map((e: any) =>
                     e && typeof e === "object" ? (e[leaf.referenceProperty!] ?? "") : e,
                 )
+                const seen = new Set<string>()
+                const out: any[] = []
+                for (const v of mapped) {
+                    const key = typeof v === "string" ? v : JSON.stringify(v)
+                    if (seen.has(key)) continue
+                    seen.add(key)
+                    out.push(v)
+                }
+                return out
             }
         }
 
diff --git a/web/oss/src/components/pages/observability/components/ObservabilityHeader/index.tsx b/web/oss/src/components/pages/observability/components/ObservabilityHeader/index.tsx
index 3b4d9db56a..be55e65fa3 100644
--- a/web/oss/src/components/pages/observability/components/ObservabilityHeader/index.tsx
+++ b/web/oss/src/components/pages/observability/components/ObservabilityHeader/index.tsx
@@ -13,6 +13,8 @@ import Papa from "papaparse"
 
 import EnhancedButton from "@/oss/components/EnhancedUIs/Button"
 import {SortResult} from "@/oss/components/Filters/Sort"
+import type {FilterItem} from "@/oss/components/Filters/types"
+import {fieldConfigByOptionKey} from "@/oss/components/pages/observability/assets/filters/fieldAdapter"
 import AddActionsDropdown from "@/oss/components/SharedActions/AddActionsDropdown"
 import {deleteTraceModalAtom} from "@/oss/components/SharedDrawers/TraceDrawer/components/DeleteTraceModal/store/atom"
 import useLazyEffect from "@/oss/hooks/useLazyEffect"
@@ -25,6 +27,7 @@ import {buildTraceQueryParams} from "@/oss/state/newObservability/atoms/queryHel
 import {createAdaptiveTracePageFetcher} from "@/oss/state/newObservability/etl/adaptiveTracePageFetcher"
 import {createExportWriter, PICKER_CANCELLED} from "@/oss/state/newObservability/etl/exportWriter"
 import {getAgData} from "@/oss/state/newObservability/selectors/tracing"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 import {createTraceObject, DEFAULT_TRACE_EXPORT_HEADERS} from "../../assets/exportUtils"
 import {buildAttributeKeyTreeOptions} from "../../assets/filters/attributeKeyOptions"
@@ -146,6 +149,92 @@ const ObservabilityHeader = ({
         () => getFilterColumns(attributeKeyOptions),
         [attributeKeyOptions],
     )
+
+    // --- Live label flip for the permanent references row in the dialog -----
+    //
+    // After Apply, the atom regenerates the references row's `attributes.key`
+    // from the effective trace_type (annotation → evaluator, invocation →
+    // application). That's what makes the label switch between "Evaluator ID"
+    // and "Application ID" in the chip outside the dialog. But while the user
+    // is still editing in the dialog, the row sits in local state — changing
+    // the trace_type dropdown there has no visual effect on the references
+    // row's label, which feels broken.
+    //
+    // The reconciler below produces a *display-only* projection of the local
+    // filter rows: if a trace_type row is present, it re-derives the permanent
+    // references row's `selectedField` / `selectedLabel` to match. The
+    // underlying `filter` state is untouched (the reconciler only runs in a
+    // `useMemo` inside the dialog) and the Apply path is unchanged — on
+    // Apply, the atom still strips and re-derives the permanent row, so the
+    // backend value matches the displayed label.
+    //
+    // Skipped for non-evaluator workflows: the references row is always pinned
+    // to `application` there, so flipping the label on trace_type changes
+    // would be misleading.
+    const workflowKind = useAtomValue(currentWorkflowContextAtom).workflowKind
+    const filterFieldMap = useMemo(() => fieldConfigByOptionKey(filterColumns), [filterColumns])
+    const reconcileFilterRows = useCallback(
+        (rows: FilterItem[]): FilterItem[] => {
+            if (workflowKind !== "evaluator") return rows
+
+            const tt = rows.find(
+                (r) => r.selectedField === "trace_type" || r.field === "trace_type",
+            )
+            // Mirror the atom's trace_type intent resolution (controls.ts):
+            // honour `is_not`/`not_in` against the 2-value enum by flipping.
+            const op = tt?.operator
+            const rawValue = Array.isArray(tt?.value) ? tt?.value[0] : tt?.value
+            const isAffirm = op === "is" || op === "in"
+            const isNeg = op === "is_not" || op === "not_in"
+            const normalize = (x: unknown): "annotation" | "invocation" | null =>
+                x === "annotation" ? "annotation" : x === "invocation" ? "invocation" : null
+            const flip = (x: unknown): "annotation" | "invocation" | null =>
+                x === "annotation" ? "invocation" : x === "invocation" ? "annotation" : null
+            let effective: "annotation" | "invocation" | null = null
+            if (tt && isAffirm) effective = normalize(rawValue)
+            else if (tt && isNeg) effective = flip(rawValue)
+
+            // When trace_type is absent, fall through to "no opinion" — keep
+            // whatever the row currently shows (which came from the atom's
+            // default for this workflow kind).
+            if (!effective) return rows
+
+            const targetCategory = effective === "invocation" ? "application" : "evaluator"
+
+            return rows.map((row) => {
+                if (!row.isPermanent) return row
+                const optionKey = row.selectedField || row.field
+                if (!optionKey) return row
+                const fc = filterFieldMap.get(optionKey)
+                if (!fc?.referenceCategory) return row
+                if (
+                    fc.referenceCategory !== "application" &&
+                    fc.referenceCategory !== "evaluator"
+                ) {
+                    return row
+                }
+                if (fc.referenceCategory === targetCategory) return row
+                // Find the corresponding FieldConfig for the target category
+                // with the same referenceProperty (id / slug).
+                let target: typeof fc | undefined
+                for (const candidate of filterFieldMap.values()) {
+                    if (candidate.referenceCategory !== targetCategory) continue
+                    if (candidate.referenceProperty !== fc.referenceProperty) continue
+                    target = candidate
+                    break
+                }
+                if (!target) return row
+                return {
+                    ...row,
+                    field: target.optionKey,
+                    selectedField: target.optionKey,
+                    selectedLabel: target.label,
+                    baseField: target.baseField,
+                }
+            })
+        },
+        [workflowKind, filterFieldMap],
+    )
     const selectedTraceIds = useMemo(
         () =>
             Array.from(
@@ -579,6 +668,7 @@ const ObservabilityHeader = ({
                             columns={filterColumns}
                             onApplyFilter={onApplyFilter}
                             onClearFilter={onClearFilter}
+                            reconcileFilterRows={reconcileFilterRows}
                         />
 
                         <Sort onSortApply={onSortApply} defaultSortValue="24 hours" />
diff --git a/web/oss/src/state/newObservability/atoms/controls.ts b/web/oss/src/state/newObservability/atoms/controls.ts
index 095f27444b..dfa51564c0 100644
--- a/web/oss/src/state/newObservability/atoms/controls.ts
+++ b/web/oss/src/state/newObservability/atoms/controls.ts
@@ -1,6 +1,7 @@
 // Query control atoms for the observability module
 import type {Key} from "react"
 
+import {defaultTraceTypeForWorkflow} from "@agenta/entities/workflow"
 import dayjs from "dayjs"
 import {atom} from "jotai"
 import {atomFamily, atomWithStorage} from "jotai/utils"
@@ -9,6 +10,7 @@ import type {SortResult} from "@/oss/components/Filters/Sort"
 import type {TestsetTraceData} from "@/oss/components/SharedDrawers/AddToTestsetDrawer/assets/types"
 import {onboardingStorageUserIdAtom} from "@/oss/lib/onboarding/atoms"
 import type {Filter} from "@/oss/lib/Types"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 import {routerAppIdAtom} from "../../app"
 import {SESSIONS_PAGE_SIZE, TRACES_PAGE_SIZE} from "../constants"
@@ -76,12 +78,135 @@ export const limitAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
 export const sortAtomFamily = atomFamily((_tab: ObservabilityTabInfo) =>
     atom<SortResult>(DEFAULT_SORT as SortResult),
 )
-export const traceTypeDefaultEnabledAtomFamily = atomFamily((_tab: ObservabilityTabInfo) =>
-    atom<boolean>(true),
+/**
+ * User's intent for the `trace_type` filter. Tagged union — explicit
+ * semantics instead of the dual-atom (default-enabled + filters-array) dance
+ * that preceded it, where state could revert silently on re-derivations.
+ *
+ *   - `"default"`  — user has never touched trace_type → fall back to
+ *                    `defaultTraceTypeForWorkflow(workflowKind, tab)`.
+ *   - `"value"`    — user picked a specific value (annotation or invocation).
+ *   - `"cleared"`  — user explicitly removed the trace_type filter.
+ *
+ * The effective trace_type is derived in `effectiveTraceTypeAtomFamily`;
+ * downstream atoms (scope filter, query body) read that derived value.
+ */
+export type TraceTypeChoice =
+    | {kind: "default"}
+    | {kind: "value"; value: "annotation" | "invocation"}
+    | {kind: "cleared"}
+
+// --- Persisted filter state (per app, per tab) -------------------------------
+//
+// Filter selections are persisted across reloads so users don't have to
+// re-apply the same filter every time they open a page. State is scoped by
+// `app_id` so two apps can carry different filter setups, and by tab
+// (`traces` vs `sessions`) because those have independent UIs.
+//
+// Storage shape:
+//   {
+//     "<appId>": {
+//       "traces":   { userFilters: Filter[], traceTypeChoice: TraceTypeChoice },
+//       "sessions": { userFilters: Filter[], traceTypeChoice: TraceTypeChoice },
+//     },
+//     "__global__": { ... }  // when there's no router app_id (project scope)
+//   }
+//
+// We pack both pieces into one storage atom (instead of two parallel ones)
+// so a single write doesn't race the other against localStorage, and so the
+// scoped record can be cleaned up atomically per app if we ever need it.
+
+interface PersistedFilterTabState {
+    userFilters: Filter[]
+    traceTypeChoice: TraceTypeChoice
+}
+
+type PersistedFilterAppState = Partial<Record<ObservabilityTabInfo, PersistedFilterTabState>>
+
+const FILTERS_STORAGE_KEY = "agenta:observability:filters"
+const GLOBAL_SCOPE_KEY = "__global__"
+
+const filtersByAppAtom = atomWithStorage<Record<string, PersistedFilterAppState>>(
+    FILTERS_STORAGE_KEY,
+    {},
+)
+
+const emptyTabState: PersistedFilterTabState = {
+    userFilters: [],
+    traceTypeChoice: {kind: "default"},
+}
+
+const readTabState = (
+    all: Record<string, PersistedFilterAppState>,
+    appKey: string,
+    tab: ObservabilityTabInfo,
+): PersistedFilterTabState => all[appKey]?.[tab] ?? emptyTabState
+
+const writeTabState = (
+    all: Record<string, PersistedFilterAppState>,
+    appKey: string,
+    tab: ObservabilityTabInfo,
+    next: PersistedFilterTabState,
+): Record<string, PersistedFilterAppState> => ({
+    ...all,
+    [appKey]: {
+        ...(all[appKey] ?? {}),
+        [tab]: next,
+    },
+})
+
+export const traceTypeChoiceAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
+    atom(
+        (get): TraceTypeChoice => {
+            const appKey = get(routerAppIdAtom) || GLOBAL_SCOPE_KEY
+            return readTabState(get(filtersByAppAtom), appKey, tab).traceTypeChoice
+        },
+        (get, set, next: TraceTypeChoice) => {
+            const appKey = get(routerAppIdAtom) || GLOBAL_SCOPE_KEY
+            const all = get(filtersByAppAtom)
+            const current = readTabState(all, appKey, tab)
+            set(
+                filtersByAppAtom,
+                writeTabState(all, appKey, tab, {...current, traceTypeChoice: next}),
+            )
+        },
+    ),
 )
 
-// User-defined filters family
-export const userFiltersAtomFamily = atomFamily((_tab: ObservabilityTabInfo) => atom<Filter[]>([]))
+/**
+ * Effective trace_type — read this anywhere downstream that needs to know
+ * "what trace_type filter is currently in effect". `null` means no
+ * trace_type filter (user cleared, or no default applies for this tab).
+ */
+export const effectiveTraceTypeAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
+    atom<"annotation" | "invocation" | null>((get) => {
+        const choice = get(traceTypeChoiceAtomFamily(tab))
+        if (choice.kind === "cleared") return null
+        if (choice.kind === "value") return choice.value
+        // default — look up the per-workflow-kind default
+        const workflowCtx = get(currentWorkflowContextAtom)
+        const def = defaultTraceTypeForWorkflow(workflowCtx.workflowKind, tab)
+        if (def === "annotation" || def === "invocation") return def
+        return null
+    }),
+)
+
+// User-defined filters (excluding `trace_type`, which has its own atom).
+// Persisted per-app (see `filtersByAppAtom` above).
+export const userFiltersAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
+    atom(
+        (get): Filter[] => {
+            const appKey = get(routerAppIdAtom) || GLOBAL_SCOPE_KEY
+            return readTabState(get(filtersByAppAtom), appKey, tab).userFilters
+        },
+        (get, set, next: Filter[]) => {
+            const appKey = get(routerAppIdAtom) || GLOBAL_SCOPE_KEY
+            const all = get(filtersByAppAtom)
+            const current = readTabState(all, appKey, tab)
+            set(filtersByAppAtom, writeTabState(all, appKey, tab, {...current, userFilters: next}))
+        },
+    ),
+)
 
 const isTraceType = (f: Filter) => (f.key ?? f.field) === "trace_type"
 
@@ -106,58 +231,87 @@ export const sortAtom = atom(
     (get, set, value: SortResult) => set(sortAtomFamily(get(observabilityTabAtom)), value),
 )
 
-// Computed Filters logic (centralized but applied per tab)
+/**
+ * Combined filter view — what consumers (query layer, dialog) see.
+ *
+ * Composed from three pieces, in order:
+ *
+ *   1. **Scope filter** (`isPermanent: true`) — pins traces to the current
+ *      entity. Shape depends on workflow kind and the effective trace_type:
+ *
+ *      - App workflows always pin to `references.application.id = <appId>`.
+ *      - Evaluator workflows route to different reference slots because the
+ *        two relevant trace shapes write the evaluator's id into different
+ *        slots:
+ *          * Annotation traces (real evaluation runs scoring an app) put the
+ *            evaluator id in `references.evaluator.id`.
+ *          * Invocation traces (evaluator run standalone as an app) put it
+ *            in `references.application.id`, same as a normal app trace.
+ *        With trace_type known, we target the matching slot; with no
+ *        trace_type, we OR-match both slots.
+ *
+ *   2. **trace_type filter** — derived from `effectiveTraceTypeAtomFamily`.
+ *      Renders as a regular filter row in the dialog so the user can change
+ *      or remove it. The atom is the single source of truth — there's no
+ *      separate "is the default still active?" toggle. User edits flow back
+ *      through the setter into `traceTypeChoiceAtomFamily`.
+ *
+ *   3. **Other user filters** — everything else the user has added via the
+ *      filter dialog (search, span_type, has_annotation, …). Stored verbatim
+ *      in `userFiltersAtomFamily`.
+ *
+ * The setter receives the merged array (from the dialog's Apply) and splits
+ * it back: trace_type → `traceTypeChoiceAtomFamily`, other → `userFilters`.
+ * The scope filter is always re-derived; the dialog can't write to it.
+ */
 export const filtersAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
     atom(
         (get) => {
             const appId = get(routerAppIdAtom)
             const userFilters = get(userFiltersAtomFamily(tab))
-            const defaultEnabled = get(traceTypeDefaultEnabledAtomFamily(tab))
-
-            // Only apply soft default for traces, maybe? or both?
-            // "Trace filter should apply on session tab filter" - keeping logic consistent for now
-            // But if we want different defaults per tab, we can branch here.
-            // For now, assuming similar behavior is desired but independent state.
-
-            const hasUserTraceType = userFilters.some(isTraceType)
-
-            // The soft default for the trace_type filter is always
-            // `"invocation"`. Earlier we flipped to `"annotation"` when the
-            // current workflow context was an evaluator, because standalone
-            // evaluator runs at the time only emitted annotation traces.
-            // That's no longer true — standalone evaluator runs in the
-            // playground now emit invocation traces with `references.
-            // application` set (see `runnableSetup.ts`, evaluator branch),
-            // so the app-scoped `/apps/{evaluatorId}/observability` page
-            // should show those by default rather than the more rare
-            // annotation flow. Users who want annotations can still pick
-            // the filter manually.
-            const softDefaults: Filter[] = []
-            if (defaultEnabled && !hasUserTraceType && tab === "traces") {
-                softDefaults.push({
-                    field: "trace_type",
-                    operator: "is",
-                    value: "invocation",
-                })
-            }
+            const workflowCtx = get(currentWorkflowContextAtom)
+            const effectiveTraceType = get(effectiveTraceTypeAtomFamily(tab))
 
-            const appScope: Filter[] = appId
-                ? [
-                      {
-                          field: "references",
-                          operator: "in",
-                          value: [
-                              {
-                                  id: String(appId),
-                                  "attributes.key": "application",
-                              },
-                          ],
-                          isPermanent: true,
-                      },
-                  ]
+            // Build the trace_type filter row (if any)
+            const traceTypeFilters: Filter[] = effectiveTraceType
+                ? [{field: "trace_type", operator: "is", value: effectiveTraceType}]
                 : []
 
-            return [...appScope, ...softDefaults, ...userFilters]
+            // Build the scope filter row
+            const isEvaluatorWorkflow = workflowCtx.workflowKind === "evaluator"
+            const buildEvalScopeValue = () => {
+                const id = String(appId)
+                if (effectiveTraceType === "annotation") {
+                    return [{id, "attributes.key": "evaluator"}]
+                }
+                if (effectiveTraceType === "invocation") {
+                    return [{id, "attributes.key": "application"}]
+                }
+                // No trace_type filter — OR both ref slots so every trace
+                // mentioning this evaluator in either slot shows.
+                return [
+                    {id, "attributes.key": "evaluator"},
+                    {id, "attributes.key": "application"},
+                ]
+            }
+            const appScopeValue = appId
+                ? isEvaluatorWorkflow
+                    ? buildEvalScopeValue()
+                    : [{id: String(appId), "attributes.key": "application"}]
+                : []
+            const appScope: Filter[] =
+                appScopeValue.length > 0
+                    ? [
+                          {
+                              field: "references",
+                              operator: "in",
+                              value: appScopeValue,
+                              isPermanent: true,
+                          },
+                      ]
+                    : []
+
+            return [...appScope, ...traceTypeFilters, ...userFilters]
         },
         (get, set, update: Filter[] | ((prev: Filter[]) => Filter[])) => {
             const currentCombined = get(filtersAtomFamily(tab))
@@ -165,21 +319,52 @@ export const filtersAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
                 typeof update === "function" ? (update as any)(currentCombined) : update
             const normalizedNext = nextCombined || []
 
-            // Persist only non-permanent filters
-            const nextUser = normalizedNext.filter((f: Filter) => !(f as any).isPermanent)
-            set(userFiltersAtomFamily(tab), nextUser)
+            // Strip the permanent scope filter — it's regenerated, not stored.
+            const nextNonPermanent = normalizedNext.filter((f: Filter) => !(f as any).isPermanent)
 
-            // If only permanent filters remain (or none at all), keep the soft default disabled
-            if (!normalizedNext.some((f: Filter) => !(f as any).isPermanent)) {
-                set(traceTypeDefaultEnabledAtomFamily(tab), false)
-                return
-            }
+            // Split the incoming non-permanent filters: trace_type → choice
+            // atom, everything else → userFilters atom.
+            const nextTraceType = nextNonPermanent.find(isTraceType)
+            const nextOthers = nextNonPermanent.filter((f: Filter) => !isTraceType(f))
+
+            set(userFiltersAtomFamily(tab), nextOthers)
 
-            // If trace_type was present and now is not, the user explicitly cleared it.
-            const hadTraceType = currentCombined.some(isTraceType)
-            const hasTraceTypeNext = normalizedNext.some(isTraceType)
-            if (hadTraceType && !hasTraceTypeNext) {
-                set(traceTypeDefaultEnabledAtomFamily(tab), false)
+            // Trace-type intent routing:
+            //   - User has trace_type in the incoming array → store as
+            //     {kind: "value", value: …}.
+            //   - User HAD trace_type before, doesn't now → they cleared it
+            //     → store as {kind: "cleared"}.
+            //   - Neither: don't touch (e.g., updating only `search` shouldn't
+            //     overwrite the trace_type intent).
+            if (nextTraceType) {
+                const v = nextTraceType.value
+                // Normalize is/is_not against the two-value enum to a single
+                // affirmative value.
+                const op = nextTraceType.operator
+                const isAffirm = op === "is" || op === "in"
+                const isNeg = op === "is_not" || op === "not_in"
+                const flip = (x: unknown): "annotation" | "invocation" | null =>
+                    x === "annotation" ? "invocation" : x === "invocation" ? "annotation" : null
+                let resolved: "annotation" | "invocation" | null = null
+                if (isAffirm) {
+                    resolved =
+                        v === "annotation" ? "annotation" : v === "invocation" ? "invocation" : null
+                } else if (isNeg) {
+                    resolved = flip(v)
+                }
+                if (resolved) {
+                    set(traceTypeChoiceAtomFamily(tab), {kind: "value", value: resolved})
+                } else {
+                    // Unknown shape (e.g., a future trace_type value we don't
+                    // map). Treat as "cleared" rather than fabricating a value.
+                    set(traceTypeChoiceAtomFamily(tab), {kind: "cleared"})
+                }
+            } else {
+                const hadTraceType = currentCombined.some(isTraceType)
+                if (hadTraceType) {
+                    set(traceTypeChoiceAtomFamily(tab), {kind: "cleared"})
+                }
+                // else: don't touch — caller didn't intend to change trace_type
             }
         },
     ),
diff --git a/web/oss/src/state/newObservability/atoms/queries.ts b/web/oss/src/state/newObservability/atoms/queries.ts
index 26bf90a760..c01a7419df 100644
--- a/web/oss/src/state/newObservability/atoms/queries.ts
+++ b/web/oss/src/state/newObservability/atoms/queries.ts
@@ -18,6 +18,7 @@ import {TraceSpanNode} from "@/oss/services/tracing/types"
 import {selectedAppIdAtom} from "@/oss/state/app/selectors/app"
 import {getOrgValues} from "@/oss/state/org"
 import {projectIdAtom} from "@/oss/state/project"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 import {sessionExistsAtom} from "../../session"
 
@@ -37,6 +38,15 @@ import {buildTraceQueryParams, executeTraceQuery, mergeConditions} from "./query
 // Traces query ----------------------------------------------------------------
 export const tracesQueryAtom = atomWithInfiniteQuery((get) => {
     const appId = get(selectedAppIdAtom)
+    const workflowCtx = get(currentWorkflowContextAtom)
+    // `fetchAllPreviewTraces` writes the legacy `?application_id=` URL param
+    // off this value. For app workflows that's correct (and redundant with the
+    // body filter that also pins `references.application.id`). For evaluator
+    // workflows it would AND with the body's `references.evaluator.id` filter
+    // and return zero traces — `application.id` is a different reference slot
+    // than `evaluator.id`. Drop the URL param for evaluators; the body filter
+    // (from `filtersAtomFamily`'s appScope branch) already pins the scope.
+    const effectiveAppId = workflowCtx.workflowKind === "evaluator" ? "" : appId
     const sort = get(sortAtomFamily("traces"))
     const filters = get(filtersAtomFamily("traces"))
     const traceTabs = get(traceTabsAtomFamily("traces"))
@@ -48,6 +58,15 @@ export const tracesQueryAtom = atomWithInfiniteQuery((get) => {
 
     const sessionExists = get(sessionExistsAtom)
 
+    // Wait for workflow context to settle before firing the query. While
+    // `workflowCtx.isResolving` is true, `effectiveAppId` falls through to
+    // the app branch with the raw `appId` (which is the evaluator's id when
+    // we're on `/apps/<evalId>/traces`), causing a wrong `application_id`
+    // URL param to be sent. Gating on `!isResolving` skips that wasted
+    // request — once ctx settles, the atom re-evaluates with the correct
+    // `effectiveAppId` and queryFn fires.
+    const enabledFlag = sessionExists && Boolean(appId || projectId) && !workflowCtx.isResolving
+
     return {
         queryKey: ["traces", projectId, appId, params],
         initialPageParam: {
@@ -58,12 +77,12 @@ export const tracesQueryAtom = atomWithInfiniteQuery((get) => {
             executeTraceQuery({
                 params,
                 pageParam: pageParam as {newest?: string} | undefined,
-                appId: appId as string,
+                appId: effectiveAppId as string,
                 isHasAnnotationSelected,
                 hasAnnotationConditions,
                 hasAnnotationOperator,
             }),
-        enabled: sessionExists && Boolean(appId || projectId),
+        enabled: enabledFlag,
 
         getNextPageParam: (lastPage, _pages) => {
             const page = lastPage as any
diff --git a/web/oss/src/state/workflow/flags.ts b/web/oss/src/state/workflow/flags.ts
index b88f0dd610..e207c04f53 100644
--- a/web/oss/src/state/workflow/flags.ts
+++ b/web/oss/src/state/workflow/flags.ts
@@ -1,21 +1,29 @@
 /**
  * Feature flags for the workflow / evaluator full-page UX (PR #4288).
  *
- * The "Phase 5" change routed evaluator table row clicks (and post-create
+ * The "Phase 5" change routes evaluator table row clicks (and post-create
  * navigation) to a full-page playground at `/apps/<evaluatorId>/playground`,
- * with the drawer reduced to a quick-edit affordance. We're temporarily
- * disabling that routing while follow-up fixes land — when the flag flips to
- * `true`, the new flow takes over again with no other code changes required.
+ * with the drawer reduced to a quick-edit affordance.
  *
- * Call sites gated by this flag:
+ * History:
+ *   - #4288 (2026-05-14): shipped the full-page nav.
+ *   - #4384 (2026-05-20): disabled via this flag after two blockers surfaced:
+ *     (1) the full-page surface had no upstream-app picker (lost on the
+ *     generic `PlaygroundHeader`), and (2) the default `trace_type` filter
+ *     on `/apps/<evalId>/traces` reverted to `"invocation"`, leaving
+ *     evaluator users on an empty page.
+ *   - Both fixed: `PlaygroundRouter` now swaps to `ConfigureEvaluatorPage`
+ *     for evaluators (carries the app picker via `EvaluatorPlaygroundHeader`),
+ *     and `defaultTraceTypeForWorkflow` re-instates the annotation default.
+ *
+ * Call sites gated by this flag (no longer dark — flag is `true`):
  *   1. `components/Evaluators/index.tsx` — row-click navigation.
  *   2. `components/WorkflowRevisionDrawerWrapper/index.tsx` — post-create
  *      navigation after evaluator commit.
  *   3. `components/PlaygroundRouter/index.tsx` — guard that allows full-page
- *      UX evaluators to stay on `/playground`. With the flag off, all
- *      evaluator playground URLs redirect back to `/evaluators` so direct
- *      URL visits also fall back to the drawer flow.
+ *      UX evaluators to stay on `/playground` instead of bouncing to
+ *      `/evaluators` + drawer.
  *   4. `components/Sidebar/components/WorkflowEntityCard.tsx` — sidebar
  *      switcher that lists full-page-eligible evaluators.
  */
-export const EVALUATOR_FULL_PAGE_NAV_ENABLED = false
+export const EVALUATOR_FULL_PAGE_NAV_ENABLED = true
diff --git a/web/oss/tests/playwright/acceptance/evaluators/index.ts b/web/oss/tests/playwright/acceptance/evaluators/index.ts
index 5ea00839f7..df0e62bbec 100644
--- a/web/oss/tests/playwright/acceptance/evaluators/index.ts
+++ b/web/oss/tests/playwright/acceptance/evaluators/index.ts
@@ -5,9 +5,6 @@ import {
     selectEvaluatorTemplate,
     getEvaluatorCommitModal,
     waitForWorkflowCreation,
-    openEvaluatorViewDrawer,
-    expandEvaluatorToPlayground,
-    selectCompletionAppFromDrawer,
     fillTestcaseField,
     createHumanEvaluatorFromDrawer,
     editEvaluatorAndSaveNewVersion,
@@ -19,6 +16,13 @@ import {
     EVALUATOR_TAB_PARAM_HUMAN,
     EVALUATOR_CREATE_BUTTON_LABEL,
     EVALUATOR_EXACT_MATCH_TEMPLATE_NAME,
+    EVALUATOR_LLM_AS_A_JUDGE_TEMPLATE_NAME,
+    EVALUATOR_SELECT_APP_PLACEHOLDER,
+    EVALUATOR_NO_APPS_TEXT,
+    EVALUATOR_NON_COMPLETION_TYPE_LABELS,
+    EVALUATOR_POPOVER_TEST_ID,
+    EVALUATOR_POPOVER_ROOT_PANEL_TEST_ID,
+    EVALUATOR_POPOVER_CHILD_PANEL_TEST_ID,
     EVALUATOR_DRAWER_CREATE_TITLE,
     EVALUATOR_DRAWER_CREATE_BUTTON_LABEL,
     EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER,
@@ -253,18 +257,26 @@ const testEvaluators = () => {
                 page.locator("[data-row-key]").filter({hasText: evaluatorName}).first(),
             ).toBeVisible({timeout: 5000})
 
-            // Step 2: Open the evaluator view drawer by clicking the row
-            const viewDrawer = await openEvaluatorViewDrawer(page, evaluatorName)
+            // Step 2: Post-commit navigates to /apps/<id>/playground (full-page surface,
+            // not the drawer — per the re-enable of EVALUATOR_FULL_PAGE_NAV. Earlier this
+            // test used the drawer flow; rewritten to operate on the new page surface.)
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            const surface = page.locator("body")
 
-            // Step 3: Expand the drawer into playground mode
-            await expandEvaluatorToPlayground(viewDrawer)
+            // Step 3: The evaluator-flavored page has a "Select app" picker in the header
+            const selectAppButton = page
+                .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                .first()
+            await expect(selectAppButton).toBeVisible({timeout: 15000})
 
-            // Step 4: Select a completion-type app
-            // Note 1: Skip if no apps are available in this environment
-            // Note 2: Skip if no completion-type app is available
-            const appSelectionResult = await selectCompletionAppFromDrawer(page, viewDrawer)
+            // Step 4: Open the picker and select a completion-type app.
+            // Skip gracefully if no apps or no completion app exist in this environment.
+            await selectAppButton.click()
+            const popover = page.getByTestId(EVALUATOR_POPOVER_TEST_ID).last()
+            await expect(popover).toBeVisible({timeout: 5000})
 
-            if (appSelectionResult === "no_apps") {
+            const noItemsText = popover.getByText(EVALUATOR_NO_APPS_TEXT)
+            if (await noItemsText.isVisible().catch(() => false)) {
                 test.skip(
                     true,
                     "No apps available in this environment to test the evaluator playground",
@@ -272,24 +284,46 @@ const testEvaluators = () => {
                 return
             }
 
-            if (appSelectionResult === "no_completion") {
+            const appItems = popover
+                .getByTestId(EVALUATOR_POPOVER_ROOT_PANEL_TEST_ID)
+                .locator('[role="option"]')
+            await expect(appItems.first()).toBeVisible({timeout: 10000})
+
+            // Pick the first non-Chat / non-Custom app — completion-type.
+            const allItems = await appItems.all()
+            let completionItem = null
+            for (const item of allItems) {
+                const itemText = await item.textContent()
+                const isNonCompletion = EVALUATOR_NON_COMPLETION_TYPE_LABELS.some((label) =>
+                    itemText?.includes(label),
+                )
+                if (!isNonCompletion) {
+                    completionItem = item
+                    break
+                }
+            }
+            if (!completionItem) {
                 test.skip(
                     true,
                     "No completion-type app available — evaluator requires a completion app",
                 )
                 return
             }
+            await completionItem.click()
 
-            // Step 5: Verify the selected app is a completion type by waiting for "Testcase Data".
-            // Chat and Custom apps render a different playground UI without this section.
-            // If it doesn't appear the type-detection heuristic picked a non-completion app —
-            // skip gracefully instead of failing.
+            // Wait for and pick the first revision in the right-side panel.
+            const revisionPanel = popover.getByTestId(EVALUATOR_POPOVER_CHILD_PANEL_TEST_ID)
+            await expect(revisionPanel).toBeVisible({timeout: 5000})
+            const revisionItems = revisionPanel.locator('[role="option"]')
+            await expect(revisionItems.first()).toBeVisible({timeout: 5000})
+            await revisionItems.first().click()
+
+            // Step 5: Verify completion-app UI (Testcase Data section) appears.
             const isCompletionApp = await page
                 .getByText("Testcase Data")
                 .first()
                 .isVisible({timeout: 10000})
                 .catch(() => false)
-
             if (!isCompletionApp) {
                 test.skip(
                     true,
@@ -298,24 +332,20 @@ const testEvaluators = () => {
                 return
             }
 
-            // Step 6: Fill in the testcase fields
-            // The testcase rows appear inside the expanded drawer's playground area.
-            // We fill in well-known fields if present; the exact schema depends on
-            // the connected app. For the standard "country capitals" completion app,
-            // "country" is the app input and "correct_answer" is the evaluator ground truth.
-            await fillTestcaseField(page, viewDrawer, "country", "Germany")
-            await fillTestcaseField(page, viewDrawer, "correct_answer", "Berlin")
-
-            // Step 7: Click the Run button
-            const runButton = viewDrawer
-                .getByRole("button", {name: EVALUATOR_RUN_BUTTON_LABEL})
-                .first()
+            // Step 6: Fill testcase fields. For the standard country-capitals completion
+            // app, "country" is the app input and "correct_answer" is the evaluator
+            // ground truth.
+            await fillTestcaseField(page, surface, "country", "Germany")
+            await fillTestcaseField(page, surface, "correct_answer", "Berlin")
+
+            // Step 7: Click Run
+            const runButton = page.getByRole("button", {name: EVALUATOR_RUN_BUTTON_LABEL}).first()
             await expect(runButton).toBeVisible({timeout: 10000})
             await expect(runButton).toBeEnabled()
             await runButton.click()
 
-            // Step 8: Verify the evaluation ran — the evaluator result card should appear
-            await expect(viewDrawer.locator(EVALUATOR_RESULT_CARD_SELECTOR).first()).toBeVisible({
+            // Step 8: Verify the evaluator result card appears
+            await expect(page.locator(EVALUATOR_RESULT_CARD_SELECTOR).first()).toBeVisible({
                 timeout: 30000,
             })
         },
@@ -480,6 +510,370 @@ const testEvaluators = () => {
             await deleteEvaluator(page, evaluatorName)
         },
     )
+
+    // ────────────────────────────────────────────────────────────────────────
+    // Full-page evaluator playground (PR #4288 / re-enable after #4384)
+    //
+    // Every non-archived automatic evaluator opens in the full-page surface
+    // at `/apps/<evalId>/playground` (powered by `ConfigureEvaluatorPage`)
+    // on row click + post-create + direct URL visit, regardless of template
+    // type. Earlier the gate restricted this to LLM/code evaluators only and
+    // declarative classifiers fell back to the drawer — that meant several
+    // evaluator types had no UI path into the per-evaluator pages (variants,
+    // traces, sidebar). The gate is gone now; the drawer remains available
+    // as a quick-edit affordance via the row context menu's Configure
+    // action.
+    // ────────────────────────────────────────────────────────────────────────
+
+    test(
+        "should navigate to the full-page playground for a declarative classifier (Exact Match) on post-create",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.FAST,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            const evaluatorName = `e2e-exact-match-fullpage-${Date.now()}`
+
+            await navigateToEvaluators()
+
+            // Create a fresh Exact Match evaluator
+            const drawer = await selectEvaluatorTemplate(page, EVALUATOR_EXACT_MATCH_TEMPLATE_NAME)
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Post-create lands on the full-page playground (all evaluator
+            // kinds, not just LLM/code).
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+        },
+    )
+
+    test(
+        "should navigate to the full-page playground when clicking an LLM-as-a-judge row",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.SLOW,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            const evaluatorName = `e2e-llm-judge-row-${Date.now()}`
+
+            await navigateToEvaluators()
+
+            // Create an LLM-as-a-judge evaluator (flags.is_llm — full-page eligible)
+            const drawer = await selectEvaluatorTemplate(
+                page,
+                EVALUATOR_LLM_AS_A_JUDGE_TEMPLATE_NAME,
+            )
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Post-create navigation lands directly on the full-page playground
+            // (`WorkflowRevisionDrawerWrapper:489-502` evaluator-create branch).
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+
+            // The full-page evaluator surface renders ConfigureEvaluatorPage's
+            // header, whose marker is the upstream-app picker. This is the
+            // regression blocker #4384 disabled the flow over — when the swap
+            // is wrong the user lands on the generic <Playground /> with no
+            // way to pick the app the evaluator scores.
+            const selectAppButton = page
+                .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                .first()
+            await expect(selectAppButton).toBeVisible({timeout: 15000})
+
+            // Navigate back to /evaluators and click the row — same destination
+            // (validates the registry's row-click handler, not just post-create).
+            await navigateToEvaluators()
+            const searchInput = page.locator('input[placeholder="Search"]').first()
+            if (await searchInput.isVisible().catch(() => false)) {
+                await searchInput.fill(evaluatorName)
+            }
+            await expect
+                .poll(
+                    async () =>
+                        page.locator("[data-row-key]").filter({hasText: evaluatorName}).count(),
+                    {timeout: 15000},
+                )
+                .toBeGreaterThan(0)
+            const row = page.locator("[data-row-key]").filter({hasText: evaluatorName}).first()
+            await row.click()
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            await expect(
+                page
+                    .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                    .first(),
+            ).toBeVisible({timeout: 15000})
+        },
+    )
+
+    test(
+        "should navigate to the full-page playground when clicking a declarative classifier row (Exact Match)",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.FAST,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            // Verifies T17 (gate removal): declarative classifiers — not just
+            // LLM/code evaluators — open the full-page playground on row click.
+            const evaluatorName = `e2e-exact-match-rowclick-${Date.now()}`
+
+            await navigateToEvaluators()
+
+            // Create Exact Match
+            const drawer = await selectEvaluatorTemplate(page, EVALUATOR_EXACT_MATCH_TEMPLATE_NAME)
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Navigate back to the registry, then click the row.
+            await navigateToEvaluators()
+            const searchInput = page.locator('input[placeholder="Search"]').first()
+            if (await searchInput.isVisible().catch(() => false)) {
+                await searchInput.fill(evaluatorName)
+            }
+            await expect
+                .poll(
+                    async () =>
+                        page.locator("[data-row-key]").filter({hasText: evaluatorName}).count(),
+                    {timeout: 15000},
+                )
+                .toBeGreaterThan(0)
+            const row = page.locator("[data-row-key]").filter({hasText: evaluatorName}).first()
+            await row.click()
+
+            // Row click navigates to the full-page playground — same surface as
+            // LLM/code evaluators (Phase 6 unification, gate removed in T17).
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            await expect(
+                page
+                    .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                    .first(),
+            ).toBeVisible({timeout: 15000})
+        },
+    )
+
+    test(
+        "should render the full-page playground on direct URL visit to /apps/<evalId>/playground",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.FAST,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            // Verifies T17: direct URL visits to a declarative classifier's
+            // /apps/<evalId>/playground page render the evaluator-flavored
+            // surface — no bounce to /evaluators (the bounce was the behavior
+            // pre-T17 via the now-removed useEvaluatorPlaygroundGuard).
+            const evaluatorName = `e2e-exact-match-direct-${Date.now()}`
+
+            await navigateToEvaluators()
+            const drawer = await selectEvaluatorTemplate(page, EVALUATOR_EXACT_MATCH_TEMPLATE_NAME)
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Capture the post-create URL — it's the playground URL we want to
+            // re-visit directly. (Post-create navigation already lands here.)
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            const playgroundUrl = page.url()
+
+            // Navigate away, then revisit the URL directly. If the guard were
+            // still in place, this would bounce to /evaluators?revisionId=...
+            await navigateToEvaluators()
+            await expect(page).toHaveURL(/\/evaluators(\?|$)/, {timeout: 5000})
+
+            await page.goto(playgroundUrl)
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            await expect(
+                page
+                    .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                    .first(),
+            ).toBeVisible({timeout: 15000})
+        },
+    )
+
+    test(
+        "should list declarative classifiers in the sidebar switcher (not just LLM/code evaluators)",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.FAST,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            // Verifies T17: the sidebar workflow switcher lists ALL evaluator
+            // kinds, not just full-page-eligible (LLM/code) ones. Pre-T17 the
+            // dropdown used `fullPagePlaygroundEvaluatorsAtom` which filtered
+            // declarative classifiers out — leaving them unreachable via UI
+            // navigation from anywhere except the /evaluators table.
+            const evaluatorName = `e2e-exact-match-sidebar-${Date.now()}`
+
+            await navigateToEvaluators()
+            const drawer = await selectEvaluatorTemplate(page, EVALUATOR_EXACT_MATCH_TEMPLATE_NAME)
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Post-create lands on the full-page playground; the
+            // WorkflowEntityCard switcher appears in the sidebar from there.
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+
+            // Click the switcher's "Switch workflow" button. The aria-label is
+            // only set on the expanded-sidebar variant in WorkflowEntityCard.tsx
+            // (the collapsed-sidebar trigger uses just the icon button) — this
+            // test therefore assumes the sidebar is expanded, which is the
+            // default state. If a test environment ever defaults to collapsed,
+            // this finder would need to also match the icon-only button.
+            const switchButton = page.getByRole("button", {name: "Switch workflow"}).first()
+            await expect(switchButton).toBeVisible({timeout: 15000})
+            await switchButton.click()
+
+            // The dropdown opens via AntD's Dropdown. The just-created
+            // declarative classifier should be in the list — pre-T17 it
+            // wouldn't be (the dropdown filtered to LLM/code-only evaluators).
+            await expect(
+                page.getByRole("menuitem").filter({hasText: evaluatorName}).first(),
+            ).toBeVisible({timeout: 10000})
+        },
+    )
 }
 
 export default testEvaluators
diff --git a/web/oss/tests/playwright/acceptance/evaluators/tests.ts b/web/oss/tests/playwright/acceptance/evaluators/tests.ts
index d04319b417..39adaa3e2d 100644
--- a/web/oss/tests/playwright/acceptance/evaluators/tests.ts
+++ b/web/oss/tests/playwright/acceptance/evaluators/tests.ts
@@ -17,6 +17,10 @@ const EVALUATOR_TAB_PARAM_HUMAN = "human"
 // Template dropdown
 const EVALUATOR_TEMPLATE_DROPDOWN_TITLE = "Select evaluator type"
 const EVALUATOR_EXACT_MATCH_TEMPLATE_NAME = "Exact Match"
+// Backend template key `auto_ai_critique`; display name lives in
+// api/oss/src/resources/evaluators/evaluators.py. LLM-as-a-judge is the
+// canonical "full-page playground" evaluator (has flags.is_llm).
+const EVALUATOR_LLM_AS_A_JUDGE_TEMPLATE_NAME = "LLM-as-a-judge"
 
 // Drawer (create)
 const EVALUATOR_DRAWER_CREATE_TITLE = "New Evaluator"
@@ -563,6 +567,7 @@ export {
     EVALUATOR_SEARCH_PLACEHOLDER,
     EVALUATOR_TEMPLATE_DROPDOWN_TITLE,
     EVALUATOR_EXACT_MATCH_TEMPLATE_NAME,
+    EVALUATOR_LLM_AS_A_JUDGE_TEMPLATE_NAME,
     EVALUATOR_DRAWER_CREATE_TITLE,
     EVALUATOR_CORRECT_ANSWER_PROP,
     EVALUATOR_DRAWER_CREATE_BUTTON_LABEL,
@@ -573,6 +578,9 @@ export {
     EVALUATOR_SELECT_APP_PLACEHOLDER,
     EVALUATOR_NO_APPS_TEXT,
     EVALUATOR_NON_COMPLETION_TYPE_LABELS,
+    EVALUATOR_POPOVER_TEST_ID,
+    EVALUATOR_POPOVER_ROOT_PANEL_TEST_ID,
+    EVALUATOR_POPOVER_CHILD_PANEL_TEST_ID,
     EVALUATOR_RUN_BUTTON_LABEL,
     EVALUATOR_RESULT_CARD_SELECTOR,
     createHumanEvaluatorFromDrawer,
diff --git a/web/packages/agenta-entities/src/workflow/core/index.ts b/web/packages/agenta-entities/src/workflow/core/index.ts
index dd0aa8b618..0f5e52659f 100644
--- a/web/packages/agenta-entities/src/workflow/core/index.ts
+++ b/web/packages/agenta-entities/src/workflow/core/index.ts
@@ -86,3 +86,11 @@ export {
     type EvaluatorDefinition,
     type MetricColumnDefinition,
 } from "./evaluatorResolution"
+
+// Observability defaults (kept pure for unit-testability)
+export {
+    defaultTraceTypeForWorkflow,
+    type TraceTypeDefault,
+    type ObservabilityTab,
+    type WorkflowKindForTraceDefault,
+} from "./traceTypeDefault"
diff --git a/web/packages/agenta-entities/src/workflow/core/schema.ts b/web/packages/agenta-entities/src/workflow/core/schema.ts
index 56aba6b29d..f1cb140e86 100644
--- a/web/packages/agenta-entities/src/workflow/core/schema.ts
+++ b/web/packages/agenta-entities/src/workflow/core/schema.ts
@@ -293,6 +293,12 @@ export const workflowSchema = z
         // Parent slugs (from revision responses; backend returns artifact_slug
         // and variant_slug alongside the IDs so callers can verify which
         // workflow/variant the revision belongs to without a second lookup).
+        //
+        // workflow_slug / workflow_variant_slug are also required for emitting
+        // evaluator references on playground chain runs — the trace storage
+        // layer identifies evaluator runs by slug (via
+        // `references.evaluator.slug`), and we want to write the parent
+        // workflow's slug, not the revision's.
         workflow_slug: z.string().nullable().optional(),
         workflow_variant_slug: z.string().nullable().optional(),
         artifact_slug: z.string().nullable().optional(),
diff --git a/web/packages/agenta-entities/src/workflow/core/traceTypeDefault.ts b/web/packages/agenta-entities/src/workflow/core/traceTypeDefault.ts
new file mode 100644
index 0000000000..b8ec1d880d
--- /dev/null
+++ b/web/packages/agenta-entities/src/workflow/core/traceTypeDefault.ts
@@ -0,0 +1,41 @@
+/**
+ * Soft default for the `trace_type` filter on the app-scoped observability
+ * page (`/apps/<entityId>/traces`).
+ *
+ * Lives in entities (not in OSS) so the truth table can be unit-tested with
+ * vitest. The OSS atom in `state/newObservability/atoms/controls.ts` calls
+ * this helper and applies the result as a filter when no user override is
+ * present.
+ *
+ * - `tab === "sessions"` → no default (Sessions tab is app-only; evaluators
+ *   don't emit them — the tab itself is hidden for evaluator workflows per
+ *   Phase 6.3.3, but a stale `?tab=sessions` URL still hits this code).
+ * - `workflowKind === "evaluator"` → `"annotation"`. Production evaluators
+ *   score app traces and emit annotation-type traces. The playground-
+ *   triggered standalone evaluator runs (which emit invocation traces with
+ *   `references.application` set) are the edge case, not the default.
+ * - everything else (`"app"`, `"snippet"`, `null`) → `"invocation"`. Apps
+ *   invoke models; the app-scoped observability page should default to
+ *   those.
+ *
+ * Returns `null` when no soft default should apply.
+ */
+export type TraceTypeDefault = "invocation" | "annotation"
+export type ObservabilityTab = "traces" | "sessions"
+
+/**
+ * Workflow role kind, mirrored locally to keep this helper free of OSS
+ * imports. OSS' canonical type lives at
+ * `web/oss/src/state/workflow/destinations.ts:11` with the same shape; the
+ * compiler will catch any drift at the wire-up site in `controls.ts`.
+ */
+export type WorkflowKindForTraceDefault = "app" | "evaluator" | "snippet"
+
+export function defaultTraceTypeForWorkflow(
+    workflowKind: WorkflowKindForTraceDefault | null,
+    tab: ObservabilityTab,
+): TraceTypeDefault | null {
+    if (tab !== "traces") return null
+    if (workflowKind === "evaluator") return "annotation"
+    return "invocation"
+}
diff --git a/web/packages/agenta-entities/src/workflow/index.ts b/web/packages/agenta-entities/src/workflow/index.ts
index 464f92f1dc..feb975eaf7 100644
--- a/web/packages/agenta-entities/src/workflow/index.ts
+++ b/web/packages/agenta-entities/src/workflow/index.ts
@@ -95,6 +95,11 @@ export {
     isOnlineCapableEvaluator,
     hasFullPagePlaygroundUX,
     collectEvaluatorCandidates,
+    // Observability defaults
+    defaultTraceTypeForWorkflow,
+    type TraceTypeDefault,
+    type ObservabilityTab,
+    type WorkflowKindForTraceDefault,
     // Output schema utilities
     resolveInputSchema,
     resolveOutputSchema,
diff --git a/web/packages/agenta-entities/tests/unit/traceTypeDefault.test.ts b/web/packages/agenta-entities/tests/unit/traceTypeDefault.test.ts
new file mode 100644
index 0000000000..4333d4b19c
--- /dev/null
+++ b/web/packages/agenta-entities/tests/unit/traceTypeDefault.test.ts
@@ -0,0 +1,59 @@
+/**
+ * Unit tests for defaultTraceTypeForWorkflow.
+ *
+ * The helper drives the soft-default `trace_type` filter on
+ * `/apps/<entityId>/traces` (see `web/oss/src/state/newObservability/atoms/
+ * controls.ts:filtersAtomFamily`). The truth table matters because getting
+ * this wrong means evaluator users land on an empty page by default — the
+ * regression that #4384 disabled the whole evaluator full-page flow over.
+ */
+
+import {describe, it, expect} from "vitest"
+
+import {defaultTraceTypeForWorkflow} from "../../src/workflow/core/traceTypeDefault"
+
+describe("defaultTraceTypeForWorkflow", () => {
+    describe("sessions tab", () => {
+        it("returns null for app workflow", () => {
+            expect(defaultTraceTypeForWorkflow("app", "sessions")).toBeNull()
+        })
+
+        it("returns null for evaluator workflow", () => {
+            expect(defaultTraceTypeForWorkflow("evaluator", "sessions")).toBeNull()
+        })
+
+        it("returns null for snippet workflow", () => {
+            expect(defaultTraceTypeForWorkflow("snippet", "sessions")).toBeNull()
+        })
+
+        it("returns null when workflow kind is unknown", () => {
+            expect(defaultTraceTypeForWorkflow(null, "sessions")).toBeNull()
+        })
+    })
+
+    describe("traces tab", () => {
+        it("defaults to annotation for evaluator workflows", () => {
+            // Production evaluators score app traces and emit annotation-type
+            // traces — that's the more common case for the per-evaluator
+            // observability view, not playground-triggered standalone runs.
+            expect(defaultTraceTypeForWorkflow("evaluator", "traces")).toBe("annotation")
+        })
+
+        it("defaults to invocation for app workflows", () => {
+            expect(defaultTraceTypeForWorkflow("app", "traces")).toBe("invocation")
+        })
+
+        it("defaults to invocation for snippet workflows", () => {
+            // Snippets behave like apps from an invocation perspective —
+            // they invoke models the same way and don't generate annotations.
+            expect(defaultTraceTypeForWorkflow("snippet", "traces")).toBe("invocation")
+        })
+
+        it("defaults to invocation when workflow kind is unknown (resolving)", () => {
+            // Cold-load fallback: when `currentWorkflowContextAtom` is still
+            // resolving, the kind comes through as `null`. Picking invocation
+            // is the safest default since most users land on app pages.
+            expect(defaultTraceTypeForWorkflow(null, "traces")).toBe("invocation")
+        })
+    })
+})
diff --git a/web/packages/agenta-entity-ui/src/selection/adapters/workflowRevisionRelationAdapter.ts b/web/packages/agenta-entity-ui/src/selection/adapters/workflowRevisionRelationAdapter.ts
index be063d1483..00d196dc69 100644
--- a/web/packages/agenta-entity-ui/src/selection/adapters/workflowRevisionRelationAdapter.ts
+++ b/web/packages/agenta-entity-ui/src/selection/adapters/workflowRevisionRelationAdapter.ts
@@ -327,12 +327,34 @@ export interface CreateWorkflowRevisionAdapterOptions {
     ) => WorkflowRevisionSelectionResult
 
     /**
-     * Empty state message.
+     * Display label for the parent (workflow) level. Drives the picker's
+     * search placeholder ("Search {parentLabel}…"), the empty-list "No
+     * {parentLabel} found" copy, and similar UI strings.
+     *
+     * Defaults to `"Evaluator"` when used in skip-variant mode (the adapter's
+     * original primary use case was evaluator selection), but consumers
+     * picking app workflows — e.g., `EvaluatorPlaygroundHeader` — should pass
+     * `"Application"` so the search bar doesn't say "Search evaluator…" while
+     * the user is actually picking an app.
+     *
+     * @example
+     * ```typescript
+     * createWorkflowRevisionAdapter({
+     *     skipVariantLevel: true,
+     *     flags: {is_evaluator: false},
+     *     parentLabel: "Application",
+     * })
+     * ```
+     */
+    parentLabel?: string
+
+    /**
+     * Empty state message. Defaults to "No {parentLabel}s found".
      */
     emptyMessage?: string
 
     /**
-     * Loading state message.
+     * Loading state message. Defaults to "Loading {parentLabel}s...".
      */
     loadingMessage?: string
 
@@ -421,12 +443,19 @@ export function createWorkflowRevisionAdapter(
         toSelection,
         emptyMessage,
         loadingMessage,
+        parentLabel = "Evaluator",
         flags,
         filterWorkflows,
         skipVariantLevel = false,
         workflowListAtom,
     } = options
 
+    // Derive empty/loading defaults from the parent label so callers picking
+    // app workflows don't see "No evaluators found" in an app picker.
+    const lowerParent = parentLabel.toLowerCase()
+    const resolvedEmptyMessage = emptyMessage ?? `No ${lowerParent}s found`
+    const resolvedLoadingMessage = loadingMessage ?? `Loading ${lowerParent}s...`
+
     const emptyListState: ListQueryState<unknown> = {
         data: [],
         isPending: false,
@@ -467,7 +496,7 @@ export function createWorkflowRevisionAdapter(
         return createTwoLevelAdapter<WorkflowRevisionSelectionResult>({
             name: "workflowRevision",
             parentType: "workflow",
-            parentLabel: "Evaluator",
+            parentLabel,
             parentListAtom: resolvedWorkflowsListAtom,
             parentOverrides: {
                 getId: (entity: unknown) => (entity as {id: string}).id,
@@ -502,7 +531,7 @@ export function createWorkflowRevisionAdapter(
                     return {
                         type: "workflowRevision",
                         id: revision.id,
-                        label: `${workflow?.label ?? "Evaluator"} / v${revision.version ?? 0}`,
+                        label: `${workflow?.label ?? parentLabel} / v${revision.version ?? 0}`,
                         path,
                         metadata: {
                             workflowId: workflow?.id ?? "",
@@ -513,8 +542,8 @@ export function createWorkflowRevisionAdapter(
                         },
                     }
                 }),
-            emptyMessage: emptyMessage ?? "No evaluators found",
-            loadingMessage: loadingMessage ?? "Loading evaluators...",
+            emptyMessage: resolvedEmptyMessage,
+            loadingMessage: resolvedLoadingMessage,
         })
     }
 
diff --git a/web/packages/agenta-playground/src/state/execution/executionRunner.ts b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
index 83339f1478..0c2457b050 100644
--- a/web/packages/agenta-playground/src/state/execution/executionRunner.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
@@ -162,6 +162,80 @@ function buildUpstreamReferences(params: {
     return normalizeApplicationReferences(sourcePayload?.references)
 }
 
+/**
+ * Build the `references.evaluator{,_variant,_revision}` map for a chain stage
+ * whose target node is an evaluator.
+ *
+ * The playground node's `entity.id` is a REVISION id. We read the merged
+ * revision record from the workflow molecule and pull both the revision-level
+ * fields (id / slug / version) and the parent workflow + variant identity
+ * (workflow_id, workflow_slug, workflow_variant_id, workflow_variant_slug)
+ * that the backend writes on revision responses.
+ *
+ * The trace storage layer indexes evaluator references by these fields:
+ *   - `references.evaluator.{id, slug}` ← parent workflow identity
+ *   - `references.evaluator_variant.{id, slug}` ← parent variant identity
+ *   - `references.evaluator_revision.{id, slug, version}` ← this revision
+ *
+ * Without these, traces emitted from playground chain runs don't surface on
+ * the evaluator's `/apps/<evalId>/traces` page — the page filters by
+ * `references.evaluator.slug`, and a missing slot returns 0 matches.
+ * Matches the shape backend evaluation runs emit (verified against real
+ * auto-evaluation trace data on 2026-05-28).
+ *
+ * Returns `undefined` when the node isn't an evaluator workflow, or when the
+ * revision data isn't available yet (rare — only during initial hydration).
+ */
+function buildEvaluatorSelfReferences(params: {
+    get: Getter
+    revisionId: string
+}): TraceReferenceMap | undefined {
+    const revision = params.get(workflowMolecule.selectors.data(params.revisionId)) as
+        | (Record<string, unknown> & {flags?: Record<string, unknown> | null})
+        | null
+    if (!revision) return undefined
+    if (!revision.flags?.is_evaluator) return undefined
+
+    const refs: TraceReferenceMap = {}
+
+    // evaluator (parent workflow)
+    const workflowId = readString(revision.workflow_id)
+    const workflowSlug = readString(revision.workflow_slug)
+    if (workflowId || workflowSlug) {
+        refs.evaluator = {
+            ...(workflowId ? {id: workflowId} : {}),
+            ...(workflowSlug ? {slug: workflowSlug} : {}),
+        }
+    }
+
+    // evaluator_variant (parent variant)
+    const variantId = readString(revision.workflow_variant_id) ?? readString(revision.variant_id)
+    const variantSlug = readString(revision.workflow_variant_slug)
+    if (variantId || variantSlug) {
+        refs.evaluator_variant = {
+            ...(variantId ? {id: variantId} : {}),
+            ...(variantSlug ? {slug: variantSlug} : {}),
+        }
+    }
+
+    // evaluator_revision (this revision)
+    const revisionId = readString(revision.id) ?? params.revisionId
+    const revisionSlug = readString(revision.slug)
+    const revisionVersion =
+        typeof revision.version === "number"
+            ? String(revision.version)
+            : readString(revision.version)
+    if (revisionId || revisionSlug || revisionVersion) {
+        refs.evaluator_revision = {
+            ...(revisionId ? {id: revisionId} : {}),
+            ...(revisionSlug ? {slug: revisionSlug} : {}),
+            ...(revisionVersion ? {version: revisionVersion} : {}),
+        }
+    }
+
+    return Object.keys(refs).length > 0 ? refs : undefined
+}
+
 function createConcurrencyLimiter(concurrency: number) {
     let active = 0
     const queue: (() => void)[] = []
@@ -471,20 +545,27 @@ export async function executeStepForSessionWithExecutionItems(
                                   nodeResults,
                               })
                             : undefined
-
-                    const isEvaluatorStage =
-                        node.depth > 0 &&
-                        get(workflowMolecule.selectors.isEvaluator(node.entity.id as string))
-                    const stageReferences =
-                        node.depth > 0 && !isEvaluatorStage
-                            ? buildUpstreamReferences({
-                                  get,
-                                  incomingConnection: allConnections.find(
-                                      (connection) => connection.targetNodeId === nodeId,
-                                  ),
-                                  runnableNodes,
-                              })
-                            : undefined
+                    const stageReferences = (() => {
+                        if (node.depth === 0) return undefined
+                        const upstream = buildUpstreamReferences({
+                            get,
+                            incomingConnection: allConnections.find(
+                                (connection) => connection.targetNodeId === nodeId,
+                            ),
+                            runnableNodes,
+                        })
+                        // For evaluator stages, also attach the evaluator's
+                        // own identity so the emitted trace can be found via
+                        // `references.evaluator.slug` on the evaluator's
+                        // /apps/<evalId>/traces page. Merges with upstream
+                        // application refs (the app being scored).
+                        const selfEval = buildEvaluatorSelfReferences({
+                            get,
+                            revisionId: node.entity.id as string,
+                        })
+                        if (!upstream && !selfEval) return undefined
+                        return {...(upstream ?? {}), ...(selfEval ?? {})}
+                    })()
 
                     const stageExecutionItem = stageHandle.run({
                         get,

From becb4862bb5aa1af2d0100239e215363ba775405 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Thu, 28 May 2026 15:13:19 +0200
Subject: [PATCH 06/36] fix(frontend): address PR #4474 review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CodeRabbit flagged 5 issues on the evaluator-full-page rollout PR.
This commit addresses each:

1. PlaygroundRouter — `is_feedback` evaluators skip the full-page swap.
   `workflowKind === "evaluator"` was too broad. Human/feedback
   evaluators are drawer-only in /evaluators (they capture human input,
   they don't run), so routing them to ConfigureEvaluatorPage produced
   a run-controls UI for a workflow with nothing to run. Added a
   `flags.is_feedback` exclusion next to the workflowKind check.

2. Sidebar — switcher filters out `is_feedback` evaluators.
   `nonArchivedEvaluatorsAtom` only filters by `deleted_at` and
   includes human evaluators; the switcher was exposing entries that,
   when clicked, would land on the (now-correctly-gated) generic
   <Playground /> for a feedback workflow. Filtered the list at the
   switcher boundary.

3. controls.ts — handle array-valued `trace_type` for in/not_in.
   The dialog dispatches `{operator: "in", value: ["annotation"]}` for
   the IN operator family, but the intent setter only normalized
   scalars — so the user's choice was silently dropped to
   `{kind: "cleared"}`. Normalize to an array, filter to enum values,
   and collapse single-value arrays back to a scalar. Multi-value
   selections (which mean "no filter" for a 2-value enum) still map
   to `cleared`.

4. Playwright — drop stale `[data-row-key]` poll in select-app-and-run.
   The test asserted post-create navigation to /apps/<id>/playground
   AFTER polling for the new row in the evaluators table — but the
   redirect wins first, the table disappears, and the poll became a
   timing-dependent failure. Removed the registry-side wait;
   evaluator-in-registry assertion is covered by the
   post-create-row-click test alongside.

5. ConfigureEvaluator/atoms.ts — fix persistedAppSelectionAtom race.
   `connectAppToEvaluatorAtom` persisted the app selection BEFORE
   `changePrimaryNode` ran, so a failed swap (returns `null` with no
   primary to swap from) left a stale localStorage record that the
   next mount re-hydrated into a phantom "connected" state. Moved the
   persist call to after both graph mutations succeed.
   `disconnectAppFromEvaluatorAtom` early-returned on no-downstream
   without clearing the persisted state, allowing the same phantom
   record to survive a disconnect attempt. Clear it on that branch
   too.

No behavior change for the happy-path full-page flow — these all
narrow edge cases the reviewer flagged.
---
 .../components/ConfigureEvaluator/atoms.ts    | 30 +++++++---
 .../src/components/PlaygroundRouter/index.tsx | 14 ++++-
 .../Sidebar/components/WorkflowEntityCard.tsx | 18 +++---
 .../state/newObservability/atoms/controls.ts  | 33 +++++++----
 .../playwright/acceptance/evaluators/index.ts | 58 ++++++++-----------
 5 files changed, 93 insertions(+), 60 deletions(-)

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
index 0a6c2d3625..b0a0ed0426 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
@@ -162,12 +162,14 @@ export const connectAppToEvaluatorAtom = atom(
     ) => {
         const {appRevisionId, appLabel, evaluatorRevisionId, evaluatorLabel} = params
 
-        // Persist across sessions. The picker display label is derived from
-        // the depth-0 node's `label` via `selectedAppLabelAtom`, so no extra
-        // write needed here.
-        set(persistedAppSelectionAtom, {appRevisionId, appLabel})
-
-        // Replace primary node with app
+        // Replace primary node with the app FIRST — if the graph mutation
+        // bails out (changePrimaryNode returns null when there's no current
+        // primary to swap), we must not commit a stale persisted record.
+        // Pre-fix the persist happened before this call, which could leave
+        // an `{appRevisionId, appLabel}` entry in localStorage referring to
+        // a connection that never actually formed; the next mount would
+        // re-hydrate from that record and the picker would show "connected"
+        // for an app the playground never linked.
         const nodeId = set(playgroundController.actions.changePrimaryNode, {
             type: "workflow",
             id: appRevisionId,
@@ -185,6 +187,11 @@ export const connectAppToEvaluatorAtom = atom(
                 label: evaluatorLabel,
             },
         })
+
+        // Persist only after both graph mutations succeeded. The picker
+        // display label is derived from the depth-0 node's `label` via
+        // `selectedAppLabelAtom`, so no extra write needed here.
+        set(persistedAppSelectionAtom, {appRevisionId, appLabel})
     },
 )
 
@@ -208,7 +215,16 @@ export const connectAppToEvaluatorAtom = atom(
 export const disconnectAppFromEvaluatorAtom = atom(null, (get, set) => {
     const nodes = get(playgroundController.selectors.nodes())
     const downstreamEvaluator = nodes.find((n) => n.depth > 0)
-    if (!downstreamEvaluator) return
+    if (!downstreamEvaluator) {
+        // No downstream node means the graph is already in the
+        // standalone-evaluator shape, but a stale `persistedAppSelectionAtom`
+        // entry could still be on disk (e.g., from a previous session where
+        // `connectAppToEvaluatorAtom` persisted before its swap silently
+        // failed mid-mutation). Clear it on this path too so the next mount
+        // doesn't re-hydrate a phantom "connected" app.
+        set(persistedAppSelectionAtom, null)
+        return
+    }
 
     const evaluatorEntity = {
         type: downstreamEvaluator.entityType,
diff --git a/web/oss/src/components/PlaygroundRouter/index.tsx b/web/oss/src/components/PlaygroundRouter/index.tsx
index 3158366096..4abb241beb 100644
--- a/web/oss/src/components/PlaygroundRouter/index.tsx
+++ b/web/oss/src/components/PlaygroundRouter/index.tsx
@@ -70,7 +70,19 @@ const PlaygroundRouter = () => {
     // ConfigureEvaluatorPage renders the same few form fields the drawer
     // would, with the bonus of the evaluator-as-app surface (variants,
     // traces, sidebar context).
-    if (ctx.workflowKind === "evaluator") return <ConfigureEvaluatorPage />
+    //
+    // Exception: `is_feedback` evaluators (human-annotation workflows) are
+    // intentionally drawer-only in /evaluators — they don't run, they capture
+    // human input. Routing them to `ConfigureEvaluatorPage` would render a
+    // page with no testset/run controls that make sense for them. Direct
+    // URL visits to `/apps/<human-id>/playground` fall through to the
+    // generic `<Playground />`, which will (correctly) treat them as an
+    // unsupported playground target and let the upstream route guard /
+    // landing logic redirect them back to /evaluators.
+    const isFeedbackEvaluator = ctx.workflow?.flags?.is_feedback === true
+    if (ctx.workflowKind === "evaluator" && !isFeedbackEvaluator) {
+        return <ConfigureEvaluatorPage />
+    }
     return <Playground />
 }
 
diff --git a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
index b2ca96b43f..818db127aa 100644
--- a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
+++ b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
@@ -118,13 +118,17 @@ const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => {
     const evaluators = useAtomValue(nonArchivedEvaluatorsAtom) as readonly Workflow[]
     // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off, the
     // switcher dropdown hides the "Evaluators" group entirely. With the flag
-    // on, ALL non-archived evaluators are listed — every evaluator kind has
-    // a working `/apps/<id>/*` surface (PlaygroundRouter renders
-    // ConfigureEvaluatorPage for all evaluator workflows regardless of
-    // template type), so there's no reason to filter to LLM/code only.
-    const switcherEvaluators: readonly Workflow[] = EVALUATOR_FULL_PAGE_NAV_ENABLED
-        ? evaluators
-        : EMPTY_WORKFLOWS
+    // on, list every evaluator EXCEPT human/feedback workflows:
+    // `is_feedback` evaluators are drawer-only in /evaluators (they capture
+    // human input, they don't run), so the corresponding `/apps/<id>/*`
+    // surface has no useful UI. PlaygroundRouter falls through to the
+    // generic `<Playground />` for those, which doesn't make sense to
+    // expose via the sidebar switcher — clicking would land on a
+    // run-controls page for a workflow that has nothing to run.
+    const switcherEvaluators: readonly Workflow[] = useMemo(() => {
+        if (!EVALUATOR_FULL_PAGE_NAV_ENABLED) return EMPTY_WORKFLOWS
+        return evaluators.filter((w) => !w.flags?.is_feedback)
+    }, [evaluators])
     const recentAppId = useAtomValue(recentAppIdAtom)
     const recentEvaluatorId = useAtomValue(recentEvaluatorIdAtom)
     const navigateToWorkflow = useSetAtom(routerAppNavigationAtom)
diff --git a/web/oss/src/state/newObservability/atoms/controls.ts b/web/oss/src/state/newObservability/atoms/controls.ts
index dfa51564c0..a300751a51 100644
--- a/web/oss/src/state/newObservability/atoms/controls.ts
+++ b/web/oss/src/state/newObservability/atoms/controls.ts
@@ -337,26 +337,35 @@ export const filtersAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
             //   - Neither: don't touch (e.g., updating only `search` shouldn't
             //     overwrite the trace_type intent).
             if (nextTraceType) {
-                const v = nextTraceType.value
-                // Normalize is/is_not against the two-value enum to a single
-                // affirmative value.
+                // The filter dialog sends `value` as a scalar for `is`/
+                // `is_not` and as an array for `in`/`not_in` (e.g.,
+                // `["annotation"]`). Normalize to an array, filter to known
+                // enum values, then collapse single-value arrays back to a
+                // scalar for the choice atom — which only stores one value.
+                const rawValues = Array.isArray(nextTraceType.value)
+                    ? nextTraceType.value
+                    : [nextTraceType.value]
+                const values = rawValues.filter(
+                    (entry: unknown): entry is "annotation" | "invocation" =>
+                        entry === "annotation" || entry === "invocation",
+                )
                 const op = nextTraceType.operator
                 const isAffirm = op === "is" || op === "in"
                 const isNeg = op === "is_not" || op === "not_in"
-                const flip = (x: unknown): "annotation" | "invocation" | null =>
-                    x === "annotation" ? "invocation" : x === "invocation" ? "annotation" : null
+                const flip = (x: "annotation" | "invocation"): "annotation" | "invocation" =>
+                    x === "annotation" ? "invocation" : "annotation"
                 let resolved: "annotation" | "invocation" | null = null
-                if (isAffirm) {
-                    resolved =
-                        v === "annotation" ? "annotation" : v === "invocation" ? "invocation" : null
-                } else if (isNeg) {
-                    resolved = flip(v)
+                if (values.length === 1) {
+                    if (isAffirm) resolved = values[0]
+                    else if (isNeg) resolved = flip(values[0])
                 }
                 if (resolved) {
                     set(traceTypeChoiceAtomFamily(tab), {kind: "value", value: resolved})
                 } else {
-                    // Unknown shape (e.g., a future trace_type value we don't
-                    // map). Treat as "cleared" rather than fabricating a value.
+                    // Multi-value selections (e.g., `in: ["annotation",
+                    // "invocation"]` — equivalent to "no filter") or
+                    // future enum values we don't map. Treat as cleared
+                    // rather than fabricating a single-value pick.
                     set(traceTypeChoiceAtomFamily(tab), {kind: "cleared"})
                 }
             } else {
diff --git a/web/oss/tests/playwright/acceptance/evaluators/index.ts b/web/oss/tests/playwright/acceptance/evaluators/index.ts
index df0e62bbec..3f43d5748f 100644
--- a/web/oss/tests/playwright/acceptance/evaluators/index.ts
+++ b/web/oss/tests/playwright/acceptance/evaluators/index.ts
@@ -1,3 +1,18 @@
+import {
+    createTagString,
+    TestCoverage,
+    TestPath,
+    TestScope,
+    TestSpeedType,
+    TestLensType,
+    TestCostType,
+    TestLicenseType,
+    TestRoleType,
+    TestcaseType,
+} from "@agenta/web-tests/playwright/config/testTags"
+
+import {buildAcceptanceTags} from "../utils/tags"
+
 import {
     test,
     expect,
@@ -32,19 +47,6 @@ import {
     EVALUATOR_RESULT_CARD_SELECTOR,
     HUMAN_EVALUATOR_CREATE_SUCCESS_MESSAGE,
 } from "./tests"
-import {
-    createTagString,
-    TestCoverage,
-    TestPath,
-    TestScope,
-    TestSpeedType,
-    TestLensType,
-    TestCostType,
-    TestLicenseType,
-    TestRoleType,
-    TestcaseType,
-} from "@agenta/web-tests/playwright/config/testTags"
-import {buildAcceptanceTags} from "../utils/tags"
 
 const testEvaluators = () => {
     test(
@@ -240,26 +242,16 @@ const testEvaluators = () => {
                 page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
             ).toBeVisible({timeout: 10000})
 
-            // Verify the evaluator appears in the table.
-            // Use the search input to narrow results, then poll via [data-row-key].
-            const searchInput2 = page.locator('input[placeholder="Search"]').first()
-            if (await searchInput2.isVisible().catch(() => false)) {
-                await searchInput2.fill(evaluatorName)
-            }
-            await expect
-                .poll(
-                    async () =>
-                        page.locator("[data-row-key]").filter({hasText: evaluatorName}).count(),
-                    {timeout: 15000},
-                )
-                .toBeGreaterThan(0)
-            await expect(
-                page.locator("[data-row-key]").filter({hasText: evaluatorName}).first(),
-            ).toBeVisible({timeout: 5000})
-
-            // Step 2: Post-commit navigates to /apps/<id>/playground (full-page surface,
-            // not the drawer — per the re-enable of EVALUATOR_FULL_PAGE_NAV. Earlier this
-            // test used the drawer flow; rewritten to operate on the new page surface.)
+            // Step 2: Post-commit navigates to `/apps/<id>/playground` — the
+            // full-page surface introduced by the EVALUATOR_FULL_PAGE_NAV
+            // re-enable. Assert the redirect FIRST (no DOM-poll for the
+            // registry table). Earlier this test waited on `[data-row-key]`
+            // entries before the URL check, which raced against the redirect:
+            // once the post-commit navigation won, the table wasn't in the
+            // DOM and the poll timed out. The evaluator's presence in the
+            // registry is exercised by the post-create-row-click test
+            // alongside; here we only care that the create flow leads to
+            // the playground page.
             await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
             const surface = page.locator("body")
 

From 52e4ff4ce4301d010212d0187e9be1f0df5277d5 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Tue, 2 Jun 2026 15:58:24 +0200
Subject: [PATCH 07/36] fix(playground): filter root-node inputs by entity
 schema (#4525)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue: In the LLM-as-a-judge playground, switching the chained app from a
chat application to a completion application kept sending `context` and
`messages` from the previous app in the new request body.

Root cause: At `executionRunner.ts` for depth=0 (the root entity), the
runner spreads the entire row's data into `nodeInputs` (`{...data}`) and
hands it to the stage handle as `inputValues`. The downstream filter in
`resolveVariableValues` / `buildCompletionInputRow` correctly drops keys
that aren't in the entity's input variables — when `variables` is non-
empty. But when the entity's input ports haven't resolved yet (entity
mid-hydration) or genuinely declares no input variables, that filter
falls back to "spread every key from the row", which is exactly the
window in which stale chat-shape keys (`messages`, `context`) leak into
a completion request.

Fix: Filter `data` at the runner against the entity's declared
`inputSchema.properties` BEFORE building `nodeInputs`. This applies to
both the first execution (line ~417) and the repetition retries (line
~689). When the entity has no resolvable input schema, the helper falls
back to `{...data}` so workflows that genuinely depend on free-form
input (e.g. `__rawBody` app workflows whose variables live in
`__meta.variables`) keep working.

The fix is safe for chat mode: chat strips `messages` separately at line
587 of `executionItems.ts` and rebuilds the conversation from
`chatHistory` via `messageIdsAtomFamily(loadableId)` — independent of
`inputValues`.

Defense-in-depth: this complements the existing
`resolveVariableValues` filter rather than replacing it.
---
 .../src/state/execution/executionRunner.ts    | 72 ++++++++++++++++++-
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/web/packages/agenta-playground/src/state/execution/executionRunner.ts b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
index 0c2457b050..c67b349158 100644
--- a/web/packages/agenta-playground/src/state/execution/executionRunner.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
@@ -236,6 +236,57 @@ function buildEvaluatorSelfReferences(params: {
     return Object.keys(refs).length > 0 ? refs : undefined
 }
 
+/**
+ * Filter row data to only keys present in the entity's input schema.
+ *
+ * Why this exists: testcase rows live in the local testcase molecule and
+ * preserve every key the user ever ran with (chat apps populate
+ * `messages`/`context`, completion apps populate template variables, etc.).
+ * When the user swaps the primary app — e.g. in the LLM-as-a-judge playground
+ * switching the chained app from chat to completion (issue #4525 /
+ * AGE-3793) — the same row data carries stale chat keys into the completion
+ * request.
+ *
+ * Downstream there's already filtering by `variables` (input ports) in
+ * `resolveVariableValues` / `buildCompletionInputRow`, but it falls back to
+ * "spread all keys" when `variables` is empty (entity still hydrating, or a
+ * workflow with no declared input ports). This helper filters AT THE RUNNER
+ * before the data leaves for stage execution, so stale keys can't slip
+ * through the fallback.
+ *
+ * Fallback contract: when the entity has no resolvable input schema
+ * (`properties` missing / empty), return the data unchanged — preserves the
+ * pre-fix behavior so workflows that genuinely depend on free-form input
+ * (e.g. `__rawBody` app workflows with `__meta.variables`) aren't broken.
+ */
+function filterDataToEntityInputSchema(
+    get: Getter,
+    data: Record<string, unknown>,
+    entityId: string,
+): Record<string, unknown> {
+    const schemas = get(workflowMolecule.selectors.ioSchemas(entityId)) as
+        | {inputSchema?: unknown}
+        | undefined
+    const inputSchema = schemas?.inputSchema as
+        | {properties?: Record<string, unknown>}
+        | undefined
+    const properties = inputSchema?.properties
+    if (!properties || typeof properties !== "object") {
+        return {...data}
+    }
+    const allowedKeys = new Set(Object.keys(properties))
+    if (allowedKeys.size === 0) {
+        return {...data}
+    }
+    const filtered: Record<string, unknown> = {}
+    for (const [key, value] of Object.entries(data)) {
+        if (allowedKeys.has(key)) {
+            filtered[key] = value
+        }
+    }
+    return filtered
+}
+
 function createConcurrencyLimiter(concurrency: number) {
     let active = 0
     const queue: (() => void)[] = []
@@ -434,7 +485,17 @@ export async function executeStepForSessionWithExecutionItems(
 
                     let nodeInputs: Record<string, unknown>
                     if (node.depth === 0) {
-                        nodeInputs = {...data}
+                        // Filter to the root entity's declared input schema so stale
+                        // keys from a previous primary app (e.g. chat `messages` /
+                        // `context` after swapping the upstream app in the
+                        // LLM-as-a-judge playground — issue #4525 / AGE-3793) don't
+                        // leak into the new app's request body via the downstream
+                        // "spread all keys" fallback in resolveVariableValues.
+                        nodeInputs = filterDataToEntityInputSchema(
+                            get,
+                            data,
+                            node.entity.id as string,
+                        )
                     } else {
                         // Check whether the incoming connection has explicit valid mappings.
                         // resolveChainInputs always returns non-empty (fallback spreads testcaseData
@@ -653,7 +714,14 @@ export async function executeStepForSessionWithExecutionItems(
                 if (abortController.signal.aborted) break
 
                 const perSession2 = sessionOptions?.[session.id]
-                const nodeInputs2 = {...data}
+                // Same input-schema filter as the first-run path above —
+                // repetitions hit the same root entity, so stale keys must be
+                // filtered identically (issue #4525 / AGE-3793).
+                const nodeInputs2 = filterDataToEntityInputSchema(
+                    get,
+                    data,
+                    session.runnableId,
+                )
                 const repetitionItem = rootExecutionHandle.retry({
                     get,
                     headers: perSession2?.headers ?? {},

From d925746f9c4f6af55b1cd07e6f2fd46838ba69f4 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Tue, 2 Jun 2026 21:41:35 +0200
Subject: [PATCH 08/36] fix(playground-ui): dark-mode classes on evaluator info
 callout

The evaluator info notice in SingleLayout rendered with hardcoded
light-mode colors (bg-blue-50, text-gray-700) and was unreadable
against the dark UI. Add dark: variants to background, border,
icon, body text, and dismiss button to match the existing
dark:bg-blue-900/* pattern used elsewhere in the app.
---
 .../ExecutionItems/assets/ExecutionRow/SingleLayout.tsx  | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx b/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
index 587dfae906..fbc48aa887 100644
--- a/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
+++ b/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
@@ -828,10 +828,14 @@ const SingleView = ({
                                     className={clsx(
                                         "flex items-start gap-2 px-3 py-2 rounded-md",
                                         "bg-blue-50 border border-solid border-blue-100",
+                                        "dark:bg-blue-900/20 dark:border-blue-900/40",
                                     )}
                                 >
-                                    <Info size={14} className="text-blue-500 mt-0.5 shrink-0" />
-                                    <div className="flex-1 text-xs text-gray-700 leading-relaxed">
+                                    <Info
+                                        size={14}
+                                        className="text-blue-500 dark:text-blue-400 mt-0.5 shrink-0"
+                                    />
+                                    <div className="flex-1 text-xs text-gray-700 dark:text-gray-200 leading-relaxed">
                                         Fill these with the data the application being evaluated
                                         received and produced. The evaluator will judge this pair —
                                         not your own typed values.
@@ -841,6 +845,7 @@ const SingleView = ({
                                         onClick={() => setEvaluatorCalloutDismissed(true)}
                                         className={clsx(
                                             "shrink-0 p-0.5 rounded text-gray-400 hover:text-gray-700 hover:bg-blue-100",
+                                            "dark:text-gray-500 dark:hover:text-gray-200 dark:hover:bg-blue-900/40",
                                             "border-0 bg-transparent cursor-pointer",
                                         )}
                                         aria-label="Dismiss"

From 970c18cd0a0ae9871fb3f0250b66145bd120e84a Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Tue, 2 Jun 2026 21:52:45 +0200
Subject: [PATCH 09/36] fix(playground-ui): readable dark-mode text on
 evaluator callout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous attempt used dark:text-gray-200 which conflicted with the
themeAwareColors CSS-variable layer — the gray scale is role-inverted
in dark mode, so dark:text-gray-200 resolved to a dark shade against
the dark callout background.

Switch overrides to the blue scale (not theme-flipped): dark:text-blue-50
for body text, dark:text-blue-300 for the icon, and dark:text-blue-200
for the dismiss button. All readable against dark:bg-blue-900/20.
---
 .../ExecutionItems/assets/ExecutionRow/SingleLayout.tsx     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx b/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
index fbc48aa887..52ecb2901a 100644
--- a/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
+++ b/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
@@ -833,9 +833,9 @@ const SingleView = ({
                                 >
                                     <Info
                                         size={14}
-                                        className="text-blue-500 dark:text-blue-400 mt-0.5 shrink-0"
+                                        className="text-blue-500 dark:text-blue-300 mt-0.5 shrink-0"
                                     />
-                                    <div className="flex-1 text-xs text-gray-700 dark:text-gray-200 leading-relaxed">
+                                    <div className="flex-1 text-xs text-gray-700 dark:text-blue-50 leading-relaxed">
                                         Fill these with the data the application being evaluated
                                         received and produced. The evaluator will judge this pair —
                                         not your own typed values.
@@ -845,7 +845,7 @@ const SingleView = ({
                                         onClick={() => setEvaluatorCalloutDismissed(true)}
                                         className={clsx(
                                             "shrink-0 p-0.5 rounded text-gray-400 hover:text-gray-700 hover:bg-blue-100",
-                                            "dark:text-gray-500 dark:hover:text-gray-200 dark:hover:bg-blue-900/40",
+                                            "dark:text-blue-200 dark:hover:text-blue-50 dark:hover:bg-blue-900/40",
                                             "border-0 bg-transparent cursor-pointer",
                                         )}
                                         aria-label="Dismiss"

From 1d8cddc6b8d1fa73971b2bc5bb2ea873163a2ec9 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Tue, 2 Jun 2026 22:18:01 +0200
Subject: [PATCH 10/36] fix(playground): close depth>0 leak path for stale chat
 fields (#4525)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The first #4525 fix only covered the depth=0 (root entity) path. In
the LLM-as-a-judge evaluator playground the chained app sits at
depth>0, where input construction goes through resolveChainInputs
(spreads testcaseData on the no-mapping branch) or
buildEvaluatorExecutionInputs (spreads testcaseData when the schema
allows additionalProperties). Both paths re-leak the stale
`messages` field from a previous chat app into the current target
entity's request body.

Add stripChatTransportForEntity — a targeted strip of known chat-
transport keys (currently just `messages`) that runs unless the
target entity's input schema explicitly declares them. Applied:

- depth=0 path: as a defense-in-depth pass after the strict
  filterDataToEntityInputSchema, so the spread fallback (taken
  while the new app's schema is mid-hydration) can't leak the
  stale field either.
- depth=0 repetition path: same.
- depth>0 path: pre-filters `data` before chain / evaluator input
  construction. Uses a targeted strip (rather than the strict
  schema filter) so evaluators that legitimately depend on
  additionalProperties: true spread of testcase columns keep
  receiving them.

The helper short-circuits to the input reference when no chat
transport keys are present, so there's no allocation in the
common path.
---
 .../src/state/execution/executionRunner.ts    | 105 +++++++++++++++---
 1 file changed, 87 insertions(+), 18 deletions(-)

diff --git a/web/packages/agenta-playground/src/state/execution/executionRunner.ts b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
index c67b349158..2a9f2a1915 100644
--- a/web/packages/agenta-playground/src/state/execution/executionRunner.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
@@ -236,6 +236,28 @@ function buildEvaluatorSelfReferences(params: {
     return Object.keys(refs).length > 0 ? refs : undefined
 }
 
+/**
+ * Keys that are chat-conversation transport (not template variables). They
+ * accumulate on the shared testcase row when a chat app runs, and must not
+ * leak into a non-chat entity's request body (issue #4525 / AGE-3793).
+ *
+ * Kept conservative: only `messages`. `chatHistory` is constructed at
+ * runtime from the flat message store, not stored on row data.
+ */
+const CHAT_TRANSPORT_KEYS = ["messages"] as const
+
+function getEntityInputSchema(
+    get: Getter,
+    entityId: string,
+): {properties?: Record<string, unknown>; additionalProperties?: unknown} | undefined {
+    const schemas = get(workflowMolecule.selectors.ioSchemas(entityId)) as
+        | {inputSchema?: unknown}
+        | undefined
+    return schemas?.inputSchema as
+        | {properties?: Record<string, unknown>; additionalProperties?: unknown}
+        | undefined
+}
+
 /**
  * Filter row data to only keys present in the entity's input schema.
  *
@@ -264,12 +286,7 @@ function filterDataToEntityInputSchema(
     data: Record<string, unknown>,
     entityId: string,
 ): Record<string, unknown> {
-    const schemas = get(workflowMolecule.selectors.ioSchemas(entityId)) as
-        | {inputSchema?: unknown}
-        | undefined
-    const inputSchema = schemas?.inputSchema as
-        | {properties?: Record<string, unknown>}
-        | undefined
+    const inputSchema = getEntityInputSchema(get, entityId)
     const properties = inputSchema?.properties
     if (!properties || typeof properties !== "object") {
         return {...data}
@@ -287,6 +304,41 @@ function filterDataToEntityInputSchema(
     return filtered
 }
 
+/**
+ * Strip chat-transport keys from row data unless the target entity
+ * explicitly declares them as inputs. Used as a defensive pre-filter for
+ * the chain / evaluator input-building paths (depth > 0), where the
+ * filter-by-properties pass above isn't applied because chain mappings
+ * and evaluator schema resolution own their own input shaping.
+ *
+ * Difference from `filterDataToEntityInputSchema`: that helper does a
+ * strict allow-list (great for primary/root entities with closed
+ * schemas); this one only strips a known-leaky set so evaluators that
+ * legitimately depend on `additionalProperties: true` spread keep
+ * receiving their extra testcase columns.
+ */
+function stripChatTransportForEntity(
+    get: Getter,
+    data: Record<string, unknown>,
+    entityId: string,
+): Record<string, unknown> {
+    const inputSchema = getEntityInputSchema(get, entityId)
+    const properties = inputSchema?.properties
+    const declared =
+        properties && typeof properties === "object"
+            ? new Set(Object.keys(properties))
+            : new Set<string>()
+    let mutated = false
+    const out: Record<string, unknown> = {...data}
+    for (const key of CHAT_TRANSPORT_KEYS) {
+        if (key in out && !declared.has(key)) {
+            delete out[key]
+            mutated = true
+        }
+    }
+    return mutated ? out : data
+}
+
 function createConcurrencyLimiter(concurrency: number) {
     let active = 0
     const queue: (() => void)[] = []
@@ -491,12 +543,23 @@ export async function executeStepForSessionWithExecutionItems(
                         // LLM-as-a-judge playground — issue #4525 / AGE-3793) don't
                         // leak into the new app's request body via the downstream
                         // "spread all keys" fallback in resolveVariableValues.
-                        nodeInputs = filterDataToEntityInputSchema(
-                            get,
-                            data,
-                            node.entity.id as string,
-                        )
+                        const rootEntityId = node.entity.id as string
+                        const filtered = filterDataToEntityInputSchema(get, data, rootEntityId)
+                        // Defense in depth: if the strict filter fell back to spreading
+                        // all keys (schema not yet resolved), still strip known chat-
+                        // transport keys unless the entity declares them. Without this
+                        // the bug repros while the new app's schema is mid-hydration.
+                        nodeInputs = stripChatTransportForEntity(get, filtered, rootEntityId)
                     } else {
+                        // Strip chat-transport keys from testcase data before chain /
+                        // evaluator input construction, so the downstream "spread all
+                        // keys" fallbacks (resolveChainInputs no-mapping branch and
+                        // buildEvaluatorExecutionInputs additionalProperties spread)
+                        // can't carry stale `messages` from a previous chat app into
+                        // the current target entity (#4525 / AGE-3793).
+                        const targetEntityId = node.entity.id as string
+                        const dataForChain = stripChatTransportForEntity(get, data, targetEntityId)
+
                         // Check whether the incoming connection has explicit valid mappings.
                         // resolveChainInputs always returns non-empty (fallback spreads testcaseData
                         // + prediction), so we can't rely on its result length alone.
@@ -514,7 +577,7 @@ export async function executeStepForSessionWithExecutionItems(
                                 allConnections,
                                 nodeId,
                                 nodeResults,
-                                data,
+                                dataForChain,
                             )
                             nodeInputs = resolved
                         } else {
@@ -530,10 +593,10 @@ export async function executeStepForSessionWithExecutionItems(
 
                             const evalStore = getDefaultStore()
                             const stageConfiguration = evalStore.get(
-                                workflowMolecule.selectors.configuration(node.entity.id as string),
+                                workflowMolecule.selectors.configuration(targetEntityId),
                             )
                             const stageSchemas = evalStore.get(
-                                workflowMolecule.selectors.ioSchemas(node.entity.id as string),
+                                workflowMolecule.selectors.ioSchemas(targetEntityId),
                             )
                             const inputSchema =
                                 (stageSchemas?.inputSchema as
@@ -543,10 +606,15 @@ export async function executeStepForSessionWithExecutionItems(
                                 session.mode === "chat"
                                     ? buildSharedChatInputs(get, loadableId)
                                     : undefined
+                            // Base the evaluator testcase on the stripped
+                            // `dataForChain` (not raw `data`) so stale chat-
+                            // transport keys from a previous chat app can't leak
+                            // in (#4525 / AGE-3793), then layer the current
+                            // shared chat inputs on top for chat-mode runs.
                             const evaluatorTestcaseData =
                                 rootChatInputs && Object.keys(rootChatInputs).length > 0
-                                    ? {...data, ...rootChatInputs}
-                                    : data
+                                    ? {...dataForChain, ...rootChatInputs}
+                                    : dataForChain
 
                             const evaluatorInputContext = {
                                 testcaseData: evaluatorTestcaseData,
@@ -717,9 +785,10 @@ export async function executeStepForSessionWithExecutionItems(
                 // Same input-schema filter as the first-run path above —
                 // repetitions hit the same root entity, so stale keys must be
                 // filtered identically (issue #4525 / AGE-3793).
-                const nodeInputs2 = filterDataToEntityInputSchema(
+                const repFiltered = filterDataToEntityInputSchema(get, data, session.runnableId)
+                const nodeInputs2 = stripChatTransportForEntity(
                     get,
-                    data,
+                    repFiltered,
                     session.runnableId,
                 )
                 const repetitionItem = rootExecutionHandle.retry({

From c4a2f3a8f7a74579aa81429858da0e20b65b9e55 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Tue, 2 Jun 2026 22:27:50 +0200
Subject: [PATCH 11/36] chore(playground): log when input-schema filter falls
 back or strips chat keys
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Diagnostic telemetry for #4525 / AGE-3793 — three console.warn signals
in executionRunner so we can tell which layer is actually rescuing the
request body during a chat→completion swap:

1. filterDataToEntityInputSchema schema-not-resolved fallback — the
   strict allow-list can't run because workflowMolecule.selectors
   .ioSchemas returned no inputSchema.properties. Logs the entityId,
   the reason (no-properties vs properties-not-object), the data keys
   present, and whether `messages` is among them.
2. filterDataToEntityInputSchema empty-properties fallback — schema
   resolved but Object.keys(properties).length === 0. Same payload.
3. stripChatTransportForEntity strip — emits only when a chat-transport
   key was actually dropped, with which keys and whether the entity
   schema was resolved at the time of the strip.

All three are warn-level so they're visible in production console
without code changes, and gated to the unusual paths so the happy
path stays quiet.
---
 .../src/state/execution/executionRunner.ts    | 33 +++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/web/packages/agenta-playground/src/state/execution/executionRunner.ts b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
index 2a9f2a1915..0deb9de39a 100644
--- a/web/packages/agenta-playground/src/state/execution/executionRunner.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
@@ -289,10 +289,26 @@ function filterDataToEntityInputSchema(
     const inputSchema = getEntityInputSchema(get, entityId)
     const properties = inputSchema?.properties
     if (!properties || typeof properties !== "object") {
+        // Schema not resolved — emits when the strict allow-list filter can't
+        // run and the secondary stripChatTransportForEntity pass becomes the
+        // only line of defense against stale chat keys leaking through.
+        // If this fires on a chat→completion swap repro, the secondary strip
+        // is the one keeping `messages` out of the request body.
+        console.warn("[executionRunner.filter] schema-not-resolved fallback", {
+            entityId,
+            reason: properties === undefined ? "no-properties" : "properties-not-object",
+            dataKeys: Object.keys(data),
+            hasMessagesKey: "messages" in data,
+        })
         return {...data}
     }
     const allowedKeys = new Set(Object.keys(properties))
     if (allowedKeys.size === 0) {
+        console.warn("[executionRunner.filter] empty-properties fallback", {
+            entityId,
+            dataKeys: Object.keys(data),
+            hasMessagesKey: "messages" in data,
+        })
         return {...data}
     }
     const filtered: Record<string, unknown> = {}
@@ -328,15 +344,26 @@ function stripChatTransportForEntity(
         properties && typeof properties === "object"
             ? new Set(Object.keys(properties))
             : new Set<string>()
-    let mutated = false
+    const stripped: string[] = []
     const out: Record<string, unknown> = {...data}
     for (const key of CHAT_TRANSPORT_KEYS) {
         if (key in out && !declared.has(key)) {
             delete out[key]
-            mutated = true
+            stripped.push(key)
         }
     }
-    return mutated ? out : data
+    if (stripped.length > 0) {
+        // Logged only when this strip actually drops a key — i.e. the
+        // entity didn't declare it but the testcase row carried it
+        // (typical signal: a chat→completion swap leaving stale `messages`).
+        console.warn("[executionRunner.filter] stripped chat-transport keys", {
+            entityId,
+            stripped,
+            schemaResolved: declared.size > 0,
+        })
+        return out
+    }
+    return data
 }
 
 function createConcurrencyLimiter(concurrency: number) {

From ea7da193e7fc18d6fd2624eb214b0ae6aaa7d15e Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Wed, 3 Jun 2026 00:11:15 +0200
Subject: [PATCH 12/36] fix(playground): reconcile testcase rows when primary
 entity swaps (#4525)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the stale-key fix from execution-time stripping to the layer
where it belongs: the testcase row store, on swap of the primary
entity. The testcaseMolecule is shared across loadables, so when the
user swaps the chained app in the LLM-as-a-judge playground (anchor
positional swap in setEntityIdsAtom), the same rows now carry every
key the previous primary populated — `messages` from a prior chat
app, completion variables from a prior completion app, etc.

Reconciliation strategy (decided with the user):
- Closed schema (additionalProperties: false): drop any row key not
  declared by the new entity's inputSchema.properties. Drops silently
  — no toast, no confirm modal. Matches what the user typed for the
  new app and nothing more.
- Open schema (additionalProperties not set or true): only strip the
  CHAT_TRANSPORT_KEYS set (currently `messages`). Evaluator
  workflows that legitimately depend on additionalProperties spread
  keep receiving their extra testcase columns.
- Schema not resolved: skip. The execution-time strip in
  executionRunner.ts is the fallback during this hydration window —
  it will be removed in a follow-up commit once the row-layer fix is
  verified end-to-end and a reactive deferred reconciliation handles
  the hydration race.

Mutation goes through testcaseMolecule.actions.batchUpdate with
stale keys set to `undefined` (the store's update reducer
interprets that as a delete). Drafts are created per affected row.

A console.warn is emitted in two cases:
- schema-not-resolved on swap (so we can verify the hydration race
  surface area in practice).
- one summary per swap that lists which keys were dropped per row
  and the schema mode (closed vs open).
---
 .../state/controllers/playgroundController.ts | 111 +++++++++++++++++-
 1 file changed, 110 insertions(+), 1 deletion(-)

diff --git a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
index 86a5394efd..845fe69e0e 100644
--- a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
+++ b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
@@ -24,7 +24,7 @@
 
 import {loadableStateAtomFamily} from "@agenta/entities/loadable"
 import {loadableController, snapshotAdapterRegistry} from "@agenta/entities/runnable"
-import {fetchTestcasesPage} from "@agenta/entities/testcase"
+import {fetchTestcasesPage, testcaseMolecule} from "@agenta/entities/testcase"
 import type {TraceSpan, TraceSpanNode} from "@agenta/entities/trace"
 import {extractAgData, extractInputs, extractOutputs} from "@agenta/entities/trace"
 import {
@@ -1907,6 +1907,13 @@ const setEntityIdsAtom = atom(null, (get, set, next: string[] | ((prev: string[]
                     oldLoadableId,
                     newLoadableId: newAnchorLoadableId,
                 })
+                // After the loadable re-link, the testcase row store is still
+                // carrying every key the *previous* primary populated (chat
+                // `messages`, old completion variables, etc.). Reconcile each
+                // row against the NEW primary's input schema so the UI shows
+                // only the relevant variables and execution doesn't have to
+                // strip them later (#4525 / AGE-3793).
+                pruneTestcaseRowsForEntity(get, set, anchorSwap.newEntityId)
             }
         }
     }
@@ -2125,6 +2132,108 @@ function relinkLoadableSessions(
     }
 }
 
+/**
+ * Chat-conversation transport keys that accumulate on a shared testcase row
+ * when a chat app runs (#4525 / AGE-3793). They are not template variables;
+ * they describe a conversation. Used as the strip set when the new primary's
+ * input schema is open (`additionalProperties !== false`), where the strict
+ * allow-list would over-prune evaluator rows that depend on extra columns.
+ */
+const CHAT_TRANSPORT_KEYS = ["messages"] as const
+
+/**
+ * Reconcile every testcase row against the new primary entity's input
+ * schema.
+ *
+ * Why this exists: the testcase row store (`testcaseMolecule`) is shared
+ * across loadables. When the user swaps the primary app in the LLM-as-a-
+ * judge playground (anchor swap in `setEntityIdsAtom`), the row data keeps
+ * every key the *previous* primary populated — chat `messages`, completion
+ * template variables that the new app doesn't declare, etc. Without
+ * reconciliation, those stale keys leak into the new app's request body
+ * via the downstream "spread all keys" fallbacks.
+ *
+ * Handling at the row layer (here) makes the UI immediately reflect the
+ * new app's variables and removes the need for execution-time stripping.
+ *
+ * Schema cases:
+ *   - Closed schema (`additionalProperties: false`): drop any row key not
+ *     declared by `inputSchema.properties`.
+ *   - Open schema (additionalProperties unset or true): only drop the
+ *     CHAT_TRANSPORT_KEYS set — evaluator workflows that legitimately
+ *     spread testcase columns still receive them.
+ *   - Schema not resolved: skip silently. The execution-time strip in
+ *     `executionRunner.ts` is the fallback during this hydration window.
+ *     A console.warn is emitted so we can see when it happens.
+ *
+ * Mutations go through `testcaseMolecule.actions.batchUpdate` setting
+ * stale keys to `undefined`, which the store's update reducer interprets
+ * as a delete. Drafts are created as needed (one per affected row).
+ */
+function pruneTestcaseRowsForEntity(get: Getter, set: Setter, entityId: string) {
+    const schemas = get(workflowMolecule.selectors.ioSchemas(entityId)) as
+        | {inputSchema?: unknown}
+        | undefined
+    const inputSchema = schemas?.inputSchema as
+        | {properties?: Record<string, unknown>; additionalProperties?: unknown}
+        | undefined
+    const properties = inputSchema?.properties
+    const declared =
+        properties && typeof properties === "object" ? new Set(Object.keys(properties)) : null
+
+    if (!declared) {
+        console.warn("[playgroundController.prune] schema-not-resolved on swap", {
+            entityId,
+            // Without the schema we can't safely decide what to drop. The
+            // execution-time strip catches the leak window for now.
+        })
+        return
+    }
+
+    const isClosedSchema = inputSchema?.additionalProperties === false
+    const displayRowIds = get(testcaseMolecule.atoms.displayRowIds)
+    if (!Array.isArray(displayRowIds) || displayRowIds.length === 0) return
+
+    const updates: {id: string; updates: {data: Record<string, unknown>}}[] = []
+    const droppedPerRow: Record<string, string[]> = {}
+
+    for (const rowId of displayRowIds) {
+        const row = get(testcaseMolecule.data(rowId))
+        const data = (row as {data?: Record<string, unknown>} | null)?.data
+        if (!data || typeof data !== "object") continue
+
+        const keysToDrop: string[] = []
+        if (isClosedSchema) {
+            for (const key of Object.keys(data)) {
+                if (!declared.has(key)) keysToDrop.push(key)
+            }
+        } else {
+            for (const key of CHAT_TRANSPORT_KEYS) {
+                if (key in data && !declared.has(key)) keysToDrop.push(key)
+            }
+        }
+        if (keysToDrop.length === 0) continue
+
+        const undefinedData: Record<string, unknown> = {}
+        for (const key of keysToDrop) {
+            undefinedData[key] = undefined
+        }
+        updates.push({id: rowId, updates: {data: undefinedData}})
+        droppedPerRow[rowId] = keysToDrop
+    }
+
+    if (updates.length === 0) return
+
+    console.warn("[playgroundController.prune] dropped stale keys after primary swap", {
+        entityId,
+        rowsAffected: updates.length,
+        schemaMode: isClosedSchema ? "closed" : "open",
+        droppedPerRow,
+    })
+
+    set(testcaseMolecule.actions.batchUpdate, updates)
+}
+
 /**
  * Switch one entity for another in the displayed selection.
  * Handles both single and comparison mode. The loadable-scoped re-link

From 96b2492825cb328b53c08a1bfb454fdc8857b6f9 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Wed, 3 Jun 2026 02:45:03 +0200
Subject: [PATCH 13/36] fix(playground): resolve input allow-list from
 inputPorts, not inputSchema (#4525)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause of the `context` leak that survived the prior fixes: both
the row prune and the runtime filter read the allow-list from
`workflowMolecule.selectors.ioSchemas(entityId).inputSchema.properties`,
which is EMPTY for completion apps. Completion apps express their
variables as prompt template placeholders surfaced through
`inputPorts`, not through the static input schema. So the filter
degraded to its empty-properties fallback (keep everything) and only
the hardcoded chat-transport strip removed `messages` — `context`
(a real chat template var, stale on the row) sailed through.

Diagnostic confirmation from the repro console:
  [executionRunner.filter] empty-properties fallback
    {entityId, dataKeys: ['messages','context','country'], hasMessagesKey: true}

Fix: new shared helper `state/helpers/entityInputContract.ts` that
resolves the allow-list the SAME way executionItems builds request
`variables`:
  variablesFromInputPorts = inputPorts[].key
  variablesFromPayload    = requestPayload.__meta.variables
                            ?? requestPayload.variables ?? []
  variables = inputPorts.length > 0 ? inputPorts : payload
  (+ `messages` when executionMode === 'chat')

`reconcileRowDataForEntity` applies the policy:
  - app with resolved contract → strict allow-list (drops context+messages)
  - evaluator → chat-transport-only strip (preserves additionalProperties
    spread of extra testcase columns)
  - unresolved contract → chat-transport-only safety strip

Both consumers now delegate to it:
  - playgroundController.pruneTestcaseRowsForEntity (swap-time, primary fix)
  - executionRunner.reconcileEntityInputData (exec-time hydration safety net)

This collapses the three ad-hoc helpers (filterDataToEntityInputSchema,
stripChatTransportForEntity, getEntityInputSchema) into one correct
source-of-truth resolution and removes the now-misleading
empty-properties / schema-not-resolved diagnostics.
---
 .../state/controllers/playgroundController.ts |  85 ++++-----
 .../src/state/execution/executionRunner.ts    | 180 ++++--------------
 .../src/state/helpers/entityInputContract.ts  | 172 +++++++++++++++++
 3 files changed, 244 insertions(+), 193 deletions(-)
 create mode 100644 web/packages/agenta-playground/src/state/helpers/entityInputContract.ts

diff --git a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
index 845fe69e0e..40156efbda 100644
--- a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
+++ b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
@@ -83,6 +83,7 @@ import {
     newTestcaseDataHashAtom,
 } from "../execution/selectors"
 import {pruneDanglingConnections} from "../helpers/connectionGraph"
+import {reconcileRowDataForEntity, resolveEntityInputContract} from "../helpers/entityInputContract"
 import {extractAndLoadChatMessagesAtom} from "../helpers/extractAndLoadChatMessages"
 import {normalizeTestcaseRowsForLoad} from "../helpers/testcaseRowNormalization"
 import type {EntitySelection, PlaygroundNode, RunnableType} from "../types"
@@ -2132,18 +2133,9 @@ function relinkLoadableSessions(
     }
 }
 
-/**
- * Chat-conversation transport keys that accumulate on a shared testcase row
- * when a chat app runs (#4525 / AGE-3793). They are not template variables;
- * they describe a conversation. Used as the strip set when the new primary's
- * input schema is open (`additionalProperties !== false`), where the strict
- * allow-list would over-prune evaluator rows that depend on extra columns.
- */
-const CHAT_TRANSPORT_KEYS = ["messages"] as const
-
 /**
  * Reconcile every testcase row against the new primary entity's input
- * schema.
+ * contract.
  *
  * Why this exists: the testcase row store (`testcaseMolecule`) is shared
  * across loadables. When the user swaps the primary app in the LLM-as-a-
@@ -2153,73 +2145,62 @@ const CHAT_TRANSPORT_KEYS = ["messages"] as const
  * reconciliation, those stale keys leak into the new app's request body
  * via the downstream "spread all keys" fallbacks.
  *
- * Handling at the row layer (here) makes the UI immediately reflect the
- * new app's variables and removes the need for execution-time stripping.
+ * Handling at the row layer (here) makes the UI immediately reflect the new
+ * app's variables and is the primary fix; execution-time reconciliation in
+ * `executionRunner.ts` is only a hydration-window safety net.
  *
- * Schema cases:
- *   - Closed schema (`additionalProperties: false`): drop any row key not
- *     declared by `inputSchema.properties`.
- *   - Open schema (additionalProperties unset or true): only drop the
- *     CHAT_TRANSPORT_KEYS set — evaluator workflows that legitimately
- *     spread testcase columns still receive them.
- *   - Schema not resolved: skip silently. The execution-time strip in
- *     `executionRunner.ts` is the fallback during this hydration window.
- *     A console.warn is emitted so we can see when it happens.
+ * Allow-list source is `inputPorts` (via `resolveEntityInputContract`), NOT
+ * `inputSchema.properties` — completion apps surface their variables as
+ * prompt template placeholders through `inputPorts` and have an EMPTY static
+ * input schema, so schema-based filtering keeps everything. Policy:
+ *   - App with a resolved contract → strict: keep only declared keys.
+ *   - Evaluator → chat-transport only: evaluators spread extra testcase
+ *     columns, so we never strict-filter them.
+ *   - Unresolved contract (ports mid-hydration) → skip; the execution-time
+ *     reconciliation catches it. A console.warn is emitted so we can see
+ *     when the hydration window is hit.
  *
- * Mutations go through `testcaseMolecule.actions.batchUpdate` setting
- * stale keys to `undefined`, which the store's update reducer interprets
- * as a delete. Drafts are created as needed (one per affected row).
+ * Mutations go through `testcaseMolecule.actions.batchUpdate` setting stale
+ * keys to `undefined`, which the store's update reducer interprets as a
+ * delete. Drafts are created as needed (one per affected row).
  */
 function pruneTestcaseRowsForEntity(get: Getter, set: Setter, entityId: string) {
-    const schemas = get(workflowMolecule.selectors.ioSchemas(entityId)) as
-        | {inputSchema?: unknown}
-        | undefined
-    const inputSchema = schemas?.inputSchema as
-        | {properties?: Record<string, unknown>; additionalProperties?: unknown}
-        | undefined
-    const properties = inputSchema?.properties
-    const declared =
-        properties && typeof properties === "object" ? new Set(Object.keys(properties)) : null
+    const contract = resolveEntityInputContract(get, entityId)
 
-    if (!declared) {
-        console.warn("[playgroundController.prune] schema-not-resolved on swap", {
+    // Unresolved, non-evaluator contract → we can't strict-filter safely yet.
+    // The evaluator path is always "resolved enough" (chat-transport strip
+    // works without a variable list), so only bail for non-evaluator apps.
+    if (!contract.isEvaluator && !contract.resolved) {
+        console.warn("[playgroundController.prune] contract-not-resolved on swap", {
             entityId,
-            // Without the schema we can't safely decide what to drop. The
-            // execution-time strip catches the leak window for now.
+            // Without resolved inputPorts we can't decide what to drop. The
+            // execution-time reconciliation catches the leak window for now.
         })
         return
     }
 
-    const isClosedSchema = inputSchema?.additionalProperties === false
     const displayRowIds = get(testcaseMolecule.atoms.displayRowIds)
     if (!Array.isArray(displayRowIds) || displayRowIds.length === 0) return
 
     const updates: {id: string; updates: {data: Record<string, unknown>}}[] = []
     const droppedPerRow: Record<string, string[]> = {}
+    let strategyUsed: string | null = null
 
     for (const rowId of displayRowIds) {
         const row = get(testcaseMolecule.data(rowId))
         const data = (row as {data?: Record<string, unknown>} | null)?.data
         if (!data || typeof data !== "object") continue
 
-        const keysToDrop: string[] = []
-        if (isClosedSchema) {
-            for (const key of Object.keys(data)) {
-                if (!declared.has(key)) keysToDrop.push(key)
-            }
-        } else {
-            for (const key of CHAT_TRANSPORT_KEYS) {
-                if (key in data && !declared.has(key)) keysToDrop.push(key)
-            }
-        }
-        if (keysToDrop.length === 0) continue
+        const {dropped, strategy} = reconcileRowDataForEntity(get, entityId, data)
+        if (dropped.length === 0) continue
+        strategyUsed = strategy
 
         const undefinedData: Record<string, unknown> = {}
-        for (const key of keysToDrop) {
+        for (const key of dropped) {
             undefinedData[key] = undefined
         }
         updates.push({id: rowId, updates: {data: undefinedData}})
-        droppedPerRow[rowId] = keysToDrop
+        droppedPerRow[rowId] = dropped
     }
 
     if (updates.length === 0) return
@@ -2227,7 +2208,7 @@ function pruneTestcaseRowsForEntity(get: Getter, set: Setter, entityId: string)
     console.warn("[playgroundController.prune] dropped stale keys after primary swap", {
         entityId,
         rowsAffected: updates.length,
-        schemaMode: isClosedSchema ? "closed" : "open",
+        strategy: strategyUsed,
         droppedPerRow,
     })
 
diff --git a/web/packages/agenta-playground/src/state/execution/executionRunner.ts b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
index 0deb9de39a..d9106e8d8c 100644
--- a/web/packages/agenta-playground/src/state/execution/executionRunner.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
@@ -16,6 +16,7 @@ import {getDefaultStore} from "jotai/vanilla"
 
 import {messageIdsAtomFamily, messagesByIdAtomFamily} from "../chat/messageAtoms"
 import {SHARED_SESSION_ID, type ChatMessage} from "../chat/messageTypes"
+import {reconcileRowDataForEntity} from "../helpers/entityInputContract"
 import type {OutputConnection, PlaygroundNode} from "../types"
 
 import {
@@ -237,133 +238,38 @@ function buildEvaluatorSelfReferences(params: {
 }
 
 /**
- * Keys that are chat-conversation transport (not template variables). They
- * accumulate on the shared testcase row when a chat app runs, and must not
- * leak into a non-chat entity's request body (issue #4525 / AGE-3793).
+ * Reconcile row data to an entity's input contract at execution time.
  *
- * Kept conservative: only `messages`. `chatHistory` is constructed at
- * runtime from the flat message store, not stored on row data.
- */
-const CHAT_TRANSPORT_KEYS = ["messages"] as const
-
-function getEntityInputSchema(
-    get: Getter,
-    entityId: string,
-): {properties?: Record<string, unknown>; additionalProperties?: unknown} | undefined {
-    const schemas = get(workflowMolecule.selectors.ioSchemas(entityId)) as
-        | {inputSchema?: unknown}
-        | undefined
-    return schemas?.inputSchema as
-        | {properties?: Record<string, unknown>; additionalProperties?: unknown}
-        | undefined
-}
-
-/**
- * Filter row data to only keys present in the entity's input schema.
+ * This is the runtime safety net for #4525 / AGE-3793: testcase rows live in
+ * a shared store and preserve every key the user ever ran with (chat apps
+ * populate `messages`, completion apps populate template variables, etc.).
+ * When the user swaps the primary app, the same row carries stale keys.
  *
- * Why this exists: testcase rows live in the local testcase molecule and
- * preserve every key the user ever ran with (chat apps populate
- * `messages`/`context`, completion apps populate template variables, etc.).
- * When the user swaps the primary app — e.g. in the LLM-as-a-judge playground
- * switching the chained app from chat to completion (issue #4525 /
- * AGE-3793) — the same row data carries stale chat keys into the completion
- * request.
- *
- * Downstream there's already filtering by `variables` (input ports) in
- * `resolveVariableValues` / `buildCompletionInputRow`, but it falls back to
- * "spread all keys" when `variables` is empty (entity still hydrating, or a
- * workflow with no declared input ports). This helper filters AT THE RUNNER
- * before the data leaves for stage execution, so stale keys can't slip
- * through the fallback.
- *
- * Fallback contract: when the entity has no resolvable input schema
- * (`properties` missing / empty), return the data unchanged — preserves the
- * pre-fix behavior so workflows that genuinely depend on free-form input
- * (e.g. `__rawBody` app workflows with `__meta.variables`) aren't broken.
- */
-function filterDataToEntityInputSchema(
-    get: Getter,
-    data: Record<string, unknown>,
-    entityId: string,
-): Record<string, unknown> {
-    const inputSchema = getEntityInputSchema(get, entityId)
-    const properties = inputSchema?.properties
-    if (!properties || typeof properties !== "object") {
-        // Schema not resolved — emits when the strict allow-list filter can't
-        // run and the secondary stripChatTransportForEntity pass becomes the
-        // only line of defense against stale chat keys leaking through.
-        // If this fires on a chat→completion swap repro, the secondary strip
-        // is the one keeping `messages` out of the request body.
-        console.warn("[executionRunner.filter] schema-not-resolved fallback", {
-            entityId,
-            reason: properties === undefined ? "no-properties" : "properties-not-object",
-            dataKeys: Object.keys(data),
-            hasMessagesKey: "messages" in data,
-        })
-        return {...data}
-    }
-    const allowedKeys = new Set(Object.keys(properties))
-    if (allowedKeys.size === 0) {
-        console.warn("[executionRunner.filter] empty-properties fallback", {
-            entityId,
-            dataKeys: Object.keys(data),
-            hasMessagesKey: "messages" in data,
-        })
-        return {...data}
-    }
-    const filtered: Record<string, unknown> = {}
-    for (const [key, value] of Object.entries(data)) {
-        if (allowedKeys.has(key)) {
-            filtered[key] = value
-        }
-    }
-    return filtered
-}
-
-/**
- * Strip chat-transport keys from row data unless the target entity
- * explicitly declares them as inputs. Used as a defensive pre-filter for
- * the chain / evaluator input-building paths (depth > 0), where the
- * filter-by-properties pass above isn't applied because chain mappings
- * and evaluator schema resolution own their own input shaping.
+ * Reconciliation primarily happens at swap time in the playground controller
+ * (`pruneTestcaseRowsForEntity`); this pass catches the hydration window
+ * where the new entity's input contract wasn't yet resolved at swap time but
+ * IS resolved by the time the request is built.
  *
- * Difference from `filterDataToEntityInputSchema`: that helper does a
- * strict allow-list (great for primary/root entities with closed
- * schemas); this one only strips a known-leaky set so evaluators that
- * legitimately depend on `additionalProperties: true` spread keep
- * receiving their extra testcase columns.
+ * Delegates to the shared `reconcileRowDataForEntity` — allow-list derived
+ * from `inputPorts` (the same source `executionItems` uses for `variables`),
+ * NOT `inputSchema.properties` (empty for completion apps). Apps get a strict
+ * allow-list; evaluators / unresolved contracts get a chat-transport-only
+ * strip so workflows depending on extra testcase columns keep working.
  */
-function stripChatTransportForEntity(
+function reconcileEntityInputData(
     get: Getter,
     data: Record<string, unknown>,
     entityId: string,
 ): Record<string, unknown> {
-    const inputSchema = getEntityInputSchema(get, entityId)
-    const properties = inputSchema?.properties
-    const declared =
-        properties && typeof properties === "object"
-            ? new Set(Object.keys(properties))
-            : new Set<string>()
-    const stripped: string[] = []
-    const out: Record<string, unknown> = {...data}
-    for (const key of CHAT_TRANSPORT_KEYS) {
-        if (key in out && !declared.has(key)) {
-            delete out[key]
-            stripped.push(key)
-        }
-    }
-    if (stripped.length > 0) {
-        // Logged only when this strip actually drops a key — i.e. the
-        // entity didn't declare it but the testcase row carried it
-        // (typical signal: a chat→completion swap leaving stale `messages`).
-        console.warn("[executionRunner.filter] stripped chat-transport keys", {
+    const {data: next, dropped, strategy} = reconcileRowDataForEntity(get, entityId, data)
+    if (dropped.length > 0) {
+        console.warn("[executionRunner.filter] reconciled stale row keys", {
             entityId,
-            stripped,
-            schemaResolved: declared.size > 0,
+            strategy,
+            dropped,
         })
-        return out
     }
-    return data
+    return next
 }
 
 function createConcurrencyLimiter(concurrency: number) {
@@ -564,28 +470,25 @@ export async function executeStepForSessionWithExecutionItems(
 
                     let nodeInputs: Record<string, unknown>
                     if (node.depth === 0) {
-                        // Filter to the root entity's declared input schema so stale
-                        // keys from a previous primary app (e.g. chat `messages` /
-                        // `context` after swapping the upstream app in the
+                        // Reconcile the row to the root entity's input contract so
+                        // stale keys from a previous primary app (e.g. chat `messages`
+                        // / `context` after swapping the upstream app in the
                         // LLM-as-a-judge playground — issue #4525 / AGE-3793) don't
                         // leak into the new app's request body via the downstream
-                        // "spread all keys" fallback in resolveVariableValues.
+                        // "spread all keys" fallback in resolveVariableValues. Apps
+                        // get a strict allow-list (from inputPorts); evaluators get a
+                        // chat-transport-only strip.
                         const rootEntityId = node.entity.id as string
-                        const filtered = filterDataToEntityInputSchema(get, data, rootEntityId)
-                        // Defense in depth: if the strict filter fell back to spreading
-                        // all keys (schema not yet resolved), still strip known chat-
-                        // transport keys unless the entity declares them. Without this
-                        // the bug repros while the new app's schema is mid-hydration.
-                        nodeInputs = stripChatTransportForEntity(get, filtered, rootEntityId)
+                        nodeInputs = reconcileEntityInputData(get, data, rootEntityId)
                     } else {
-                        // Strip chat-transport keys from testcase data before chain /
-                        // evaluator input construction, so the downstream "spread all
-                        // keys" fallbacks (resolveChainInputs no-mapping branch and
+                        // Reconcile testcase data before chain / evaluator input
+                        // construction, so the downstream "spread all keys" fallbacks
+                        // (resolveChainInputs no-mapping branch and
                         // buildEvaluatorExecutionInputs additionalProperties spread)
-                        // can't carry stale `messages` from a previous chat app into
-                        // the current target entity (#4525 / AGE-3793).
+                        // can't carry stale keys from a previous app into the current
+                        // target entity (#4525 / AGE-3793).
                         const targetEntityId = node.entity.id as string
-                        const dataForChain = stripChatTransportForEntity(get, data, targetEntityId)
+                        const dataForChain = reconcileEntityInputData(get, data, targetEntityId)
 
                         // Check whether the incoming connection has explicit valid mappings.
                         // resolveChainInputs always returns non-empty (fallback spreads testcaseData
@@ -809,15 +712,10 @@ export async function executeStepForSessionWithExecutionItems(
                 if (abortController.signal.aborted) break
 
                 const perSession2 = sessionOptions?.[session.id]
-                // Same input-schema filter as the first-run path above —
-                // repetitions hit the same root entity, so stale keys must be
-                // filtered identically (issue #4525 / AGE-3793).
-                const repFiltered = filterDataToEntityInputSchema(get, data, session.runnableId)
-                const nodeInputs2 = stripChatTransportForEntity(
-                    get,
-                    repFiltered,
-                    session.runnableId,
-                )
+                // Same reconciliation as the first-run path above — repetitions
+                // hit the same root entity, so stale keys must be filtered
+                // identically (issue #4525 / AGE-3793).
+                const nodeInputs2 = reconcileEntityInputData(get, data, session.runnableId)
                 const repetitionItem = rootExecutionHandle.retry({
                     get,
                     headers: perSession2?.headers ?? {},
diff --git a/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts b/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts
new file mode 100644
index 0000000000..dd249ffdd9
--- /dev/null
+++ b/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts
@@ -0,0 +1,172 @@
+/**
+ * Entity input contract resolution.
+ *
+ * Single source of truth for "what testcase row keys does this entity
+ * legitimately consume as inputs". Used to reconcile shared testcase rows
+ * when the primary entity changes (#4525 / AGE-3793) — the testcase store is
+ * shared across loadables, so a row keeps every key the *previous* primary
+ * populated (chat `messages`, a prior completion app's template variables,
+ * etc.). Those stale keys must not leak into the new entity's request body.
+ *
+ * CRITICAL: the allow-list is derived from `inputPorts`, NOT
+ * `inputSchema.properties`. Completion apps express their variables as prompt
+ * template placeholders surfaced through `inputPorts`; their static
+ * `inputSchema.properties` is EMPTY. Reading the schema there yields an empty
+ * allow-list and the filter degrades to "keep everything" — which is exactly
+ * the bug. `inputPorts` is also the same source `executionItems` uses to
+ * build the request `variables`, so filtering against it is guaranteed
+ * consistent with what actually gets sent.
+ */
+import {workflowMolecule} from "@agenta/entities/workflow"
+import type {Getter} from "jotai"
+
+/**
+ * Chat-conversation transport keys. They accumulate on a shared testcase row
+ * when a chat app runs and are not template variables — they describe a
+ * conversation. Stripped from non-chat entities. Kept conservative (only
+ * `messages`); `chatHistory` is rebuilt at runtime from the flat message
+ * store, never stored on row data.
+ */
+export const CHAT_TRANSPORT_KEYS = ["messages"] as const
+
+export interface EntityInputContract {
+    /**
+     * Keys the entity legitimately consumes as testcase inputs. Includes
+     * `messages` for chat apps. Empty when nothing could be resolved.
+     */
+    allowedKeys: Set<string>
+    /**
+     * True when we have a confident allow-list to strict-filter against:
+     * the entity surfaced at least one input variable, or it's a chat app
+     * (an empty-variable chat app is still valid — it consumes `messages`).
+     */
+    resolved: boolean
+    /**
+     * Evaluators get OPEN-schema treatment: they may spread arbitrary extra
+     * testcase columns (`additionalProperties`), so we never strict-filter
+     * their rows — only strip known chat-transport keys.
+     */
+    isEvaluator: boolean
+    /** Chat apps keep `messages`. */
+    isChat: boolean
+}
+
+function isNonEmptyString(value: unknown): value is string {
+    return typeof value === "string" && value.length > 0
+}
+
+/**
+ * Resolve the input contract for an entity, mirroring the variable
+ * resolution in `executionItems.ts` exactly:
+ *
+ *   variablesFromInputPorts = inputPorts[].key
+ *   variablesFromPayload     = requestPayload.__meta.variables
+ *                              ?? requestPayload.variables ?? []
+ *   variables = inputPorts.length > 0 ? inputPorts : payload
+ *
+ * plus `messages` when the entity runs in chat mode.
+ */
+export function resolveEntityInputContract(get: Getter, entityId: string): EntityInputContract {
+    const entity = get(workflowMolecule.selectors.data(entityId)) as
+        | {flags?: Record<string, unknown> | null}
+        | null
+        | undefined
+    const isEvaluator = !!entity?.flags?.is_evaluator
+
+    const mode = get(workflowMolecule.selectors.executionMode(entityId)) as
+        | "chat"
+        | "completion"
+        | undefined
+    const isChat = mode === "chat"
+
+    const inputPorts = (get(workflowMolecule.selectors.inputPorts(entityId)) ?? []) as {
+        key?: unknown
+    }[]
+    const variablesFromInputPorts = Array.from(
+        new Set(inputPorts.map((port) => port?.key).filter(isNonEmptyString)),
+    )
+
+    const requestPayload = get(workflowMolecule.selectors.requestPayload(entityId)) as
+        | {variables?: unknown; __meta?: {variables?: unknown} | null}
+        | null
+        | undefined
+    const metaVariables = requestPayload?.__meta?.variables
+    const payloadVariables = requestPayload?.variables
+    const rawPayloadVariables: unknown[] = Array.isArray(metaVariables)
+        ? metaVariables
+        : Array.isArray(payloadVariables)
+          ? payloadVariables
+          : []
+    const variablesFromPayload = rawPayloadVariables.filter(isNonEmptyString)
+
+    const variables =
+        variablesFromInputPorts.length > 0 ? variablesFromInputPorts : variablesFromPayload
+
+    const allowedKeys = new Set(variables)
+    if (isChat) allowedKeys.add("messages")
+
+    const resolved = variables.length > 0 || isChat
+
+    return {allowedKeys, resolved, isEvaluator, isChat}
+}
+
+export type ReconcileStrategy = "strict" | "chat-transport" | "skip"
+
+export interface ReconcileResult {
+    /** The reconciled data (new object only when keys were dropped). */
+    data: Record<string, unknown>
+    /** Keys that were removed. Empty when nothing changed. */
+    dropped: string[]
+    /** Which policy ran. */
+    strategy: ReconcileStrategy
+}
+
+/**
+ * Reconcile a row's data to an entity's input contract.
+ *
+ * Policy:
+ *  - Evaluator → `chat-transport`: only strip chat-transport keys the entity
+ *    doesn't declare. Preserves evaluators that spread additional testcase
+ *    columns.
+ *  - App with a resolved contract → `strict`: keep only declared keys.
+ *  - Unresolved (schema/ports mid-hydration, non-evaluator) → `chat-transport`
+ *    as a safety net; the caller may choose to defer a strict pass until the
+ *    contract resolves.
+ */
+export function reconcileRowDataForEntity(
+    get: Getter,
+    entityId: string,
+    data: Record<string, unknown>,
+): ReconcileResult {
+    const contract = resolveEntityInputContract(get, entityId)
+
+    const useStrict = !contract.isEvaluator && contract.resolved
+
+    if (useStrict) {
+        const dropped: string[] = []
+        const next: Record<string, unknown> = {}
+        for (const [key, value] of Object.entries(data)) {
+            if (contract.allowedKeys.has(key)) {
+                next[key] = value
+            } else {
+                dropped.push(key)
+            }
+        }
+        return dropped.length > 0
+            ? {data: next, dropped, strategy: "strict"}
+            : {data, dropped, strategy: "strict"}
+    }
+
+    // chat-transport strip (evaluators + unresolved contracts)
+    const dropped: string[] = []
+    const next: Record<string, unknown> = {...data}
+    for (const key of CHAT_TRANSPORT_KEYS) {
+        if (key in next && !contract.allowedKeys.has(key)) {
+            delete next[key]
+            dropped.push(key)
+        }
+    }
+    return dropped.length > 0
+        ? {data: next, dropped, strategy: "chat-transport"}
+        : {data, dropped, strategy: "chat-transport"}
+}

From d0442d3757e189009361b4d31b0b4868818b2cd9 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Wed, 3 Jun 2026 03:34:01 +0200
Subject: [PATCH 14/36] fix(playground): clean shared testcase row before run,
 not just app inputs (#4525)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prior fix stripped stale keys from the APP's request inputs, but the
trace still showed `context` because it surfaces in the downstream
EVALUATOR's {inputs, outputs} envelope. The evaluator reads the SAME
shared testcase row, and the evaluator policy (chat-transport-only)
intentionally preserves non-`messages` keys — so `context` survived
there. The UI row also still showed it.

Also: the swap-time prune in setEntityIdsAtom never fired for this flow —
the evaluator playground selects the app via add/remove node actions in
ConfigureEvaluator, not a setEntityIds positional swap (no
[playgroundController.prune] log appeared in the repro).

Fix: reconcile the shared testcase row against the ROOT entity's input
contract in webWorkerIntegration, right before execution — path-agnostic,
fires on every run regardless of how the app was selected. The cleaned
row is:
  - passed to the runner (so app request AND evaluator envelope are clean),
  - written back via loadableController.actions.updateRow (so the UI and
    future runs reflect it; undefined values delete the keys).

Evaluator-referenced columns are protected: collectDownstreamReferencedColumns
gathers testcase columns named by downstream evaluator `<input>_key`
settings (e.g. correct_answer_key → ground_truth) and passes them as
protectedKeys, so a strict clean against the app contract never drops
intentional evaluation inputs.

reconcileRowDataForEntity gains an optional protectedKeys set; a key
survives strict filtering when it's in the app allow-list OR protected.

Emits [webWorker.reconcile] when keys are dropped, listing the strategy,
dropped keys, and protected columns.
---
 .../state/execution/webWorkerIntegration.ts   | 43 ++++++++++++++-
 .../src/state/helpers/entityInputContract.ts  | 54 +++++++++++++++++--
 2 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts b/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
index 3a4959f6e0..45a4af1777 100644
--- a/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
+++ b/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
@@ -19,6 +19,10 @@ import {queryClientAtom} from "jotai-tanstack-query"
 import {outputConnectionsAtom} from "../atoms/connections"
 import {entityIdsAtom, playgroundNodesAtom} from "../atoms/playground"
 import {clearSessionResponsesAtom, messageIdsAtomFamily, messagesByIdAtomFamily} from "../chat"
+import {
+    collectDownstreamReferencedColumns,
+    reconcileRowDataForEntity,
+} from "../helpers/entityInputContract"
 
 import {executionConcurrencyAtom, repetitionCountAtom} from "./atoms"
 import {handleExecutionResultAtom} from "./executionItems"
@@ -316,7 +320,44 @@ export const triggerExecutionAtom = atom(
         const rowEntry = get(loadableController.selectors.row(loadableId, testcaseRowId)) as {
             data?: Record<string, unknown>
         } | null
-        const testcaseData: Record<string, unknown> = rowEntry?.data ?? {}
+        const rawTestcaseData: Record<string, unknown> = rowEntry?.data ?? {}
+
+        // Reconcile the shared testcase row against the ROOT entity's input
+        // contract before execution (#4525 / AGE-3793). The testcase store is
+        // shared across loadables, so the row keeps every key a previous
+        // primary populated — chat `messages`/`context` after swapping the
+        // upstream app from chat to completion. Cleaning here:
+        //   (a) keeps stale keys out of the app request,
+        //   (b) keeps them out of the downstream evaluator's {inputs, outputs}
+        //       envelope (the evaluator reads this same row), and
+        //   (c) persists the cleaned row so the UI + future runs reflect it.
+        // This is path-agnostic: it fires no matter how the app was selected,
+        // unlike the swap-time prune which only covers setEntityIds positional
+        // swaps. Columns a downstream evaluator references via `<input>_key`
+        // settings (e.g. correct_answer_key → ground_truth) are protected so a
+        // strict clean against the app contract doesn't drop intentional eval
+        // inputs.
+        const protectedColumns = collectDownstreamReferencedColumns(get, nodes)
+        const reconciledRow = reconcileRowDataForEntity(get, rootEntityId, rawTestcaseData, {
+            protectedKeys: protectedColumns,
+        })
+        const testcaseData: Record<string, unknown> = reconciledRow.data
+        if (reconciledRow.dropped.length > 0) {
+            const undefinedPatch: Record<string, unknown> = {}
+            for (const key of reconciledRow.dropped) {
+                undefinedPatch[key] = undefined
+            }
+            // Persist the cleaned row (deletes the dropped keys via the
+            // testcase store's undefined-means-delete semantics).
+            set(loadableController.actions.updateRow, loadableId, logicalRowId, undefinedPatch)
+            console.warn("[webWorker.reconcile] cleaned stale row keys before run", {
+                rootEntityId,
+                rowId: logicalRowId,
+                strategy: reconciledRow.strategy,
+                dropped: reconciledRow.dropped,
+                protectedColumns: Array.from(protectedColumns),
+            })
+        }
 
         // In comparison mode, filter nodes to only include the effective variant's
         // root + downstream nodes. Other depth-0 comparison variants are excluded
diff --git a/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts b/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts
index dd249ffdd9..5209f9b892 100644
--- a/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts
+++ b/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts
@@ -121,6 +121,17 @@ export interface ReconcileResult {
     strategy: ReconcileStrategy
 }
 
+export interface ReconcileOptions {
+    /**
+     * Keys to keep even when they aren't in the entity's allow-list. Used to
+     * protect testcase columns that a DOWNSTREAM evaluator consumes via its
+     * `<input>_key` settings (e.g. `correct_answer_key → ground_truth`). The
+     * primary app doesn't declare them, but they're intentional evaluation
+     * columns — not stale leftovers — so a strict clean must not drop them.
+     */
+    protectedKeys?: ReadonlySet<string>
+}
+
 /**
  * Reconcile a row's data to an entity's input contract.
  *
@@ -128,7 +139,8 @@ export interface ReconcileResult {
  *  - Evaluator → `chat-transport`: only strip chat-transport keys the entity
  *    doesn't declare. Preserves evaluators that spread additional testcase
  *    columns.
- *  - App with a resolved contract → `strict`: keep only declared keys.
+ *  - App with a resolved contract → `strict`: keep only declared (or
+ *    protected) keys.
  *  - Unresolved (schema/ports mid-hydration, non-evaluator) → `chat-transport`
  *    as a safety net; the caller may choose to defer a strict pass until the
  *    contract resolves.
@@ -137,8 +149,10 @@ export function reconcileRowDataForEntity(
     get: Getter,
     entityId: string,
     data: Record<string, unknown>,
+    options?: ReconcileOptions,
 ): ReconcileResult {
     const contract = resolveEntityInputContract(get, entityId)
+    const protectedKeys = options?.protectedKeys
 
     const useStrict = !contract.isEvaluator && contract.resolved
 
@@ -146,7 +160,7 @@ export function reconcileRowDataForEntity(
         const dropped: string[] = []
         const next: Record<string, unknown> = {}
         for (const [key, value] of Object.entries(data)) {
-            if (contract.allowedKeys.has(key)) {
+            if (contract.allowedKeys.has(key) || protectedKeys?.has(key)) {
                 next[key] = value
             } else {
                 dropped.push(key)
@@ -161,7 +175,7 @@ export function reconcileRowDataForEntity(
     const dropped: string[] = []
     const next: Record<string, unknown> = {...data}
     for (const key of CHAT_TRANSPORT_KEYS) {
-        if (key in next && !contract.allowedKeys.has(key)) {
+        if (key in next && !contract.allowedKeys.has(key) && !protectedKeys?.has(key)) {
             delete next[key]
             dropped.push(key)
         }
@@ -170,3 +184,37 @@ export function reconcileRowDataForEntity(
         ? {data: next, dropped, strategy: "chat-transport"}
         : {data, dropped, strategy: "chat-transport"}
 }
+
+/**
+ * Collect testcase column names that downstream evaluator nodes reference via
+ * their `<input>_key` settings (e.g. `correct_answer_key → ground_truth`).
+ *
+ * These columns are intentional evaluation inputs the primary app doesn't
+ * declare, so a strict row clean against the app contract must protect them
+ * (pass the result as `reconcileRowDataForEntity`'s `protectedKeys`).
+ *
+ * Mirrors the `<key>_key` resolution in `buildEvaluatorExecutionInputs`
+ * (`@agenta/entities/runnable`): a setting named `<input>_key` whose string
+ * value names a column, optionally prefixed `testcase.`.
+ */
+export function collectDownstreamReferencedColumns(
+    get: Getter,
+    nodes: readonly {depth: number; entityId: string}[],
+): Set<string> {
+    const columns = new Set<string>()
+    for (const node of nodes) {
+        if (node.depth === 0) continue
+        const settings = get(workflowMolecule.selectors.configuration(node.entityId)) as
+            | Record<string, unknown>
+            | null
+            | undefined
+        if (!settings || typeof settings !== "object") continue
+        for (const [key, value] of Object.entries(settings)) {
+            if (!key.endsWith("_key")) continue
+            if (typeof value !== "string" || value.length === 0) continue
+            const column = value.startsWith("testcase.") ? value.split(".")[1] : value
+            if (column) columns.add(column)
+        }
+    }
+    return columns
+}

From de548da6359366c0c8a8081b43f4f6dce32f625d Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Wed, 3 Jun 2026 10:27:19 +0200
Subject: [PATCH 15/36] fix(playground): clean stale row on app swap + drop
 diagnostic logs (#4525)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups now that the row-reconciliation fix is verified working:

1. Clean-on-swap: the evaluator playground selects an app via
   changePrimaryNode + connectDownstreamNode (ConfigureEvaluator), not a
   setEntityIds positional swap — so the previously-wired swap-time prune
   never fired there. Add playgroundController.actions.reconcileRowsToPrimary
   and call it from connectAppToEvaluatorAtom AFTER connectDownstreamNode, so
   the shared testcase row is cleaned the instant the app changes (not only at
   run time). Running after the downstream connect means the evaluator's
   referenced columns (correct_answer_key → ground_truth, etc.) are protected
   from the strict app-contract clean.

   pruneTestcaseRowsForEntity now:
   - collects downstream-evaluator protected columns,
   - returns a status ('acted' | 'noop' | 'unresolved').

   reconcileRowsToPrimary handles the hydration race: if the new primary's
   inputPorts aren't resolved yet AND the entity isn't loaded, it subscribes
   to inputPorts and retries once, then unsubscribes. If the entity is loaded
   but genuinely has no variables, it doesn't subscribe (no dangling sub). The
   run-time reconciliation in webWorkerIntegration remains the backstop.

2. Remove diagnostic logs added while tracing the bug:
   [executionRunner.filter], [webWorker.reconcile],
   [playgroundController.prune]. The reconcile + writeback logic stays; only
   the console.warn telemetry is dropped.
---
 .../components/ConfigureEvaluator/atoms.ts    |   9 ++
 .../state/controllers/playgroundController.ts | 110 ++++++++++++------
 .../src/state/execution/executionRunner.ts    |  10 +-
 .../state/execution/webWorkerIntegration.ts   |   7 --
 4 files changed, 83 insertions(+), 53 deletions(-)

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
index b0a0ed0426..32fd75511e 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
@@ -188,6 +188,15 @@ export const connectAppToEvaluatorAtom = atom(
             },
         })
 
+        // Clean the shared testcase row against the newly-selected app's input
+        // contract so stale keys from a previously-selected app (e.g. chat
+        // `messages`/`context` after swapping a chat app for a completion app)
+        // are dropped immediately — not only at run time (#4525 / AGE-3793).
+        // Runs AFTER connectDownstreamNode so the evaluator is in the graph and
+        // its referenced columns (correct_answer_key → ground_truth, etc.) are
+        // protected from the strict app-contract clean.
+        set(playgroundController.actions.reconcileRowsToPrimary)
+
         // Persist only after both graph mutations succeeded. The picker
         // display label is derived from the depth-0 node's `label` via
         // `selectedAppLabelAtom`, so no extra write needed here.
diff --git a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
index 40156efbda..d974eb5e0c 100644
--- a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
+++ b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
@@ -83,7 +83,11 @@ import {
     newTestcaseDataHashAtom,
 } from "../execution/selectors"
 import {pruneDanglingConnections} from "../helpers/connectionGraph"
-import {reconcileRowDataForEntity, resolveEntityInputContract} from "../helpers/entityInputContract"
+import {
+    collectDownstreamReferencedColumns,
+    reconcileRowDataForEntity,
+    resolveEntityInputContract,
+} from "../helpers/entityInputContract"
 import {extractAndLoadChatMessagesAtom} from "../helpers/extractAndLoadChatMessages"
 import {normalizeTestcaseRowsForLoad} from "../helpers/testcaseRowNormalization"
 import type {EntitySelection, PlaygroundNode, RunnableType} from "../types"
@@ -2133,88 +2137,113 @@ function relinkLoadableSessions(
     }
 }
 
+type PruneStatus = "acted" | "noop" | "unresolved"
+
 /**
- * Reconcile every testcase row against the new primary entity's input
- * contract.
+ * Reconcile every testcase row against the given entity's input contract.
  *
  * Why this exists: the testcase row store (`testcaseMolecule`) is shared
  * across loadables. When the user swaps the primary app in the LLM-as-a-
- * judge playground (anchor swap in `setEntityIdsAtom`), the row data keeps
- * every key the *previous* primary populated — chat `messages`, completion
- * template variables that the new app doesn't declare, etc. Without
- * reconciliation, those stale keys leak into the new app's request body
- * via the downstream "spread all keys" fallbacks.
- *
- * Handling at the row layer (here) makes the UI immediately reflect the new
- * app's variables and is the primary fix; execution-time reconciliation in
- * `executionRunner.ts` is only a hydration-window safety net.
+ * judge playground, the row data keeps every key the *previous* primary
+ * populated — chat `messages`, completion template variables that the new
+ * app doesn't declare, etc. Without reconciliation, those stale keys leak
+ * into the new app's request body and the downstream evaluator's envelope.
  *
  * Allow-list source is `inputPorts` (via `resolveEntityInputContract`), NOT
  * `inputSchema.properties` — completion apps surface their variables as
  * prompt template placeholders through `inputPorts` and have an EMPTY static
  * input schema, so schema-based filtering keeps everything. Policy:
- *   - App with a resolved contract → strict: keep only declared keys.
+ *   - App with a resolved contract → strict: keep only declared (or
+ *     downstream-evaluator-protected) keys.
  *   - Evaluator → chat-transport only: evaluators spread extra testcase
  *     columns, so we never strict-filter them.
- *   - Unresolved contract (ports mid-hydration) → skip; the execution-time
- *     reconciliation catches it. A console.warn is emitted so we can see
- *     when the hydration window is hit.
+ *   - Unresolved contract (ports mid-hydration) → no-op; returns
+ *     `"unresolved"` so the caller can retry once the contract resolves. The
+ *     run-time reconciliation in `webWorkerIntegration` is the backstop.
+ *
+ * Columns referenced by downstream evaluator `<input>_key` settings (e.g.
+ * `correct_answer_key → ground_truth`) are protected so a strict clean
+ * against the app contract doesn't drop intentional evaluation inputs.
  *
  * Mutations go through `testcaseMolecule.actions.batchUpdate` setting stale
  * keys to `undefined`, which the store's update reducer interprets as a
  * delete. Drafts are created as needed (one per affected row).
  */
-function pruneTestcaseRowsForEntity(get: Getter, set: Setter, entityId: string) {
+function pruneTestcaseRowsForEntity(get: Getter, set: Setter, entityId: string): PruneStatus {
     const contract = resolveEntityInputContract(get, entityId)
 
     // Unresolved, non-evaluator contract → we can't strict-filter safely yet.
     // The evaluator path is always "resolved enough" (chat-transport strip
     // works without a variable list), so only bail for non-evaluator apps.
     if (!contract.isEvaluator && !contract.resolved) {
-        console.warn("[playgroundController.prune] contract-not-resolved on swap", {
-            entityId,
-            // Without resolved inputPorts we can't decide what to drop. The
-            // execution-time reconciliation catches the leak window for now.
-        })
-        return
+        return "unresolved"
     }
 
     const displayRowIds = get(testcaseMolecule.atoms.displayRowIds)
-    if (!Array.isArray(displayRowIds) || displayRowIds.length === 0) return
+    if (!Array.isArray(displayRowIds) || displayRowIds.length === 0) return "noop"
+
+    const protectedColumns = collectDownstreamReferencedColumns(get, get(playgroundNodesAtom))
 
     const updates: {id: string; updates: {data: Record<string, unknown>}}[] = []
-    const droppedPerRow: Record<string, string[]> = {}
-    let strategyUsed: string | null = null
 
     for (const rowId of displayRowIds) {
         const row = get(testcaseMolecule.data(rowId))
         const data = (row as {data?: Record<string, unknown>} | null)?.data
         if (!data || typeof data !== "object") continue
 
-        const {dropped, strategy} = reconcileRowDataForEntity(get, entityId, data)
+        const {dropped} = reconcileRowDataForEntity(get, entityId, data, {
+            protectedKeys: protectedColumns,
+        })
         if (dropped.length === 0) continue
-        strategyUsed = strategy
 
         const undefinedData: Record<string, unknown> = {}
         for (const key of dropped) {
             undefinedData[key] = undefined
         }
         updates.push({id: rowId, updates: {data: undefinedData}})
-        droppedPerRow[rowId] = dropped
     }
 
-    if (updates.length === 0) return
-
-    console.warn("[playgroundController.prune] dropped stale keys after primary swap", {
-        entityId,
-        rowsAffected: updates.length,
-        strategy: strategyUsed,
-        droppedPerRow,
-    })
+    if (updates.length === 0) return "noop"
 
     set(testcaseMolecule.actions.batchUpdate, updates)
+    return "acted"
 }
 
+/**
+ * Reconcile all testcase rows against the CURRENT primary (depth-0) entity's
+ * input contract, on demand — call this right after a primary swap so the
+ * shared row is cleaned the instant the app changes, without waiting for a
+ * run. The run-time reconciliation in `webWorkerIntegration` is the backstop.
+ *
+ * Hydration handling: the new primary's input ports may not be resolved at
+ * call time (the workflow is still loading). When the prune reports
+ * `"unresolved"` AND the entity isn't loaded yet, we subscribe to its
+ * `inputPorts` and retry once they resolve, then unsubscribe. If the entity
+ * is already loaded but has no resolvable variables, there's nothing to wait
+ * for, so we don't subscribe (avoids a dangling subscription).
+ */
+const reconcileRowsToPrimaryAtom = atom(null, (get, set) => {
+    const nodes = get(playgroundNodesAtom)
+    const primary = nodes.find((node) => node.depth === 0)
+    if (!primary) return
+    const entityId = primary.entityId
+
+    const status = pruneTestcaseRowsForEntity(get, set, entityId)
+    if (status !== "unresolved") return
+
+    // Unresolved: either the workflow is still loading, or it's a genuinely
+    // no-variable app. Only wait if it hasn't loaded yet.
+    const entityLoaded = get(workflowMolecule.selectors.data(entityId)) != null
+    if (entityLoaded) return
+
+    const store = getDefaultStore()
+    const unsub = store.sub(workflowMolecule.selectors.inputPorts(entityId), () => {
+        const retryStatus = pruneTestcaseRowsForEntity(store.get, store.set, entityId)
+        const nowLoaded = store.get(workflowMolecule.selectors.data(entityId)) != null
+        if (retryStatus !== "unresolved" || nowLoaded) unsub()
+    })
+})
+
 /**
  * Switch one entity for another in the displayed selection.
  * Handles both single and comparison mode. The loadable-scoped re-link
@@ -2336,6 +2365,13 @@ export const playgroundController = {
         /** Change the primary node */
         changePrimaryNode: changePrimaryNodeAtom,
 
+        /**
+         * Reconcile all testcase rows against the current primary entity's
+         * input contract. Call after a primary swap to clean stale keys from a
+         * previous app off the shared row immediately (#4525 / AGE-3793).
+         */
+        reconcileRowsToPrimary: reconcileRowsToPrimaryAtom,
+
         /** Disconnect from testset and reset to local mode */
         disconnectAndResetToLocal: disconnectAndResetToLocalAtom,
 
diff --git a/web/packages/agenta-playground/src/state/execution/executionRunner.ts b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
index d9106e8d8c..0cbaedff3e 100644
--- a/web/packages/agenta-playground/src/state/execution/executionRunner.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
@@ -261,15 +261,7 @@ function reconcileEntityInputData(
     data: Record<string, unknown>,
     entityId: string,
 ): Record<string, unknown> {
-    const {data: next, dropped, strategy} = reconcileRowDataForEntity(get, entityId, data)
-    if (dropped.length > 0) {
-        console.warn("[executionRunner.filter] reconciled stale row keys", {
-            entityId,
-            strategy,
-            dropped,
-        })
-    }
-    return next
+    return reconcileRowDataForEntity(get, entityId, data).data
 }
 
 function createConcurrencyLimiter(concurrency: number) {
diff --git a/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts b/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
index 45a4af1777..c0ffdcd055 100644
--- a/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
+++ b/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
@@ -350,13 +350,6 @@ export const triggerExecutionAtom = atom(
             // Persist the cleaned row (deletes the dropped keys via the
             // testcase store's undefined-means-delete semantics).
             set(loadableController.actions.updateRow, loadableId, logicalRowId, undefinedPatch)
-            console.warn("[webWorker.reconcile] cleaned stale row keys before run", {
-                rootEntityId,
-                rowId: logicalRowId,
-                strategy: reconciledRow.strategy,
-                dropped: reconciledRow.dropped,
-                protectedColumns: Array.from(protectedColumns),
-            })
         }
 
         // In comparison mode, filter nodes to only include the effective variant's

From 07bc27de03d823244a38cb16a2844d1018736103 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Thu, 4 Jun 2026 19:18:07 +0200
Subject: [PATCH 16/36] fix(frontend): render markdown headings by level
 instead of uppercasing

The beautified/markdown view forced H2 headings to uppercase via
text-transform, rewriting the user's own prompt text. H1 was also lighter
than H2, and H3-H6 had no styling. Apply a consistent best-practice scale
(descending sizes, shared weight/color/spacing) across H1-H6 in both light
and dark mode, with no case transform.
---
 web/oss/src/styles/editor-theme.css | 49 +++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/web/oss/src/styles/editor-theme.css b/web/oss/src/styles/editor-theme.css
index f292a22edb..e3c6b494b5 100644
--- a/web/oss/src/styles/editor-theme.css
+++ b/web/oss/src/styles/editor-theme.css
@@ -187,25 +187,44 @@ h1 {
     margin-bottom: 0;
 }
 
-.editor-heading-h1 {
-    font-size: 18px;
+.editor-heading-h1,
+.editor-heading-h2,
+.editor-heading-h3,
+.editor-heading-h4,
+.editor-heading-h5,
+.editor-heading-h6 {
     color: rgb(5, 5, 5);
-    font-weight: 400;
+    font-weight: 600;
     margin: 0;
-    margin-bottom: 12px;
+    margin-top: 16px;
+    margin-bottom: 8px;
     padding: 0;
-    line-height: 1.5;
+    line-height: 1.3;
+}
+
+.editor-heading-h1 {
+    font-size: 24px;
 }
 
 .editor-heading-h2 {
+    font-size: 20px;
+}
+
+.editor-heading-h3 {
+    font-size: 17px;
+}
+
+.editor-heading-h4 {
     font-size: 15px;
+}
+
+.editor-heading-h5 {
+    font-size: 14px;
+}
+
+.editor-heading-h6 {
+    font-size: 13px;
     color: rgb(101, 103, 107);
-    font-weight: 700;
-    margin: 0;
-    margin-top: 10px;
-    padding: 0;
-    text-transform: uppercase;
-    line-height: 1.35;
 }
 
 .editor-quote {
@@ -686,10 +705,14 @@ pre::-webkit-scrollbar-thumb {
 .dark .other a {
     color: rgba(255, 255, 255, 0.65);
 }
-.dark .editor-heading-h1 {
+.dark .editor-heading-h1,
+.dark .editor-heading-h2,
+.dark .editor-heading-h3,
+.dark .editor-heading-h4,
+.dark .editor-heading-h5 {
     color: rgba(255, 255, 255, 0.85);
 }
-.dark .editor-heading-h2,
+.dark .editor-heading-h6,
 .dark .editor-quote {
     color: rgba(255, 255, 255, 0.65);
 }

From b5d26762db31eada98485a13134ea79474b52d64 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Thu, 4 Jun 2026 20:01:25 +0200
Subject: [PATCH 17/36] fix(frontend): remove copyright and social-links footer
 bar
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the bottom bar that showed the GitHub, LinkedIn, and X icons and the
'Copyright © <year> | Agenta.' line from the platform layout.

Remove the FooterIsland component, its styles, and the footerHeight resize
observer that only existed to size the footer.
---
 .../src/components/Layout/FooterIsland.tsx    | 30 -------------
 web/oss/src/components/Layout/Layout.tsx      | 41 ++---------------
 .../src/components/Layout/assets/styles.ts    | 44 +++----------------
 3 files changed, 8 insertions(+), 107 deletions(-)
 delete mode 100644 web/oss/src/components/Layout/FooterIsland.tsx

diff --git a/web/oss/src/components/Layout/FooterIsland.tsx b/web/oss/src/components/Layout/FooterIsland.tsx
deleted file mode 100644
index b5b335575d..0000000000
--- a/web/oss/src/components/Layout/FooterIsland.tsx
+++ /dev/null
@@ -1,30 +0,0 @@
-import {memo} from "react"
-
-import {GithubFilled, LinkedinFilled, TwitterOutlined} from "@ant-design/icons"
-import {Layout, Space, Typography} from "antd"
-import Link from "next/link"
-
-const {Footer} = Layout
-
-interface FooterIslandProps {
-    className?: string
-}
-
-export const FooterIsland = memo(function FooterIsland({className}: FooterIslandProps) {
-    return (
-        <Footer className={className}>
-            <Space size={10}>
-                <Link href="https://github.com/Agenta-AI/agenta" target="_blank">
-                    <GithubFilled />
-                </Link>
-                <Link href="https://www.linkedin.com/company/agenta-ai/" target="_blank">
-                    <LinkedinFilled />
-                </Link>
-                <Link href="https://twitter.com/agenta_ai" target="_blank">
-                    <TwitterOutlined />
-                </Link>
-            </Space>
-            <Typography.Text>Copyright © {new Date().getFullYear()} | Agenta.</Typography.Text>
-        </Footer>
-    )
-})
diff --git a/web/oss/src/components/Layout/Layout.tsx b/web/oss/src/components/Layout/Layout.tsx
index 8e73230639..3eb5683de7 100644
--- a/web/oss/src/components/Layout/Layout.tsx
+++ b/web/oss/src/components/Layout/Layout.tsx
@@ -1,16 +1,12 @@
-import {memo, useCallback, useEffect, useRef, useState, type ReactNode, type RefObject} from "react"
+import {memo, useCallback, useEffect, useRef, useState, type ReactNode} from "react"
 
-import {GithubFilled, LinkedinFilled, TwitterOutlined} from "@ant-design/icons"
-import {ConfigProvider, Layout, Modal, Space, theme} from "antd"
+import {ConfigProvider, Layout, Modal, theme} from "antd"
 import clsx from "clsx"
 import {atom} from "jotai"
 import {useAtom, useAtomValue, useSetAtom, useStore} from "jotai"
 import {selectAtom} from "jotai/utils"
-import dynamic from "next/dynamic"
-import Link from "next/link"
 import {useRouter} from "next/router"
 import {ErrorBoundary} from "react-error-boundary"
-import {useResizeObserver} from "usehooks-ts"
 
 import useURL from "@/oss/hooks/useURL"
 import {currentAppAtom} from "@/oss/state/app"
@@ -146,11 +142,6 @@ const useCommittedLayoutFlags = (): LayoutRouteFlags => {
     return committedFlags
 }
 
-const FooterIsland = dynamic(() => import("./FooterIsland").then((m) => m.FooterIsland), {
-    ssr: false,
-    loading: () => null,
-})
-
 type StyleClasses = ReturnType<typeof useStyles>
 
 const {Content} = Layout
@@ -169,7 +160,6 @@ const AppWithVariants = memo(
         isEvaluator,
         isFullHeight,
         appTheme,
-        footerHeight,
     }: {
         children: ReactNode
         isAppRoute: boolean
@@ -179,7 +169,6 @@ const AppWithVariants = memo(
         classes: StyleClasses
         appTheme: string
         isPlayground?: boolean
-        footerHeight?: number
     }) => {
         const {baseAppURL} = useURL()
         const appState = useAppState()
@@ -361,24 +350,6 @@ const AppWithVariants = memo(
                                 </Content>
                             )}
                         </div>
-                        <div className="w-full h-[30px]"></div>
-                        <FooterIsland className={classes.footer}>
-                            <Space className={classes.footerLeft} size={10}>
-                                <Link href={"https://github.com/Agenta-AI/agenta"} target="_blank">
-                                    <GithubFilled className={classes.footerLinkIcon} />
-                                </Link>
-                                <Link
-                                    href={"https://www.linkedin.com/company/agenta-ai/"}
-                                    target="_blank"
-                                >
-                                    <LinkedinFilled className={classes.footerLinkIcon} />
-                                </Link>
-                                <Link href={"https://twitter.com/agenta_ai"} target="_blank">
-                                    <TwitterOutlined className={classes.footerLinkIcon} />
-                                </Link>
-                            </Space>
-                            <div>Copyright © {new Date().getFullYear()} | Agenta.</div>
-                        </FooterIsland>
                     </Layout>
                 </Layout>
             </div>
@@ -388,12 +359,7 @@ const AppWithVariants = memo(
 
 const App: React.FC<LayoutProps> = ({children}) => {
     const {appTheme} = useAppTheme()
-    const ref = useRef<HTMLElement | null>(null)
-    const {height: footerHeight} = useResizeObserver({
-        ref: ref as RefObject<HTMLElement>,
-        box: "border-box",
-    })
-    const classes = useStyles({themeMode: appTheme, footerHeight} as StyleProps)
+    const classes = useStyles({themeMode: appTheme} as StyleProps)
     const {isHumanEval, isPlayground, isAppRoute, isAuthRoute, isEvaluator, isFullHeight} =
         useCommittedLayoutFlags()
 
@@ -419,7 +385,6 @@ const App: React.FC<LayoutProps> = ({children}) => {
                         isHumanEval={isHumanEval}
                         isEvaluator={isEvaluator}
                         isFullHeight={isFullHeight}
-                        footerHeight={footerHeight}
                     >
                         {children}
                         {contextHolder}
diff --git a/web/oss/src/components/Layout/assets/styles.ts b/web/oss/src/components/Layout/assets/styles.ts
index cc47034abc..49a9112c52 100644
--- a/web/oss/src/components/Layout/assets/styles.ts
+++ b/web/oss/src/components/Layout/assets/styles.ts
@@ -2,9 +2,7 @@ import {createUseStyles} from "react-jss"
 
 import type {JSSTheme, StyleProps as MainStyleProps} from "@/oss/lib/Types"
 
-export interface StyleProps extends MainStyleProps {
-    footerHeight: number
-}
+export type StyleProps = MainStyleProps
 
 export const useStyles = createUseStyles((theme: JSSTheme) => ({
     layout: ({themeMode}: StyleProps) => ({
@@ -14,15 +12,15 @@ export const useStyles = createUseStyles((theme: JSSTheme) => ({
         minHeight: "100vh",
         position: "relative",
     }),
-    content: ({footerHeight}: StyleProps) => ({
-        height: `calc(100% - ${footerHeight ?? 0}px)`,
+    content: {
+        height: "100%",
         paddingTop: "24px",
         paddingLeft: "1.5rem",
         paddingRight: "1.5rem",
-        marginBottom: `calc(2rem + ${footerHeight ?? 0}px)`,
+        marginBottom: "2rem",
         flex: 1,
         gap: 16,
-    }),
+    },
     breadcrumbContainer: {
         display: "flex",
         alignItems: "center",
@@ -31,38 +29,6 @@ export const useStyles = createUseStyles((theme: JSSTheme) => ({
         padding: "8px 1.5rem",
         borderBottom: `1px solid ${theme.colorBorderSecondary}`,
     },
-    footer: {
-        position: "absolute",
-        bottom: 0,
-        left: 0,
-        right: 0,
-        textAlign: "center",
-        padding: "5px 20px",
-        display: "flex",
-        alignItems: "center",
-        justifyContent: "space-between",
-        // antd's Layout.Footer defaults to colorBgLayout (#000 in dark), which
-        // reads as a mismatched black band against the #141414 content. Blend
-        // with whatever's behind it instead, and add a top border to separate
-        // it from the content above.
-        backgroundColor: "transparent",
-        borderTop: `1px solid ${theme.colorBorderSecondary}`,
-        // The social links are anchors that would otherwise inherit antd's
-        // colorLink (blue in dark). Use neutral text color so they read as icons,
-        // not links — matches the prior navy look in light, flips to light in dark.
-        "& a": {
-            color: theme.colorText,
-        },
-        "& a:hover": {
-            color: theme.colorTextSecondary,
-        },
-    },
-    footerLeft: {
-        fontSize: 18,
-    },
-    footerLinkIcon: ({themeMode}: StyleProps) => ({
-        color: themeMode === "dark" ? "#fff" : "#000",
-    }),
     topRightBar: {
         display: "flex",
         alignItems: "center",

From a8529a4a67659cf73d2b9a720fa445e1b7c41c91 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Fri, 5 Jun 2026 10:49:11 +0200
Subject: [PATCH 18/36] fix(frontend): correct playground message text/markdown
 view and styling

The message editors had the Text/Markdown view inverted: the view-mode
dropdown mapped markdownView to the wrong boolean, so picking 'Markdown'
showed raw source and the default 'Text' showed rendered markdown. Fixed
the mapping across every message editor (chat turns, prompt messages,
variable inputs, the JSON object field) and the live markdown toggle
button, whose icon and tooltip were also inverted.

Also:
- The view mode is now a shared, persisted atom (messageViewModeAtom), so
  switching one message switches all and the choice survives a refresh.
- Text mode renders with the editor's proportional font instead of
  monospace, with spacing that matches the rendered markdown view.
- Message text and placeholder align with the role label above them.
---
 web/oss/src/styles/globals.css                | 31 +++++++++++++++++++
 .../adapters/TurnMessageAdapter.tsx           | 11 ++++---
 .../adapters/VariableControlAdapter.tsx       |  2 +-
 .../components/ChatMessageEditor.tsx          | 12 ++++++-
 .../components/ChatMessageList.tsx            |  9 ++++--
 .../components/MarkdownToggleButton.tsx       |  4 +--
 .../FieldRenderers/JsonObjectField.tsx        |  2 +-
 web/packages/agenta-ui/src/drill-in/index.ts  |  1 +
 .../src/drill-in/state/messageViewModeAtom.ts | 19 ++++++++++++
 9 files changed, 78 insertions(+), 13 deletions(-)
 create mode 100644 web/packages/agenta-ui/src/drill-in/state/messageViewModeAtom.ts

diff --git a/web/oss/src/styles/globals.css b/web/oss/src/styles/globals.css
index db30d4a682..bcf2162872 100644
--- a/web/oss/src/styles/globals.css
+++ b/web/oss/src/styles/globals.css
@@ -184,6 +184,13 @@ body {
 
     > .editor-input.markdown-view > .editor-code {
         background-color: transparent;
+        /* Text mode shows the raw source as plain prose, not code. Use the
+           editor's proportional font instead of the monospace code face, and
+           drop the code-block padding/margins so the text aligns with the
+           rich-text (markdown) view's left edge and top. */
+        font-family: inherit;
+        padding: 0;
+        margin: 0;
     }
     > .editor-input:not(.markdown-view) > .editor-code {
         &:after {
@@ -199,6 +206,30 @@ body {
     }
 }
 
+/* Align the message text with the first character of the role label above it,
+   with the same symmetric horizontal inset on the role, the text, and the
+   placeholder. The single inset value lives in --ag-message-inline-pad.
+
+   Notes:
+   - The role label is an antd Button whose default padding is wider than the
+     inset (Tailwind's px-2 on it loses to antd), so it is pinned with !important.
+   - The text is padded here in CSS rather than via an editor prop because
+     ChatMessageEditor renders the Editor with `noProvider`, a mode where
+     `className`/`editorClassName` is currently dropped (known bug, tracked
+     separately). JSON/code editors are excluded; they have a line-number gutter. */
+.agenta-chat-message-editor {
+    --ag-message-inline-pad: 8px;
+}
+.agenta-chat-message-editor .message-user-select {
+    padding-inline: var(--ag-message-inline-pad) !important;
+}
+.agenta-chat-message-editor .editor-input:not(.code-only) {
+    padding-inline: var(--ag-message-inline-pad);
+}
+.agenta-chat-message-editor .editor-placeholder {
+    left: var(--ag-message-inline-pad);
+}
+
 /** Align the input search with the search box **/
 .ant-input-group-wrapper {
     .ant-input {
diff --git a/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx b/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
index 0aabee0691..0501848a0b 100644
--- a/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
+++ b/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
@@ -21,10 +21,10 @@ import {
     PromptImageUpload,
     PromptDocumentUpload,
 } from "@agenta/ui/components/presentational"
-import type {ViewMode} from "@agenta/ui/drill-in"
+import {messageViewModeAtom} from "@agenta/ui/drill-in"
 import type {UploadFile} from "antd"
 import clsx from "clsx"
-import {useAtomValue, useSetAtom} from "jotai"
+import {useAtom, useAtomValue, useSetAtom} from "jotai"
 import JSON5 from "json5"
 import {v4 as uuidv4} from "uuid"
 
@@ -215,7 +215,8 @@ const TurnMessageAdapter: React.FC<Props> = ({
 
         return fallback
     }, [computedText, msg])
-    const [viewMode, setViewMode] = useState<ViewMode>("text")
+    // Shared + persisted across all message editors (see messageViewModeAtom).
+    const [viewMode, setViewMode] = useAtom(messageViewModeAtom)
     const isCodeMode = viewMode === "json" || viewMode === "yaml"
     const editorLanguage = viewMode === "yaml" ? "yaml" : "json"
 
@@ -656,7 +657,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                             isJSON={isCodeMode}
                             isTool={isCodeMode}
                             language={editorLanguage}
-                            markdownView={viewMode === "markdown"}
+                            markdownView={viewMode === "text"}
                             onFocusChange={handleEditorFocusChange}
                             text={p?.json}
                             enableTokens={messageProps?.enableTokens ?? !isCodeMode}
@@ -750,7 +751,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                         state={editorState}
                         isJSON={isCodeMode}
                         language={editorLanguage}
-                        markdownView={viewMode === "markdown"}
+                        markdownView={viewMode === "text"}
                         enableTokens={messageProps?.enableTokens ?? !isCodeMode}
                         headerRight={
                             <TurnMessageHeaderOptions
diff --git a/web/packages/agenta-playground-ui/src/components/adapters/VariableControlAdapter.tsx b/web/packages/agenta-playground-ui/src/components/adapters/VariableControlAdapter.tsx
index a1a4640ce9..0a9ce04c2c 100644
--- a/web/packages/agenta-playground-ui/src/components/adapters/VariableControlAdapter.tsx
+++ b/web/packages/agenta-playground-ui/src/components/adapters/VariableControlAdapter.tsx
@@ -522,7 +522,7 @@ const VariableControlAdapter: React.FC<VariableControlAdapterProps> = ({
                 enableTokens={!editorProps?.codeOnly}
                 disabled={isEffectivelyDisabled}
             >
-                <MarkdownViewSynchronizer enabled={viewMode === "markdown"} />
+                <MarkdownViewSynchronizer enabled={viewMode === "text"} />
                 <SharedEditor
                     id={editorId}
                     noProvider
diff --git a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageEditor.tsx b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageEditor.tsx
index 864cabe098..85ceca76ef 100644
--- a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageEditor.tsx
+++ b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageEditor.tsx
@@ -214,7 +214,17 @@ const ChatMessageEditorInner: React.FC<ChatMessageEditorProps> = ({
             placeholder={placeholder}
             disabled={disabled}
             state={disabled ? "readOnly" : state}
-            className={cn("relative", flexLayouts.column, gapClasses.xs, "rounded-md", className)}
+            // `agenta-chat-message-editor` is the styling hook used in globals.css
+            // to align the message text with the role label (see that file). The
+            // padding can't go through `editorClassName` because ChatMessageEditor
+            // renders the Editor with `noProvider`, where `className` is dropped.
+            className={cn(
+                "agenta-chat-message-editor relative",
+                flexLayouts.column,
+                gapClasses.xs,
+                "rounded-md",
+                className,
+            )}
             footer={footer}
             onFocusChange={onFocusChange}
             maxPasteChars={maxPasteChars}
diff --git a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
index f42baa61ce..afb06369e2 100644
--- a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
+++ b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
@@ -12,9 +12,11 @@ import {
 } from "@agenta/shared/utils"
 import {Copy, MinusCircle, Plus} from "@phosphor-icons/react"
 import {Button, Tooltip} from "antd"
+import {useAtom} from "jotai"
 
 import {CollapseToggleButton, getCollapseStyle} from "../../components/presentational/buttons"
 import {ViewModeDropdown} from "../../drill-in/core/ViewModeDropdown"
+import {messageViewModeAtom} from "../../drill-in/state/messageViewModeAtom"
 import {getViewOptions, type ViewMode} from "../../drill-in/utils/getViewOptions"
 import {message, modal} from "../../utils/appMessageContext"
 import {cn, flexLayouts, gapClasses} from "../../utils/styles"
@@ -89,7 +91,8 @@ const ChatMessageItem: React.FC<{
     onToggleMinimize,
 }) => {
     const containerRef = useRef<HTMLDivElement>(null)
-    const [viewMode, setViewMode] = useState<ChatViewMode>("text")
+    // Shared + persisted across all message editors (see messageViewModeAtom).
+    const [viewMode, setViewMode] = useAtom(messageViewModeAtom)
     const isCodeMode = viewMode === "json" || viewMode === "yaml"
     const editorLanguage: "json" | "yaml" = viewMode === "yaml" ? "yaml" : "json"
 
@@ -173,7 +176,7 @@ const ChatMessageItem: React.FC<{
                 onChangeText={(text) => onTextChange(index, text)}
                 isJSON={isCodeMode}
                 language={editorLanguage}
-                markdownView={viewMode === "markdown"}
+                markdownView={viewMode === "text"}
                 enableTokens={enableTokens && !isCodeMode}
                 templateFormat={templateFormat}
                 tokens={tokens}
@@ -196,7 +199,7 @@ const ChatMessageItem: React.FC<{
                         )}
                     >
                         <ViewModeDropdown<ChatViewMode>
-                            value={viewMode}
+                            value={viewMode as ChatViewMode}
                             options={viewOptions}
                             onChange={setViewMode}
                         />
diff --git a/web/packages/agenta-ui/src/ChatMessage/components/MarkdownToggleButton.tsx b/web/packages/agenta-ui/src/ChatMessage/components/MarkdownToggleButton.tsx
index c90a8c8204..ea19c0f50e 100644
--- a/web/packages/agenta-ui/src/ChatMessage/components/MarkdownToggleButton.tsx
+++ b/web/packages/agenta-ui/src/ChatMessage/components/MarkdownToggleButton.tsx
@@ -22,11 +22,11 @@ const MarkdownToggleButton = ({id}: MarkdownToggleButtonProps) => {
     }, [editor])
 
     return (
-        <Tooltip title={markdownView ? "Preview text" : "Preview markdown"}>
+        <Tooltip title={markdownView ? "Preview markdown" : "Preview text"}>
             <Button
                 type="text"
                 size="small"
-                icon={markdownView ? <TextAa size={14} /> : <MarkdownLogoIcon size={14} />}
+                icon={markdownView ? <MarkdownLogoIcon size={14} /> : <TextAa size={14} />}
                 onClick={onToggleMarkdown}
                 className={cn(flexLayouts.rowCenter, justifyClasses.center)}
             />
diff --git a/web/packages/agenta-ui/src/drill-in/FieldRenderers/JsonObjectField.tsx b/web/packages/agenta-ui/src/drill-in/FieldRenderers/JsonObjectField.tsx
index e46f7bd745..e09857d162 100644
--- a/web/packages/agenta-ui/src/drill-in/FieldRenderers/JsonObjectField.tsx
+++ b/web/packages/agenta-ui/src/drill-in/FieldRenderers/JsonObjectField.tsx
@@ -65,7 +65,7 @@ function ChatMessageObjectField({
             disabled={!editable}
             isJSON={isCodeMode}
             language={editorLanguage}
-            markdownView={viewMode === "markdown"}
+            markdownView={viewMode === "text"}
             enableTokens={!isCodeMode}
             templateFormat="curly"
             onChangeRole={(newRole: string) => {
diff --git a/web/packages/agenta-ui/src/drill-in/index.ts b/web/packages/agenta-ui/src/drill-in/index.ts
index e2081547b1..74ddaefc8c 100644
--- a/web/packages/agenta-ui/src/drill-in/index.ts
+++ b/web/packages/agenta-ui/src/drill-in/index.ts
@@ -128,6 +128,7 @@ export {
 } from "./utils"
 export {getViewOptions} from "./utils/getViewOptions"
 export type {ViewMode, ViewOption} from "./utils/getViewOptions"
+export {messageViewModeAtom} from "./state/messageViewModeAtom"
 
 // ============================================================================
 // FIELD RENDERERS
diff --git a/web/packages/agenta-ui/src/drill-in/state/messageViewModeAtom.ts b/web/packages/agenta-ui/src/drill-in/state/messageViewModeAtom.ts
new file mode 100644
index 0000000000..b3cab4975a
--- /dev/null
+++ b/web/packages/agenta-ui/src/drill-in/state/messageViewModeAtom.ts
@@ -0,0 +1,19 @@
+import {atomWithStorage} from "jotai/utils"
+
+import type {ViewMode} from "../utils/getViewOptions"
+
+/**
+ * Shared, persisted view mode for chat / prompt message editors.
+ *
+ * Replaces the per-message local `useState` so that:
+ *  - switching one message's view (Text / Markdown / JSON / YAML) switches every
+ *    message editor at once, and
+ *  - the choice survives a page refresh (persisted to localStorage).
+ *
+ * Scope note: this is a single app-wide atom, so it is shared by every consumer
+ * of the message editors (playground prompt + chat turns, and also the drill-in
+ * message fields). The key is intentionally not namespaced to "playground".
+ *
+ * Defaults to "text" so messages open as plain, raw text.
+ */
+export const messageViewModeAtom = atomWithStorage<ViewMode>("agenta:message-view-mode", "text")

From f0d60d1f4629f200cca160abdbf8b41a2f46004b Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Fri, 5 Jun 2026 10:44:02 +0200
Subject: [PATCH 19/36] feat(frontend): add Run-on mode selector to the
 evaluator playground

Adds a 'Run on' control to the evaluator (LLM-as-a-judge) playground header
so the first/empty state explains itself instead of leaving the user with two
disconnected loaders. Three modes, each drawing its own data-flow:

- Run directly on a test case  (Data -> Evaluator -> Score)
- Run on an app output         (Data -> App -> Output -> Evaluator -> Score) - default
- Run on a trace               (Trace -> Evaluator -> Score) - disabled for now

The mode is persisted per project; a connected app forces effective 'app' mode.
In app mode with no app connected, the run panel hides the testcases and shows a
centered 'Select an app' empty state (shared with the evaluator-creation drawer).
All colors come from the antd theme token so it follows light/dark mode.

Prompt playground is intentionally untouched.
---
 .../EvaluatorPlaygroundHeader.tsx             |  47 ++-
 .../ConfigureEvaluator/RunOnSelector.tsx      | 295 ++++++++++++++++++
 .../SelectAppEmptyState.tsx                   |  56 ++++
 .../components/ConfigureEvaluator/atoms.ts    |  54 ++++
 .../components/ConfigureEvaluator/index.tsx   |  41 ++-
 .../CreateEvaluatorDrawer/index.tsx           |  18 +-
 6 files changed, 486 insertions(+), 25 deletions(-)
 create mode 100644 web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx
 create mode 100644 web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
index 174f7273f2..1dd5f55414 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
@@ -20,7 +20,14 @@ import {Button, Tooltip, Typography} from "antd"
 import {useAtomValue, useSetAtom} from "jotai"
 import dynamic from "next/dynamic"
 
-import {disconnectAppFromEvaluatorAtom, selectedAppLabelAtom} from "./atoms"
+import {
+    disconnectAppFromEvaluatorAtom,
+    effectiveRunOnModeAtom,
+    runOnModeAtom,
+    selectedAppLabelAtom,
+    type RunOnMode,
+} from "./atoms"
+import RunOnSelector from "./RunOnSelector"
 
 const TestsetDropdown = dynamic(
     () => import("@/oss/components/Playground/Components/TestsetDropdown"),
@@ -77,6 +84,23 @@ const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
         disconnectApp()
     }, [disconnectApp])
 
+    // Run-on mode — drives which loaders are surfaced. A connected app forces
+    // "app" mode (see effectiveRunOnModeAtom); the stored mode only matters when
+    // nothing is connected.
+    const runOnMode = useAtomValue(effectiveRunOnModeAtom)
+    const setRunOnMode = useSetAtom(runOnModeAtom)
+    const handlePickRunOn = useCallback(
+        (next: RunOnMode) => {
+            if (next === "trace") return // disabled, not selectable
+            // Leaving "app" mode means dropping the connected app so the graph
+            // returns to standalone-evaluator shape.
+            if (next === "data") disconnectApp()
+            setRunOnMode(next)
+        },
+        [disconnectApp, setRunOnMode],
+    )
+    const isAppMode = runOnMode === "app"
+
     // Check if we have an app node (depth-0 with a different entity than evaluator)
     const hasAppSelected = nodes.some((n) => n.depth === 0 && n.entityId !== evaluatorEntityId)
 
@@ -100,15 +124,18 @@ const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
             </div>
 
             <div className="flex min-w-0 flex-1 items-center justify-end gap-1">
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={onAppSelect}
-                    size="small"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                    popupFooter={popupFooter}
-                />
-                {hasAppSelected && (
+                <RunOnSelector mode={runOnMode} onPick={handlePickRunOn} />
+                {isAppMode && (
+                    <EntityPicker<WorkflowRevisionSelectionResult>
+                        variant="popover-cascader"
+                        adapter={appWorkflowAdapter}
+                        onSelect={onAppSelect}
+                        size="small"
+                        placeholder={selectedAppLabel ?? "Select app"}
+                        popupFooter={popupFooter}
+                    />
+                )}
+                {isAppMode && hasAppSelected && (
                     <Tooltip title="Disconnect app">
                         <Button
                             type="text"
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx
new file mode 100644
index 0000000000..4fd8a2ee61
--- /dev/null
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx
@@ -0,0 +1,295 @@
+/**
+ * RunOnSelector
+ *
+ * The "Run on" control for the evaluator playground header. A leading dropdown
+ * that names the data source the evaluator runs against and draws the resulting
+ * data-flow, so the empty/first state explains itself instead of leaving the
+ * user with two disconnected loaders.
+ *
+ * Three modes:
+ *  - Run directly on data  (Data → Evaluator → Score)
+ *  - Run on an app         (Data → App → Output → Evaluator → Score) — default
+ *  - Run on a trace        (Trace → Evaluator → Score) — disabled for now
+ *
+ * All colors come from the live antd token (`theme.useToken()`) so the control
+ * follows light/dark mode automatically.
+ */
+
+import {useState} from "react"
+
+import {AppstoreOutlined} from "@ant-design/icons"
+import {
+    CaretDownIcon,
+    CheckIcon,
+    DatabaseIcon,
+    GavelIcon,
+    TreeViewIcon,
+} from "@phosphor-icons/react"
+import {Button, Dropdown, theme} from "antd"
+import type {GlobalToken} from "antd"
+import clsx from "clsx"
+
+import type {RunOnMode} from "./atoms"
+
+// The app icon used across the product (the sidebar "Prompts" item). Wrapped so
+// it accepts the same `size`/`style` props as the phosphor icons it sits beside.
+const AppIcon = ({size = 16, style}: {size?: number; style?: React.CSSProperties}) => (
+    <AppstoreOutlined style={{fontSize: size, ...style}} />
+)
+
+// ── flow pills ──────────────────────────────────────────────────────────────
+
+type FlowVariant = "data" | "app" | "out" | "eval" | "trace"
+
+interface FlowNode {
+    label: string
+    variant: FlowVariant
+}
+
+const flowStyle = (token: GlobalToken, variant: FlowVariant): React.CSSProperties => {
+    switch (variant) {
+        case "data":
+            return {background: token.blue1, color: token.blue7, borderColor: token.blue2}
+        case "app":
+            return {
+                background: token.colorPrimaryBg,
+                color: token.colorText,
+                borderColor: token.colorPrimaryBorder,
+            }
+        case "out":
+            return {background: token.green1, color: token.green7, borderColor: token.green3}
+        case "eval":
+            // index 7 (not 6) so the text brightens under the dark algorithm —
+            // purple6 lands dark-on-dark and disappears on a dark background.
+            return {background: token.purple1, color: token.purple7, borderColor: token.purple3}
+        case "trace":
+            return {background: token.cyan1, color: token.cyan7, borderColor: token.cyan3}
+    }
+}
+
+const FlowIcon = ({variant}: {variant: FlowVariant}) => {
+    switch (variant) {
+        case "data":
+            return <DatabaseIcon size={12} />
+        case "app":
+            return <AppIcon size={12} />
+        case "eval":
+            return <GavelIcon size={12} />
+        case "trace":
+            return <TreeViewIcon size={12} />
+        default:
+            return null
+    }
+}
+
+const FlowPills = ({steps, token}: {steps: FlowNode[]; token: GlobalToken}) => (
+    <div className="flex flex-wrap items-center gap-y-1">
+        {steps.map((step, i) => (
+            <span key={`${step.label}-${i}`} className="flex items-center">
+                {i > 0 && (
+                    <span className="px-1.5 text-[12px]" style={{color: token.colorTextQuaternary}}>
+                        →
+                    </span>
+                )}
+                <span
+                    className="inline-flex items-center gap-1 whitespace-nowrap rounded-full border border-solid px-2 py-[3px] text-[11px] leading-none"
+                    style={flowStyle(token, step.variant)}
+                >
+                    <FlowIcon variant={step.variant} />
+                    {step.label}
+                </span>
+            </span>
+        ))}
+    </div>
+)
+
+// ── modes ───────────────────────────────────────────────────────────────────
+
+interface ModeDef {
+    key: RunOnMode
+    /** Full label shown in the dropdown option. */
+    label: string
+    /** Short label shown after "Run on:" in the trigger button. */
+    shortLabel: string
+    Icon: React.ComponentType<{size?: number; style?: React.CSSProperties}>
+    desc: string
+    flow: FlowNode[]
+    badge?: "default" | "soon"
+    disabled?: boolean
+}
+
+const MODES: ModeDef[] = [
+    {
+        key: "data",
+        label: "Run directly on a test case",
+        shortLabel: "Test case",
+        Icon: DatabaseIcon,
+        desc: "Evaluate data you provide. Connect a test set, or type the input and output in by hand.",
+        flow: [
+            {label: "Data", variant: "data"},
+            {label: "Evaluator", variant: "eval"},
+            {label: "Score", variant: "out"},
+        ],
+    },
+    {
+        key: "app",
+        label: "Run on an app output",
+        shortLabel: "App output",
+        Icon: AppIcon,
+        badge: "default",
+        desc: "Run an app over your data, then the evaluator grades its output. The usual evaluation flow.",
+        flow: [
+            {label: "Data", variant: "data"},
+            {label: "App", variant: "app"},
+            {label: "Output", variant: "out"},
+            {label: "Evaluator", variant: "eval"},
+            {label: "Score", variant: "out"},
+        ],
+    },
+    {
+        key: "trace",
+        label: "Run on a trace",
+        shortLabel: "Trace",
+        Icon: TreeViewIcon,
+        badge: "soon",
+        disabled: true,
+        desc: "Pull the input and output straight from a logged trace in Observability.",
+        flow: [
+            {label: "Trace", variant: "trace"},
+            {label: "Evaluator", variant: "eval"},
+            {label: "Score", variant: "out"},
+        ],
+    },
+]
+
+// ── component ───────────────────────────────────────────────────────────────
+
+interface RunOnSelectorProps {
+    mode: RunOnMode
+    onPick: (mode: RunOnMode) => void
+}
+
+const RunOnSelector = ({mode, onPick}: RunOnSelectorProps) => {
+    const {token} = theme.useToken()
+    const [open, setOpen] = useState(false)
+    const [hovered, setHovered] = useState<RunOnMode | null>(null)
+    const current = MODES.find((m) => m.key === mode) ?? MODES.find((m) => m.key === "app")!
+
+    const overlay = (
+        <div
+            className="w-[460px] rounded-lg border border-solid p-1.5"
+            style={{
+                background: token.colorBgElevated,
+                borderColor: token.colorBorderSecondary,
+                boxShadow: token.boxShadowSecondary,
+            }}
+        >
+            <div
+                className="px-2.5 pb-1.5 pt-1 text-[11px] font-semibold uppercase tracking-[0.04em]"
+                style={{color: token.colorTextQuaternary}}
+            >
+                What should the evaluator run on?
+            </div>
+            {MODES.map((m) => {
+                const selected = m.key === mode
+                const isHovered = hovered === m.key
+                const background = selected
+                    ? token.controlItemBgActive
+                    : isHovered && !m.disabled
+                      ? token.colorFillTertiary
+                      : "transparent"
+                return (
+                    <div
+                        key={m.key}
+                        role="button"
+                        aria-disabled={m.disabled}
+                        onMouseEnter={() => setHovered(m.key)}
+                        onMouseLeave={() => setHovered((h) => (h === m.key ? null : h))}
+                        onClick={() => {
+                            if (m.disabled) return
+                            onPick(m.key)
+                            setOpen(false)
+                        }}
+                        className={clsx(
+                            "flex items-start gap-3 rounded-md p-2.5",
+                            m.disabled ? "cursor-default opacity-55" : "cursor-pointer",
+                        )}
+                        style={{background}}
+                    >
+                        <span
+                            className="mt-0.5 flex w-[18px] shrink-0 justify-center"
+                            style={{color: token.colorPrimary}}
+                        >
+                            {selected && <CheckIcon size={16} />}
+                        </span>
+                        <div className="min-w-0 flex-1">
+                            <div
+                                className="flex items-center gap-2 text-[14px] font-medium"
+                                style={{color: token.colorText}}
+                            >
+                                <m.Icon size={15} />
+                                {m.label}
+                                {m.badge === "default" && (
+                                    <span
+                                        className="rounded-full px-[7px] py-px text-[10.5px] font-semibold"
+                                        style={{
+                                            background: token.colorPrimary,
+                                            color: token.colorTextLightSolid,
+                                        }}
+                                    >
+                                        default
+                                    </span>
+                                )}
+                                {m.badge === "soon" && (
+                                    <span
+                                        className="rounded-full px-[7px] py-px text-[10.5px] font-semibold"
+                                        style={{background: token.gold1, color: token.gold8}}
+                                    >
+                                        soon
+                                    </span>
+                                )}
+                            </div>
+                            <div
+                                className="mt-0.5 text-[12.5px] leading-snug"
+                                style={{color: token.colorTextTertiary}}
+                            >
+                                {m.desc}
+                            </div>
+                            <div className="mt-2">
+                                <FlowPills steps={m.flow} token={token} />
+                            </div>
+                        </div>
+                    </div>
+                )
+            })}
+        </div>
+    )
+
+    return (
+        <Dropdown
+            open={open}
+            onOpenChange={setOpen}
+            trigger={["click"]}
+            placement="bottomLeft"
+            popupRender={() => overlay}
+        >
+            <Button
+                size="small"
+                className="flex items-center gap-1.5 font-medium"
+                style={{
+                    background: token.colorPrimaryBg,
+                    borderColor: token.colorPrimaryBorder,
+                }}
+            >
+                <span className="font-normal" style={{color: token.colorTextTertiary}}>
+                    Run on:
+                </span>
+                <current.Icon size={14} style={{color: token.colorText}} />
+                <span className="truncate">{current.shortLabel}</span>
+                <CaretDownIcon size={12} style={{color: token.colorTextTertiary}} />
+            </Button>
+        </Dropdown>
+    )
+}
+
+export default RunOnSelector
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx
new file mode 100644
index 0000000000..6b31aad851
--- /dev/null
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx
@@ -0,0 +1,56 @@
+/**
+ * SelectAppEmptyState
+ *
+ * Centered empty state shown in the run/generation panel when the evaluator is
+ * in "Run on an app" mode but no app is connected yet. The evaluator can't run
+ * until an app is picked, so this guides the user to the one action that
+ * unblocks them. Shared by the evaluator playground page and the
+ * evaluator-creation drawer so both read identically.
+ */
+
+import {EntityPicker} from "@agenta/entity-ui"
+import type {
+    EntitySelectionAdapter,
+    WorkflowRevisionSelectionResult,
+} from "@agenta/entity-ui/selection"
+import {AppstoreOutlined} from "@ant-design/icons"
+import {Typography, theme} from "antd"
+
+interface SelectAppEmptyStateProps {
+    adapter: EntitySelectionAdapter<WorkflowRevisionSelectionResult>
+    onSelect: (selection: WorkflowRevisionSelectionResult) => void
+    selectedAppLabel?: string | null
+}
+
+const SelectAppEmptyState = ({adapter, onSelect, selectedAppLabel}: SelectAppEmptyStateProps) => {
+    const {token} = theme.useToken()
+
+    return (
+        <div className="flex max-w-[340px] flex-col items-center gap-4">
+            <div
+                className="flex h-14 w-14 items-center justify-center rounded-full"
+                style={{background: token.colorPrimaryBg, color: token.colorPrimary}}
+            >
+                <AppstoreOutlined style={{fontSize: 26}} />
+            </div>
+            <div className="flex flex-col gap-1 text-center">
+                <Typography.Text className="text-[15px] font-semibold">
+                    Select an app to run the evaluator on
+                </Typography.Text>
+                <Typography.Text type="secondary" className="text-[13px] leading-snug">
+                    The evaluator grades this app&apos;s output. Pick which app to run, then fill
+                    its inputs or load a test set.
+                </Typography.Text>
+            </div>
+            <EntityPicker<WorkflowRevisionSelectionResult>
+                variant="popover-cascader"
+                adapter={adapter}
+                onSelect={onSelect}
+                size="middle"
+                placeholder={selectedAppLabel ?? "Select app"}
+            />
+        </div>
+    )
+}
+
+export default SelectAppEmptyState
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
index 32fd75511e..cb2500f622 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
@@ -83,6 +83,53 @@ export const persistedTestsetSelectionAtom = atom(
     },
 )
 
+// ============================================================================
+// RUN-ON MODE
+// ============================================================================
+
+/**
+ * What the evaluator runs on:
+ *  - "data"  → run directly on data you provide (test set or manual input/output)
+ *  - "app"   → run an app over the data, then grade its output (the usual flow)
+ *  - "trace" → grade the input/output of a logged trace (not yet available)
+ *
+ * "app" is the default so a fresh playground guides the user down the most
+ * common path (pick an app → run against it). The "trace" mode is disabled in
+ * the UI for now.
+ */
+export type RunOnMode = "data" | "app" | "trace"
+
+const runOnModeByProjectAtom = atomWithStorage<Record<string, RunOnMode>>(
+    "agenta:evaluator:run-on-mode",
+    {},
+)
+
+/** Read/write the persisted run-on mode for the current project (default "app"). */
+export const runOnModeAtom = atom(
+    (get) => {
+        const projectId = get(projectIdAtom) || "__global__"
+        return get(runOnModeByProjectAtom)[projectId] ?? "app"
+    },
+    (get, set, next: RunOnMode) => {
+        const projectId = get(projectIdAtom) || "__global__"
+        const all = get(runOnModeByProjectAtom)
+        set(runOnModeByProjectAtom, {...all, [projectId]: next})
+    },
+)
+
+/**
+ * The mode actually in effect.
+ *
+ * A connected app (downstream evaluator node) always means we're in "app" mode,
+ * regardless of the stored preference — the node graph is the source of truth.
+ * Only when nothing is connected do we fall back to the stored mode.
+ */
+export const effectiveRunOnModeAtom = atom<RunOnMode>((get) => {
+    const nodes = get(playgroundNodesAtom)
+    if (nodes.some((n) => n.depth > 0)) return "app"
+    return get(runOnModeAtom)
+})
+
 // ============================================================================
 // DERIVED SELECTORS
 // ============================================================================
@@ -201,6 +248,13 @@ export const connectAppToEvaluatorAtom = atom(
         // display label is derived from the depth-0 node's `label` via
         // `selectedAppLabelAtom`, so no extra write needed here.
         set(persistedAppSelectionAtom, {appRevisionId, appLabel})
+
+        // Pin the stored run-on mode to "app" too. While connected,
+        // `effectiveRunOnModeAtom` overrides to "app" regardless, but the
+        // stored mode is what we fall back to on disconnect — without this a
+        // user who connected an app from "data" mode would snap back to the
+        // testcase panel on disconnect instead of the "Select an app" state.
+        set(runOnModeAtom, "app")
     },
 )
 
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
index 3b94c3a163..80b2952e17 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
@@ -30,8 +30,15 @@ import {OSSPlaygroundShell} from "@/oss/components/Playground/OSSPlaygroundShell
 import SharedGenerationResultUtils from "@/oss/components/SharedGenerationResultUtils"
 import {playgroundSyncAtom} from "@/oss/state/url/playground"
 
-import {connectAppToEvaluatorAtom, evaluatorConfigEntityIdsAtom} from "./atoms"
+import {
+    connectAppToEvaluatorAtom,
+    effectiveRunOnModeAtom,
+    evaluatorConfigEntityIdsAtom,
+    hasAppConnectedAtom,
+    selectedAppLabelAtom,
+} from "./atoms"
 import EvaluatorPlaygroundHeader from "./EvaluatorPlaygroundHeader"
+import SelectAppEmptyState from "./SelectAppEmptyState"
 
 const PlaygroundMainView = dynamic(
     () => import("@/oss/components/Playground/Components/MainLayout"),
@@ -71,6 +78,14 @@ const ConfigureEvaluatorPageInner = () => {
 
     const configEntityIds = useAtomValue(evaluatorConfigEntityIdsAtom)
     const connectApp = useSetAtom(connectAppToEvaluatorAtom)
+    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
+    const runOnMode = useAtomValue(effectiveRunOnModeAtom)
+
+    // In "Run on an app" mode with no app connected yet, the run panel surfaces
+    // the app selector (mirrors the evaluator-creation drawer) so the default
+    // path — pick an app → run against it — is the obvious next step.
+    const runDisabled = runOnMode === "app" && !hasAppConnected
 
     // Read the current evaluator entity from playground nodes
     // Phase 1: evaluator is at depth 0 (primary, standalone run)
@@ -119,6 +134,17 @@ const ConfigureEvaluatorPageInner = () => {
         [connectApp, evaluatorNode],
     )
 
+    const runDisabledContent = useMemo(
+        () => (
+            <SelectAppEmptyState
+                adapter={appWorkflowAdapter}
+                onSelect={handleAppSelect}
+                selectedAppLabel={selectedAppLabel}
+            />
+        ),
+        [appWorkflowAdapter, handleAppSelect, selectedAppLabel],
+    )
+
     const providers = useMemo(
         () =>
             ({
@@ -132,12 +158,21 @@ const ConfigureEvaluatorPageInner = () => {
 
     return (
         <OSSPlaygroundShell providers={providers}>
-            <div className="flex flex-col w-full h-full overflow-hidden">
+            {/* Definite height (viewport minus the app topbar) so the run panel's
+             * `h-full` centering resolves — same pattern as the app playground
+             * (`Playground.tsx`). With a plain `h-full` here the chain collapses
+             * to content height and the empty state sticks to the top. */}
+            <div className="flex flex-col w-full h-[calc(100dvh-75px)] overflow-hidden">
                 <EvaluatorPlaygroundHeader
                     appWorkflowAdapter={appWorkflowAdapter}
                     onAppSelect={handleAppSelect}
                 />
-                <PlaygroundMainView mode="evaluator" configEntityIdsOverride={configEntityIds} />
+                <PlaygroundMainView
+                    mode="evaluator"
+                    configEntityIdsOverride={configEntityIds}
+                    runDisabled={runDisabled}
+                    runDisabledContent={runDisabledContent}
+                />
             </div>
         </OSSPlaygroundShell>
     )
diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
index ae366bab51..43d25653a9 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
@@ -40,6 +40,7 @@ import {
     hasAppConnectedAtom,
     selectedAppLabelAtom,
 } from "@/oss/components/Evaluators/components/ConfigureEvaluator/atoms"
+import SelectAppEmptyState from "@/oss/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState"
 import {clearEvaluatorWorkflowCache} from "@/oss/components/Evaluators/store/evaluatorsPaginatedStore"
 import PlaygroundTestcaseEditor from "@/oss/components/Playground/Components/PlaygroundTestcaseEditor"
 import {OSSPlaygroundShell} from "@/oss/components/Playground/OSSPlaygroundShell"
@@ -214,18 +215,11 @@ const DrawerContent = ({
 
     const runDisabledContent = useMemo(
         () => (
-            <>
-                <Typography.Text type="secondary" className="text-sm">
-                    Select an app to run the evaluator chain
-                </Typography.Text>
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={handleAppSelect}
-                    size="middle"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-            </>
+            <SelectAppEmptyState
+                adapter={appWorkflowAdapter}
+                onSelect={handleAppSelect}
+                selectedAppLabel={selectedAppLabel}
+            />
         ),
         [appWorkflowAdapter, handleAppSelect, selectedAppLabel],
     )

From 8c270b5139e0071e9ae588c974a7ce95030fa631 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Fri, 5 Jun 2026 11:19:52 +0200
Subject: [PATCH 20/36] fix(frontend): drop now-unused StyleProps cast in
 Layout

The useStyles call cast its arg to StyleProps, but StyleProps was never
imported in Layout.tsx (a latent issue, flagged by review). With footerHeight
gone, StyleProps is just {themeMode}, so the cast is unnecessary. Pass the
arg directly; tsc confirms it type-checks.
---
 web/oss/src/components/Layout/Layout.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/oss/src/components/Layout/Layout.tsx b/web/oss/src/components/Layout/Layout.tsx
index 3eb5683de7..eb415201f3 100644
--- a/web/oss/src/components/Layout/Layout.tsx
+++ b/web/oss/src/components/Layout/Layout.tsx
@@ -359,7 +359,7 @@ const AppWithVariants = memo(
 
 const App: React.FC<LayoutProps> = ({children}) => {
     const {appTheme} = useAppTheme()
-    const classes = useStyles({themeMode: appTheme} as StyleProps)
+    const classes = useStyles({themeMode: appTheme})
     const {isHumanEval, isPlayground, isAppRoute, isAuthRoute, isEvaluator, isFullHeight} =
         useCommittedLayoutFlags()
 

From 1b80a4e6180e550918936862b56b902e23f60549 Mon Sep 17 00:00:00 2001
From: Kaosiso Ezealigo <ezealigokosiso@gmail.com>
Date: Fri, 5 Jun 2026 11:56:33 +0200
Subject: [PATCH 21/36] test(@agenta/shared): align template-variable tests
 with permissive JSONPath policy

The release/v0.102.0 merge updated validateTemplateVariable to accept any
well-formed $.x expression without checking against known envelope slots
(post-mustache QA: slot mismatches surface as API errors, not UI errors).
Five tests still asserted the old strict behavior and were failing. Updated
them to match the intentional permissive policy documented in the source.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../tests/unit/template-variable.test.ts      | 36 +++++++++++--------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/web/packages/agenta-shared/tests/unit/template-variable.test.ts b/web/packages/agenta-shared/tests/unit/template-variable.test.ts
index edcee7157d..40075febc9 100644
--- a/web/packages/agenta-shared/tests/unit/template-variable.test.ts
+++ b/web/packages/agenta-shared/tests/unit/template-variable.test.ts
@@ -31,9 +31,9 @@ describe("validateTemplateVariable — empty / malformed", () => {
 // ---------------------------------------------------------------------------
 
 describe("validateTemplateVariable — JSONPath", () => {
-    it("rejects bare '$' (no envelope slot after root)", () => {
-        // On main: tokens after stripping '$.' are empty → invalid
-        expect(validateTemplateVariable("$").valid).toBe(false)
+    it("accepts bare '$' (whole-context compact JSON)", () => {
+        // Bare '$' resolves the whole context object — valid per the runtime contract.
+        expect(validateTemplateVariable("$").valid).toBe(true)
     })
 
     it("accepts a well-formed JSONPath rooted at a known slot", () => {
@@ -41,16 +41,19 @@ describe("validateTemplateVariable — JSONPath", () => {
         expect(validateTemplateVariable("$.outputs.result").valid).toBe(true)
     })
 
-    it("rejects a JSONPath whose root is not a known envelope slot", () => {
+    it("accepts a JSONPath with an unknown root (permissive — root becomes a testcase column)", () => {
+        // Per post-mustache QA: any well-formed '$.x' is valid; slot mismatches
+        // surface as runtime errors from the API, not UI errors.
         const result = validateTemplateVariable("$.arbitrary_column")
-        expect(result.valid).toBe(false)
-        expect(result.reason).toMatch(/unknown envelope slot/i)
+        expect(result.valid).toBe(true)
     })
 
-    it("includes a 'did-you-mean' suggestion for near-miss slot names", () => {
-        const result = validateTemplateVariable("$.input.country") // 'input' ≈ 'inputs'
-        expect(result.valid).toBe(false)
-        expect(result.suggestion).toBe("inputs")
+    it("accepts a near-miss JSONPath without a typo suggestion (permissive)", () => {
+        // The JSONPath branch no longer emits 'did-you-mean' hints; the user's
+        // literal text wins and the root is treated as a testcase column name.
+        const result = validateTemplateVariable("$.input.country")
+        expect(result.valid).toBe(true)
+        expect(result.suggestion).toBeUndefined()
     })
 })
 
@@ -64,13 +67,15 @@ describe("validateTemplateVariable — JSON Pointer", () => {
         expect(validateTemplateVariable("/outputs/result").valid).toBe(true)
     })
 
-    it("rejects a pointer with an unknown root slot", () => {
+    it("accepts a single-segment identifier (may be a mustache section close tag)", () => {
+        // '/identifier' is ambiguous: it could be '{{/close}}' in mustache or a
+        // JSON Pointer to an envelope slot. Single-segment paths are accepted
+        // unconditionally; the runtime is the source of truth.
         const result = validateTemplateVariable("/section")
-        expect(result.valid).toBe(false)
-        expect(result.reason).toMatch(/unknown envelope slot/i)
+        expect(result.valid).toBe(true)
     })
 
-    it("includes a 'did-you-mean' suggestion for near-miss slot names", () => {
+    it("rejects a multi-segment pointer with an unknown root slot", () => {
         const result = validateTemplateVariable("/input/country")
         expect(result.valid).toBe(false)
         expect(result.suggestion).toBe("inputs")
@@ -107,7 +112,8 @@ describe("isValidTemplateVariable", () => {
 
     it("returns false for an invalid expression", () => {
         expect(isValidTemplateVariable("")).toBe(false)
-        expect(isValidTemplateVariable("$.unknown_slot")).toBe(false)
+        expect(isValidTemplateVariable("$foo")).toBe(false) // missing '.' after '$'
+        expect(isValidTemplateVariable("$.")).toBe(false) // trailing dot, no field
     })
 })
 

From 5e2ccc6867ea4e196ce71d11e997f7ab447d4e6a Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Fri, 5 Jun 2026 14:26:56 +0200
Subject: [PATCH 22/36] fix(frontend): coerce shared view mode to a valid
 message mode

The shared messageViewModeAtom is typed ViewMode, which includes "form". Chat
and prompt message editors only handle text/markdown/json/yaml, and a cast
hid the mismatch, so a persisted "form" value could leave the dropdown and
editor inconsistent. Add toMessageViewMode() (form -> text) and use it in
ChatMessageList and TurnMessageAdapter before deriving mode-dependent state,
removing the unsafe cast. Addresses CodeRabbit review on PR #4554.
---
 .../components/adapters/TurnMessageAdapter.tsx  | 17 ++++++++++-------
 .../ChatMessage/components/ChatMessageList.tsx  | 13 ++++++++-----
 web/packages/agenta-ui/src/drill-in/index.ts    |  4 ++--
 .../src/drill-in/utils/getViewOptions.ts        | 12 ++++++++++++
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx b/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
index 0501848a0b..b1921ec305 100644
--- a/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
+++ b/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
@@ -21,7 +21,7 @@ import {
     PromptImageUpload,
     PromptDocumentUpload,
 } from "@agenta/ui/components/presentational"
-import {messageViewModeAtom} from "@agenta/ui/drill-in"
+import {messageViewModeAtom, toMessageViewMode} from "@agenta/ui/drill-in"
 import type {UploadFile} from "antd"
 import clsx from "clsx"
 import {useAtom, useAtomValue, useSetAtom} from "jotai"
@@ -216,9 +216,12 @@ const TurnMessageAdapter: React.FC<Props> = ({
         return fallback
     }, [computedText, msg])
     // Shared + persisted across all message editors (see messageViewModeAtom).
+    // The atom is typed `ViewMode` (can hold "form"), so coerce to a mode this
+    // editor can actually render before deriving any mode-dependent state.
     const [viewMode, setViewMode] = useAtom(messageViewModeAtom)
-    const isCodeMode = viewMode === "json" || viewMode === "yaml"
-    const editorLanguage = viewMode === "yaml" ? "yaml" : "json"
+    const chatViewMode = toMessageViewMode(viewMode)
+    const isCodeMode = chatViewMode === "json" || chatViewMode === "yaml"
+    const editorLanguage = chatViewMode === "yaml" ? "yaml" : "json"
 
     const effectiveDisabled = Boolean(disabled)
     const isUserRole = kind === "user" && !isToolKind
@@ -657,7 +660,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                             isJSON={isCodeMode}
                             isTool={isCodeMode}
                             language={editorLanguage}
-                            markdownView={viewMode === "text"}
+                            markdownView={chatViewMode === "text"}
                             onFocusChange={handleEditorFocusChange}
                             text={p?.json}
                             enableTokens={messageProps?.enableTokens ?? !isCodeMode}
@@ -687,7 +690,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                                     resultHashes={propsResultHashes ?? resultHashes}
                                     results={results}
                                     text={p?.json ?? editorText}
-                                    viewMode={viewMode}
+                                    viewMode={chatViewMode}
                                     onViewModeChange={setViewMode}
                                     collapsed={isMessageCollapsed}
                                     allowFileUpload={isUserRole && !effectiveDisabled}
@@ -751,7 +754,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                         state={editorState}
                         isJSON={isCodeMode}
                         language={editorLanguage}
-                        markdownView={viewMode === "text"}
+                        markdownView={chatViewMode === "text"}
                         enableTokens={messageProps?.enableTokens ?? !isCodeMode}
                         headerRight={
                             <TurnMessageHeaderOptions
@@ -762,7 +765,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                                 resultHashes={propsResultHashes ?? resultHashes}
                                 results={results}
                                 text={editorText}
-                                viewMode={viewMode}
+                                viewMode={chatViewMode}
                                 onViewModeChange={setViewMode}
                                 collapsed={isMessageCollapsed}
                                 allowFileUpload={isUserRole && !effectiveDisabled}
diff --git a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
index afb06369e2..78fe345545 100644
--- a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
+++ b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
@@ -17,7 +17,7 @@ import {useAtom} from "jotai"
 import {CollapseToggleButton, getCollapseStyle} from "../../components/presentational/buttons"
 import {ViewModeDropdown} from "../../drill-in/core/ViewModeDropdown"
 import {messageViewModeAtom} from "../../drill-in/state/messageViewModeAtom"
-import {getViewOptions, type ViewMode} from "../../drill-in/utils/getViewOptions"
+import {getViewOptions, toMessageViewMode, type ViewMode} from "../../drill-in/utils/getViewOptions"
 import {message, modal} from "../../utils/appMessageContext"
 import {cn, flexLayouts, gapClasses} from "../../utils/styles"
 import {createSnippetPdfAttachment} from "../utils/snippetAttachment"
@@ -92,9 +92,12 @@ const ChatMessageItem: React.FC<{
 }) => {
     const containerRef = useRef<HTMLDivElement>(null)
     // Shared + persisted across all message editors (see messageViewModeAtom).
+    // The atom is typed `ViewMode` (can hold "form"), so coerce to a mode this
+    // editor can actually render before deriving any mode-dependent state.
     const [viewMode, setViewMode] = useAtom(messageViewModeAtom)
-    const isCodeMode = viewMode === "json" || viewMode === "yaml"
-    const editorLanguage: "json" | "yaml" = viewMode === "yaml" ? "yaml" : "json"
+    const chatViewMode = toMessageViewMode(viewMode)
+    const isCodeMode = chatViewMode === "json" || chatViewMode === "yaml"
+    const editorLanguage: "json" | "yaml" = chatViewMode === "yaml" ? "yaml" : "json"
 
     const isToolResponse = msg.role === "tool"
     const hasToolCalls = Boolean(msg.tool_calls && msg.tool_calls.length > 0)
@@ -176,7 +179,7 @@ const ChatMessageItem: React.FC<{
                 onChangeText={(text) => onTextChange(index, text)}
                 isJSON={isCodeMode}
                 language={editorLanguage}
-                markdownView={viewMode === "text"}
+                markdownView={chatViewMode === "text"}
                 enableTokens={enableTokens && !isCodeMode}
                 templateFormat={templateFormat}
                 tokens={tokens}
@@ -199,7 +202,7 @@ const ChatMessageItem: React.FC<{
                         )}
                     >
                         <ViewModeDropdown<ChatViewMode>
-                            value={viewMode as ChatViewMode}
+                            value={chatViewMode}
                             options={viewOptions}
                             onChange={setViewMode}
                         />
diff --git a/web/packages/agenta-ui/src/drill-in/index.ts b/web/packages/agenta-ui/src/drill-in/index.ts
index 74ddaefc8c..58ba59db23 100644
--- a/web/packages/agenta-ui/src/drill-in/index.ts
+++ b/web/packages/agenta-ui/src/drill-in/index.ts
@@ -126,8 +126,8 @@ export {
     canToggleRawMode,
     detectDataType,
 } from "./utils"
-export {getViewOptions} from "./utils/getViewOptions"
-export type {ViewMode, ViewOption} from "./utils/getViewOptions"
+export {getViewOptions, toMessageViewMode} from "./utils/getViewOptions"
+export type {ViewMode, MessageViewMode, ViewOption} from "./utils/getViewOptions"
 export {messageViewModeAtom} from "./state/messageViewModeAtom"
 
 // ============================================================================
diff --git a/web/packages/agenta-ui/src/drill-in/utils/getViewOptions.ts b/web/packages/agenta-ui/src/drill-in/utils/getViewOptions.ts
index 9a1b28f759..2daae9eb98 100644
--- a/web/packages/agenta-ui/src/drill-in/utils/getViewOptions.ts
+++ b/web/packages/agenta-ui/src/drill-in/utils/getViewOptions.ts
@@ -1,5 +1,17 @@
 export type ViewMode = "text" | "markdown" | "json" | "yaml" | "form"
 
+/** The view modes a chat / prompt message editor can render ("form" is for objects). */
+export type MessageViewMode = Exclude<ViewMode, "form">
+
+/**
+ * Coerce a (possibly app-wide / persisted) view mode to one a message editor can
+ * render. The shared `messageViewModeAtom` is typed `ViewMode`, so it can hold
+ * "form"; falling back to "text" keeps the dropdown and editor consistent instead
+ * of silently casting and rendering an unsupported mode.
+ */
+export const toMessageViewMode = (mode: ViewMode): MessageViewMode =>
+    mode === "form" ? "text" : mode
+
 export interface ViewOption {
     value: ViewMode
     label: string

From 140bcb44f621b4642c2dc0a88b3757d4cdf2721e Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Fri, 5 Jun 2026 14:45:38 +0200
Subject: [PATCH 23/36] feat(frontend): bring the Run-on modes to the evaluator
 creation drawer

The Run-on selector (test case / app output / trace) was only wired into the
full-page evaluator playground. The evaluator-creation drawer still hardcoded
`runDisabled={!hasAppConnected}` and only showed the test-set dropdown after an
app was connected, so it forced the user to pick an app even when they wanted to
run the evaluator directly on a test case.

Rather than copy the run-on wiring into the drawer (a fourth duplicate), extract
the shared logic the page and drawer were already duplicating:

- useEvaluatorRunControls(): app adapter, app-select handler, run-on mode +
  handlePickRunOn, and the runDisabled gate (runOnMode === 'app' && !appConnected).
- EvaluatorRunControls: the run-on selector + app picker + disconnect + test-set
  cluster, shared by the page header and the drawer header so they can't drift.

The page is behavior-preserving; the drawer gains all three modes, the run-on
selector, a disconnect affordance, and an always-available test-set dropdown.
This also removes the adapter/handleAppSelect/evaluator-node triplication across
the page body, drawer header, and drawer body.
---
 .../EvaluatorPlaygroundHeader.tsx             | 114 ++----------------
 .../EvaluatorRunControls.tsx                  |  83 +++++++++++++
 .../components/ConfigureEvaluator/index.tsx   |  75 ++----------
 .../useEvaluatorRunControls.ts                | 108 +++++++++++++++++
 .../CreateEvaluatorDrawer/index.tsx           | 112 ++---------------
 5 files changed, 218 insertions(+), 274 deletions(-)
 create mode 100644 web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx
 create mode 100644 web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
index 1dd5f55414..b0b970eb4d 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
@@ -1,48 +1,22 @@
 /**
  * EvaluatorPlaygroundHeader
  *
- * Simplified playground header for the evaluator configuration page.
- * Shows evaluator name, app workflow selector, and testset dropdown.
- * Reads evaluator info from playground nodes (URL-driven, no props needed).
+ * Header for the evaluator configuration page: the evaluator name plus the
+ * shared run controls. The controls (run-on selector, app picker, testset)
+ * live in `EvaluatorRunControls` so the page and the creation drawer share one
+ * implementation. Reads evaluator info from playground nodes (URL-driven).
  */
 
-import {useCallback, useMemo} from "react"
+import {useMemo} from "react"
 
 import {workflowMolecule} from "@agenta/entities/workflow"
-import {EntityPicker} from "@agenta/entity-ui"
-import type {
-    EntitySelectionAdapter,
-    WorkflowRevisionSelectionResult,
-} from "@agenta/entity-ui/selection"
 import {playgroundController} from "@agenta/playground"
-import {X} from "@phosphor-icons/react"
-import {Button, Tooltip, Typography} from "antd"
-import {useAtomValue, useSetAtom} from "jotai"
-import dynamic from "next/dynamic"
+import {Typography} from "antd"
+import {useAtomValue} from "jotai"
 
-import {
-    disconnectAppFromEvaluatorAtom,
-    effectiveRunOnModeAtom,
-    runOnModeAtom,
-    selectedAppLabelAtom,
-    type RunOnMode,
-} from "./atoms"
-import RunOnSelector from "./RunOnSelector"
+import EvaluatorRunControls from "./EvaluatorRunControls"
 
-const TestsetDropdown = dynamic(
-    () => import("@/oss/components/Playground/Components/TestsetDropdown"),
-    {ssr: false},
-)
-
-interface EvaluatorPlaygroundHeaderProps {
-    appWorkflowAdapter: EntitySelectionAdapter<WorkflowRevisionSelectionResult>
-    onAppSelect: (selection: WorkflowRevisionSelectionResult) => void
-}
-
-const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
-    appWorkflowAdapter,
-    onAppSelect,
-}) => {
+const EvaluatorPlaygroundHeader: React.FC = () => {
     // Read evaluator node from playground nodes
     // Phase 1: evaluator is at depth 0 (primary)
     // Phase 2: evaluator is at depth 1 (downstream)
@@ -77,44 +51,6 @@ const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
         evaluatorData?.slug?.trim() ||
         "Evaluator"
 
-    // Selected app label for display in the picker trigger
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
-    const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom)
-    const handleDisconnect = useCallback(() => {
-        disconnectApp()
-    }, [disconnectApp])
-
-    // Run-on mode — drives which loaders are surfaced. A connected app forces
-    // "app" mode (see effectiveRunOnModeAtom); the stored mode only matters when
-    // nothing is connected.
-    const runOnMode = useAtomValue(effectiveRunOnModeAtom)
-    const setRunOnMode = useSetAtom(runOnModeAtom)
-    const handlePickRunOn = useCallback(
-        (next: RunOnMode) => {
-            if (next === "trace") return // disabled, not selectable
-            // Leaving "app" mode means dropping the connected app so the graph
-            // returns to standalone-evaluator shape.
-            if (next === "data") disconnectApp()
-            setRunOnMode(next)
-        },
-        [disconnectApp, setRunOnMode],
-    )
-    const isAppMode = runOnMode === "app"
-
-    // Check if we have an app node (depth-0 with a different entity than evaluator)
-    const hasAppSelected = nodes.some((n) => n.depth === 0 && n.entityId !== evaluatorEntityId)
-
-    // Footer inside the picker popover — only when an app is currently connected.
-    // Mirrors the "Disconnect all" pattern used by the evaluator picker in
-    // `Playground/Components/PlaygroundHeader/index.tsx`.
-    const popupFooter = hasAppSelected ? (
-        <div className="border-0 border-t border-solid border-[rgba(5,23,41,0.06)] p-2">
-            <Button size="small" danger className="w-full" onClick={handleDisconnect}>
-                Disconnect app
-            </Button>
-        </div>
-    ) : undefined
-
     return (
         <div className="flex items-center justify-between gap-4 px-2.5 py-2 bg-[var(--ag-rgba-000-02)] border-0 border-b border-solid border-[var(--ag-rgba-051729-06)]">
             <div className="flex shrink-0 items-center gap-2 pl-2">
@@ -123,37 +59,7 @@ const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
                 </Typography>
             </div>
 
-            <div className="flex min-w-0 flex-1 items-center justify-end gap-1">
-                <RunOnSelector mode={runOnMode} onPick={handlePickRunOn} />
-                {isAppMode && (
-                    <EntityPicker<WorkflowRevisionSelectionResult>
-                        variant="popover-cascader"
-                        adapter={appWorkflowAdapter}
-                        onSelect={onAppSelect}
-                        size="small"
-                        placeholder={selectedAppLabel ?? "Select app"}
-                        popupFooter={popupFooter}
-                    />
-                )}
-                {isAppMode && hasAppSelected && (
-                    <Tooltip title="Disconnect app">
-                        <Button
-                            type="text"
-                            size="small"
-                            icon={<X size={12} />}
-                            onClick={handleDisconnect}
-                            aria-label="Disconnect app"
-                        />
-                    </Tooltip>
-                )}
-                {/* Testset is always connectable, with or without an upstream
-                 * app. The earlier `hasAppSelected` gate matched the
-                 * runDisabled gate we removed in T7 — same regression, same
-                 * fix: standalone evaluator runs need a testset just as much
-                 * as chained ones (the evaluator's prompt template variables
-                 * still come from testcase row fields). */}
-                <TestsetDropdown />
-            </div>
+            <EvaluatorRunControls />
         </div>
     )
 }
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx
new file mode 100644
index 0000000000..b52c0271ac
--- /dev/null
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx
@@ -0,0 +1,83 @@
+/**
+ * EvaluatorRunControls
+ *
+ * The run-on + app + testset control cluster, shared by the evaluator
+ * playground page header and the evaluator-creation drawer header so the two
+ * stay identical. Reads everything from `useEvaluatorRunControls` (atom-backed),
+ * so it takes no props — drop it next to a title and it works on either surface.
+ *
+ * - Run-on selector (test case / app output / trace).
+ * - App picker — only in "app" mode, with a disconnect affordance once connected.
+ * - Test set dropdown — always available: it's the data source in test-case
+ *   mode and feeds the app in app mode.
+ */
+
+import {EntityPicker} from "@agenta/entity-ui"
+import type {WorkflowRevisionSelectionResult} from "@agenta/entity-ui/selection"
+import {X} from "@phosphor-icons/react"
+import {Button, Tooltip} from "antd"
+import dynamic from "next/dynamic"
+
+import RunOnSelector from "./RunOnSelector"
+import {useEvaluatorRunControls} from "./useEvaluatorRunControls"
+
+const TestsetDropdown = dynamic(
+    () => import("@/oss/components/Playground/Components/TestsetDropdown"),
+    {ssr: false},
+)
+
+const EvaluatorRunControls = () => {
+    const {
+        appWorkflowAdapter,
+        handleAppSelect,
+        disconnectApp,
+        runOnMode,
+        handlePickRunOn,
+        hasAppConnected,
+        selectedAppLabel,
+    } = useEvaluatorRunControls()
+
+    const isAppMode = runOnMode === "app"
+
+    // Footer inside the picker popover — only when an app is currently connected.
+    const popupFooter = hasAppConnected ? (
+        <div className="border-0 border-t border-solid border-[var(--ag-rgba-051729-06)] p-2">
+            <Button size="small" danger className="w-full" onClick={() => disconnectApp()}>
+                Disconnect app
+            </Button>
+        </div>
+    ) : undefined
+
+    return (
+        <div className="flex min-w-0 items-center justify-end gap-1">
+            <RunOnSelector mode={runOnMode} onPick={handlePickRunOn} />
+
+            {isAppMode && (
+                <EntityPicker<WorkflowRevisionSelectionResult>
+                    variant="popover-cascader"
+                    adapter={appWorkflowAdapter}
+                    onSelect={handleAppSelect}
+                    size="small"
+                    placeholder={selectedAppLabel ?? "Select app"}
+                    popupFooter={popupFooter}
+                />
+            )}
+
+            {isAppMode && hasAppConnected && (
+                <Tooltip title="Disconnect app">
+                    <Button
+                        type="text"
+                        size="small"
+                        icon={<X size={12} />}
+                        onClick={() => disconnectApp()}
+                        aria-label="Disconnect app"
+                    />
+                </Tooltip>
+            )}
+
+            <TestsetDropdown />
+        </div>
+    )
+}
+
+export default EvaluatorRunControls
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
index 80b2952e17..f4db283381 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
@@ -14,11 +14,6 @@ import {useCallback, useEffect, useMemo} from "react"
 
 import {loadableController} from "@agenta/entities/loadable"
 import {testcaseMolecule} from "@agenta/entities/testcase"
-import {
-    createWorkflowRevisionAdapter,
-    type WorkflowRevisionSelectionResult,
-} from "@agenta/entity-ui/selection"
-import {playgroundController} from "@agenta/playground"
 import {type PlaygroundUIProviders} from "@agenta/playground-ui"
 import {preloadEditorPlugins, SyncStateTag} from "@agenta/ui"
 import {useAtomValue, useSetAtom} from "jotai"
@@ -30,15 +25,10 @@ import {OSSPlaygroundShell} from "@/oss/components/Playground/OSSPlaygroundShell
 import SharedGenerationResultUtils from "@/oss/components/SharedGenerationResultUtils"
 import {playgroundSyncAtom} from "@/oss/state/url/playground"
 
-import {
-    connectAppToEvaluatorAtom,
-    effectiveRunOnModeAtom,
-    evaluatorConfigEntityIdsAtom,
-    hasAppConnectedAtom,
-    selectedAppLabelAtom,
-} from "./atoms"
+import {evaluatorConfigEntityIdsAtom} from "./atoms"
 import EvaluatorPlaygroundHeader from "./EvaluatorPlaygroundHeader"
 import SelectAppEmptyState from "./SelectAppEmptyState"
+import {useEvaluatorRunControls} from "./useEvaluatorRunControls"
 
 const PlaygroundMainView = dynamic(
     () => import("@/oss/components/Playground/Components/MainLayout"),
@@ -77,63 +67,17 @@ const ConfigureEvaluatorPageInner = () => {
     useAtomValue(playgroundSyncAtom)
 
     const configEntityIds = useAtomValue(evaluatorConfigEntityIdsAtom)
-    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
-    const runOnMode = useAtomValue(effectiveRunOnModeAtom)
-
-    // In "Run on an app" mode with no app connected yet, the run panel surfaces
-    // the app selector (mirrors the evaluator-creation drawer) so the default
-    // path — pick an app → run against it — is the obvious next step.
-    const runDisabled = runOnMode === "app" && !hasAppConnected
-
-    // Read the current evaluator entity from playground nodes
-    // Phase 1: evaluator is at depth 0 (primary, standalone run)
-    // Phase 2: evaluator is at depth 1 (downstream of a connected app — chain run)
-    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
-    const evaluatorNode = useMemo(() => {
-        const downstream = nodes.find((n) => n.depth > 0)
-        if (downstream) return downstream
-        return nodes[0] ?? null
-    }, [nodes])
+
+    // Shared run controls (app adapter, app-select, run-on mode, run gate) — the
+    // same hook the header and the creation drawer use, so all surfaces agree.
+    const {appWorkflowAdapter, handleAppSelect, selectedAppLabel, runDisabled} =
+        useEvaluatorRunControls()
 
     // Preload editor plugins
     useEffect(() => {
         void preloadEditorPlugins()
     }, [])
 
-    // App workflow picker — opt-in for chain-mode execution. The evaluator can
-    // also run standalone: the user fills the testcase row's template variables
-    // (e.g. `{{inputs}}`, `{{outputs}}` for LLM-as-a-judge) directly. The
-    // header surfaces this picker; we never block the run panel on it.
-    const appWorkflowAdapter = useMemo(
-        () =>
-            createWorkflowRevisionAdapter({
-                skipVariantLevel: true,
-                excludeRevisionZero: true,
-                flags: {is_evaluator: false, is_feedback: false},
-                // The picker on the evaluator playground header is picking an
-                // upstream *app* workflow to connect to — without this the
-                // search bar would say "Search evaluator…" (the adapter's
-                // historical default) while the user is choosing an app.
-                parentLabel: "Application",
-            }),
-        [],
-    )
-
-    const handleAppSelect = useCallback(
-        (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
-            connectApp({
-                appRevisionId: selection.id,
-                appLabel: selection.label,
-                evaluatorRevisionId: evaluatorNode.entityId,
-                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
-            })
-        },
-        [connectApp, evaluatorNode],
-    )
-
     const runDisabledContent = useMemo(
         () => (
             <SelectAppEmptyState
@@ -163,10 +107,7 @@ const ConfigureEvaluatorPageInner = () => {
              * (`Playground.tsx`). With a plain `h-full` here the chain collapses
              * to content height and the empty state sticks to the top. */}
             <div className="flex flex-col w-full h-[calc(100dvh-75px)] overflow-hidden">
-                <EvaluatorPlaygroundHeader
-                    appWorkflowAdapter={appWorkflowAdapter}
-                    onAppSelect={handleAppSelect}
-                />
+                <EvaluatorPlaygroundHeader />
                 <PlaygroundMainView
                     mode="evaluator"
                     configEntityIdsOverride={configEntityIds}
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
new file mode 100644
index 0000000000..4722dd0daa
--- /dev/null
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
@@ -0,0 +1,108 @@
+/**
+ * useEvaluatorRunControls
+ *
+ * Single source of truth for the evaluator playground's run controls, shared by
+ * the full-page playground and the evaluator-creation drawer. Before this hook,
+ * the app adapter, app-select handler, evaluator-node lookup, and run-on
+ * wiring were copy-pasted across the page header, page body, drawer header, and
+ * drawer body — which is exactly how the drawer drifted out of sync with the
+ * page (it kept forcing an app even in test-case mode). Centralizing it here
+ * means both surfaces behave identically by construction.
+ */
+
+import {useCallback, useMemo} from "react"
+
+import {
+    createWorkflowRevisionAdapter,
+    type WorkflowRevisionSelectionResult,
+} from "@agenta/entity-ui/selection"
+import {playgroundController} from "@agenta/playground"
+import {useAtomValue, useSetAtom} from "jotai"
+
+import {
+    connectAppToEvaluatorAtom,
+    disconnectAppFromEvaluatorAtom,
+    effectiveRunOnModeAtom,
+    hasAppConnectedAtom,
+    runOnModeAtom,
+    selectedAppLabelAtom,
+    type RunOnMode,
+} from "./atoms"
+
+export function useEvaluatorRunControls() {
+    // Evaluator node — phase 1: evaluator at depth 0 (primary); phase 2:
+    // evaluator at depth 1 (downstream of a connected app).
+    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
+    const evaluatorNode = useMemo(() => {
+        const downstream = nodes.find((n) => n.depth > 0)
+        if (downstream) return downstream
+        return nodes[0] ?? null
+    }, [nodes])
+
+    // App picker — picks an upstream *app* workflow to attach to the evaluator.
+    // `parentLabel: "Application"` keeps the search bar saying "Search app…"
+    // rather than the adapter's historical "Search evaluator…" default.
+    const appWorkflowAdapter = useMemo(
+        () =>
+            createWorkflowRevisionAdapter({
+                skipVariantLevel: true,
+                excludeRevisionZero: true,
+                flags: {is_evaluator: false, is_feedback: false},
+                parentLabel: "Application",
+            }),
+        [],
+    )
+
+    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
+    const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom)
+
+    const handleAppSelect = useCallback(
+        (selection: WorkflowRevisionSelectionResult) => {
+            if (!evaluatorNode) return
+            connectApp({
+                appRevisionId: selection.id,
+                appLabel: selection.label,
+                evaluatorRevisionId: evaluatorNode.entityId,
+                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
+            })
+        },
+        [connectApp, evaluatorNode],
+    )
+
+    // Run-on mode. A connected app forces effective "app" mode (the node graph
+    // is the source of truth); the stored preference only applies when nothing
+    // is connected.
+    const runOnMode = useAtomValue(effectiveRunOnModeAtom)
+    const setRunOnMode = useSetAtom(runOnModeAtom)
+    const handlePickRunOn = useCallback(
+        (next: RunOnMode) => {
+            if (next === "trace") return // disabled, not selectable
+            // Leaving "app" mode drops the connected app so the graph returns to
+            // standalone-evaluator shape.
+            if (next === "data") disconnectApp()
+            setRunOnMode(next)
+        },
+        [disconnectApp, setRunOnMode],
+    )
+
+    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
+    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+
+    // In "app" mode with no app connected yet, the evaluator can't run — the run
+    // panel surfaces the app selector instead of the testcase rows. In test-case
+    // mode the evaluator runs standalone, so it's never blocked on an app.
+    // Only takes effect where the run panel renders (the page and the expanded
+    // drawer); the collapsed drawer is config-only and ignores `runDisabled`.
+    const runDisabled = runOnMode === "app" && !hasAppConnected
+
+    return {
+        appWorkflowAdapter,
+        handleAppSelect,
+        disconnectApp,
+        runOnMode,
+        handlePickRunOn,
+        hasAppConnected,
+        selectedAppLabel,
+        runDisabled,
+    }
+}
diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
index 43d25653a9..9bb079e6f7 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
@@ -20,12 +20,6 @@ import {
     registerWorkflowCommitCallbacks,
     getWorkflowCommitCallbacks,
 } from "@agenta/entities/workflow"
-import {EntityPicker} from "@agenta/entity-ui"
-import {
-    createWorkflowRevisionAdapter,
-    type WorkflowRevisionSelectionResult,
-} from "@agenta/entity-ui/selection"
-import {playgroundController} from "@agenta/playground"
 import {type PlaygroundUIProviders} from "@agenta/playground-ui"
 import {ArrowsIn, ArrowsOut} from "@phosphor-icons/react"
 import {Button, Typography} from "antd"
@@ -34,13 +28,10 @@ import dynamic from "next/dynamic"
 
 import SimpleSharedEditor from "@/oss/components/EditorViews/SimpleSharedEditor"
 import EnhancedDrawer from "@/oss/components/EnhancedUIs/Drawer"
-import {
-    connectAppToEvaluatorAtom,
-    evaluatorConfigEntityIdsAtom,
-    hasAppConnectedAtom,
-    selectedAppLabelAtom,
-} from "@/oss/components/Evaluators/components/ConfigureEvaluator/atoms"
+import {evaluatorConfigEntityIdsAtom} from "@/oss/components/Evaluators/components/ConfigureEvaluator/atoms"
+import EvaluatorRunControls from "@/oss/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls"
 import SelectAppEmptyState from "@/oss/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState"
+import {useEvaluatorRunControls} from "@/oss/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls"
 import {clearEvaluatorWorkflowCache} from "@/oss/components/Evaluators/store/evaluatorsPaginatedStore"
 import PlaygroundTestcaseEditor from "@/oss/components/Playground/Components/PlaygroundTestcaseEditor"
 import {OSSPlaygroundShell} from "@/oss/components/Playground/OSSPlaygroundShell"
@@ -53,11 +44,6 @@ const PlaygroundMainView = dynamic(
     {ssr: false},
 )
 
-const TestsetDropdown = dynamic(
-    () => import("@/oss/components/Playground/Components/TestsetDropdown"),
-    {ssr: false},
-)
-
 interface CreateEvaluatorDrawerProps {
     /** Callback after successful evaluator creation. Called with the new revision ID. */
     onEvaluatorCreated?: (configId?: string) => void
@@ -71,57 +57,11 @@ const DrawerHeader = ({entityId, onClose}: {entityId: string; onClose: () => voi
     )
     const name = entityData?.name?.trim() || entityData?.slug?.trim() || "New Evaluator"
 
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
-    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
-
-    // Read current evaluator node (same logic as evaluator playground page)
-    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
-    const evaluatorNode = useMemo(() => {
-        const downstream = nodes.find((n) => n.depth > 0)
-        if (downstream) return downstream
-        return nodes[0] ?? null
-    }, [nodes])
-
-    const appWorkflowAdapter = useMemo(
-        () =>
-            createWorkflowRevisionAdapter({
-                skipVariantLevel: true,
-                excludeRevisionZero: true,
-                flags: {is_evaluator: false, is_feedback: false},
-                // Picking an *app* to attach to the evaluator — without this
-                // the search bar would say "Search evaluator…" (the adapter's
-                // historical default in skip-variant mode).
-                parentLabel: "Application",
-            }),
-        [],
-    )
-
-    const handleAppSelect = useCallback(
-        (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
-            connectApp({
-                appRevisionId: selection.id,
-                appLabel: selection.label,
-                evaluatorRevisionId: evaluatorNode.entityId,
-                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
-            })
-        },
-        [connectApp, evaluatorNode],
-    )
-
     return (
         <div className="flex items-center justify-between px-4 py-3 border-0 border-b border-solid border-[var(--ag-rgba-051729-06)]">
             <Typography.Text className="text-base font-semibold">{name}</Typography.Text>
             <div className="flex items-center gap-2">
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={handleAppSelect}
-                    size="small"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-                {hasAppConnected && <TestsetDropdown />}
+                <EvaluatorRunControls />
                 <Button
                     type="text"
                     size="small"
@@ -146,10 +86,11 @@ const DrawerContent = ({
     onEvaluatorCreated?: (configId?: string) => void
 }) => {
     const isExpanded = useAtomValue(drawerExpandedAtom)
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
     const configEntityIds = useAtomValue(evaluatorConfigEntityIdsAtom)
-    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+    // Same shared controls the header uses — the run gate now respects the
+    // run-on mode, so test-case mode runs without forcing an app.
+    const {appWorkflowAdapter, handleAppSelect, selectedAppLabel, runDisabled} =
+        useEvaluatorRunControls()
     const onEvaluatorCreatedRef = useRef(onEvaluatorCreated)
     onEvaluatorCreatedRef.current = onEvaluatorCreated
 
@@ -178,41 +119,6 @@ const DrawerContent = ({
         }
     }, [])
 
-    // Read current evaluator node for app selection
-    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
-    const evaluatorNode = useMemo(() => {
-        const downstream = nodes.find((n) => n.depth > 0)
-        if (downstream) return downstream
-        return nodes[0] ?? null
-    }, [nodes])
-
-    const appWorkflowAdapter = useMemo(
-        () =>
-            createWorkflowRevisionAdapter({
-                skipVariantLevel: true,
-                excludeRevisionZero: true,
-                flags: {is_evaluator: false, is_feedback: false},
-                // Picking an *app* to attach to the evaluator — without this
-                // the search bar would say "Search evaluator…" (the adapter's
-                // historical default in skip-variant mode).
-                parentLabel: "Application",
-            }),
-        [],
-    )
-
-    const handleAppSelect = useCallback(
-        (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
-            connectApp({
-                appRevisionId: selection.id,
-                appLabel: selection.label,
-                evaluatorRevisionId: evaluatorNode.entityId,
-                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
-            })
-        },
-        [connectApp, evaluatorNode],
-    )
-
     const runDisabledContent = useMemo(
         () => (
             <SelectAppEmptyState
@@ -242,7 +148,7 @@ const DrawerContent = ({
                     mode="evaluator"
                     viewMode={isExpanded ? "full" : "configOnly"}
                     configEntityIdsOverride={configEntityIds}
-                    runDisabled={!hasAppConnected}
+                    runDisabled={runDisabled}
                     runDisabledContent={runDisabledContent}
                 />
             </div>

From de0fdc8edc7c24e856cb66b3d2f1e04b2c49923c Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Fri, 5 Jun 2026 15:44:49 +0200
Subject: [PATCH 24/36] fix(frontend): bind evaluator run controls to the
 default jotai store
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The creation drawer renders inside EvaluationRunsTableStoreProvider, a scoped
jotai store that mirrors only a handful of global atoms. The playground state,
however, runs on the default store (the playground package uses
getDefaultStore() throughout). So in the drawer the run-on mode was read/written
in the scoped store while the playground lived in the default store — the two
split, and switching to test-case mode never reached the run panel: it stayed
stuck on the 'Select an app' empty state.

Read and write all run-on / playground atoms through getDefaultStore() in
useEvaluatorRunControls, mirroring the existing workaround in
usePreviewVariantConfig and TestsetCells. On the full page (no scoped store)
this is a no-op; in the drawer it aligns run-on state with the playground so
test-case mode shows the inputs/outputs as it does on the page.
---
 .../useEvaluatorRunControls.ts                | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
index 4722dd0daa..83f9b30d21 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
@@ -17,7 +17,7 @@ import {
     type WorkflowRevisionSelectionResult,
 } from "@agenta/entity-ui/selection"
 import {playgroundController} from "@agenta/playground"
-import {useAtomValue, useSetAtom} from "jotai"
+import {getDefaultStore, useAtomValue, useSetAtom} from "jotai"
 
 import {
     connectAppToEvaluatorAtom,
@@ -30,9 +30,23 @@ import {
 } from "./atoms"
 
 export function useEvaluatorRunControls() {
+    // Bind to the default store explicitly. The playground state runs on the
+    // default store (the playground package uses `getDefaultStore()` throughout),
+    // but the evaluator-creation drawer renders inside a scoped Jotai store
+    // (`EvaluationRunsTableStoreProvider`) that doesn't mirror the playground or
+    // run-on atoms. Without this, the drawer would read/write run-on mode in the
+    // scoped store while the playground lives in the default store — the two
+    // split, so switching to test-case mode never reaches the run panel and it
+    // stays stuck on "select an app". On the full page (no scoped store) this is
+    // a no-op. Same pattern as `usePreviewVariantConfig` / `TestsetCells`.
+    const store = getDefaultStore()
+
     // Evaluator node — phase 1: evaluator at depth 0 (primary); phase 2:
     // evaluator at depth 1 (downstream of a connected app).
-    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
+    const nodes = useAtomValue(
+        useMemo(() => playgroundController.selectors.nodes(), []),
+        {store},
+    )
     const evaluatorNode = useMemo(() => {
         const downstream = nodes.find((n) => n.depth > 0)
         if (downstream) return downstream
@@ -53,8 +67,8 @@ export function useEvaluatorRunControls() {
         [],
     )
 
-    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
-    const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom)
+    const connectApp = useSetAtom(connectAppToEvaluatorAtom, {store})
+    const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom, {store})
 
     const handleAppSelect = useCallback(
         (selection: WorkflowRevisionSelectionResult) => {
@@ -72,8 +86,8 @@ export function useEvaluatorRunControls() {
     // Run-on mode. A connected app forces effective "app" mode (the node graph
     // is the source of truth); the stored preference only applies when nothing
     // is connected.
-    const runOnMode = useAtomValue(effectiveRunOnModeAtom)
-    const setRunOnMode = useSetAtom(runOnModeAtom)
+    const runOnMode = useAtomValue(effectiveRunOnModeAtom, {store})
+    const setRunOnMode = useSetAtom(runOnModeAtom, {store})
     const handlePickRunOn = useCallback(
         (next: RunOnMode) => {
             if (next === "trace") return // disabled, not selectable
@@ -85,8 +99,8 @@ export function useEvaluatorRunControls() {
         [disconnectApp, setRunOnMode],
     )
 
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+    const hasAppConnected = useAtomValue(hasAppConnectedAtom, {store})
+    const selectedAppLabel = useAtomValue(selectedAppLabelAtom, {store})
 
     // In "app" mode with no app connected yet, the evaluator can't run — the run
     // panel surfaces the app selector instead of the testcase rows. In test-case

From 3a7df2c33279642314867a5e5d2bdc96c6c849d0 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Fri, 5 Jun 2026 19:11:26 +0200
Subject: [PATCH 25/36] fix(evaluator): guard local-draft ids in refs + filter
 non-playground evals from switcher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

QA round 2 (2026-06-05):

[A] LLM-as-judge chain runs could ship a `local-…` draft id as
references.evaluator_revision.id, which the backend rejects as a non-UUID
(422). buildEvaluatorSelfReferences now drops any id that isLocalDraftId()
(workflow_id, variant_id, revision id); slugs/version are kept. NOTE: the
critical drawer direct-invoke path (depth-0) is a separate reference
builder owned by Mahmoud — same isLocalDraftId guard applies there.

[C] The sidebar workflow switcher showed human (feedback) evaluators. The
old filter `!w.flags?.is_feedback` ran on nonArchivedEvaluatorsAtom LIST
records, which carry no `data.uri` and no `is_feedback`/`is_llm`/`is_code`
flags — those live on the revision, not the parent artifact — so the filter
never excluded anything. Switch to fullPagePlaygroundEvaluatorsAtom, the
existing drop-in that resolves flags from each evaluator's latest revision
and excludes human AND declarative-classifier evaluators (both route to an
/apps/<id> destination the guard bounces back to /evaluators).
---
 .../Sidebar/components/WorkflowEntityCard.tsx | 28 ++++++++++++-------
 .../src/state/execution/executionRunner.ts    | 18 ++++++++++--
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
index 818db127aa..fcf1c49f06 100644
--- a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
+++ b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
@@ -1,6 +1,7 @@
 import {memo, useCallback, useMemo, useState} from "react"
 
 import {
+    fullPagePlaygroundEvaluatorsAtom,
     nonArchivedAppWorkflowsAtom,
     nonArchivedEvaluatorsAtom,
     parseWorkflowKeyFromUri,
@@ -116,19 +117,26 @@ const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => {
     const ctx = useAtomValue(currentWorkflowContextAtom)
     const apps = useAtomValue(nonArchivedAppWorkflowsAtom) as readonly Workflow[]
     const evaluators = useAtomValue(nonArchivedEvaluatorsAtom) as readonly Workflow[]
+    // Only evaluators with a real full-page playground belong in the switcher.
+    // `fullPagePlaygroundEvaluatorsAtom` resolves the type flags from each
+    // evaluator's LATEST REVISION — the workflow LIST records this card reads
+    // from `nonArchivedEvaluatorsAtom` carry NO `data.uri` and NO
+    // `is_feedback`/`is_llm`/`is_code` flags (those live on the revision, not
+    // the parent artifact). That's why the old `!w.flags?.is_feedback` filter
+    // never excluded anything and human/feedback evaluators leaked into the
+    // switcher (QA 2026-06-05). The atom drops human (`is_feedback`) AND
+    // declarative classifier evaluators (match/exact_match/json_*/etc.) — all
+    // of which route to an `/apps/<id>/*` destination the guard redirects back
+    // to /evaluators, so clicking them would be a dead end.
+    const fullPagePlaygroundEvaluators = useAtomValue(
+        fullPagePlaygroundEvaluatorsAtom,
+    ) as readonly Workflow[]
     // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off, the
-    // switcher dropdown hides the "Evaluators" group entirely. With the flag
-    // on, list every evaluator EXCEPT human/feedback workflows:
-    // `is_feedback` evaluators are drawer-only in /evaluators (they capture
-    // human input, they don't run), so the corresponding `/apps/<id>/*`
-    // surface has no useful UI. PlaygroundRouter falls through to the
-    // generic `<Playground />` for those, which doesn't make sense to
-    // expose via the sidebar switcher — clicking would land on a
-    // run-controls page for a workflow that has nothing to run.
+    // switcher dropdown hides the "Evaluators" group entirely.
     const switcherEvaluators: readonly Workflow[] = useMemo(() => {
         if (!EVALUATOR_FULL_PAGE_NAV_ENABLED) return EMPTY_WORKFLOWS
-        return evaluators.filter((w) => !w.flags?.is_feedback)
-    }, [evaluators])
+        return fullPagePlaygroundEvaluators
+    }, [fullPagePlaygroundEvaluators])
     const recentAppId = useAtomValue(recentAppIdAtom)
     const recentEvaluatorId = useAtomValue(recentEvaluatorIdAtom)
     const navigateToWorkflow = useSetAtom(routerAppNavigationAtom)
diff --git a/web/packages/agenta-playground/src/state/execution/executionRunner.ts b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
index 0cbaedff3e..40929d9c1e 100644
--- a/web/packages/agenta-playground/src/state/execution/executionRunner.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
@@ -9,6 +9,7 @@ import {
     type StageExecutionResult,
     type EntitySelection,
 } from "@agenta/entities/runnable"
+import {isLocalDraftId} from "@agenta/entities/shared"
 import {workflowMolecule} from "@agenta/entities/workflow"
 import {generateId} from "@agenta/shared/utils"
 import type {Getter, Setter} from "jotai"
@@ -197,10 +198,21 @@ function buildEvaluatorSelfReferences(params: {
     if (!revision) return undefined
     if (!revision.flags?.is_evaluator) return undefined
 
+    // A local-draft evaluator (opened in the drawer playground but not yet
+    // saved) has no committed server identity — its ids are `local-…` strings.
+    // The backend's reference validator rejects those as non-UUIDs (422, QA
+    // 2026-06-05). Drop any id that's a local draft so we never ship one as a
+    // reference; slugs and version are plain strings the backend accepts and
+    // are kept.
+    const realId = (value: unknown): string | undefined => {
+        const s = readString(value)
+        return s && !isLocalDraftId(s) ? s : undefined
+    }
+
     const refs: TraceReferenceMap = {}
 
     // evaluator (parent workflow)
-    const workflowId = readString(revision.workflow_id)
+    const workflowId = realId(revision.workflow_id)
     const workflowSlug = readString(revision.workflow_slug)
     if (workflowId || workflowSlug) {
         refs.evaluator = {
@@ -210,7 +222,7 @@ function buildEvaluatorSelfReferences(params: {
     }
 
     // evaluator_variant (parent variant)
-    const variantId = readString(revision.workflow_variant_id) ?? readString(revision.variant_id)
+    const variantId = realId(revision.workflow_variant_id) ?? realId(revision.variant_id)
     const variantSlug = readString(revision.workflow_variant_slug)
     if (variantId || variantSlug) {
         refs.evaluator_variant = {
@@ -220,7 +232,7 @@ function buildEvaluatorSelfReferences(params: {
     }
 
     // evaluator_revision (this revision)
-    const revisionId = readString(revision.id) ?? params.revisionId
+    const revisionId = realId(revision.id) ?? realId(params.revisionId)
     const revisionSlug = readString(revision.slug)
     const revisionVersion =
         typeof revision.version === "number"

From b29b1031a777f59e7a64ed91d600ce6437d1355d Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Fri, 5 Jun 2026 20:15:36 +0200
Subject: [PATCH 26/36] chore(evaluator): TEMP [B-repro] diagnostics for
 re-select-app QA bug

Temporary console.debug instrumentation to pin the QA bug where
re-selecting the same app after disconnect connects nothing. Logs the
connect-flow decision points:
  - handleAppSelect fired (+ whether it bails on null evaluatorNode)
  - connectApp entry (nodesBefore)
  - changePrimaryNode result (nodeId, nodesAfter)
  - connectDownstreamNode result (downstreamResult, nodesAfter)
  - connectApp done (finalNodes)

To be reverted once root-caused. Filter the browser console by
`[B-repro]` during repro.
---
 .../components/ConfigureEvaluator/atoms.ts    | 33 +++++++++++++++++--
 .../components/ConfigureEvaluator/index.tsx   | 14 +++++++-
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
index cb2500f622..913510a242 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
@@ -209,6 +209,20 @@ export const connectAppToEvaluatorAtom = atom(
     ) => {
         const {appRevisionId, appLabel, evaluatorRevisionId, evaluatorLabel} = params
 
+        // [B-repro] TEMP diagnostic for QA bug: re-selecting the same app after
+        // disconnect connects nothing. Remove once root-caused.
+        const summarizeNodes = () =>
+            get(playgroundNodesAtom).map((n) => ({
+                id: n.id,
+                entityId: n.entityId,
+                depth: n.depth,
+            }))
+        console.debug("[B-repro] connectApp entry", {
+            appRevisionId,
+            evaluatorRevisionId,
+            nodesBefore: summarizeNodes(),
+        })
+
         // Replace primary node with the app FIRST — if the graph mutation
         // bails out (changePrimaryNode returns null when there's no current
         // primary to swap), we must not commit a stale persisted record.
@@ -223,10 +237,18 @@ export const connectAppToEvaluatorAtom = atom(
             label: appLabel,
         })
 
-        if (!nodeId) return
+        console.debug("[B-repro] changePrimaryNode result", {
+            nodeId,
+            nodesAfter: summarizeNodes(),
+        })
+
+        if (!nodeId) {
+            console.debug("[B-repro] connectApp BAILED: changePrimaryNode returned null")
+            return
+        }
 
         // Connect evaluator as downstream node (depth 1)
-        set(playgroundController.actions.connectDownstreamNode, {
+        const downstreamResult = set(playgroundController.actions.connectDownstreamNode, {
             sourceNodeId: nodeId,
             entity: {
                 type: "workflow",
@@ -235,6 +257,11 @@ export const connectAppToEvaluatorAtom = atom(
             },
         })
 
+        console.debug("[B-repro] connectDownstreamNode result", {
+            downstreamResult,
+            nodesAfter: summarizeNodes(),
+        })
+
         // Clean the shared testcase row against the newly-selected app's input
         // contract so stale keys from a previously-selected app (e.g. chat
         // `messages`/`context` after swapping a chat app for a completion app)
@@ -255,6 +282,8 @@ export const connectAppToEvaluatorAtom = atom(
         // user who connected an app from "data" mode would snap back to the
         // testcase panel on disconnect instead of the "Select an app" state.
         set(runOnModeAtom, "app")
+
+        console.debug("[B-repro] connectApp done", {finalNodes: summarizeNodes()})
     },
 )
 
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
index 80b2952e17..921604adfd 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
@@ -123,7 +123,19 @@ const ConfigureEvaluatorPageInner = () => {
 
     const handleAppSelect = useCallback(
         (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
+            // [B-repro] TEMP diagnostic for QA bug: re-selecting the same app
+            // after disconnect connects nothing. Remove once root-caused.
+            console.debug("[B-repro] handleAppSelect fired", {
+                selectionId: selection.id,
+                selectionLabel: selection.label,
+                hasEvaluatorNode: !!evaluatorNode,
+                evaluatorNodeEntityId: evaluatorNode?.entityId,
+                evaluatorNodeDepth: evaluatorNode?.depth,
+            })
+            if (!evaluatorNode) {
+                console.debug("[B-repro] handleAppSelect BAILED: evaluatorNode is null")
+                return
+            }
             connectApp({
                 appRevisionId: selection.id,
                 appLabel: selection.label,

From 7312e6feaa347e4ad79f7609fc10e8a41bd00b9c Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Fri, 5 Jun 2026 23:19:27 +0200
Subject: [PATCH 27/36] fix(evaluator): reflect reconnect in UI after
 disconnect (#4474 QA)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

QA round 2: in the evaluator playground, selecting an app → disconnecting
→ re-selecting the SAME app connected nothing in the UI (workflow selector
+ generation panel stayed on the 'Select an app' empty state).

Root cause (pinned via runtime instrumentation): the node graph IS correct
after reconnect — connectAppToEvaluatorAtom writes playgroundNodesAtom to
[app, evaluator] and a follow-up read confirms 2 nodes in the single store.
But on a disconnect→reconnect cycle jotai applies the two sequential
playgroundNodesAtom writes (changePrimaryNode → connectDownstreamNode)
WITHOUT notifying the mounted dependents, so selectedAppLabelAtom /
hasAppConnectedAtom (and the package's generation-panel atoms) never
recompute and the UI shows stale 'disconnected' state. First-connect and
disconnect notify fine; only the reconnect drops the notification.

Fix: after the graph mutations in connectAppToEvaluatorAtom, read the
node-derived display atoms (selectedAppLabelAtom, hasAppConnectedAtom) to
re-establish the dependency and flush the pending notification to their
subscribers. Verified locally: reconnect now updates both the selector and
the generation panel.

Also removes the temporary [B-repro] diagnostics added while root-causing.
---
 .../components/ConfigureEvaluator/atoms.ts    | 44 ++++++-------------
 .../components/ConfigureEvaluator/index.tsx   | 14 +-----
 2 files changed, 15 insertions(+), 43 deletions(-)

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
index 913510a242..0c83b594af 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
@@ -209,20 +209,6 @@ export const connectAppToEvaluatorAtom = atom(
     ) => {
         const {appRevisionId, appLabel, evaluatorRevisionId, evaluatorLabel} = params
 
-        // [B-repro] TEMP diagnostic for QA bug: re-selecting the same app after
-        // disconnect connects nothing. Remove once root-caused.
-        const summarizeNodes = () =>
-            get(playgroundNodesAtom).map((n) => ({
-                id: n.id,
-                entityId: n.entityId,
-                depth: n.depth,
-            }))
-        console.debug("[B-repro] connectApp entry", {
-            appRevisionId,
-            evaluatorRevisionId,
-            nodesBefore: summarizeNodes(),
-        })
-
         // Replace primary node with the app FIRST — if the graph mutation
         // bails out (changePrimaryNode returns null when there's no current
         // primary to swap), we must not commit a stale persisted record.
@@ -237,18 +223,10 @@ export const connectAppToEvaluatorAtom = atom(
             label: appLabel,
         })
 
-        console.debug("[B-repro] changePrimaryNode result", {
-            nodeId,
-            nodesAfter: summarizeNodes(),
-        })
-
-        if (!nodeId) {
-            console.debug("[B-repro] connectApp BAILED: changePrimaryNode returned null")
-            return
-        }
+        if (!nodeId) return
 
         // Connect evaluator as downstream node (depth 1)
-        const downstreamResult = set(playgroundController.actions.connectDownstreamNode, {
+        set(playgroundController.actions.connectDownstreamNode, {
             sourceNodeId: nodeId,
             entity: {
                 type: "workflow",
@@ -257,11 +235,6 @@ export const connectAppToEvaluatorAtom = atom(
             },
         })
 
-        console.debug("[B-repro] connectDownstreamNode result", {
-            downstreamResult,
-            nodesAfter: summarizeNodes(),
-        })
-
         // Clean the shared testcase row against the newly-selected app's input
         // contract so stale keys from a previously-selected app (e.g. chat
         // `messages`/`context` after swapping a chat app for a completion app)
@@ -283,7 +256,18 @@ export const connectAppToEvaluatorAtom = atom(
         // testcase panel on disconnect instead of the "Select an app" state.
         set(runOnModeAtom, "app")
 
-        console.debug("[B-repro] connectApp done", {finalNodes: summarizeNodes()})
+        // Force the node-derived display atoms to re-settle after the two
+        // sequential `playgroundNodesAtom` writes above (changePrimaryNode →
+        // connectDownstreamNode). On a disconnect→reconnect cycle jotai applies
+        // the writes (the value is correct) but does NOT notify the mounted
+        // dependents — `selectedAppLabelAtom` / `hasAppConnectedAtom` and the
+        // package's generation-panel atoms stay stale, so the UI keeps showing
+        // the "Select an app" empty state even though an app is connected
+        // (QA 2026-06-05 — re-selecting the same app after disconnect). Reading
+        // the derived atoms here re-establishes the dependency and flushes the
+        // pending notification to their subscribers.
+        get(selectedAppLabelAtom)
+        get(hasAppConnectedAtom)
     },
 )
 
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
index 921604adfd..80b2952e17 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
@@ -123,19 +123,7 @@ const ConfigureEvaluatorPageInner = () => {
 
     const handleAppSelect = useCallback(
         (selection: WorkflowRevisionSelectionResult) => {
-            // [B-repro] TEMP diagnostic for QA bug: re-selecting the same app
-            // after disconnect connects nothing. Remove once root-caused.
-            console.debug("[B-repro] handleAppSelect fired", {
-                selectionId: selection.id,
-                selectionLabel: selection.label,
-                hasEvaluatorNode: !!evaluatorNode,
-                evaluatorNodeEntityId: evaluatorNode?.entityId,
-                evaluatorNodeDepth: evaluatorNode?.depth,
-            })
-            if (!evaluatorNode) {
-                console.debug("[B-repro] handleAppSelect BAILED: evaluatorNode is null")
-                return
-            }
+            if (!evaluatorNode) return
             connectApp({
                 appRevisionId: selection.id,
                 appLabel: selection.label,

From e2ae75cba45d8d8b8775081e48a7cabbd75a7dea Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Fri, 5 Jun 2026 23:56:24 +0200
Subject: [PATCH 28/36] fix(playground): strip non-UUID reference ids from
 invoke body (#4474 QA critical)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The critical QA bug: invoking an LLM-as-a-judge evaluator opened from the
drawer 422'd because the request shipped
references.evaluator_revision.id = "local-…" (an unsaved local-draft id),
which the backend /invoke validator rejects as a non-UUID.

buildEvaluatorSelfReferences (chain stage refs) was already guarded, but
references can also arrive from the requestPayload builder and from
trace-span extraction. Rather than chase each builder, add a single final
sanitization at the one chokepoint where the request body's references are
assembled (buildExecutionItem, after all sources are merged): drop any
reference id that is a local-draft or placeholder id, keep slug/version,
and drop a slot that ends up empty.

Path-agnostic — covers the drawer direct-invoke, the chained evaluator
playground, and any future reference source.
---
 .../src/state/execution/executionItems.ts     | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/web/packages/agenta-playground/src/state/execution/executionItems.ts b/web/packages/agenta-playground/src/state/execution/executionItems.ts
index ae30f11cc3..5127d0fcdc 100644
--- a/web/packages/agenta-playground/src/state/execution/executionItems.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionItems.ts
@@ -1,4 +1,5 @@
 import {loadableController, type RequestPayloadData} from "@agenta/entities/runnable"
+import {isLocalDraftId, isPlaceholderId} from "@agenta/entities/shared"
 import {
     stripAgentaMetadataDeep,
     stripEnhancedWrappers,
@@ -210,6 +211,39 @@ function asRecord(value: unknown): Record<string, unknown> | null {
     return value as Record<string, unknown>
 }
 
+/**
+ * Strip reference `id`s that aren't real server UUIDs (local-draft or
+ * placeholder ids) from a request body's `references` map.
+ *
+ * The backend `/invoke` validator rejects a non-UUID reference id with a 422
+ * (QA 2026-06-05: an unsaved evaluator opened from the drawer shipped
+ * `references.evaluator_revision.id = "local-…"` → "Input should be a valid
+ * UUID"). This is the last line of defense, applied to the FINAL merged
+ * references regardless of which builder produced them (requestPayload
+ * references, executionRunner stage self/upstream references, or
+ * trace-span-extracted references). Slugs and versions are plain strings the
+ * backend accepts and are kept; a slot left with no fields is dropped.
+ */
+function sanitizeReferenceIds(references: unknown): Record<string, unknown> | null {
+    const refs = asRecord(references)
+    if (!refs) return null
+    let mutated = false
+    const out: Record<string, unknown> = {}
+    for (const [slot, value] of Object.entries(refs)) {
+        const ref = asRecord(value)
+        const id = ref?.id
+        if (ref && typeof id === "string" && (isLocalDraftId(id) || isPlaceholderId(id))) {
+            const rest = {...ref}
+            delete rest.id
+            mutated = true
+            if (Object.keys(rest).length > 0) out[slot] = rest
+        } else {
+            out[slot] = value
+        }
+    }
+    return mutated ? out : refs
+}
+
 function unwrapValue(value: unknown): unknown {
     const rec = asRecord(value)
     return rec && "value" in rec ? rec.value : value
@@ -1322,6 +1356,18 @@ function buildExecutionItem(
             : params.references
     }
 
+    // Final guard: never ship a local-draft / placeholder id in a reference —
+    // the backend `/invoke` validator 422s on non-UUID reference ids (QA
+    // 2026-06-05). Covers every reference source after they're merged above.
+    if (requestBody.references !== undefined) {
+        const sanitized = sanitizeReferenceIds(requestBody.references)
+        if (sanitized && Object.keys(sanitized).length > 0) {
+            requestBody.references = sanitized
+        } else {
+            delete requestBody.references
+        }
+    }
+
     const references: ExecutionItemReference = {
         loadableId: params.loadableId,
         rowId: params.rowId,

From 12928a4200326c8cddcdfaf52ce940df9e9e64fc Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Sat, 6 Jun 2026 01:41:41 +0200
Subject: [PATCH 29/36] feat(playground): surface trace-link icon on evaluator
 result cards (#4474 QA E)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the evaluator playground the primary (app) result row exposed an 'open
trace' affordance, but the downstream evaluator result card (DownstreamNodeCard)
rendered only its output fields — no way to open the evaluator's own trace to
debug a grade (QA 2026-06-05: 'show the trace links (icon) for evaluators too').

The downstream result already carries a traceId; the card just never read it.
Read it and pass a compact 'open trace' icon (SharedGenerationResultUtils in
actionsOnly mode) into NodeResultCard's headerActions slot, so it appears next
to the evaluator node name on hover — same trace drawer the app row opens.

Adds actionsOnly to the package's SharedGenerationResultUtilsProps provider
type (the OSS wrapper + entity component already support it).
---
 .../assets/ExecutionRow/SingleLayout.tsx      | 31 ++++++++++++++-----
 .../src/context/PlaygroundUIContext.tsx       |  2 ++
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx b/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
index 52ecb2901a..f284c4b9d8 100644
--- a/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
+++ b/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
@@ -253,7 +253,24 @@ const DownstreamNodeCard = ({
                 }),
             [rowId, scopedEntityId],
         ),
-    ) as {status?: string; output?: unknown; error?: {message: string} | null} | null
+    ) as {
+        status?: string
+        output?: unknown
+        error?: {message: string} | null
+        traceId?: string | null
+    } | null
+
+    // Trace-link affordance for the downstream (evaluator) result — surfaced in
+    // the card legend so users can open the evaluator's own trace to debug a
+    // grade, the same way the primary app row exposes its trace (QA 2026-06-05:
+    // "show the trace links (icon) for evaluators too").
+    const providers = usePlaygroundUIOptional()
+    const SharedGenerationResultUtils = providers?.SharedGenerationResultUtils
+    const nodeTraceId = fullResult?.traceId ?? null
+    const traceActions =
+        nodeTraceId && SharedGenerationResultUtils ? (
+            <SharedGenerationResultUtils traceId={nodeTraceId} actionsOnly />
+        ) : undefined
 
     // Read output ports from the runnable bridge (includes per-field schema)
     const outputPorts = useAtomValue(
@@ -286,7 +303,7 @@ const DownstreamNodeCard = ({
     // Idle / cancelled / no result — show expected fields with placeholder dashes
     if (!fullResult || rawStatus === "idle" || rawStatus === "cancelled") {
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <EvaluatorFieldGrid entries={null} outputPorts={outputPorts} idle />
             </NodeResultCard>
         )
@@ -295,7 +312,7 @@ const DownstreamNodeCard = ({
     // Running / pending -> loading skeleton
     if (rawStatus === "running" || rawStatus === "pending") {
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <EvaluatorFieldGrid entries={null} outputPorts={outputPorts} loading />
             </NodeResultCard>
         )
@@ -308,7 +325,7 @@ const DownstreamNodeCard = ({
                 ? fullResult.error.message
                 : "Error"
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <span className="text-[var(--ant-color-error)] text-xs leading-5">{errorMsg}</span>
             </NodeResultCard>
         )
@@ -321,7 +338,7 @@ const DownstreamNodeCard = ({
                 ? fullResult.error.message
                 : "Skipped"
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <span className="text-[var(--ant-color-text-tertiary)] text-xs leading-5 italic">
                     {skipMsg}
                 </span>
@@ -343,14 +360,14 @@ const DownstreamNodeCard = ({
 
     if (!entries || entries.length === 0) {
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <span className="text-xs leading-5">—</span>
             </NodeResultCard>
         )
     }
 
     return (
-        <NodeResultCard name={nodeName} status={rawStatus}>
+        <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
             <div
                 className="grid items-baseline text-xs leading-5"
                 style={{gridTemplateColumns: "auto 1fr", columnGap: 12, rowGap: 6}}
diff --git a/web/packages/agenta-playground-ui/src/context/PlaygroundUIContext.tsx b/web/packages/agenta-playground-ui/src/context/PlaygroundUIContext.tsx
index 8ae340cd73..cee1fe9435 100644
--- a/web/packages/agenta-playground-ui/src/context/PlaygroundUIContext.tsx
+++ b/web/packages/agenta-playground-ui/src/context/PlaygroundUIContext.tsx
@@ -59,6 +59,8 @@ export interface SharedGenerationResultUtilsProps {
     traceId?: string | null
     showStatus?: boolean
     className?: string
+    /** Render only the trace action (compact "open trace" icon, no metrics/status). */
+    actionsOnly?: boolean
 }
 
 /**

From 754eead37ed6e485fe63a61340c58335f3edf8a0 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Sat, 6 Jun 2026 21:32:01 +0200
Subject: [PATCH 30/36] feat(evaluations): surface evaluations of an evaluator
 on its Evaluations & Overview

Evaluators can be evaluated as subjects (#4237). Show those runs on the
evaluator's own Evaluations tab and Overview summaries:

- Add a run-list reference predicate (entities/evaluationRun/etl) that drops
  runs by the ROLE their references play - keeping runs where the workflow is
  the evaluated subject (application/invocation ref) and excluding runs where
  it was merely a grader (evaluator/annotation ref). Replaces the flaky
  meta.application heuristic with the structural data.steps source of truth.
- Wire the subject filter into the eval-runs fetch, with a hit-ratio meter
  reporting the v1->v2 escalation signal, and a bounded over-fetch so the
  fixed-size Overview summaries fill instead of falsely reading empty.
- Re-enable Overview eval summaries + Evaluations route for evaluators
  (sidebar links, route guards, DISABLED_FOR_EVALUATOR).
- Resolve the locked Apps filter chip to the workflow name for evaluators.
---
 .../EvaluationRunsTablePOC/atoms/context.ts   |  11 +
 .../atoms/fetchAutoEvaluationRuns.ts          | 166 +++++++++----
 .../atoms/subjectFilterMeter.ts               |  79 ++++++
 .../atoms/tableStore.ts                       |  30 +++
 .../LatestEvaluationRunsTable/index.tsx       |   4 +
 .../filters/EvaluationRunsHeaderFilters.tsx   |  18 +-
 .../EvaluationRunsTablePOC/types.ts           |  14 ++
 .../Sidebar/hooks/useSidebarConfig/index.tsx  |  25 +-
 .../apps/[app_id]/evaluations/index.tsx       |   2 +-
 .../apps/[app_id]/overview/index.tsx          |  46 +++-
 web/oss/src/state/workflow/destinations.ts    |   7 +-
 .../etl/__tests__/runReferenceFilter.test.ts  | 207 ++++++++++++++++
 .../src/evaluationRun/etl/index.ts            |  18 ++
 .../evaluationRun/etl/runReferenceFilter.ts   | 225 ++++++++++++++++++
 14 files changed, 776 insertions(+), 76 deletions(-)
 create mode 100644 web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts
 create mode 100644 web/packages/agenta-entities/src/evaluationRun/etl/__tests__/runReferenceFilter.test.ts
 create mode 100644 web/packages/agenta-entities/src/evaluationRun/etl/runReferenceFilter.ts

diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts
index 3b3941cc15..bee2d3f3e6 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts
+++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts
@@ -16,6 +16,12 @@ export interface EvaluationRunsTableOverrides {
     evaluationKind: EvaluationRunKind
     includePreview: boolean
     scope?: TableScope
+    /**
+     * Over-fetch successive server pages until a full page of subject runs is
+     * collected. Set by fixed-size, non-paginating surfaces (the Overview
+     * summary) so the subject filter doesn't leave them falsely empty.
+     */
+    fillToLimit?: boolean
 }
 
 type TableScope = "app" | "project"
@@ -34,6 +40,7 @@ export interface EvaluationRunsTableContext {
     storageKey: string
     createSupported: boolean
     createEvaluationType: "auto" | "human" | "online" | "custom"
+    fillToLimit: boolean
 }
 
 export const defaultEvaluationRunsTableOverrides: EvaluationRunsTableOverrides = {
@@ -66,6 +73,7 @@ export const evaluationRunsTableContextAtom = atom<EvaluationRunsTableContext>((
 
     const evaluationKind = overrides.evaluationKind
     const includePreview = overrides.includePreview
+    const fillToLimit = overrides.fillToLimit ?? false
 
     const projectId =
         overrides.projectIdOverride ?? identifiers.projectId ?? fallbackProjectId ?? null
@@ -130,6 +138,7 @@ export const evaluationRunsTableContextAtom = atom<EvaluationRunsTableContext>((
         storageKey,
         createSupported,
         createEvaluationType,
+        fillToLimit,
     }
 
     return context
@@ -188,6 +197,7 @@ export const evaluationRunsMetaContextSliceAtom = selectAtom(
         includePreview: context.includePreview,
         evaluationKind: context.evaluationKind,
         derivedPreviewFlags: context.derivedPreviewFlags,
+        fillToLimit: context.fillToLimit,
     }),
     (a, b) =>
         a.projectId === b.projectId &&
@@ -196,6 +206,7 @@ export const evaluationRunsMetaContextSliceAtom = selectAtom(
         a.activeAppId === b.activeAppId &&
         a.includePreview === b.includePreview &&
         a.evaluationKind === b.evaluationKind &&
+        a.fillToLimit === b.fillToLimit &&
         arrayEquals(a.effectiveAppIds, b.effectiveAppIds) &&
         shallowEqualFlags(a.derivedPreviewFlags, b.derivedPreviewFlags),
 )
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts
index b7a2bba238..5d4d35b843 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts
+++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts
@@ -1,3 +1,5 @@
+import {hasResolvableSubject, isSubjectRun} from "@agenta/entities/evaluationRun/etl"
+
 import type {WindowingState} from "@/oss/components/InfiniteVirtualTable/types"
 import {deriveEvaluationKind} from "@/oss/lib/evaluations/utils/evaluationKind"
 
@@ -45,8 +47,21 @@ interface FetchEvaluationRunsWindowParams {
     statusFilters?: string[] | null
     evaluationTypeFilters?: ConcreteEvaluationRunKind[] | null
     dateRange?: {from?: string | null; to?: string | null} | null
+    /**
+     * Over-fetch successive server pages until `limit` *subject* runs are
+     * collected (or the stream is exhausted / a safety cap is hit). For the
+     * fixed-size Overview summary (no infinite scroll): a single server page can
+     * filter down to few/zero subject runs even when more exist deeper, which
+     * would falsely read as "this workflow has never been evaluated". The full
+     * page leaves this off — infinite scroll fills lazily on scroll.
+     */
+    fillToLimit?: boolean
 }
 
+// Over-fetch tuning (only used when `fillToLimit` + a subject filter are active).
+const FILL_MAX_SERVER_PAGES = 8
+const FILL_MIN_SERVER_PAGE = 25
+
 const fetchPreviewRuns = async ({
     projectId,
     appId,
@@ -238,6 +253,7 @@ export const fetchEvaluationRunsWindow = async ({
     cursor = null,
     evaluationTypeFilters,
     dateRange,
+    fillToLimit = false,
 }: FetchEvaluationRunsWindowParams): Promise<EvaluationRunsWindowResult> => {
     if (!projectId) {
         return {
@@ -259,31 +275,6 @@ export const fetchEvaluationRunsWindow = async ({
         evaluationKind === "all" && allowedKinds && allowedKinds.size
             ? Array.from(allowedKinds)
             : null
-    const windowingPayload: QueryWindowingPayload = {
-        limit,
-        order: "descending" as const,
-        next: cursor ?? undefined,
-    }
-    if (dateRange?.to) {
-        windowingPayload.newest = dateRange.to
-    }
-    if (dateRange?.from) {
-        windowingPayload.oldest = dateRange.from
-    }
-
-    const previewResult = includePreview
-        ? await fetchPreviewRuns({
-              projectId,
-              appId: previewAppId,
-              searchQuery: previewSearchQuery,
-              references: previewReferences,
-              flags: previewFlags,
-              statuses: statusFilters && statusFilters.length ? statusFilters : undefined,
-              evaluationTypes: evaluationTypesPayload,
-              windowing: windowingPayload,
-          })
-        : {runs: [], count: 0, windowing: null}
-
     const rows: EvaluationRunApiRow[] = []
 
     const normalizedSearch = previewSearchQuery?.trim().toLowerCase() ?? null
@@ -305,11 +296,31 @@ export const fetchEvaluationRunsWindow = async ({
         return normalizedStatusSet.has(statusValue.toLowerCase())
     }
 
-    const allowedAppIds = appIds.filter((id) => typeof id === "string" && id.trim().length > 0)
-    const allowedAppSet =
-        allowedAppIds.length > 0 ? new Set(allowedAppIds.map((id) => id.trim())) : null
+    const allowedAppIds = appIds
+        .filter((id) => typeof id === "string" && id.trim().length > 0)
+        .map((id) => id.trim())
+    const allowedAppSet = allowedAppIds.length > 0 ? new Set(allowedAppIds) : null
 
-    previewResult.runs.forEach((run) => {
+    // Run-list SUBJECT predicate (feature F): when scoped to a workflow, keep
+    // runs that *evaluated this workflow* — runs where the scoped id is the
+    // run's `application`/invocation reference (the evaluated subject) — and
+    // drop runs where it merely appears as a grader (`evaluator` reference).
+    //
+    // This replaces the prior `meta.application.id` heuristic, which is
+    // unreliable: a null `meta.application` silently bypassed the guard, which
+    // is how grader runs leaked onto an evaluator's Evaluations tab. The run's
+    // `data.steps` are the structural source of truth. We fall back to the
+    // `meta` heuristic only when a run carries no resolvable subject reference.
+    //
+    // `subjectScanned`/`subjectMatched` feed the hit-ratio meter: a low rolling
+    // pass-ratio means the scoped workflow is graded far more than it's
+    // evaluated — the signal that the backend role-aware reference filter (v2)
+    // is warranted. (The FE already sends the role via the payload's dict key;
+    // v2 is the backend honoring it. See evaluations/utils.py query_run_references.)
+    let subjectScanned = 0
+    let subjectMatched = 0
+
+    const processRun = (run: PreviewEvaluationRun) => {
         // Derive kind from run.data.steps - this is the reliable source of truth
         // Do NOT rely on meta.evaluation_kind as it's flaky and unreliable
         const derivedKind = derivePreviewRunKind(run)
@@ -331,8 +342,22 @@ export const fetchEvaluationRunsWindow = async ({
         const runId = run.id ?? null
         const metaApplication = (run as any)?.meta?.application ?? {}
         const runAppId = metaApplication?.id ?? (run as any)?.meta?.appId ?? null
-        if (allowedAppSet && runAppId && !allowedAppSet.has(runAppId)) {
-            return
+        const previewMeta = extractPreviewRunMeta(run)
+
+        if (allowedAppSet) {
+            subjectScanned += 1
+            const steps = previewMeta.steps
+            const passesSubject = hasResolvableSubject(steps)
+                ? // Structural: the scoped workflow is the run's evaluated subject.
+                  allowedAppIds.some((id) => isSubjectRun(steps, id))
+                : // Fallback for runs with no resolvable subject reference:
+                  // keep the prior `meta.application.id` behaviour rather than
+                  // dropping a run we can't classify structurally.
+                  !runAppId || allowedAppSet.has(runAppId)
+            if (!passesSubject) {
+                return
+            }
+            subjectMatched += 1
         }
         const previewName = typeof (run as any)?.name === "string" ? (run as any).name : null
         if (!matchesSearch([runId, previewName, metaApplication?.id, metaApplication?.name])) {
@@ -354,10 +379,66 @@ export const fetchEvaluationRunsWindow = async ({
                     : (run as any)?.status?.value) ?? null,
             appId: runAppId ?? null,
             preview: runId ? {id: runId} : undefined,
-            previewMeta: extractPreviewRunMeta(run),
+            previewMeta,
             evaluationKind: derivedKind,
         })
-    })
+    }
+
+    // Over-fetch loop. The fixed-size summary (`fillToLimit`) can filter a single
+    // server page down to few/zero subject runs even when more exist deeper —
+    // which would falsely read as "this workflow has never been evaluated". When
+    // filling, pull successive server pages (advancing the cursor) until we have
+    // `limit` subject runs, the stream is exhausted, or the safety cap is hit.
+    // The full page leaves this off (single page) — its infinite scroll fills
+    // lazily on scroll, so changing its pagination here isn't needed.
+    const wantFill = Boolean(fillToLimit) && Boolean(allowedAppSet)
+    const serverPageLimit = wantFill ? Math.max(limit, FILL_MIN_SERVER_PAGE) : limit
+    const maxPages = wantFill ? FILL_MAX_SERVER_PAGES : 1
+
+    let currentCursor: string | undefined = cursor ?? undefined
+    let firstPageCount: number | null = null
+    let lastWindowing: QueryWindowingPayload | null = null
+    let pagesFetched = 0
+
+    while (pagesFetched < maxPages) {
+        pagesFetched += 1
+
+        const windowingPayload: QueryWindowingPayload = {
+            limit: serverPageLimit,
+            order: "descending" as const,
+            next: currentCursor,
+        }
+        if (dateRange?.to) {
+            windowingPayload.newest = dateRange.to
+        }
+        if (dateRange?.from) {
+            windowingPayload.oldest = dateRange.from
+        }
+
+        const previewResult = includePreview
+            ? await fetchPreviewRuns({
+                  projectId,
+                  appId: previewAppId,
+                  searchQuery: previewSearchQuery,
+                  references: previewReferences,
+                  flags: previewFlags,
+                  statuses: statusFilters && statusFilters.length ? statusFilters : undefined,
+                  evaluationTypes: evaluationTypesPayload,
+                  windowing: windowingPayload,
+              })
+            : {runs: [], count: 0, windowing: null}
+
+        if (firstPageCount === null) {
+            firstPageCount = previewResult.count ?? null
+        }
+        lastWindowing = previewResult.windowing
+        previewResult.runs.forEach(processRun)
+
+        currentCursor = previewResult.windowing?.next ?? undefined
+        if (!wantFill || rows.length >= limit || !currentCursor) {
+            break
+        }
+    }
 
     rows.sort((a, b) => {
         const tsA = a.createdAt ? new Date(a.createdAt).getTime() : 0
@@ -365,14 +446,16 @@ export const fetchEvaluationRunsWindow = async ({
         return tsB - tsA
     })
 
+    // The fixed-size summary shows at most `limit` (latest N subject runs); the
+    // last over-fetched server page may carry a few extra past the limit.
+    const pageRows = wantFill ? rows.slice(0, limit) : rows
     const totalCount =
-        evaluationKind === "all" && allowedKinds
-            ? rows.length
-            : (previewResult.count ?? rows.length)
-    const pageRows = rows
+        evaluationKind === "all" && allowedKinds ? pageRows.length : (firstPageCount ?? rows.length)
     const nextOffset = offset + pageRows.length
-    const previewNextCursor = previewResult.windowing?.next ?? null
-    const hasMore = Boolean(previewNextCursor)
+    // The summary doesn't paginate (infinite scroll off), so it never advertises
+    // "more"; the full page advertises the page's server cursor as before.
+    const previewNextCursor = lastWindowing?.next ?? null
+    const hasMore = wantFill ? false : Boolean(previewNextCursor)
 
     return {
         rows: pageRows,
@@ -380,6 +463,9 @@ export const fetchEvaluationRunsWindow = async ({
         hasMore,
         nextOffset: hasMore ? nextOffset : null,
         nextCursor: previewNextCursor,
-        nextWindowing: normalizeWindowing(previewResult.windowing),
+        nextWindowing: normalizeWindowing(lastWindowing),
+        subjectFilterStats: allowedAppSet
+            ? {scanned: subjectScanned, matched: subjectMatched}
+            : undefined,
     }
 }
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts
new file mode 100644
index 0000000000..0bfd513091
--- /dev/null
+++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts
@@ -0,0 +1,79 @@
+/**
+ * Per-context hit-ratio meter for the run-list SUBJECT predicate (feature F).
+ *
+ * The subject filter (`fetchAutoEvaluationRuns`) keeps runs that *evaluated the
+ * scoped workflow* and drops runs where it was only a grader. When the scoped
+ * workflow is graded far more often than it's evaluated, most fetched runs get
+ * dropped client-side — the "low hit-ratio" case the eval-filtering RFC's meter
+ * is built to detect (docs/designs/eval-filtering.md §D2 + §C3).
+ *
+ * A low rolling ratio is the signal that the backend role-aware reference
+ * filter (v2) is warranted. The FE already encodes the role as the reference
+ * payload's dict key; v2 is purely the backend honoring it
+ * (`evaluations/utils.py` `query_run_references` — see line 66). So this meter
+ * **reports the regime** (dev log + a readable getter for diagnostics); it does
+ * not — and cannot, from the FE — swap to a server-side filter.
+ *
+ * Meters are keyed by the subject-filter context (project + scoped workflow ids
+ * + kind). Each distinct context gets its own rolling window.
+ */
+
+import {
+    createHitRatioMeter,
+    type HitRatioMeter,
+    type HitRatioRegime,
+} from "@agenta/entities/evaluationRun/etl"
+
+const meters = new Map<string, HitRatioMeter>()
+
+const meterFor = (signature: string): HitRatioMeter => {
+    let meter = meters.get(signature)
+    if (!meter) {
+        meter = createHitRatioMeter()
+        meters.set(signature, meter)
+    }
+    return meter
+}
+
+/** Stable signature for a subject-filter context. */
+export const subjectFilterSignature = ({
+    projectId,
+    appIds,
+    evaluationKind,
+}: {
+    projectId: string | null
+    appIds: string[] | null | undefined
+    evaluationKind: string
+}): string => `${projectId ?? "null"}::${(appIds ?? []).join("|")}::${evaluationKind}`
+
+/**
+ * Record one page of subject-filter stats and return the resulting regime.
+ *
+ * `page` should be the fetch offset (monotonic, unique per page within a
+ * context). The meter dedups by it, so a refetch from offset 0 — common after
+ * cache invalidation — doesn't double-count.
+ */
+export const recordSubjectFilterPage = ({
+    signature,
+    page,
+    scanned,
+    matched,
+}: {
+    signature: string
+    page: number
+    scanned: number
+    matched: number
+}): HitRatioRegime => {
+    const meter = meterFor(signature)
+    meter.record({chunk: page, scanned, matched})
+    return meter.regime()
+}
+
+/** Read the current regime without recording (diagnostics / banners). */
+export const getSubjectFilterRegime = (signature: string): HitRatioRegime | null =>
+    meters.get(signature)?.regime() ?? null
+
+/** Drop a context's meter (e.g. when its filter signature is retired). */
+export const resetSubjectFilterMeter = (signature: string): void => {
+    meters.delete(signature)
+}
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts
index b79bfe6cf2..803a19f0f4 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts
+++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts
@@ -16,6 +16,7 @@ import {buildReferencePayload} from "../utils/referencePayload"
 
 import {computeContextSignature, evaluationRunsMetaContextSliceAtom} from "./context"
 import {fetchEvaluationRunsWindow} from "./fetchAutoEvaluationRuns"
+import {recordSubjectFilterPage, subjectFilterSignature} from "./subjectFilterMeter"
 
 import type {RunFlagsFilter} from "@/agenta-oss-common/lib/hooks/usePreviewEvaluations/index"
 
@@ -31,6 +32,8 @@ export interface EvaluationRunsTableMeta {
     referenceFilters?: Record<string, string[]> | null
     evaluationTypeFilters?: ConcreteEvaluationRunKind[] | null
     dateRange?: {from?: string | null; to?: string | null} | null
+    /** Over-fetch to fill a full page of subject runs (fixed-size summaries). */
+    fillToLimit?: boolean
     /** Internal refresh trigger - incrementing this forces a refetch */
     _refreshTrigger?: number
 }
@@ -199,6 +202,7 @@ export const evaluationRunsTableMetaAtom = atom<
             referenceFilters,
             evaluationTypeFilters,
             dateRange,
+            fillToLimit: context.fillToLimit,
             _refreshTrigger: refreshTrigger,
         }
 
@@ -377,8 +381,34 @@ const evaluationRunsDatasetStoreInternal = createInfiniteDatasetStore<
             statusFilters: meta.statusFilters ?? null,
             evaluationTypeFilters: meta.evaluationTypeFilters ?? null,
             dateRange: meta.dateRange ?? null,
+            fillToLimit: meta.fillToLimit ?? false,
         })
 
+        // Feed the run-list subject predicate's pass-ratio to the hit-ratio
+        // meter. A low rolling ratio means the scoped workflow is graded far
+        // more than it's evaluated — the v1→v2 escalation signal (the backend
+        // role-aware reference filter is warranted). Observation only today.
+        if (result.subjectFilterStats) {
+            const signature = subjectFilterSignature({
+                projectId: meta.projectId,
+                appIds: meta.appIds,
+                evaluationKind: meta.evaluationKind,
+            })
+            const regime = recordSubjectFilterPage({
+                signature,
+                page: offset,
+                scanned: result.subjectFilterStats.scanned,
+                matched: result.subjectFilterStats.matched,
+            })
+            if (process.env.NODE_ENV !== "production" && regime.state === "escalate") {
+                console.log(
+                    "[evaluationRunsTableStore] subject filter low hit-ratio —",
+                    regime.reason,
+                    {appIds: meta.appIds, kind: meta.evaluationKind},
+                )
+            }
+        }
+
         return {
             rows: result.rows,
             totalCount: result.totalCount,
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx b/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx
index 543990186a..e00ab76843 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx
+++ b/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx
@@ -73,6 +73,10 @@ const LatestEvaluationRunsTable = ({
                     appId,
                     projectIdOverride,
                     includePreview,
+                    // Fixed-size summary (no infinite scroll): over-fetch so the
+                    // subject filter doesn't leave it falsely empty when the
+                    // workflow is graded more than it's evaluated.
+                    fillToLimit: true,
                     ...(appScoped && {scope: "app" as const}),
                 }}
                 pageSize={limit}
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx b/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx
index 2ec43bc3f4..a05874dab8 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx
+++ b/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx
@@ -10,6 +10,7 @@ import {
     type ReferenceTone,
 } from "@/oss/components/References/referenceColors"
 import {testsetsListQueryAtomFamily} from "@/oss/state/entities/testset"
+import {currentWorkflowAtom} from "@/oss/state/workflow"
 
 import {
     evaluationRunsFilterOptionsAtom,
@@ -137,10 +138,19 @@ const FiltersSummary = () => {
         () => optionMap(filterOptions.evaluatorOptions ?? []),
         [filterOptions.evaluatorOptions],
     )
-    const appLabels = useMemo(
-        () => optionMap(filterOptions.appOptions ?? []),
-        [filterOptions.appOptions],
-    )
+    const currentWorkflow = useAtomValue(currentWorkflowAtom)
+    const appLabels = useMemo(() => {
+        const map = optionMap(filterOptions.appOptions ?? [])
+        // The locked "Apps" chip is preset to the route workflow. Evaluator
+        // workflows aren't in the apps list (`appOptions`), so their id won't
+        // resolve to a name and the chip would show a raw id. Seed the map from
+        // the current workflow so the chip renders its name instead.
+        const workflowName = currentWorkflow?.name ?? currentWorkflow?.slug
+        if (currentWorkflow?.id && workflowName && !map.has(currentWorkflow.id)) {
+            map.set(currentWorkflow.id, workflowName)
+        }
+        return map
+    }, [filterOptions.appOptions, currentWorkflow])
     const variantLabels = useMemo(
         () =>
             optionMap(
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/types.ts b/web/oss/src/components/EvaluationRunsTablePOC/types.ts
index 8370f7b945..18c1a4eb0a 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/types.ts
+++ b/web/oss/src/components/EvaluationRunsTablePOC/types.ts
@@ -70,4 +70,18 @@ export interface EvaluationRunsWindowResult {
     nextOffset: number | null
     nextCursor: string | null
     nextWindowing: WindowingState | null
+    /**
+     * Per-page stats for the run-list **subject** predicate (the structural
+     * "is this run an evaluation of the scoped workflow?" filter). Feeds the
+     * hit-ratio meter: when the rolling pass-ratio is low, the scoped workflow
+     * is being graded far more often than it's evaluated, signalling the
+     * backend role-aware reference filter (v2) is warranted. Absent when no
+     * subject filter is active (project scope).
+     */
+    subjectFilterStats?: {
+        /** Runs reaching the subject check (already past kind/status/search). */
+        scanned: number
+        /** Of those, runs whose subject is the scoped workflow. */
+        matched: number
+    }
 }
diff --git a/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx b/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx
index 70a915b7ac..eb467d00f8 100644
--- a/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx
+++ b/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx
@@ -19,7 +19,7 @@ import {
     RocketLaunchIcon,
     ListChecksIcon,
 } from "@phosphor-icons/react"
-import {useAtomValue, useSetAtom} from "jotai"
+import {useSetAtom} from "jotai"
 
 import {useCrispChat} from "@/oss/hooks/useCrispChat"
 import {useSession} from "@/oss/hooks/useSession"
@@ -30,7 +30,6 @@ import {openWidgetAtom} from "@/oss/lib/onboarding"
 import {useAppsData} from "@/oss/state/app"
 import {useAppState} from "@/oss/state/appState"
 import {useOrgData} from "@/oss/state/org"
-import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 import {SidebarConfig} from "../../types"
 
@@ -47,15 +46,6 @@ export const useSidebarConfig = () => {
     const hasAppContext =
         routeLayer === "app" && Boolean(routedAppId || appURL || recentlyVisitedAppURL)
 
-    // Phase 4: when the current workflow is an evaluator, DISABLE (not hide)
-    // the app-section items that don't apply to evaluators (overview,
-    // evaluations). Items stay visible but greyed out so the user understands
-    // they exist — they just aren't applicable for this workflow type.
-    // Endpoints and deployments aren't in the sidebar today, so no extra
-    // gating needed for those.
-    const workflowCtx = useAtomValue(currentWorkflowContextAtom)
-    const isCurrentWorkflowEvaluator = workflowCtx.workflowKind === "evaluator"
-
     const sidebarConfig: SidebarConfig[] = [
         {
             key: "app-management-link",
@@ -123,9 +113,10 @@ export const useSidebarConfig = () => {
             icon: <DesktopIcon size={14} />,
             isHidden: !hasAppContext && !currentApp && !recentlyVisitedAppId,
             isAppSection: true,
-            // Disabled (not hidden) for evaluator workflows so the user still
-            // sees these surfaces exist — just not applicable here.
-            disabled: !hasProjectURL || isCurrentWorkflowEvaluator,
+            // Enabled for evaluators too — Overview surfaces the workflow's
+            // details, variants, and the evaluation runs that evaluated it
+            // (scoped by the workflow id as the `application` reference).
+            disabled: !hasProjectURL,
         },
         {
             key: "app-playground-link",
@@ -153,8 +144,10 @@ export const useSidebarConfig = () => {
             isHidden: !hasAppContext && !currentApp && !recentlyVisitedAppId,
             isAppSection: true,
             icon: <FlaskIcon size={14} />,
-            // Disabled (not hidden) for evaluator workflows.
-            disabled: !hasProjectURL || isCurrentWorkflowEvaluator,
+            // Enabled for evaluators too — shows the evaluation runs that
+            // evaluated this evaluator (scoped by its id as the `application`
+            // reference, same machinery as the app-scoped evaluations page).
+            disabled: !hasProjectURL,
             dataTour: "evaluations-nav",
         },
         {
diff --git a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/evaluations/index.tsx b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/evaluations/index.tsx
index f6f8581c10..2e5ea84fd3 100644
--- a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/evaluations/index.tsx
+++ b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/evaluations/index.tsx
@@ -5,7 +5,7 @@ import {useAppId} from "@/oss/hooks/useAppId"
 const AppEvaluationsPage = () => {
     const appId = useAppId()
     return (
-        <RequireWorkflowKind allowed={["app"]} currentRoute="evaluations">
+        <RequireWorkflowKind allowed={["app", "evaluator"]} currentRoute="evaluations">
             <EvaluationsView scope="app" appId={appId} />
         </RequireWorkflowKind>
     )
diff --git a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/overview/index.tsx b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/overview/index.tsx
index bb1302d913..3315d7c01e 100644
--- a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/overview/index.tsx
+++ b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/overview/index.tsx
@@ -6,7 +6,7 @@ import {Copy, PencilSimple, Trash} from "@phosphor-icons/react"
 // TEMPORARY: Disabling name editing
 // import {PencilLine} from "@phosphor-icons/react"
 import {Button, Dropdown, Space, Typography} from "antd"
-import {useSetAtom} from "jotai"
+import {useAtomValue, useSetAtom} from "jotai"
 import dynamic from "next/dynamic"
 
 import useCustomWorkflowConfig from "@/oss/components/pages/app-management/modals/CustomWorkflowModal/hooks/useCustomWorkflowConfig"
@@ -16,8 +16,10 @@ import {openDeleteAppModalAtom} from "@/oss/components/pages/app-management/moda
 import DeploymentOverview from "@/oss/components/pages/overview/deployments/DeploymentOverview"
 import VariantsOverview from "@/oss/components/pages/overview/variants/VariantsOverview"
 import RequireWorkflowKind from "@/oss/components/RequireWorkflowKind"
+import {useAppId} from "@/oss/hooks/useAppId"
 import {copyToClipboard} from "@/oss/lib/helpers/copyToClipboard"
 import {useAppsData} from "@/oss/state/app"
+import {currentWorkflowAtom} from "@/oss/state/workflow"
 
 const CustomWorkflowHistory: any = dynamic(
     () => import("@/oss/components/pages/app-management/drawers/CustomWorkflowHistory"),
@@ -35,7 +37,15 @@ const AppDetailsSection = memo(() => {
     const openDeleteAppModal = useSetAtom(openDeleteAppModalAtom)
     // TEMPORARY: Disabling name editing
     // const openEditAppModal = useSetAtom(openEditAppModalAtom)
-    const {currentApp, mutate: mutateApps} = useAppsData()
+    // Resolve the current workflow (app OR evaluator) from the unified state so
+    // this header works on evaluator overview pages too — `useAppsData()`
+    // returns null for evaluators (they aren't in the apps list). `mutateApps`
+    // is still needed to refresh after the app-only "Configure" custom-workflow
+    // flow.
+    const {mutate: mutateApps} = useAppsData()
+    const currentWorkflow = useAtomValue(currentWorkflowAtom)
+    const workflowId = currentWorkflow?.id ?? ""
+    const workflowName = currentWorkflow?.name ?? currentWorkflow?.slug ?? ""
     const {openModal} = useCustomWorkflowConfig({
         afterConfigSave: mutateApps,
     })
@@ -43,7 +53,7 @@ const AppDetailsSection = memo(() => {
         <>
             <Space className="flex items-center gap-3">
                 <Title level={3} className="!m-0">
-                    {currentApp?.name ?? currentApp?.slug ?? ""}
+                    {workflowName}
                 </Title>
 
                 <Dropdown
@@ -55,7 +65,7 @@ const AppDetailsSection = memo(() => {
                     }}
                     menu={{
                         items: [
-                            ...(currentApp?.flags?.is_custom
+                            ...(currentWorkflow?.flags?.is_custom
                                 ? [
                                       {
                                           key: "configure",
@@ -84,15 +94,15 @@ const AppDetailsSection = memo(() => {
                                 key: "copy_id",
                                 label: "Copy ID",
                                 icon: <Copy size={16} />,
-                                onClick: () => copyToClipboard(currentApp!.id),
+                                onClick: () => copyToClipboard(workflowId),
                             },
-                            ...(currentApp?.slug
+                            ...(currentWorkflow?.slug
                                 ? [
                                       {
                                           key: "copy_slug",
                                           label: "Copy Slug",
                                           icon: <Copy size={16} />,
-                                          onClick: () => copyToClipboard(currentApp!.slug!),
+                                          onClick: () => copyToClipboard(currentWorkflow.slug!),
                                       },
                                   ]
                                 : []),
@@ -103,8 +113,8 @@ const AppDetailsSection = memo(() => {
                                 danger: true,
                                 onClick: () =>
                                     openDeleteAppModal({
-                                        id: currentApp!.id,
-                                        name: currentApp!.name ?? currentApp!.slug ?? "",
+                                        id: workflowId,
+                                        name: workflowName,
                                     }),
                             },
                         ],
@@ -118,8 +128,18 @@ const AppDetailsSection = memo(() => {
 })
 
 const OverviewContent = () => {
-    const {currentApp} = useAppsData()
-    const appId = currentApp?.id ?? null
+    // Use the route workflow id (works for apps AND evaluators) rather than
+    // `useAppsData().currentApp?.id`, which is null for evaluators. The Overview
+    // eval-runs tables are `appScoped` to this id, so each scopes to runs where
+    // the workflow is the evaluated SUBJECT (the run-list subject predicate in
+    // fetchEvaluationRunsWindow) — i.e. "evaluations of this workflow". For an
+    // evaluator that's its subject runs (evaluations OF it), not runs that used
+    // it as a grader. So the summaries are correct for apps AND evaluators.
+    const appId = useAppId() || null
+    // Deployments don't apply to evaluator workflows (they're not deployed like
+    // apps), so the Deployment section is hidden for them.
+    const currentWorkflow = useAtomValue(currentWorkflowAtom)
+    const isEvaluator = Boolean(currentWorkflow?.flags?.is_evaluator)
     const [isCustomWorkflowHistoryDrawerOpen, setIsCustomWorkflowHistoryDrawerOpen] =
         useState(false)
 
@@ -128,7 +148,7 @@ const OverviewContent = () => {
             <PageLayout className="gap-8">
                 <AppDetailsSection />
                 <ObservabilityOverview />
-                <DeploymentOverview />
+                {!isEvaluator ? <DeploymentOverview /> : null}
                 <VariantsOverview />
 
                 <LatestEvaluationRunsTable
@@ -156,7 +176,7 @@ const OverviewContent = () => {
 }
 
 const OverviewPage = () => (
-    <RequireWorkflowKind allowed={["app"]} currentRoute="overview">
+    <RequireWorkflowKind allowed={["app", "evaluator"]} currentRoute="overview">
         <OverviewContent />
     </RequireWorkflowKind>
 )
diff --git a/web/oss/src/state/workflow/destinations.ts b/web/oss/src/state/workflow/destinations.ts
index 03c588661c..efa212eb1b 100644
--- a/web/oss/src/state/workflow/destinations.ts
+++ b/web/oss/src/state/workflow/destinations.ts
@@ -20,9 +20,12 @@ export type WorkflowRouteSegment =
     | "traces"
 
 const DISABLED_FOR_EVALUATOR: ReadonlySet<WorkflowRouteSegment> = new Set([
-    "overview",
+    // `overview` and `evaluations` are now allowed for evaluators — Overview
+    // shows the evaluator's details/variants and the evaluation runs that
+    // evaluated it; Evaluations shows those same runs (scoped by the evaluator
+    // id as the `application` reference). `endpoints`/`deployments` stay
+    // disabled (no meaningful evaluator surface yet).
     "endpoints",
-    "evaluations",
     "deployments",
 ])
 
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/runReferenceFilter.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/runReferenceFilter.test.ts
new file mode 100644
index 0000000000..6bf46333e1
--- /dev/null
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/runReferenceFilter.test.ts
@@ -0,0 +1,207 @@
+/**
+ * Run-list reference predicate — the run-level counterpart to the scenario-row
+ * predicate. Covers role resolution off `step.references` (the role-keyed
+ * primary path + the step.type legacy fallback), the subject/grader
+ * distinction (`isSubjectRun`), the `hasResolvableSubject` safety guard, the
+ * `eq`/`ne` ops, multi-predicate AND, and the `makeRunReferenceFilter`
+ * pipeline transform.
+ */
+
+import assert from "node:assert/strict"
+import {describe, it} from "node:test"
+
+import type {Chunk} from "../../../etl/core/types"
+import {
+    collectRoleReferenceKeys,
+    evaluateRunReferencePredicate,
+    hasResolvableSubject,
+    isSubjectRun,
+    makeRunReferenceFilter,
+    matchesRunReferenceFilter,
+    type RunReferenceStep,
+} from "../runReferenceFilter"
+
+const EVALUATOR = "eval-with-reasoning"
+const APP = "app-comp-1"
+const GRADER = "eval-grader-x"
+
+/** A grader run: app `app-comp-1` graded by evaluator `eval-with-reasoning`. */
+const graderRun: RunReferenceStep[] = [
+    {type: "input", references: {testset: {id: "ts-1"}}},
+    {type: "invocation", references: {application: {id: APP}}},
+    {type: "annotation", references: {evaluator: {id: EVALUATOR, slug: "with-reasoning"}}},
+]
+
+/** A subject run: an evaluation run ON `eval-with-reasoning` (the #4237 feature). */
+const subjectRun: RunReferenceStep[] = [
+    {type: "input", references: {testset: {id: "ts-2"}}},
+    {type: "invocation", references: {application: {id: EVALUATOR}}},
+    {type: "annotation", references: {evaluator: {id: GRADER}}},
+]
+
+describe("collectRoleReferenceKeys", () => {
+    it("reads role-keyed references off each step", () => {
+        assert.deepEqual([...collectRoleReferenceKeys(graderRun, "application")], [APP])
+        assert.deepEqual(
+            [...collectRoleReferenceKeys(graderRun, "evaluator")].sort(),
+            [EVALUATOR, "with-reasoning"].sort(),
+        )
+        assert.deepEqual([...collectRoleReferenceKeys(graderRun, "testset")], ["ts-1"])
+    })
+
+    it("includes both id and slug so evaluators match either", () => {
+        const keys = collectRoleReferenceKeys(graderRun, "evaluator")
+        assert.ok(keys.has(EVALUATOR))
+        assert.ok(keys.has("with-reasoning"))
+    })
+
+    it("returns empty for missing/empty steps", () => {
+        assert.equal(collectRoleReferenceKeys(null, "application").size, 0)
+        assert.equal(collectRoleReferenceKeys(undefined, "application").size, 0)
+        assert.equal(collectRoleReferenceKeys([], "application").size, 0)
+        assert.equal(collectRoleReferenceKeys([{type: "invocation"}], "application").size, 0)
+    })
+
+    it("falls back to step.type for a legacy single-reference step", () => {
+        const legacy: RunReferenceStep[] = [{type: "invocation", references: {ref: {id: APP}}}]
+        assert.deepEqual([...collectRoleReferenceKeys(legacy, "application")], [APP])
+    })
+
+    it("does NOT use the legacy fallback when multiple references are present (avoids over-match)", () => {
+        const ambiguous: RunReferenceStep[] = [
+            {type: "invocation", references: {ref: {id: APP}, other: {id: "x"}}},
+        ]
+        assert.equal(collectRoleReferenceKeys(ambiguous, "application").size, 0)
+    })
+})
+
+describe("isSubjectRun / grader distinction", () => {
+    it("subject run: the evaluator is the application/subject", () => {
+        assert.equal(isSubjectRun(subjectRun, EVALUATOR), true)
+    })
+
+    it("grader run: the evaluator is NOT the subject (it's an annotation)", () => {
+        assert.equal(isSubjectRun(graderRun, EVALUATOR), false)
+    })
+
+    it("the app IS the subject of its own grader run", () => {
+        assert.equal(isSubjectRun(graderRun, APP), true)
+    })
+})
+
+describe("evaluateRunReferencePredicate ops", () => {
+    it("eq matches the role's id", () => {
+        assert.equal(
+            evaluateRunReferencePredicate({role: "evaluator", id: EVALUATOR}, graderRun),
+            true,
+        )
+    })
+
+    it("ne is the complement", () => {
+        assert.equal(
+            evaluateRunReferencePredicate(
+                {role: "application", id: EVALUATOR, op: "ne"},
+                graderRun,
+            ),
+            true,
+        )
+        assert.equal(
+            evaluateRunReferencePredicate(
+                {role: "application", id: EVALUATOR, op: "ne"},
+                subjectRun,
+            ),
+            false,
+        )
+    })
+
+    it("matches an evaluator by slug too", () => {
+        assert.equal(
+            evaluateRunReferencePredicate({role: "evaluator", id: "with-reasoning"}, graderRun),
+            true,
+        )
+    })
+})
+
+describe("hasResolvableSubject", () => {
+    it("true when an application reference exists", () => {
+        assert.equal(hasResolvableSubject(graderRun), true)
+        assert.equal(hasResolvableSubject(subjectRun), true)
+    })
+
+    it("false when no application reference can be resolved", () => {
+        assert.equal(
+            hasResolvableSubject([{type: "annotation", references: {evaluator: {id: EVALUATOR}}}]),
+            false,
+        )
+        assert.equal(hasResolvableSubject([]), false)
+        assert.equal(hasResolvableSubject(null), false)
+    })
+})
+
+describe("matchesRunReferenceFilter (AND-join)", () => {
+    it("AND-joins multiple predicates", () => {
+        // subject == evaluator AND grader == GRADER
+        assert.equal(
+            matchesRunReferenceFilter(
+                [
+                    {role: "application", id: EVALUATOR},
+                    {role: "evaluator", id: GRADER},
+                ],
+                subjectRun,
+            ),
+            true,
+        )
+        // subject == evaluator AND grader == (the wrong id) → fails
+        assert.equal(
+            matchesRunReferenceFilter(
+                [
+                    {role: "application", id: EVALUATOR},
+                    {role: "evaluator", id: "nope"},
+                ],
+                subjectRun,
+            ),
+            false,
+        )
+    })
+})
+
+describe("makeRunReferenceFilter (Transform)", () => {
+    it("keeps only subject runs and reports chunk telemetry", () => {
+        interface Row {
+            id: string
+            steps: RunReferenceStep[]
+        }
+        const rows: Row[] = [
+            {id: "subject", steps: subjectRun},
+            {id: "grader", steps: graderRun},
+        ]
+        const seen: {scanned: number; matched: number}[] = []
+        const filter = makeRunReferenceFilter<Row>({
+            predicates: {role: "application", id: EVALUATOR},
+            getSteps: (row) => row.steps,
+            onChunkFiltered: ({scanned, matched}) => seen.push({scanned, matched}),
+        })
+
+        const chunk: Chunk<Row> = {items: rows, cursor: null}
+        const out = filter(chunk) as Chunk<Row>
+
+        assert.deepEqual(
+            out.items.map((r) => r.id),
+            ["subject"],
+        )
+        assert.deepEqual(seen, [{scanned: 2, matched: 1}])
+    })
+
+    it("defaultGetSteps reads row.previewMeta.steps", () => {
+        interface Row {
+            previewMeta: {steps: RunReferenceStep[]}
+        }
+        const rows: Row[] = [{previewMeta: {steps: subjectRun}}, {previewMeta: {steps: graderRun}}]
+        const filter = makeRunReferenceFilter<Row>({
+            predicates: {role: "application", id: EVALUATOR},
+        })
+        const out = filter({items: rows, cursor: null}) as Chunk<Row>
+        assert.equal(out.items.length, 1)
+        assert.equal(out.items[0]!.previewMeta.steps, subjectRun)
+    })
+})
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
index 4bb71e0faf..5e647662fe 100644
--- a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
@@ -114,6 +114,24 @@ export {
     type PredicateGroupFilterOptions,
 } from "./rowPredicateFilter"
 
+// Run-list predicate filter — the run-level counterpart to rowPredicateFilter.
+// Drops whole RUNS from a run list by the ROLE their references play
+// (subject / "application" vs grader / "evaluator"), reusing the same
+// step.type → role convention. Powers "evaluations that evaluated THIS
+// workflow" — the evaluator Evaluations/Overview unification (feature F).
+export {
+    collectRoleReferenceKeys,
+    evaluateRunReferencePredicate,
+    isSubjectRun,
+    hasResolvableSubject,
+    matchesRunReferenceFilter,
+    makeRunReferenceFilter,
+    type RunReferenceStep,
+    type RunReferenceRole,
+    type RunReferencePredicate,
+    type RunReferenceFilterOptions,
+} from "./runReferenceFilter"
+
 // filterSchema — derives the filterable fields (typed + type-matched
 // operators) the Phase 2 filter UI offers. Decision D8 / eval-filtering D4.
 export {
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/runReferenceFilter.ts b/web/packages/agenta-entities/src/evaluationRun/etl/runReferenceFilter.ts
new file mode 100644
index 0000000000..54eae9d7ca
--- /dev/null
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/runReferenceFilter.ts
@@ -0,0 +1,225 @@
+/**
+ * Run-list predicate filter — the run-level counterpart to rowPredicateFilter.
+ *
+ * # Where this fits
+ *
+ * `rowPredicateFilter` drops scenario ROWS *within a single run* by their
+ * resolved cell values (an evaluator's `success`, a testset column, a metric).
+ * This module drops whole RUNS *from a run list* by the **role** their
+ * references play.
+ *
+ * The canonical use is the unification behind feature "F": an evaluator's
+ * Evaluations / Overview tab should show the evaluations that *evaluated this
+ * workflow* — runs where the visited workflow is the run's **subject** (its
+ * `application` / invocation reference) — NOT runs that merely *used* it as a
+ * grader (where it sits in an `evaluator` / annotation reference).
+ *
+ * The backend reference filter (`references @> [...]`) matches an id in *any*
+ * role, so `application = evaluatorId` over-returns: it also matches runs where
+ * the evaluator was a grader. That's harmless for apps (an app id only ever
+ * occupies the `application` role) but leaks for evaluators (their id occupies
+ * `evaluator` on every run they grade, and `application` on their own subject
+ * runs). This filter resolves the role from the run's structure and keeps the
+ * intended one.
+ *
+ * # Why structural, not `meta.application`
+ *
+ * The run carries a denormalized `meta.application` hint, but it's unreliable
+ * (absent on some runs) — a null hint silently bypasses any `meta`-based
+ * filter, which is exactly how grader runs slip through today. The run's
+ * `data.steps` are the source of truth: the invocation step's `application`
+ * reference is the evaluated/subject workflow, regardless of `meta`.
+ *
+ * # Role convention
+ *
+ * Same `step.type → role` mapping `resolveMappings` / `predicateToEntitySlices`
+ * use on the read side:
+ *
+ *   input       → testset
+ *   invocation  → application   (the evaluated / subject workflow)
+ *   annotation  → evaluator     (the grader)
+ *
+ * References are already role-keyed off each step
+ * (`{application: {id}}`, `{evaluator: {id, slug}}`, …); `step.type` is only a
+ * fallback for legacy steps whose single reference wasn't explicitly keyed.
+ *
+ * @packageDocumentation
+ */
+
+import type {Chunk, Transform} from "../../etl/core/types"
+
+/**
+ * Minimal structural shape of a run step — intentionally looser than
+ * `RunSchema`'s `RunStep` so callers can pass `previewMeta.steps[]`
+ * (whose `references` is typed `Record<string, unknown>`) without a cast.
+ */
+export interface RunReferenceStep {
+    type?: string | null
+    references?: Record<string, unknown> | null
+}
+
+/** A reference role a run step can carry. Open string for forward-compat. */
+export type RunReferenceRole = "application" | "evaluator" | "testset" | "query" | (string & {})
+
+/**
+ * `step.type → canonical role`. Used only as a fallback when a step's
+ * references aren't explicitly role-keyed (legacy single-reference steps).
+ */
+const STEP_TYPE_TO_ROLE: Record<string, RunReferenceRole> = {
+    input: "testset",
+    invocation: "application",
+    annotation: "evaluator",
+}
+
+/**
+ * One run-level clause: the run must (op "eq") or must not (op "ne") carry
+ * `id` in the given `role`.
+ *
+ *   - `role` — which reference slot the id must occupy ("application" = subject).
+ *   - `id` — the id (or slug) to match.
+ *   - `op` — "eq" → run HAS the id in this role; "ne" → run does NOT. Default "eq".
+ */
+export interface RunReferencePredicate {
+    role: RunReferenceRole
+    id: string
+    op?: "eq" | "ne"
+}
+
+function addRefKeys(ref: unknown, into: Set<string>): void {
+    if (!ref || typeof ref !== "object") return
+    const {id, slug} = ref as {id?: unknown; slug?: unknown}
+    if (typeof id === "string" && id) into.add(id)
+    // Evaluators are frequently referenced by slug rather than id, so match both.
+    if (typeof slug === "string" && slug) into.add(slug)
+}
+
+/**
+ * Collect every id/slug a given `role` occupies across a run's steps.
+ *
+ * Primary path: the role-keyed reference on each step (`refs[role]`). Fallback:
+ * a legacy step whose `references` isn't role-keyed but whose `step.type` maps
+ * to `role` and which carries exactly one reference.
+ */
+export function collectRoleReferenceKeys(
+    steps: readonly RunReferenceStep[] | null | undefined,
+    role: RunReferenceRole,
+): Set<string> {
+    const keys = new Set<string>()
+    if (!Array.isArray(steps)) return keys
+
+    for (const step of steps) {
+        const refs = step?.references
+        if (!refs || typeof refs !== "object") continue
+        const map = refs as Record<string, unknown>
+
+        const direct = map[role]
+        if (direct) {
+            addRefKeys(direct, keys)
+            continue
+        }
+
+        // Legacy fallback: references not explicitly role-keyed, but step.type
+        // identifies the role and the step carries a single reference.
+        const inferred = step?.type ? STEP_TYPE_TO_ROLE[String(step.type)] : undefined
+        if (inferred === role) {
+            const values = Object.values(map)
+            if (values.length === 1) addRefKeys(values[0], keys)
+        }
+    }
+
+    return keys
+}
+
+/** Evaluate a single run-reference predicate against a run's steps. */
+export function evaluateRunReferencePredicate(
+    predicate: RunReferencePredicate,
+    steps: readonly RunReferenceStep[] | null | undefined,
+): boolean {
+    const has = collectRoleReferenceKeys(steps, predicate.role).has(predicate.id)
+    return (predicate.op ?? "eq") === "ne" ? !has : has
+}
+
+/**
+ * True when `workflowId` is the run's evaluated / subject workflow — i.e. the
+ * workflow sits in an `application` (invocation) reference. This is the
+ * "evaluations that evaluated THIS workflow" predicate.
+ */
+export function isSubjectRun(
+    steps: readonly RunReferenceStep[] | null | undefined,
+    workflowId: string,
+): boolean {
+    return evaluateRunReferencePredicate({role: "application", id: workflowId}, steps)
+}
+
+/**
+ * Whether a run carries any resolvable `application` (subject) reference at all.
+ *
+ * Used as a safety guard: a run with no resolvable subject can't be classified
+ * structurally, so the caller should fall back to its prior heuristic
+ * (e.g. `meta.application`) rather than silently dropping the run.
+ */
+export function hasResolvableSubject(
+    steps: readonly RunReferenceStep[] | null | undefined,
+): boolean {
+    return collectRoleReferenceKeys(steps, "application").size > 0
+}
+
+// ============================================================================
+// ETL Transform parity
+//
+// The dataset-store fetch path consumes the pure helpers above directly, but
+// for headless / chunked ETL runs we expose a Transform factory mirroring
+// rowPredicateFilter's `makePredicateGroupFilter`. Predicates are AND-joined.
+// ============================================================================
+
+export interface RunReferenceFilterOptions<TRow> {
+    /** One or more predicates, AND-joined. All must hold for the run to pass. */
+    predicates: RunReferencePredicate | RunReferencePredicate[]
+    /** Extract the run's steps from a row. Defaults to `row.previewMeta?.steps` / `row.steps`. */
+    getSteps?: (row: TRow) => readonly RunReferenceStep[] | null | undefined
+    /** Optional per-chunk telemetry — feeds a hit-ratio meter. */
+    onChunkFiltered?: (info: {chunk: number; scanned: number; matched: number}) => void
+}
+
+function defaultGetSteps(row: unknown): readonly RunReferenceStep[] | null | undefined {
+    if (!row || typeof row !== "object") return null
+    const r = row as {steps?: unknown; previewMeta?: {steps?: unknown}}
+    if (Array.isArray(r.steps)) return r.steps as RunReferenceStep[]
+    if (Array.isArray(r.previewMeta?.steps)) return r.previewMeta!.steps as RunReferenceStep[]
+    return null
+}
+
+/** True when a run's steps satisfy every supplied predicate (logical AND). */
+export function matchesRunReferenceFilter(
+    predicates: RunReferencePredicate | RunReferencePredicate[],
+    steps: readonly RunReferenceStep[] | null | undefined,
+): boolean {
+    const list = Array.isArray(predicates) ? predicates : [predicates]
+    return list.every((p) => evaluateRunReferencePredicate(p, steps))
+}
+
+/**
+ * Build a `Transform<TRow, TRow>` that keeps only runs satisfying every
+ * supplied predicate. Stateless — reusable across pipeline runs.
+ */
+export function makeRunReferenceFilter<TRow>(
+    options: RunReferenceFilterOptions<TRow>,
+): Transform<TRow, TRow> {
+    const predicates = Array.isArray(options.predicates) ? options.predicates : [options.predicates]
+    const getSteps =
+        options.getSteps ?? (defaultGetSteps as RunReferenceFilterOptions<TRow>["getSteps"])!
+    let chunkIdx = 0
+
+    return (chunk: Chunk<TRow>) => {
+        chunkIdx++
+        const passing = chunk.items.filter((row) =>
+            matchesRunReferenceFilter(predicates, getSteps(row)),
+        )
+        options.onChunkFiltered?.({
+            chunk: chunkIdx,
+            scanned: chunk.items.length,
+            matched: passing.length,
+        })
+        return {...chunk, items: passing}
+    }
+}

From 310344f01db221b645b8559000b57a6b538cc1ec Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Sat, 6 Jun 2026 21:32:13 +0200
Subject: [PATCH 31/36] fix(evaluators): hide deploy actions on evaluator
 workflows

Evaluators aren't deployed to environments, but deploy actions leaked onto
their surfaces. Gate at the reusable chokepoints:

- DeployVariantButton self-guards via the workflow-level is_evaluator flag
  (correct even on v0 revisions), covering the revision drawer + every other
  reuse without per-call-site checks.
- Recent Prompts (VariantsOverview) passes hideDeployActions for evaluators,
  matching the variants dashboard.
---
 .../assets/DeployVariantButton/index.tsx                 | 9 +++++++++
 .../pages/overview/variants/VariantsOverview.tsx         | 7 ++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx b/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx
index 6ec0ba479d..315f0f7093 100644
--- a/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx
+++ b/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx
@@ -29,6 +29,9 @@ const DeployVariantButton = ({
 
     const runnableData = useAtomValue(workflowMolecule.selectors.data(revisionId || ""))
     const workflowId = runnableData?.workflow_id || ""
+    // Workflow-level evaluator flag — canonical, unlike the revision-level
+    // `flags.is_evaluator` which is `false` on v0 revisions of evaluators.
+    const isEvaluator = useAtomValue(workflowMolecule.selectors.isEvaluator(workflowId))
     const variants = useAtomValue(workflowVariantsListDataAtomFamily(workflowId))
 
     const {environments, variantName, revision} = useMemo(() => {
@@ -46,6 +49,12 @@ const DeployVariantButton = ({
 
     const handleCloseDeployModal = useCallback(() => setIsDeployModalOpen(false), [])
 
+    // Evaluator workflows aren't deployed to environments — never render a
+    // deploy trigger for them. Central guard so every surface that reuses this
+    // button (registry/overview menus, variant headers, the revision drawer) is
+    // covered without each call site repeating the check.
+    if (isEvaluator) return null
+
     return (
         <>
             {isValidElement(children) ? (
diff --git a/web/oss/src/components/pages/overview/variants/VariantsOverview.tsx b/web/oss/src/components/pages/overview/variants/VariantsOverview.tsx
index a0f0bed285..925d521679 100644
--- a/web/oss/src/components/pages/overview/variants/VariantsOverview.tsx
+++ b/web/oss/src/components/pages/overview/variants/VariantsOverview.tsx
@@ -3,7 +3,7 @@ import {useCallback, useMemo} from "react"
 import {Rocket} from "@phosphor-icons/react"
 import {Button, Typography} from "antd"
 import clsx from "clsx"
-import {useSetAtom} from "jotai"
+import {useAtomValue, useSetAtom} from "jotai"
 import Link from "next/link"
 
 import {openDeployVariantModalAtom} from "@/oss/components/Playground/Components/Modals/DeployVariantModal/store/deployVariantModalStore"
@@ -13,6 +13,7 @@ import RegistryTable from "@/oss/components/VariantsComponents/Table/RegistryTab
 import {usePlaygroundNavigation} from "@/oss/hooks/usePlaygroundNavigation"
 import {useQuery} from "@/oss/hooks/useQuery"
 import useURL from "@/oss/hooks/useURL"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 const {Title} = Typography
 
@@ -21,6 +22,9 @@ const VariantsOverview = () => {
     const {appURL} = useURL()
     const {goToPlayground} = usePlaygroundNavigation()
     const openDeployVariantModal = useSetAtom(openDeployVariantModalAtom)
+    // Evaluator workflows aren't deployed — hide the row "Deploy" action.
+    const isCurrentWorkflowEvaluator =
+        useAtomValue(currentWorkflowContextAtom).workflowKind === "evaluator"
 
     const handleRowClick = useCallback(
         (record: RegistryRevisionRow) => {
@@ -83,6 +87,7 @@ const VariantsOverview = () => {
             <RegistryTable
                 onRowClick={handleRowClick}
                 actions={columnActions}
+                hideDeployActions={isCurrentWorkflowEvaluator}
                 scopeId="overview-recent"
                 pageSize={5}
                 columnVisibilityStorageKey="agenta:overview-registry:column-visibility"

From 455c2b8c9d3b13733f462c01281dc8d108d12c54 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Sat, 6 Jun 2026 21:32:33 +0200
Subject: [PATCH 32/36] fix(apps): align Archived Apps layout with Archived
 Evaluators

Switch ArchivedAppsPage to the shared PageLayout with an inline back-arrow
title and no subtitle, matching the Archived Evaluators page.
---
 .../pages/app-management/ArchivedAppsPage.tsx | 29 ++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx b/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx
index 8d4ae5b560..d5cf926818 100644
--- a/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx
+++ b/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx
@@ -1,6 +1,8 @@
+import {PageLayout} from "@agenta/ui"
+import {ArrowLeft} from "@phosphor-icons/react"
+import {Button} from "antd"
 import {useRouter} from "next/router"
 
-import ArchivedEntityLayout from "@/oss/components/ArchivedEntityLayout"
 import useURL from "@/oss/hooks/useURL"
 
 import ApplicationManagementSection from "./components/ApplicationManagementSection"
@@ -9,13 +11,26 @@ export default function ArchivedAppsPage() {
     const router = useRouter()
     const {baseAppURL} = useURL()
 
+    // Mirror the Archived Evaluators header: the back arrow sits inline with the
+    // title (no standalone "Back" button, no subtitle) so both archived pages
+    // share one layout via PageLayout.
+    const title = (
+        <span className="inline-flex items-center gap-2">
+            <Button
+                type="text"
+                size="small"
+                icon={<ArrowLeft size={16} />}
+                onClick={() => router.push(baseAppURL)}
+                className="!px-1"
+                aria-label="Back to apps"
+            />
+            <span>Archived Apps</span>
+        </span>
+    )
+
     return (
-        <ArchivedEntityLayout
-            title="Archived Apps"
-            subtitle="Archived apps are hidden from your workspace but can be restored at any time."
-            onBack={() => router.push(baseAppURL)}
-        >
+        <PageLayout title={title} className="grow min-h-0">
             <ApplicationManagementSection mode="archived" />
-        </ArchivedEntityLayout>
+        </PageLayout>
     )
 }

From 9b50b49e2148b53e22fc7bdeab1280f9543ae25e Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Sat, 6 Jun 2026 21:32:33 +0200
Subject: [PATCH 33/36] fix(evaluations): support evaluator workflows in the
 app-scoped New Evaluation modal

When the modal is app-scoped to an evaluator route, resolve the Application
panel label/kind from the evaluators list so it shows the evaluator's name
(not its raw id) - the app-scoped pre-lock never sets selectedWorkflowMeta.

Also drop the full evaluator query-result objects from the derived-evaluators
memo deps (and a dead humanEvaluatorsQuery subscription); they changed
identity every query tick and churned the modal's renders.
---
 .../Components/NewEvaluationModalInner.tsx    | 34 +++++++++++++------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
index 2b4e35f316..25f71b2c2c 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
@@ -14,7 +14,6 @@ import {
     evaluatorsListDataAtom,
     evaluatorsListQueryAtom,
     humanEvaluatorsListDataAtom,
-    humanEvaluatorsListQueryAtom,
     invalidateWorkflowsListCache,
     invalidateEvaluatorsListCache,
 } from "@agenta/entities/workflow"
@@ -128,18 +127,27 @@ const NewEvaluationModalInner = ({
             updatedAt: app.updated_at ?? null,
         }))
         if (selectedAppId && !options.some((opt) => opt.value === selectedAppId)) {
-            // Evaluators (and locally-picked workflows) aren't in useAppsData —
-            // fall back to the captured meta so the tag renders a real name.
+            // Evaluators (and locally-picked workflows) aren't in useAppsData.
+            // When the user picked a row we have `selectedWorkflowMeta`; for an
+            // app-scoped EVALUATOR route there's no meta, so resolve the name
+            // (and kind) from the evaluators list — otherwise the Application
+            // panel renders the raw workflow id instead of its name.
+            const evaluatorWorkflow = evaluatorWorkflows.find((e) => e.id === selectedAppId)
+            const isEvaluator = selectedWorkflowMeta?.isEvaluator ?? Boolean(evaluatorWorkflow)
             options.push({
-                label: selectedWorkflowMeta?.label ?? selectedAppId,
+                label:
+                    selectedWorkflowMeta?.label ??
+                    evaluatorWorkflow?.name ??
+                    evaluatorWorkflow?.slug ??
+                    selectedAppId,
                 value: selectedAppId,
-                type: selectedWorkflowMeta?.isEvaluator ? "evaluator" : null,
+                type: isEvaluator ? "evaluator" : null,
                 createdAt: null,
                 updatedAt: null,
             })
         }
         return options
-    }, [availableApps, selectedAppId, selectedWorkflowMeta])
+    }, [availableApps, selectedAppId, selectedWorkflowMeta, evaluatorWorkflows])
     const router = useRouter()
     const {baseAppURL, projectURL} = useURL()
 
@@ -149,9 +157,11 @@ const NewEvaluationModalInner = ({
     const configsData = useAtomValue(evaluatorConfigsListDataAtom)
     const configsQueryState = useAtomValue(evaluatorConfigsQueryStateAtom)
 
-    // Workflow-based evaluator list atoms (replace legacy useEvaluators hook)
+    // Workflow-based evaluator list atoms (replace legacy useEvaluators hook).
+    // The `humanEvaluatorsListDataAtom` subscription already drives the human
+    // evaluators query; we don't separately read its query-state object (doing
+    // so only churned the derived-evaluators memo above), so it's not read here.
     const humanEvaluatorsList = useAtomValue(humanEvaluatorsListDataAtom)
-    const humanEvaluatorsQuery = useAtomValue(humanEvaluatorsListQueryAtom)
     const evaluatorsList = useAtomValue(evaluatorsListDataAtom)
     const evaluatorsQuery = useAtomValue(evaluatorsListQueryAtom)
 
@@ -174,13 +184,17 @@ const NewEvaluationModalInner = ({
                     loadingEvaluatorConfigs: configsQueryState.isPending ?? false,
                 }
             }
+            // Depend on the *values* the body reads, not the query result
+            // objects — `evaluatorsQuery`/`humanEvaluatorsQuery` change identity
+            // on every query tick (and `humanEvaluatorsQuery` isn't read at all),
+            // which recomputed this memo every render and churned the derived
+            // `evaluators`/`evaluatorConfigs` → `appOptions`/Tabs items downstream.
         }, [
             preview,
             evaluationType,
             humanEvaluatorsList,
-            humanEvaluatorsQuery,
             evaluatorsList,
-            evaluatorsQuery,
+            evaluatorsQuery.isPending,
             templatesData,
             configsData,
             templatesQuery.isPending,

From 4d9b1c010ed457897cf6cc7ed235bb2f1134d300 Mon Sep 17 00:00:00 2001
From: Arda Erzin <ardaerzin@gmail.com>
Date: Mon, 8 Jun 2026 12:48:20 +0200
Subject: [PATCH 34/36] fix(evaluators): list all automatic evaluators in the
 sidebar switcher

The switcher used fullPagePlaygroundEvaluatorsAtom, which narrows to
evaluators that have a full-page playground (LLM, code) and so dropped the
declarative matchers (exact match, regex, similarity, json diff, contains
json, ...). Add nonHumanEvaluatorsAtom - non-archived evaluators with only the
human (is_feedback, resolved from the latest revision) exclusion - and point
the switcher at it, so every automatic evaluator is listed while human ones
stay out.
---
 .../Sidebar/components/WorkflowEntityCard.tsx | 32 +++++++++----------
 .../agenta-entities/src/workflow/index.ts     |  1 +
 .../src/workflow/state/evaluatorUtils.ts      | 23 +++++++++++++
 .../src/workflow/state/index.ts               |  1 +
 4 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
index fcf1c49f06..fd265ce7d1 100644
--- a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
+++ b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
@@ -1,7 +1,7 @@
 import {memo, useCallback, useMemo, useState} from "react"
 
 import {
-    fullPagePlaygroundEvaluatorsAtom,
+    nonHumanEvaluatorsAtom,
     nonArchivedAppWorkflowsAtom,
     nonArchivedEvaluatorsAtom,
     parseWorkflowKeyFromUri,
@@ -117,26 +117,24 @@ const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => {
     const ctx = useAtomValue(currentWorkflowContextAtom)
     const apps = useAtomValue(nonArchivedAppWorkflowsAtom) as readonly Workflow[]
     const evaluators = useAtomValue(nonArchivedEvaluatorsAtom) as readonly Workflow[]
-    // Only evaluators with a real full-page playground belong in the switcher.
-    // `fullPagePlaygroundEvaluatorsAtom` resolves the type flags from each
-    // evaluator's LATEST REVISION — the workflow LIST records this card reads
-    // from `nonArchivedEvaluatorsAtom` carry NO `data.uri` and NO
-    // `is_feedback`/`is_llm`/`is_code` flags (those live on the revision, not
-    // the parent artifact). That's why the old `!w.flags?.is_feedback` filter
-    // never excluded anything and human/feedback evaluators leaked into the
-    // switcher (QA 2026-06-05). The atom drops human (`is_feedback`) AND
-    // declarative classifier evaluators (match/exact_match/json_*/etc.) — all
-    // of which route to an `/apps/<id>/*` destination the guard redirects back
-    // to /evaluators, so clicking them would be a dead end.
-    const fullPagePlaygroundEvaluators = useAtomValue(
-        fullPagePlaygroundEvaluatorsAtom,
-    ) as readonly Workflow[]
+    // The switcher lists every AUTOMATIC evaluator — LLM, code, AND the
+    // declarative classifiers (exact match, regex, similarity / semantic
+    // similarity, json diff, contains json, …). `nonHumanEvaluatorsAtom`
+    // resolves `is_feedback` from each evaluator's LATEST REVISION — the
+    // workflow LIST records this card reads from `nonArchivedEvaluatorsAtom`
+    // carry NO `is_feedback`/`is_llm`/`is_code` flags (those live on the
+    // revision, not the parent artifact), which is why the old
+    // `!w.flags?.is_feedback` filter never excluded anything and human
+    // evaluators leaked in (QA 2026-06-05). It drops ONLY human (`is_feedback`)
+    // evaluators; navigation lands on the workflow's current sub-page (Overview/
+    // Evaluations are valid for every evaluator), so matchers no longer dead-end.
+    const automaticEvaluators = useAtomValue(nonHumanEvaluatorsAtom) as readonly Workflow[]
     // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off, the
     // switcher dropdown hides the "Evaluators" group entirely.
     const switcherEvaluators: readonly Workflow[] = useMemo(() => {
         if (!EVALUATOR_FULL_PAGE_NAV_ENABLED) return EMPTY_WORKFLOWS
-        return fullPagePlaygroundEvaluators
-    }, [fullPagePlaygroundEvaluators])
+        return automaticEvaluators
+    }, [automaticEvaluators])
     const recentAppId = useAtomValue(recentAppIdAtom)
     const recentEvaluatorId = useAtomValue(recentEvaluatorIdAtom)
     const navigateToWorkflow = useSetAtom(routerAppNavigationAtom)
diff --git a/web/packages/agenta-entities/src/workflow/index.ts b/web/packages/agenta-entities/src/workflow/index.ts
index feb975eaf7..b3d5a612bf 100644
--- a/web/packages/agenta-entities/src/workflow/index.ts
+++ b/web/packages/agenta-entities/src/workflow/index.ts
@@ -289,6 +289,7 @@ export {
     evaluatorsListDataAtom,
     nonArchivedEvaluatorsAtom,
     fullPagePlaygroundEvaluatorsAtom,
+    nonHumanEvaluatorsAtom,
     // Templates
     evaluatorTemplatesQueryAtom,
     evaluatorTemplatesDataAtom,
diff --git a/web/packages/agenta-entities/src/workflow/state/evaluatorUtils.ts b/web/packages/agenta-entities/src/workflow/state/evaluatorUtils.ts
index 7ea199e121..29d362d1c7 100644
--- a/web/packages/agenta-entities/src/workflow/state/evaluatorUtils.ts
+++ b/web/packages/agenta-entities/src/workflow/state/evaluatorUtils.ts
@@ -142,6 +142,29 @@ export const fullPagePlaygroundEvaluatorsAtom = atom<Workflow[]>((get) => {
     })
 })
 
+/**
+ * Non-archived **automatic** evaluators — i.e. all evaluators except human
+ * (`is_feedback`) ones. Unlike `fullPagePlaygroundEvaluatorsAtom`, this does
+ * NOT narrow to evaluators that have a full-page playground, so it includes the
+ * declarative classifiers too (exact match, regex, similarity / semantic
+ * similarity, json diff, contains json, …). This is the right list for the
+ * sidebar workflow switcher, which should surface every automatic evaluator.
+ *
+ * `is_feedback` lives on the revision (not the parent artifact), so it's
+ * resolved from each evaluator's latest revision (batched + cached). An
+ * evaluator whose latest revision hasn't resolved yet is held back until it
+ * does, so a human evaluator never briefly leaks into the list.
+ */
+export const nonHumanEvaluatorsAtom = atom<Workflow[]>((get) => {
+    const evaluators = get(nonArchivedEvaluatorsAtom)
+    return evaluators.filter((evaluator) => {
+        if (!evaluator.id) return false
+        const revision = get(workflowLatestRevisionQueryAtomFamily(evaluator.id)).data
+        if (!revision) return false
+        return !revision.flags?.is_feedback
+    })
+})
+
 /**
  * Invalidate the evaluators list cache.
  * Call after create/update/archive operations on evaluator workflows.
diff --git a/web/packages/agenta-entities/src/workflow/state/index.ts b/web/packages/agenta-entities/src/workflow/state/index.ts
index 0c85a3131d..0cb2d9f94a 100644
--- a/web/packages/agenta-entities/src/workflow/state/index.ts
+++ b/web/packages/agenta-entities/src/workflow/state/index.ts
@@ -159,6 +159,7 @@ export {
     evaluatorsListDataAtom,
     nonArchivedEvaluatorsAtom,
     fullPagePlaygroundEvaluatorsAtom,
+    nonHumanEvaluatorsAtom,
     // Templates
     evaluatorTemplatesQueryAtom,
     evaluatorTemplatesDataAtom,

From a2dc2f3f0dd6b4c88efda45b0e4e8c385317e707 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Mon, 8 Jun 2026 13:56:33 +0200
Subject: [PATCH 35/36] ci: stop PR bot over-closing on missing checklist +
 reopen loop

Two fixes found while auditing the bot against all open PRs:

- Only a non-empty Summary plus a demo (for functional changes) are required.
  Missing Testing/Checklist sections no longer close a PR. The demo is now
  detected anywhere in the body, not just the Demo section. This fixes a PR
  that had a YouTube demo and full testing notes but was closed for lacking
  the checklist section.
- Drop the 'reopened' trigger so a maintainer who manually reopens a flagged
  PR wins, instead of the bot immediately re-closing it. Auto-reopen on a
  fixed description still works via 'edited'/'synchronize'.
---
 .../workflows/13-check-pr-contribution.yml    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/13-check-pr-contribution.yml b/.github/workflows/13-check-pr-contribution.yml
index c649f7414d..dfd9714b70 100644
--- a/.github/workflows/13-check-pr-contribution.yml
+++ b/.github/workflows/13-check-pr-contribution.yml
@@ -10,8 +10,11 @@ name: "13 - check PR contribution"
 # It never checks out or runs the PR's code, so this is safe.
 
 on:
+  # No 'reopened': a maintainer who manually reopens a flagged PR should win,
+  # otherwise the reopen event would immediately re-close it. Auto-reopen on a
+  # fixed description still works through 'edited' and 'synchronize'.
   pull_request_target:
-    types: [opened, edited, synchronize, reopened, ready_for_review]
+    types: [opened, edited, synchronize, ready_for_review]
   workflow_dispatch:
     inputs:
       pr_number:
@@ -116,27 +119,24 @@ jobs:
 
             const reasons = [];
 
-            // 1) Template is present and filled.
-            const headers = ['Summary', 'Testing', 'Demo', 'Checklist'];
-            const lower = body.toLowerCase();
-            const missing = headers.filter((h) => !lower.includes('## ' + h.toLowerCase()));
+            // 1) The PR is described. We only require a non-empty Summary, not the
+            //    full template. Missing Testing/Checklist sections do not close a PR;
+            //    a thorough PR with a demo should never be closed over a checklist.
             if (!body.trim()) {
               reasons.push('The pull request description is empty. Please fill in the PR template.');
-            } else if (missing.length) {
-              reasons.push('The description is missing required sections (' + missing.join(', ') + '). Please use the PR template without removing its sections.');
             } else if (!section('Summary')) {
-              reasons.push('The **Summary** section is empty. Describe what changed and why.');
+              reasons.push('The **Summary** section is missing or empty. Describe what changed and why using the PR template.');
             }
 
-            // 2) Demo is present for functional changes.
+            // 2) Demo is present for functional changes. Scan the whole body, not
+            //    just the Demo section, so a screenshot or video placed anywhere counts.
             const files = await github.paginate(github.rest.pulls.listFiles, {
               owner, repo, pull_number: number, per_page: 100,
             });
             const functional = files.some((f) => !EXEMPT.some((r) => r.test(f.filename)));
-            const demo = section('Demo') || '';
-            const hasMedia = MEDIA.some((r) => r.test(demo));
+            const hasMedia = MEDIA.some((r) => r.test(body));
             if (functional && !hasMedia) {
-              reasons.push('This PR changes functional code (SDK, API, or frontend) but the **Demo** section has no screenshot or video. A short demo recording is required. Only test-only, docs-only, or chore changes may mark Demo as N/A.');
+              reasons.push('This PR changes functional code (SDK, API, or frontend) but includes no demo. Add a screenshot or short video of the change. Only test-only, docs-only, or chore changes may skip it.');
             }
 
             async function upsertComment(text) {

From 2e835593bf75ee4217e6657df0ffb34abeece85b Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Mon, 8 Jun 2026 14:09:40 +0200
Subject: [PATCH 36/36] fix(frontend): run-on gate for the workflow revision
 evaluator drawer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The evaluator drawer rendered by WorkflowRevisionDrawerWrapper reimplemented
the run panel gate as `runDisabled={!hasAppConnected}`, ignoring the run-on
mode. So switching its Run-on selector to 'test case' updated the header while
the panel kept showing the 'Select an app' empty state and demanding an app —
the page and creation drawer respected the mode, only this third surface didn't.

Route it through the shared useEvaluatorRunControls hook (+ SelectAppEmptyState
and the prop-less EvaluatorPlaygroundHeader), the same wiring the page and the
creation drawer use, so the gate is `runOnMode === 'app' && !hasAppConnected`
everywhere and the three surfaces can't drift again. Removes this drawer's
duplicated app adapter / app-select / run-gate logic.

Also drop the getDefaultStore() patch from useEvaluatorRunControls: runtime
debugging proved these surfaces are not in a scoped store (the drawer that was
broken is WorkflowRevisionDrawerWrapper, not the scoped-store CreateEvaluator
drawer), so the override was a no-op based on a wrong hypothesis.
---
 .../useEvaluatorRunControls.ts                | 47 ++++-------
 .../WorkflowRevisionDrawerWrapper/index.tsx   | 79 ++++---------------
 2 files changed, 33 insertions(+), 93 deletions(-)

diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
index 83f9b30d21..c75dba5a98 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
@@ -2,12 +2,12 @@
  * useEvaluatorRunControls
  *
  * Single source of truth for the evaluator playground's run controls, shared by
- * the full-page playground and the evaluator-creation drawer. Before this hook,
- * the app adapter, app-select handler, evaluator-node lookup, and run-on
- * wiring were copy-pasted across the page header, page body, drawer header, and
- * drawer body — which is exactly how the drawer drifted out of sync with the
- * page (it kept forcing an app even in test-case mode). Centralizing it here
- * means both surfaces behave identically by construction.
+ * the full-page playground, the evaluator-creation drawer, and the workflow
+ * revision drawer. Before this hook, the app adapter, app-select handler,
+ * evaluator-node lookup, and run-on wiring were copy-pasted across every
+ * surface — which is exactly how the drawers drifted out of sync with the page
+ * (they kept forcing an app even in test-case mode). Centralizing it here means
+ * every surface behaves identically by construction.
  */
 
 import {useCallback, useMemo} from "react"
@@ -17,7 +17,7 @@ import {
     type WorkflowRevisionSelectionResult,
 } from "@agenta/entity-ui/selection"
 import {playgroundController} from "@agenta/playground"
-import {getDefaultStore, useAtomValue, useSetAtom} from "jotai"
+import {useAtomValue, useSetAtom} from "jotai"
 
 import {
     connectAppToEvaluatorAtom,
@@ -30,23 +30,9 @@ import {
 } from "./atoms"
 
 export function useEvaluatorRunControls() {
-    // Bind to the default store explicitly. The playground state runs on the
-    // default store (the playground package uses `getDefaultStore()` throughout),
-    // but the evaluator-creation drawer renders inside a scoped Jotai store
-    // (`EvaluationRunsTableStoreProvider`) that doesn't mirror the playground or
-    // run-on atoms. Without this, the drawer would read/write run-on mode in the
-    // scoped store while the playground lives in the default store — the two
-    // split, so switching to test-case mode never reaches the run panel and it
-    // stays stuck on "select an app". On the full page (no scoped store) this is
-    // a no-op. Same pattern as `usePreviewVariantConfig` / `TestsetCells`.
-    const store = getDefaultStore()
-
     // Evaluator node — phase 1: evaluator at depth 0 (primary); phase 2:
     // evaluator at depth 1 (downstream of a connected app).
-    const nodes = useAtomValue(
-        useMemo(() => playgroundController.selectors.nodes(), []),
-        {store},
-    )
+    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
     const evaluatorNode = useMemo(() => {
         const downstream = nodes.find((n) => n.depth > 0)
         if (downstream) return downstream
@@ -67,8 +53,8 @@ export function useEvaluatorRunControls() {
         [],
     )
 
-    const connectApp = useSetAtom(connectAppToEvaluatorAtom, {store})
-    const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom, {store})
+    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
+    const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom)
 
     const handleAppSelect = useCallback(
         (selection: WorkflowRevisionSelectionResult) => {
@@ -86,8 +72,8 @@ export function useEvaluatorRunControls() {
     // Run-on mode. A connected app forces effective "app" mode (the node graph
     // is the source of truth); the stored preference only applies when nothing
     // is connected.
-    const runOnMode = useAtomValue(effectiveRunOnModeAtom, {store})
-    const setRunOnMode = useSetAtom(runOnModeAtom, {store})
+    const runOnMode = useAtomValue(effectiveRunOnModeAtom)
+    const setRunOnMode = useSetAtom(runOnModeAtom)
     const handlePickRunOn = useCallback(
         (next: RunOnMode) => {
             if (next === "trace") return // disabled, not selectable
@@ -99,14 +85,13 @@ export function useEvaluatorRunControls() {
         [disconnectApp, setRunOnMode],
     )
 
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom, {store})
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom, {store})
+    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
+    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
 
     // In "app" mode with no app connected yet, the evaluator can't run — the run
     // panel surfaces the app selector instead of the testcase rows. In test-case
-    // mode the evaluator runs standalone, so it's never blocked on an app.
-    // Only takes effect where the run panel renders (the page and the expanded
-    // drawer); the collapsed drawer is config-only and ignores `runDisabled`.
+    // mode the evaluator runs standalone, so it's never blocked on an app. Only
+    // takes effect where the run panel renders (the page and expanded drawers).
     const runDisabled = runOnMode === "app" && !hasAppConnected
 
     return {
diff --git a/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx b/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
index 7349e251d2..2b236c2243 100644
--- a/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
+++ b/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
@@ -20,12 +20,7 @@ import {
     workflowMolecule,
     discardLocalServerDataAtom,
 } from "@agenta/entities/workflow"
-import {EntityPicker} from "@agenta/entity-ui"
 import {PlaygroundConfigSection} from "@agenta/entity-ui/drill-in"
-import {
-    createWorkflowRevisionAdapter,
-    type WorkflowRevisionSelectionResult,
-} from "@agenta/entity-ui/selection"
 import {VariantDetailsWithStatus, VariantNameCell} from "@agenta/entity-ui/variant"
 import {playgroundController} from "@agenta/playground"
 import {
@@ -52,7 +47,7 @@ import {
 } from "@agenta/playground-ui/workflow-revision-drawer"
 import {EnvironmentTag} from "@agenta/ui"
 import {Rocket} from "@phosphor-icons/react"
-import {Button, Typography, message} from "antd"
+import {Button, message} from "antd"
 import {getDefaultStore, useAtom, useAtomValue, useSetAtom} from "jotai"
 import dynamic from "next/dynamic"
 import {useRouter} from "next/router"
@@ -63,9 +58,10 @@ import {
     connectAppToEvaluatorAtom,
     persistedAppSelectionAtom,
     persistedTestsetSelectionAtom,
-    selectedAppLabelAtom,
 } from "@/oss/components/Evaluators/components/ConfigureEvaluator/atoms"
 import EvaluatorPlaygroundHeader from "@/oss/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader"
+import SelectAppEmptyState from "@/oss/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState"
+import {useEvaluatorRunControls} from "@/oss/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls"
 import {clearEvaluatorWorkflowCache} from "@/oss/components/Evaluators/store/evaluatorsPaginatedStore"
 import {invalidateAppManagementWorkflowQueries} from "@/oss/components/pages/app-management/store"
 import {invalidatePromptsWorkflowQueries} from "@/oss/components/pages/prompts/store"
@@ -311,64 +307,28 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
         })
     }, [connectedTestset, setPersistedTestset])
 
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+    // Shared run controls — the same hook the full page and the creation drawer
+    // use, so every evaluator surface gates runs identically (run-on aware) and
+    // can't drift apart again. (This drawer previously hardcoded
+    // `runDisabled={!hasAppConnected}`, which ignored the run-on mode and forced
+    // an app even in test-case mode.)
+    const {appWorkflowAdapter, handleAppSelect, selectedAppLabel, runDisabled} =
+        useEvaluatorRunControls()
 
     const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
-    const evaluatorNode = useMemo(() => {
-        const downstream = nodes.find((n) => n.depth > 0)
-        if (downstream) return downstream
-        return nodes[0] ?? null
-    }, [nodes])
-
-    // Derive from nodes directly (single source of truth, no atom indirection)
-    const hasAppConnected = useMemo(() => nodes.some((n) => n.depth > 0), [nodes])
     const configEntityIds = useMemo(() => {
         const downstream = nodes.filter((n) => n.depth > 0)
         if (downstream.length > 0) return downstream.map((n) => n.entityId)
         return nodes.map((n) => n.entityId)
     }, [nodes])
 
-    const appWorkflowAdapter = useMemo(
-        () =>
-            createWorkflowRevisionAdapter({
-                skipVariantLevel: true,
-                excludeRevisionZero: true,
-                flags: {is_evaluator: false, is_feedback: false},
-                // Picking an *app* to connect upstream of the evaluator — the
-                // adapter's default "Evaluator" label would make the search
-                // bar say "Search evaluator…" which is wrong here.
-                parentLabel: "Application",
-            }),
-        [],
-    )
-
-    const handleAppSelect = useCallback(
-        (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
-            connectApp({
-                appRevisionId: selection.id,
-                appLabel: selection.label,
-                evaluatorRevisionId: evaluatorNode.entityId,
-                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
-            })
-        },
-        [connectApp, evaluatorNode],
-    )
-
     const runDisabledContent = useMemo(
         () => (
-            <>
-                <Typography.Text type="secondary" className="text-sm">
-                    Select an app to run the evaluator chain
-                </Typography.Text>
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={handleAppSelect}
-                    size="middle"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-            </>
+            <SelectAppEmptyState
+                adapter={appWorkflowAdapter}
+                onSelect={handleAppSelect}
+                selectedAppLabel={selectedAppLabel}
+            />
         ),
         [appWorkflowAdapter, handleAppSelect, selectedAppLabel],
     )
@@ -386,12 +346,7 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
     return (
         <OSSPlaygroundShell providers={providers}>
             <div className="flex flex-col w-full h-full overflow-hidden">
-                {isExpanded && (
-                    <EvaluatorPlaygroundHeader
-                        appWorkflowAdapter={appWorkflowAdapter}
-                        onAppSelect={handleAppSelect}
-                    />
-                )}
+                {isExpanded && <EvaluatorPlaygroundHeader />}
                 <PlaygroundMainView
                     mode="evaluator"
                     viewMode={isExpanded ? "full" : "configOnly"}
@@ -399,7 +354,7 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
                     configViewMode={configViewMode}
                     onConfigViewModeChange={setConfigViewMode}
                     configEntityIdsOverride={configEntityIds}
-                    runDisabled={!hasAppConnected}
+                    runDisabled={runDisabled}
                     runDisabledContent={runDisabledContent}
                 />
             </div>