diff --git a/.github/workflows/13-check-pr-contribution.yml b/.github/workflows/13-check-pr-contribution.yml
index a4217327f7..04da371052 100644
--- a/.github/workflows/13-check-pr-contribution.yml
+++ b/.github/workflows/13-check-pr-contribution.yml
@@ -10,8 +10,11 @@ name: "13 - check PR contribution"
 # It never checks out or runs the PR's code, so this is safe.
 
 on:
+  # No 'reopened': a maintainer who manually reopens a flagged PR should win,
+  # otherwise the reopen event would immediately re-close it. Auto-reopen on a
+  # fixed description still works through 'edited' and 'synchronize'.
   pull_request_target:
-    types: [opened, edited, synchronize, reopened, ready_for_review]
+    types: [opened, edited, synchronize, ready_for_review]
   workflow_dispatch:
     inputs:
       pr_number:
@@ -134,27 +137,24 @@ jobs:
 
             const reasons = [];
 
-            // 1) Template is present and filled.
-            const headers = ['Summary', 'Testing', 'Demo', 'Checklist'];
-            const lower = body.toLowerCase();
-            const missing = headers.filter((h) => !lower.includes('## ' + h.toLowerCase()));
+            // 1) The PR is described. We only require a non-empty Summary, not the
+            //    full template. Missing Testing/Checklist sections do not close a PR;
+            //    a thorough PR with a demo should never be closed over a checklist.
             if (!body.trim()) {
               reasons.push('The pull request description is empty. Please fill in the PR template.');
-            } else if (missing.length) {
-              reasons.push('The description is missing required sections (' + missing.join(', ') + '). Please use the PR template without removing its sections.');
             } else if (!section('Summary')) {
-              reasons.push('The **Summary** section is empty. Describe what changed and why.');
+              reasons.push('The **Summary** section is missing or empty. Describe what changed and why using the PR template.');
             }
 
-            // 2) Demo is present for functional changes.
+            // 2) Demo is present for functional changes. Scan the whole body, not
+            //    just the Demo section, so a screenshot or video placed anywhere counts.
             const files = await github.paginate(github.rest.pulls.listFiles, {
               owner, repo, pull_number: number, per_page: 100,
             });
             const functional = files.some((f) => !EXEMPT.some((r) => r.test(f.filename)));
-            const demo = section('Demo') || '';
-            const hasMedia = MEDIA.some((r) => r.test(demo));
+            const hasMedia = MEDIA.some((r) => r.test(body));
             if (functional && !hasMedia) {
-              reasons.push('This PR changes functional code (SDK, API, or frontend) but the **Demo** section has no screenshot or video. A short demo recording is required. Only test-only, docs-only, or chore changes may mark Demo as N/A.');
+              reasons.push('This PR changes functional code (SDK, API, or frontend) but includes no demo. Add a screenshot or short video of the change. Only test-only, docs-only, or chore changes may skip it.');
             }
 
             async function upsertComment(text) {
diff --git a/api/pyproject.toml b/api/pyproject.toml
index f189225c27..bed728e76f 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "api"
-version = "0.101.1"
+version = "0.102.0"
 description = "Agenta API"
 requires-python = ">=3.11,<3.14"
 authors = [
diff --git a/api/uv.lock b/api/uv.lock
index 8e399a88c5..e03183d4a2 100644
--- a/api/uv.lock
+++ b/api/uv.lock
@@ -8,7 +8,7 @@ resolution-markers = [
 
 [[package]]
 name = "agenta"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../sdks/python" }
 dependencies = [
     { name = "agenta-client" },
@@ -70,7 +70,7 @@ dev = [
 
 [[package]]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../clients/python" }
 dependencies = [
     { name = "httpx" },
@@ -259,7 +259,7 @@ wheels = [
 
 [[package]]
 name = "api"
-version = "0.101.1"
+version = "0.102.0"
 source = { virtual = "." }
 dependencies = [
     { name = "agenta" },
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index eb0167cfb9..b6b760f4c4 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 description = "Fern-generated Python client for the Agenta API."
 requires-python = ">=3.11,<3.14"
 authors = [
diff --git a/clients/python/uv.lock b/clients/python/uv.lock
index bcf7e18af6..96b0d44d15 100644
--- a/clients/python/uv.lock
+++ b/clients/python/uv.lock
@@ -4,7 +4,7 @@ requires-python = ">=3.11, <3.14"
 
 [[package]]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "." }
 dependencies = [
     { name = "httpx" },
diff --git a/hosting/kubernetes/helm/Chart.yaml b/hosting/kubernetes/helm/Chart.yaml
index 5a80e7f5c7..3c88f08f85 100644
--- a/hosting/kubernetes/helm/Chart.yaml
+++ b/hosting/kubernetes/helm/Chart.yaml
@@ -2,8 +2,8 @@ apiVersion: v2
 name: agenta
 description: A Helm chart for deploying Agenta (OSS or EE) on Kubernetes
 type: application
-version: 0.101.1
-appVersion: "v0.101.1"
+version: 0.102.0
+appVersion: "v0.102.0"
 keywords:
   - agenta
   - llm
diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml
index 94d30e10eb..f5dffe454d 100644
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "agenta"
-version = "0.101.1"
+version = "0.102.0"
 description = "The SDK for agenta is an open-source LLMOps platform."
 readme = "README.md"
 requires-python = ">=3.11,<3.14"
diff --git a/sdks/python/uv.lock b/sdks/python/uv.lock
index 45f9972644..33bb5ed7cd 100644
--- a/sdks/python/uv.lock
+++ b/sdks/python/uv.lock
@@ -4,7 +4,7 @@ requires-python = ">=3.11, <3.14"
 
 [[package]]
 name = "agenta"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "." }
 dependencies = [
     { name = "agenta-client" },
@@ -83,7 +83,7 @@ dev = [
 
 [[package]]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../../clients/python" }
 dependencies = [
     { name = "httpx" },
diff --git a/services/pyproject.toml b/services/pyproject.toml
index fb3bcc09f2..b29077b98f 100644
--- a/services/pyproject.toml
+++ b/services/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "services"
-version = "0.101.1"
+version = "0.102.0"
 description = "Agenta Services (Chat & Completion)"
 requires-python = ">=3.11,<3.14"
 authors = [
diff --git a/services/uv.lock b/services/uv.lock
index 120dd8a54e..61aea0ab8f 100644
--- a/services/uv.lock
+++ b/services/uv.lock
@@ -8,7 +8,7 @@ resolution-markers = [
 
 [[package]]
 name = "agenta"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../sdks/python" }
 dependencies = [
     { name = "agenta-client" },
@@ -70,7 +70,7 @@ dev = [
 
 [[package]]
 name = "agenta-client"
-version = "0.101.1"
+version = "0.102.0"
 source = { editable = "../clients/python" }
 dependencies = [
     { name = "httpx" },
@@ -2363,7 +2363,7 @@ wheels = [
 
 [[package]]
 name = "services"
-version = "0.101.1"
+version = "0.102.0"
 source = { virtual = "." }
 dependencies = [
     { name = "agenta" },
diff --git a/web/ee/package.json b/web/ee/package.json
index 9b6f2b07a9..9e87a9a17a 100644
--- a/web/ee/package.json
+++ b/web/ee/package.json
@@ -1,6 +1,6 @@
 {
     "name": "@agenta/ee",
-    "version": "0.101.1",
+    "version": "0.102.0",
     "private": true,
     "engines": {
         "node": "24.x"
diff --git a/web/oss/package.json b/web/oss/package.json
index bb9da8b419..29dc761848 100644
--- a/web/oss/package.json
+++ b/web/oss/package.json
@@ -1,6 +1,6 @@
 {
     "name": "@agenta/oss",
-    "version": "0.101.1",
+    "version": "0.102.0",
     "private": true,
     "engines": {
         "node": "24.x"
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts
index 3b3941cc15..bee2d3f3e6 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts
+++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts
@@ -16,6 +16,12 @@ export interface EvaluationRunsTableOverrides {
     evaluationKind: EvaluationRunKind
     includePreview: boolean
     scope?: TableScope
+    /**
+     * Over-fetch successive server pages until a full page of subject runs is
+     * collected. Set by fixed-size, non-paginating surfaces (the Overview
+     * summary) so the subject filter doesn't leave them falsely empty.
+     */
+    fillToLimit?: boolean
 }
 
 type TableScope = "app" | "project"
@@ -34,6 +40,7 @@ export interface EvaluationRunsTableContext {
     storageKey: string
     createSupported: boolean
     createEvaluationType: "auto" | "human" | "online" | "custom"
+    fillToLimit: boolean
 }
 
 export const defaultEvaluationRunsTableOverrides: EvaluationRunsTableOverrides = {
@@ -66,6 +73,7 @@ export const evaluationRunsTableContextAtom = atom<EvaluationRunsTableContext>((
 
     const evaluationKind = overrides.evaluationKind
     const includePreview = overrides.includePreview
+    const fillToLimit = overrides.fillToLimit ?? false
 
     const projectId =
         overrides.projectIdOverride ?? identifiers.projectId ?? fallbackProjectId ?? null
@@ -130,6 +138,7 @@ export const evaluationRunsTableContextAtom = atom<EvaluationRunsTableContext>((
         storageKey,
         createSupported,
         createEvaluationType,
+        fillToLimit,
     }
 
     return context
@@ -188,6 +197,7 @@ export const evaluationRunsMetaContextSliceAtom = selectAtom(
         includePreview: context.includePreview,
         evaluationKind: context.evaluationKind,
         derivedPreviewFlags: context.derivedPreviewFlags,
+        fillToLimit: context.fillToLimit,
     }),
     (a, b) =>
         a.projectId === b.projectId &&
@@ -196,6 +206,7 @@ export const evaluationRunsMetaContextSliceAtom = selectAtom(
         a.activeAppId === b.activeAppId &&
         a.includePreview === b.includePreview &&
         a.evaluationKind === b.evaluationKind &&
+        a.fillToLimit === b.fillToLimit &&
         arrayEquals(a.effectiveAppIds, b.effectiveAppIds) &&
         shallowEqualFlags(a.derivedPreviewFlags, b.derivedPreviewFlags),
 )
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts
index b7a2bba238..5d4d35b843 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts
+++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts
@@ -1,3 +1,5 @@
+import {hasResolvableSubject, isSubjectRun} from "@agenta/entities/evaluationRun/etl"
+
 import type {WindowingState} from "@/oss/components/InfiniteVirtualTable/types"
 import {deriveEvaluationKind} from "@/oss/lib/evaluations/utils/evaluationKind"
 
@@ -45,8 +47,21 @@ interface FetchEvaluationRunsWindowParams {
     statusFilters?: string[] | null
     evaluationTypeFilters?: ConcreteEvaluationRunKind[] | null
     dateRange?: {from?: string | null; to?: string | null} | null
+    /**
+     * Over-fetch successive server pages until `limit` *subject* runs are
+     * collected (or the stream is exhausted / a safety cap is hit). For the
+     * fixed-size Overview summary (no infinite scroll): a single server page can
+     * filter down to few/zero subject runs even when more exist deeper, which
+     * would falsely read as "this workflow has never been evaluated". The full
+     * page leaves this off — infinite scroll fills lazily on scroll.
+     */
+    fillToLimit?: boolean
 }
 
+// Over-fetch tuning (only used when `fillToLimit` + a subject filter are active).
+const FILL_MAX_SERVER_PAGES = 8
+const FILL_MIN_SERVER_PAGE = 25
+
 const fetchPreviewRuns = async ({
     projectId,
     appId,
@@ -238,6 +253,7 @@ export const fetchEvaluationRunsWindow = async ({
     cursor = null,
     evaluationTypeFilters,
     dateRange,
+    fillToLimit = false,
 }: FetchEvaluationRunsWindowParams): Promise<EvaluationRunsWindowResult> => {
     if (!projectId) {
         return {
@@ -259,31 +275,6 @@ export const fetchEvaluationRunsWindow = async ({
         evaluationKind === "all" && allowedKinds && allowedKinds.size
             ? Array.from(allowedKinds)
             : null
-    const windowingPayload: QueryWindowingPayload = {
-        limit,
-        order: "descending" as const,
-        next: cursor ?? undefined,
-    }
-    if (dateRange?.to) {
-        windowingPayload.newest = dateRange.to
-    }
-    if (dateRange?.from) {
-        windowingPayload.oldest = dateRange.from
-    }
-
-    const previewResult = includePreview
-        ? await fetchPreviewRuns({
-              projectId,
-              appId: previewAppId,
-              searchQuery: previewSearchQuery,
-              references: previewReferences,
-              flags: previewFlags,
-              statuses: statusFilters && statusFilters.length ? statusFilters : undefined,
-              evaluationTypes: evaluationTypesPayload,
-              windowing: windowingPayload,
-          })
-        : {runs: [], count: 0, windowing: null}
-
     const rows: EvaluationRunApiRow[] = []
 
     const normalizedSearch = previewSearchQuery?.trim().toLowerCase() ?? null
@@ -305,11 +296,31 @@ export const fetchEvaluationRunsWindow = async ({
         return normalizedStatusSet.has(statusValue.toLowerCase())
     }
 
-    const allowedAppIds = appIds.filter((id) => typeof id === "string" && id.trim().length > 0)
-    const allowedAppSet =
-        allowedAppIds.length > 0 ? new Set(allowedAppIds.map((id) => id.trim())) : null
+    const allowedAppIds = appIds
+        .filter((id) => typeof id === "string" && id.trim().length > 0)
+        .map((id) => id.trim())
+    const allowedAppSet = allowedAppIds.length > 0 ? new Set(allowedAppIds) : null
 
-    previewResult.runs.forEach((run) => {
+    // Run-list SUBJECT predicate (feature F): when scoped to a workflow, keep
+    // runs that *evaluated this workflow* — runs where the scoped id is the
+    // run's `application`/invocation reference (the evaluated subject) — and
+    // drop runs where it merely appears as a grader (`evaluator` reference).
+    //
+    // This replaces the prior `meta.application.id` heuristic, which is
+    // unreliable: a null `meta.application` silently bypassed the guard, which
+    // is how grader runs leaked onto an evaluator's Evaluations tab. The run's
+    // `data.steps` are the structural source of truth. We fall back to the
+    // `meta` heuristic only when a run carries no resolvable subject reference.
+    //
+    // `subjectScanned`/`subjectMatched` feed the hit-ratio meter: a low rolling
+    // pass-ratio means the scoped workflow is graded far more than it's
+    // evaluated — the signal that the backend role-aware reference filter (v2)
+    // is warranted. (The FE already sends the role via the payload's dict key;
+    // v2 is the backend honoring it. See evaluations/utils.py query_run_references.)
+    let subjectScanned = 0
+    let subjectMatched = 0
+
+    const processRun = (run: PreviewEvaluationRun) => {
         // Derive kind from run.data.steps - this is the reliable source of truth
         // Do NOT rely on meta.evaluation_kind as it's flaky and unreliable
         const derivedKind = derivePreviewRunKind(run)
@@ -331,8 +342,22 @@ export const fetchEvaluationRunsWindow = async ({
         const runId = run.id ?? null
         const metaApplication = (run as any)?.meta?.application ?? {}
         const runAppId = metaApplication?.id ?? (run as any)?.meta?.appId ?? null
-        if (allowedAppSet && runAppId && !allowedAppSet.has(runAppId)) {
-            return
+        const previewMeta = extractPreviewRunMeta(run)
+
+        if (allowedAppSet) {
+            subjectScanned += 1
+            const steps = previewMeta.steps
+            const passesSubject = hasResolvableSubject(steps)
+                ? // Structural: the scoped workflow is the run's evaluated subject.
+                  allowedAppIds.some((id) => isSubjectRun(steps, id))
+                : // Fallback for runs with no resolvable subject reference:
+                  // keep the prior `meta.application.id` behaviour rather than
+                  // dropping a run we can't classify structurally.
+                  !runAppId || allowedAppSet.has(runAppId)
+            if (!passesSubject) {
+                return
+            }
+            subjectMatched += 1
         }
         const previewName = typeof (run as any)?.name === "string" ? (run as any).name : null
         if (!matchesSearch([runId, previewName, metaApplication?.id, metaApplication?.name])) {
@@ -354,10 +379,66 @@ export const fetchEvaluationRunsWindow = async ({
                     : (run as any)?.status?.value) ?? null,
             appId: runAppId ?? null,
             preview: runId ? {id: runId} : undefined,
-            previewMeta: extractPreviewRunMeta(run),
+            previewMeta,
             evaluationKind: derivedKind,
         })
-    })
+    }
+
+    // Over-fetch loop. The fixed-size summary (`fillToLimit`) can filter a single
+    // server page down to few/zero subject runs even when more exist deeper —
+    // which would falsely read as "this workflow has never been evaluated". When
+    // filling, pull successive server pages (advancing the cursor) until we have
+    // `limit` subject runs, the stream is exhausted, or the safety cap is hit.
+    // The full page leaves this off (single page) — its infinite scroll fills
+    // lazily on scroll, so changing its pagination here isn't needed.
+    const wantFill = Boolean(fillToLimit) && Boolean(allowedAppSet)
+    const serverPageLimit = wantFill ? Math.max(limit, FILL_MIN_SERVER_PAGE) : limit
+    const maxPages = wantFill ? FILL_MAX_SERVER_PAGES : 1
+
+    let currentCursor: string | undefined = cursor ?? undefined
+    let firstPageCount: number | null = null
+    let lastWindowing: QueryWindowingPayload | null = null
+    let pagesFetched = 0
+
+    while (pagesFetched < maxPages) {
+        pagesFetched += 1
+
+        const windowingPayload: QueryWindowingPayload = {
+            limit: serverPageLimit,
+            order: "descending" as const,
+            next: currentCursor,
+        }
+        if (dateRange?.to) {
+            windowingPayload.newest = dateRange.to
+        }
+        if (dateRange?.from) {
+            windowingPayload.oldest = dateRange.from
+        }
+
+        const previewResult = includePreview
+            ? await fetchPreviewRuns({
+                  projectId,
+                  appId: previewAppId,
+                  searchQuery: previewSearchQuery,
+                  references: previewReferences,
+                  flags: previewFlags,
+                  statuses: statusFilters && statusFilters.length ? statusFilters : undefined,
+                  evaluationTypes: evaluationTypesPayload,
+                  windowing: windowingPayload,
+              })
+            : {runs: [], count: 0, windowing: null}
+
+        if (firstPageCount === null) {
+            firstPageCount = previewResult.count ?? null
+        }
+        lastWindowing = previewResult.windowing
+        previewResult.runs.forEach(processRun)
+
+        currentCursor = previewResult.windowing?.next ?? undefined
+        if (!wantFill || rows.length >= limit || !currentCursor) {
+            break
+        }
+    }
 
     rows.sort((a, b) => {
         const tsA = a.createdAt ? new Date(a.createdAt).getTime() : 0
@@ -365,14 +446,16 @@ export const fetchEvaluationRunsWindow = async ({
         return tsB - tsA
     })
 
+    // The fixed-size summary shows at most `limit` (latest N subject runs); the
+    // last over-fetched server page may carry a few extra past the limit.
+    const pageRows = wantFill ? rows.slice(0, limit) : rows
     const totalCount =
-        evaluationKind === "all" && allowedKinds
-            ? rows.length
-            : (previewResult.count ?? rows.length)
-    const pageRows = rows
+        evaluationKind === "all" && allowedKinds ? pageRows.length : (firstPageCount ?? rows.length)
     const nextOffset = offset + pageRows.length
-    const previewNextCursor = previewResult.windowing?.next ?? null
-    const hasMore = Boolean(previewNextCursor)
+    // The summary doesn't paginate (infinite scroll off), so it never advertises
+    // "more"; the full page advertises the page's server cursor as before.
+    const previewNextCursor = lastWindowing?.next ?? null
+    const hasMore = wantFill ? false : Boolean(previewNextCursor)
 
     return {
         rows: pageRows,
@@ -380,6 +463,9 @@ export const fetchEvaluationRunsWindow = async ({
         hasMore,
         nextOffset: hasMore ? nextOffset : null,
         nextCursor: previewNextCursor,
-        nextWindowing: normalizeWindowing(previewResult.windowing),
+        nextWindowing: normalizeWindowing(lastWindowing),
+        subjectFilterStats: allowedAppSet
+            ? {scanned: subjectScanned, matched: subjectMatched}
+            : undefined,
     }
 }
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts
new file mode 100644
index 0000000000..0bfd513091
--- /dev/null
+++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts
@@ -0,0 +1,79 @@
+/**
+ * Per-context hit-ratio meter for the run-list SUBJECT predicate (feature F).
+ *
+ * The subject filter (`fetchAutoEvaluationRuns`) keeps runs that *evaluated the
+ * scoped workflow* and drops runs where it was only a grader. When the scoped
+ * workflow is graded far more often than it's evaluated, most fetched runs get
+ * dropped client-side — the "low hit-ratio" case the eval-filtering RFC's meter
+ * is built to detect (docs/designs/eval-filtering.md §D2 + §C3).
+ *
+ * A low rolling ratio is the signal that the backend role-aware reference
+ * filter (v2) is warranted. The FE already encodes the role as the reference
+ * payload's dict key; v2 is purely the backend honoring it
+ * (`evaluations/utils.py` `query_run_references` — see line 66). So this meter
+ * **reports the regime** (dev log + a readable getter for diagnostics); it does
+ * not — and cannot, from the FE — swap to a server-side filter.
+ *
+ * Meters are keyed by the subject-filter context (project + scoped workflow ids
+ * + kind). Each distinct context gets its own rolling window.
+ */
+
+import {
+    createHitRatioMeter,
+    type HitRatioMeter,
+    type HitRatioRegime,
+} from "@agenta/entities/evaluationRun/etl"
+
+const meters = new Map<string, HitRatioMeter>()
+
+const meterFor = (signature: string): HitRatioMeter => {
+    let meter = meters.get(signature)
+    if (!meter) {
+        meter = createHitRatioMeter()
+        meters.set(signature, meter)
+    }
+    return meter
+}
+
+/** Stable signature for a subject-filter context. */
+export const subjectFilterSignature = ({
+    projectId,
+    appIds,
+    evaluationKind,
+}: {
+    projectId: string | null
+    appIds: string[] | null | undefined
+    evaluationKind: string
+}): string => `${projectId ?? "null"}::${(appIds ?? []).join("|")}::${evaluationKind}`
+
+/**
+ * Record one page of subject-filter stats and return the resulting regime.
+ *
+ * `page` should be the fetch offset (monotonic, unique per page within a
+ * context). The meter dedups by it, so a refetch from offset 0 — common after
+ * cache invalidation — doesn't double-count.
+ */
+export const recordSubjectFilterPage = ({
+    signature,
+    page,
+    scanned,
+    matched,
+}: {
+    signature: string
+    page: number
+    scanned: number
+    matched: number
+}): HitRatioRegime => {
+    const meter = meterFor(signature)
+    meter.record({chunk: page, scanned, matched})
+    return meter.regime()
+}
+
+/** Read the current regime without recording (diagnostics / banners). */
+export const getSubjectFilterRegime = (signature: string): HitRatioRegime | null =>
+    meters.get(signature)?.regime() ?? null
+
+/** Drop a context's meter (e.g. when its filter signature is retired). */
+export const resetSubjectFilterMeter = (signature: string): void => {
+    meters.delete(signature)
+}
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts
index b79bfe6cf2..803a19f0f4 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts
+++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts
@@ -16,6 +16,7 @@ import {buildReferencePayload} from "../utils/referencePayload"
 
 import {computeContextSignature, evaluationRunsMetaContextSliceAtom} from "./context"
 import {fetchEvaluationRunsWindow} from "./fetchAutoEvaluationRuns"
+import {recordSubjectFilterPage, subjectFilterSignature} from "./subjectFilterMeter"
 
 import type {RunFlagsFilter} from "@/agenta-oss-common/lib/hooks/usePreviewEvaluations/index"
 
@@ -31,6 +32,8 @@ export interface EvaluationRunsTableMeta {
     referenceFilters?: Record<string, string[]> | null
     evaluationTypeFilters?: ConcreteEvaluationRunKind[] | null
     dateRange?: {from?: string | null; to?: string | null} | null
+    /** Over-fetch to fill a full page of subject runs (fixed-size summaries). */
+    fillToLimit?: boolean
     /** Internal refresh trigger - incrementing this forces a refetch */
     _refreshTrigger?: number
 }
@@ -199,6 +202,7 @@ export const evaluationRunsTableMetaAtom = atom<
             referenceFilters,
             evaluationTypeFilters,
             dateRange,
+            fillToLimit: context.fillToLimit,
             _refreshTrigger: refreshTrigger,
         }
 
@@ -377,8 +381,34 @@ const evaluationRunsDatasetStoreInternal = createInfiniteDatasetStore<
             statusFilters: meta.statusFilters ?? null,
             evaluationTypeFilters: meta.evaluationTypeFilters ?? null,
             dateRange: meta.dateRange ?? null,
+            fillToLimit: meta.fillToLimit ?? false,
         })
 
+        // Feed the run-list subject predicate's pass-ratio to the hit-ratio
+        // meter. A low rolling ratio means the scoped workflow is graded far
+        // more than it's evaluated — the v1→v2 escalation signal (the backend
+        // role-aware reference filter is warranted). Observation only today.
+        if (result.subjectFilterStats) {
+            const signature = subjectFilterSignature({
+                projectId: meta.projectId,
+                appIds: meta.appIds,
+                evaluationKind: meta.evaluationKind,
+            })
+            const regime = recordSubjectFilterPage({
+                signature,
+                page: offset,
+                scanned: result.subjectFilterStats.scanned,
+                matched: result.subjectFilterStats.matched,
+            })
+            if (process.env.NODE_ENV !== "production" && regime.state === "escalate") {
+                console.log(
+                    "[evaluationRunsTableStore] subject filter low hit-ratio —",
+                    regime.reason,
+                    {appIds: meta.appIds, kind: meta.evaluationKind},
+                )
+            }
+        }
+
         return {
             rows: result.rows,
             totalCount: result.totalCount,
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx b/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx
index 543990186a..e00ab76843 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx
+++ b/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx
@@ -73,6 +73,10 @@ const LatestEvaluationRunsTable = ({
                     appId,
                     projectIdOverride,
                     includePreview,
+                    // Fixed-size summary (no infinite scroll): over-fetch so the
+                    // subject filter doesn't leave it falsely empty when the
+                    // workflow is graded more than it's evaluated.
+                    fillToLimit: true,
                     ...(appScoped && {scope: "app" as const}),
                 }}
                 pageSize={limit}
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx b/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx
index 2ec43bc3f4..a05874dab8 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx
+++ b/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx
@@ -10,6 +10,7 @@ import {
     type ReferenceTone,
 } from "@/oss/components/References/referenceColors"
 import {testsetsListQueryAtomFamily} from "@/oss/state/entities/testset"
+import {currentWorkflowAtom} from "@/oss/state/workflow"
 
 import {
     evaluationRunsFilterOptionsAtom,
@@ -137,10 +138,19 @@ const FiltersSummary = () => {
         () => optionMap(filterOptions.evaluatorOptions ?? []),
         [filterOptions.evaluatorOptions],
     )
-    const appLabels = useMemo(
-        () => optionMap(filterOptions.appOptions ?? []),
-        [filterOptions.appOptions],
-    )
+    const currentWorkflow = useAtomValue(currentWorkflowAtom)
+    const appLabels = useMemo(() => {
+        const map = optionMap(filterOptions.appOptions ?? [])
+        // The locked "Apps" chip is preset to the route workflow. Evaluator
+        // workflows aren't in the apps list (`appOptions`), so their id won't
+        // resolve to a name and the chip would show a raw id. Seed the map from
+        // the current workflow so the chip renders its name instead.
+        const workflowName = currentWorkflow?.name ?? currentWorkflow?.slug
+        if (currentWorkflow?.id && workflowName && !map.has(currentWorkflow.id)) {
+            map.set(currentWorkflow.id, workflowName)
+        }
+        return map
+    }, [filterOptions.appOptions, currentWorkflow])
     const variantLabels = useMemo(
         () =>
             optionMap(
diff --git a/web/oss/src/components/EvaluationRunsTablePOC/types.ts b/web/oss/src/components/EvaluationRunsTablePOC/types.ts
index 8370f7b945..18c1a4eb0a 100644
--- a/web/oss/src/components/EvaluationRunsTablePOC/types.ts
+++ b/web/oss/src/components/EvaluationRunsTablePOC/types.ts
@@ -70,4 +70,18 @@ export interface EvaluationRunsWindowResult {
     nextOffset: number | null
     nextCursor: string | null
     nextWindowing: WindowingState | null
+    /**
+     * Per-page stats for the run-list **subject** predicate (the structural
+     * "is this run an evaluation of the scoped workflow?" filter). Feeds the
+     * hit-ratio meter: when the rolling pass-ratio is low, the scoped workflow
+     * is being graded far more often than it's evaluated, signalling the
+     * backend role-aware reference filter (v2) is warranted. Absent when no
+     * subject filter is active (project scope).
+     */
+    subjectFilterStats?: {
+        /** Runs reaching the subject check (already past kind/status/search). */
+        scanned: number
+        /** Of those, runs whose subject is the scoped workflow. */
+        matched: number
+    }
 }
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
index 879f13436f..b0b970eb4d 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx
@@ -1,40 +1,22 @@
 /**
  * EvaluatorPlaygroundHeader
  *
- * Simplified playground header for the evaluator configuration page.
- * Shows evaluator name, app workflow selector, and testset dropdown.
- * Reads evaluator info from playground nodes (URL-driven, no props needed).
+ * Header for the evaluator configuration page: the evaluator name plus the
+ * shared run controls. The controls (run-on selector, app picker, testset)
+ * live in `EvaluatorRunControls` so the page and the creation drawer share one
+ * implementation. Reads evaluator info from playground nodes (URL-driven).
  */
 
 import {useMemo} from "react"
 
 import {workflowMolecule} from "@agenta/entities/workflow"
-import {EntityPicker} from "@agenta/entity-ui"
-import type {
-    EntitySelectionAdapter,
-    WorkflowRevisionSelectionResult,
-} from "@agenta/entity-ui/selection"
 import {playgroundController} from "@agenta/playground"
 import {Typography} from "antd"
 import {useAtomValue} from "jotai"
-import dynamic from "next/dynamic"
 
-import {selectedAppLabelAtom} from "./atoms"
+import EvaluatorRunControls from "./EvaluatorRunControls"
 
-const TestsetDropdown = dynamic(
-    () => import("@/oss/components/Playground/Components/TestsetDropdown"),
-    {ssr: false},
-)
-
-interface EvaluatorPlaygroundHeaderProps {
-    appWorkflowAdapter: EntitySelectionAdapter<WorkflowRevisionSelectionResult>
-    onAppSelect: (selection: WorkflowRevisionSelectionResult) => void
-}
-
-const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
-    appWorkflowAdapter,
-    onAppSelect,
-}) => {
+const EvaluatorPlaygroundHeader: React.FC = () => {
     // Read evaluator node from playground nodes
     // Phase 1: evaluator is at depth 0 (primary)
     // Phase 2: evaluator is at depth 1 (downstream)
@@ -69,12 +51,6 @@ const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
         evaluatorData?.slug?.trim() ||
         "Evaluator"
 
-    // Selected app label for display in the picker trigger
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
-
-    // Check if we have an app node (depth-0 with a different entity than evaluator)
-    const hasAppSelected = nodes.some((n) => n.depth === 0 && n.entityId !== evaluatorEntityId)
-
     return (
         <div className="flex items-center justify-between gap-4 px-2.5 py-2 bg-[var(--ag-rgba-000-02)] border-0 border-b border-solid border-[var(--ag-rgba-051729-06)]">
             <div className="flex shrink-0 items-center gap-2 pl-2">
@@ -83,16 +59,7 @@ const EvaluatorPlaygroundHeader: React.FC<EvaluatorPlaygroundHeaderProps> = ({
                 </Typography>
             </div>
 
-            <div className="flex min-w-0 flex-1 items-center justify-end gap-2">
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={onAppSelect}
-                    size="small"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-                {hasAppSelected && <TestsetDropdown />}
-            </div>
+            <EvaluatorRunControls />
         </div>
     )
 }
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx
new file mode 100644
index 0000000000..b52c0271ac
--- /dev/null
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx
@@ -0,0 +1,83 @@
+/**
+ * EvaluatorRunControls
+ *
+ * The run-on + app + testset control cluster, shared by the evaluator
+ * playground page header and the evaluator-creation drawer header so the two
+ * stay identical. Reads everything from `useEvaluatorRunControls` (atom-backed),
+ * so it takes no props — drop it next to a title and it works on either surface.
+ *
+ * - Run-on selector (test case / app output / trace).
+ * - App picker — only in "app" mode, with a disconnect affordance once connected.
+ * - Test set dropdown — always available: it's the data source in test-case
+ *   mode and feeds the app in app mode.
+ */
+
+import {EntityPicker} from "@agenta/entity-ui"
+import type {WorkflowRevisionSelectionResult} from "@agenta/entity-ui/selection"
+import {X} from "@phosphor-icons/react"
+import {Button, Tooltip} from "antd"
+import dynamic from "next/dynamic"
+
+import RunOnSelector from "./RunOnSelector"
+import {useEvaluatorRunControls} from "./useEvaluatorRunControls"
+
+const TestsetDropdown = dynamic(
+    () => import("@/oss/components/Playground/Components/TestsetDropdown"),
+    {ssr: false},
+)
+
+const EvaluatorRunControls = () => {
+    const {
+        appWorkflowAdapter,
+        handleAppSelect,
+        disconnectApp,
+        runOnMode,
+        handlePickRunOn,
+        hasAppConnected,
+        selectedAppLabel,
+    } = useEvaluatorRunControls()
+
+    const isAppMode = runOnMode === "app"
+
+    // Footer inside the picker popover — only when an app is currently connected.
+    const popupFooter = hasAppConnected ? (
+        <div className="border-0 border-t border-solid border-[var(--ag-rgba-051729-06)] p-2">
+            <Button size="small" danger className="w-full" onClick={() => disconnectApp()}>
+                Disconnect app
+            </Button>
+        </div>
+    ) : undefined
+
+    return (
+        <div className="flex min-w-0 items-center justify-end gap-1">
+            <RunOnSelector mode={runOnMode} onPick={handlePickRunOn} />
+
+            {isAppMode && (
+                <EntityPicker<WorkflowRevisionSelectionResult>
+                    variant="popover-cascader"
+                    adapter={appWorkflowAdapter}
+                    onSelect={handleAppSelect}
+                    size="small"
+                    placeholder={selectedAppLabel ?? "Select app"}
+                    popupFooter={popupFooter}
+                />
+            )}
+
+            {isAppMode && hasAppConnected && (
+                <Tooltip title="Disconnect app">
+                    <Button
+                        type="text"
+                        size="small"
+                        icon={<X size={12} />}
+                        onClick={() => disconnectApp()}
+                        aria-label="Disconnect app"
+                    />
+                </Tooltip>
+            )}
+
+            <TestsetDropdown />
+        </div>
+    )
+}
+
+export default EvaluatorRunControls
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx
new file mode 100644
index 0000000000..4fd8a2ee61
--- /dev/null
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx
@@ -0,0 +1,295 @@
+/**
+ * RunOnSelector
+ *
+ * The "Run on" control for the evaluator playground header. A leading dropdown
+ * that names the data source the evaluator runs against and draws the resulting
+ * data-flow, so the empty/first state explains itself instead of leaving the
+ * user with two disconnected loaders.
+ *
+ * Three modes:
+ *  - Run directly on data  (Data → Evaluator → Score)
+ *  - Run on an app         (Data → App → Output → Evaluator → Score) — default
+ *  - Run on a trace        (Trace → Evaluator → Score) — disabled for now
+ *
+ * All colors come from the live antd token (`theme.useToken()`) so the control
+ * follows light/dark mode automatically.
+ */
+
+import {useState} from "react"
+
+import {AppstoreOutlined} from "@ant-design/icons"
+import {
+    CaretDownIcon,
+    CheckIcon,
+    DatabaseIcon,
+    GavelIcon,
+    TreeViewIcon,
+} from "@phosphor-icons/react"
+import {Button, Dropdown, theme} from "antd"
+import type {GlobalToken} from "antd"
+import clsx from "clsx"
+
+import type {RunOnMode} from "./atoms"
+
+// The app icon used across the product (the sidebar "Prompts" item). Wrapped so
+// it accepts the same `size`/`style` props as the phosphor icons it sits beside.
+const AppIcon = ({size = 16, style}: {size?: number; style?: React.CSSProperties}) => (
+    <AppstoreOutlined style={{fontSize: size, ...style}} />
+)
+
+// ── flow pills ──────────────────────────────────────────────────────────────
+
+type FlowVariant = "data" | "app" | "out" | "eval" | "trace"
+
+interface FlowNode {
+    label: string
+    variant: FlowVariant
+}
+
+const flowStyle = (token: GlobalToken, variant: FlowVariant): React.CSSProperties => {
+    switch (variant) {
+        case "data":
+            return {background: token.blue1, color: token.blue7, borderColor: token.blue2}
+        case "app":
+            return {
+                background: token.colorPrimaryBg,
+                color: token.colorText,
+                borderColor: token.colorPrimaryBorder,
+            }
+        case "out":
+            return {background: token.green1, color: token.green7, borderColor: token.green3}
+        case "eval":
+            // index 7 (not 6) so the text brightens under the dark algorithm —
+            // purple6 lands dark-on-dark and disappears on a dark background.
+            return {background: token.purple1, color: token.purple7, borderColor: token.purple3}
+        case "trace":
+            return {background: token.cyan1, color: token.cyan7, borderColor: token.cyan3}
+    }
+}
+
+const FlowIcon = ({variant}: {variant: FlowVariant}) => {
+    switch (variant) {
+        case "data":
+            return <DatabaseIcon size={12} />
+        case "app":
+            return <AppIcon size={12} />
+        case "eval":
+            return <GavelIcon size={12} />
+        case "trace":
+            return <TreeViewIcon size={12} />
+        default:
+            return null
+    }
+}
+
+const FlowPills = ({steps, token}: {steps: FlowNode[]; token: GlobalToken}) => (
+    <div className="flex flex-wrap items-center gap-y-1">
+        {steps.map((step, i) => (
+            <span key={`${step.label}-${i}`} className="flex items-center">
+                {i > 0 && (
+                    <span className="px-1.5 text-[12px]" style={{color: token.colorTextQuaternary}}>
+                        →
+                    </span>
+                )}
+                <span
+                    className="inline-flex items-center gap-1 whitespace-nowrap rounded-full border border-solid px-2 py-[3px] text-[11px] leading-none"
+                    style={flowStyle(token, step.variant)}
+                >
+                    <FlowIcon variant={step.variant} />
+                    {step.label}
+                </span>
+            </span>
+        ))}
+    </div>
+)
+
+// ── modes ───────────────────────────────────────────────────────────────────
+
+interface ModeDef {
+    key: RunOnMode
+    /** Full label shown in the dropdown option. */
+    label: string
+    /** Short label shown after "Run on:" in the trigger button. */
+    shortLabel: string
+    Icon: React.ComponentType<{size?: number; style?: React.CSSProperties}>
+    desc: string
+    flow: FlowNode[]
+    badge?: "default" | "soon"
+    disabled?: boolean
+}
+
+const MODES: ModeDef[] = [
+    {
+        key: "data",
+        label: "Run directly on a test case",
+        shortLabel: "Test case",
+        Icon: DatabaseIcon,
+        desc: "Evaluate data you provide. Connect a test set, or type the input and output in by hand.",
+        flow: [
+            {label: "Data", variant: "data"},
+            {label: "Evaluator", variant: "eval"},
+            {label: "Score", variant: "out"},
+        ],
+    },
+    {
+        key: "app",
+        label: "Run on an app output",
+        shortLabel: "App output",
+        Icon: AppIcon,
+        badge: "default",
+        desc: "Run an app over your data, then the evaluator grades its output. The usual evaluation flow.",
+        flow: [
+            {label: "Data", variant: "data"},
+            {label: "App", variant: "app"},
+            {label: "Output", variant: "out"},
+            {label: "Evaluator", variant: "eval"},
+            {label: "Score", variant: "out"},
+        ],
+    },
+    {
+        key: "trace",
+        label: "Run on a trace",
+        shortLabel: "Trace",
+        Icon: TreeViewIcon,
+        badge: "soon",
+        disabled: true,
+        desc: "Pull the input and output straight from a logged trace in Observability.",
+        flow: [
+            {label: "Trace", variant: "trace"},
+            {label: "Evaluator", variant: "eval"},
+            {label: "Score", variant: "out"},
+        ],
+    },
+]
+
+// ── component ───────────────────────────────────────────────────────────────
+
+interface RunOnSelectorProps {
+    mode: RunOnMode
+    onPick: (mode: RunOnMode) => void
+}
+
+const RunOnSelector = ({mode, onPick}: RunOnSelectorProps) => {
+    const {token} = theme.useToken()
+    const [open, setOpen] = useState(false)
+    const [hovered, setHovered] = useState<RunOnMode | null>(null)
+    const current = MODES.find((m) => m.key === mode) ?? MODES.find((m) => m.key === "app")!
+
+    const overlay = (
+        <div
+            className="w-[460px] rounded-lg border border-solid p-1.5"
+            style={{
+                background: token.colorBgElevated,
+                borderColor: token.colorBorderSecondary,
+                boxShadow: token.boxShadowSecondary,
+            }}
+        >
+            <div
+                className="px-2.5 pb-1.5 pt-1 text-[11px] font-semibold uppercase tracking-[0.04em]"
+                style={{color: token.colorTextQuaternary}}
+            >
+                What should the evaluator run on?
+            </div>
+            {MODES.map((m) => {
+                const selected = m.key === mode
+                const isHovered = hovered === m.key
+                const background = selected
+                    ? token.controlItemBgActive
+                    : isHovered && !m.disabled
+                      ? token.colorFillTertiary
+                      : "transparent"
+                return (
+                    <div
+                        key={m.key}
+                        role="button"
+                        aria-disabled={m.disabled}
+                        onMouseEnter={() => setHovered(m.key)}
+                        onMouseLeave={() => setHovered((h) => (h === m.key ? null : h))}
+                        onClick={() => {
+                            if (m.disabled) return
+                            onPick(m.key)
+                            setOpen(false)
+                        }}
+                        className={clsx(
+                            "flex items-start gap-3 rounded-md p-2.5",
+                            m.disabled ? "cursor-default opacity-55" : "cursor-pointer",
+                        )}
+                        style={{background}}
+                    >
+                        <span
+                            className="mt-0.5 flex w-[18px] shrink-0 justify-center"
+                            style={{color: token.colorPrimary}}
+                        >
+                            {selected && <CheckIcon size={16} />}
+                        </span>
+                        <div className="min-w-0 flex-1">
+                            <div
+                                className="flex items-center gap-2 text-[14px] font-medium"
+                                style={{color: token.colorText}}
+                            >
+                                <m.Icon size={15} />
+                                {m.label}
+                                {m.badge === "default" && (
+                                    <span
+                                        className="rounded-full px-[7px] py-px text-[10.5px] font-semibold"
+                                        style={{
+                                            background: token.colorPrimary,
+                                            color: token.colorTextLightSolid,
+                                        }}
+                                    >
+                                        default
+                                    </span>
+                                )}
+                                {m.badge === "soon" && (
+                                    <span
+                                        className="rounded-full px-[7px] py-px text-[10.5px] font-semibold"
+                                        style={{background: token.gold1, color: token.gold8}}
+                                    >
+                                        soon
+                                    </span>
+                                )}
+                            </div>
+                            <div
+                                className="mt-0.5 text-[12.5px] leading-snug"
+                                style={{color: token.colorTextTertiary}}
+                            >
+                                {m.desc}
+                            </div>
+                            <div className="mt-2">
+                                <FlowPills steps={m.flow} token={token} />
+                            </div>
+                        </div>
+                    </div>
+                )
+            })}
+        </div>
+    )
+
+    return (
+        <Dropdown
+            open={open}
+            onOpenChange={setOpen}
+            trigger={["click"]}
+            placement="bottomLeft"
+            popupRender={() => overlay}
+        >
+            <Button
+                size="small"
+                className="flex items-center gap-1.5 font-medium"
+                style={{
+                    background: token.colorPrimaryBg,
+                    borderColor: token.colorPrimaryBorder,
+                }}
+            >
+                <span className="font-normal" style={{color: token.colorTextTertiary}}>
+                    Run on:
+                </span>
+                <current.Icon size={14} style={{color: token.colorText}} />
+                <span className="truncate">{current.shortLabel}</span>
+                <CaretDownIcon size={12} style={{color: token.colorTextTertiary}} />
+            </Button>
+        </Dropdown>
+    )
+}
+
+export default RunOnSelector
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx
new file mode 100644
index 0000000000..6b31aad851
--- /dev/null
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx
@@ -0,0 +1,56 @@
+/**
+ * SelectAppEmptyState
+ *
+ * Centered empty state shown in the run/generation panel when the evaluator is
+ * in "Run on an app" mode but no app is connected yet. The evaluator can't run
+ * until an app is picked, so this guides the user to the one action that
+ * unblocks them. Shared by the evaluator playground page and the
+ * evaluator-creation drawer so both read identically.
+ */
+
+import {EntityPicker} from "@agenta/entity-ui"
+import type {
+    EntitySelectionAdapter,
+    WorkflowRevisionSelectionResult,
+} from "@agenta/entity-ui/selection"
+import {AppstoreOutlined} from "@ant-design/icons"
+import {Typography, theme} from "antd"
+
+interface SelectAppEmptyStateProps {
+    adapter: EntitySelectionAdapter<WorkflowRevisionSelectionResult>
+    onSelect: (selection: WorkflowRevisionSelectionResult) => void
+    selectedAppLabel?: string | null
+}
+
+const SelectAppEmptyState = ({adapter, onSelect, selectedAppLabel}: SelectAppEmptyStateProps) => {
+    const {token} = theme.useToken()
+
+    return (
+        <div className="flex max-w-[340px] flex-col items-center gap-4">
+            <div
+                className="flex h-14 w-14 items-center justify-center rounded-full"
+                style={{background: token.colorPrimaryBg, color: token.colorPrimary}}
+            >
+                <AppstoreOutlined style={{fontSize: 26}} />
+            </div>
+            <div className="flex flex-col gap-1 text-center">
+                <Typography.Text className="text-[15px] font-semibold">
+                    Select an app to run the evaluator on
+                </Typography.Text>
+                <Typography.Text type="secondary" className="text-[13px] leading-snug">
+                    The evaluator grades this app&apos;s output. Pick which app to run, then fill
+                    its inputs or load a test set.
+                </Typography.Text>
+            </div>
+            <EntityPicker<WorkflowRevisionSelectionResult>
+                variant="popover-cascader"
+                adapter={adapter}
+                onSelect={onSelect}
+                size="middle"
+                placeholder={selectedAppLabel ?? "Select app"}
+            />
+        </div>
+    )
+}
+
+export default SelectAppEmptyState
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
index fdbd5d271b..0c83b594af 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts
@@ -83,6 +83,53 @@ export const persistedTestsetSelectionAtom = atom(
     },
 )
 
+// ============================================================================
+// RUN-ON MODE
+// ============================================================================
+
+/**
+ * What the evaluator runs on:
+ *  - "data"  → run directly on data you provide (test set or manual input/output)
+ *  - "app"   → run an app over the data, then grade its output (the usual flow)
+ *  - "trace" → grade the input/output of a logged trace (not yet available)
+ *
+ * "app" is the default so a fresh playground guides the user down the most
+ * common path (pick an app → run against it). The "trace" mode is disabled in
+ * the UI for now.
+ */
+export type RunOnMode = "data" | "app" | "trace"
+
+const runOnModeByProjectAtom = atomWithStorage<Record<string, RunOnMode>>(
+    "agenta:evaluator:run-on-mode",
+    {},
+)
+
+/** Read/write the persisted run-on mode for the current project (default "app"). */
+export const runOnModeAtom = atom(
+    (get) => {
+        const projectId = get(projectIdAtom) || "__global__"
+        return get(runOnModeByProjectAtom)[projectId] ?? "app"
+    },
+    (get, set, next: RunOnMode) => {
+        const projectId = get(projectIdAtom) || "__global__"
+        const all = get(runOnModeByProjectAtom)
+        set(runOnModeByProjectAtom, {...all, [projectId]: next})
+    },
+)
+
+/**
+ * The mode actually in effect.
+ *
+ * A connected app (downstream evaluator node) always means we're in "app" mode,
+ * regardless of the stored preference — the node graph is the source of truth.
+ * Only when nothing is connected do we fall back to the stored mode.
+ */
+export const effectiveRunOnModeAtom = atom<RunOnMode>((get) => {
+    const nodes = get(playgroundNodesAtom)
+    if (nodes.some((n) => n.depth > 0)) return "app"
+    return get(runOnModeAtom)
+})
+
 // ============================================================================
 // DERIVED SELECTORS
 // ============================================================================
@@ -115,8 +162,27 @@ export const hasAppConnectedAtom = atom((get) => {
     return nodes.some((n) => n.depth > 0)
 })
 
-/** Label of the currently selected app workflow (for display in header picker). */
-export const selectedAppLabelAtom = atom<string | null>(null)
+/**
+ * Label of the currently selected app workflow (for display in header picker).
+ *
+ * Derived from the node graph: when an evaluator-as-downstream (depth > 0)
+ * exists, the primary (depth-0) node is the connected app, and its `label`
+ * is what we want to show. Returns `null` in standalone mode (no downstream).
+ *
+ * Derived (not a primitive atom) so URL-hydration of the snapshot — which
+ * restores `playgroundNodesAtom` along with each node's `label` — automatically
+ * surfaces the right label without any explicit re-seeding from the page.
+ * Previously the atom was a primitive `atom<string | null>(null)`, which left
+ * the picker placeholder empty after reload while the disconnect button and
+ * testset dropdown (both gated on the node graph) showed normally.
+ */
+export const selectedAppLabelAtom = atom<string | null>((get) => {
+    const nodes = get(playgroundNodesAtom)
+    const hasDownstream = nodes.some((n) => n.depth > 0)
+    if (!hasDownstream) return null
+    const primary = nodes.find((n) => n.depth === 0)
+    return primary?.label ?? null
+})
 
 // ============================================================================
 // CONNECT APP (on app select)
@@ -143,11 +209,14 @@ export const connectAppToEvaluatorAtom = atom(
     ) => {
         const {appRevisionId, appLabel, evaluatorRevisionId, evaluatorLabel} = params
 
-        // Track selected app label for display + persist across sessions
-        set(selectedAppLabelAtom, appLabel)
-        set(persistedAppSelectionAtom, {appRevisionId, appLabel})
-
-        // Replace primary node with app
+        // Replace primary node with the app FIRST — if the graph mutation
+        // bails out (changePrimaryNode returns null when there's no current
+        // primary to swap), we must not commit a stale persisted record.
+        // Pre-fix the persist happened before this call, which could leave
+        // an `{appRevisionId, appLabel}` entry in localStorage referring to
+        // a connection that never actually formed; the next mount would
+        // re-hydrate from that record and the picker would show "connected"
+        // for an app the playground never linked.
         const nodeId = set(playgroundController.actions.changePrimaryNode, {
             type: "workflow",
             id: appRevisionId,
@@ -165,5 +234,84 @@ export const connectAppToEvaluatorAtom = atom(
                 label: evaluatorLabel,
             },
         })
+
+        // Clean the shared testcase row against the newly-selected app's input
+        // contract so stale keys from a previously-selected app (e.g. chat
+        // `messages`/`context` after swapping a chat app for a completion app)
+        // are dropped immediately — not only at run time (#4525 / AGE-3793).
+        // Runs AFTER connectDownstreamNode so the evaluator is in the graph and
+        // its referenced columns (correct_answer_key → ground_truth, etc.) are
+        // protected from the strict app-contract clean.
+        set(playgroundController.actions.reconcileRowsToPrimary)
+
+        // Persist only after both graph mutations succeeded. The picker
+        // display label is derived from the depth-0 node's `label` via
+        // `selectedAppLabelAtom`, so no extra write needed here.
+        set(persistedAppSelectionAtom, {appRevisionId, appLabel})
+
+        // Pin the stored run-on mode to "app" too. While connected,
+        // `effectiveRunOnModeAtom` overrides to "app" regardless, but the
+        // stored mode is what we fall back to on disconnect — without this a
+        // user who connected an app from "data" mode would snap back to the
+        // testcase panel on disconnect instead of the "Select an app" state.
+        set(runOnModeAtom, "app")
+
+        // Force the node-derived display atoms to re-settle after the two
+        // sequential `playgroundNodesAtom` writes above (changePrimaryNode →
+        // connectDownstreamNode). On a disconnect→reconnect cycle jotai applies
+        // the writes (the value is correct) but does NOT notify the mounted
+        // dependents — `selectedAppLabelAtom` / `hasAppConnectedAtom` and the
+        // package's generation-panel atoms stay stale, so the UI keeps showing
+        // the "Select an app" empty state even though an app is connected
+        // (QA 2026-06-05 — re-selecting the same app after disconnect). Reading
+        // the derived atoms here re-establishes the dependency and flushes the
+        // pending notification to their subscribers.
+        get(selectedAppLabelAtom)
+        get(hasAppConnectedAtom)
     },
 )
+
+// ============================================================================
+// DISCONNECT APP (reverse the connect)
+// ============================================================================
+
+/**
+ * Disconnect the upstream app and return to standalone evaluator mode.
+ *
+ * Reverse of `connectAppToEvaluatorAtom`:
+ * 1. Capture the downstream evaluator's identity (we need it after removal).
+ * 2. Remove the downstream evaluator node (`removeNodeAtom` keeps primary if
+ *    target is depth > 0; if there's no depth-1 node, this is a no-op and we
+ *    just swap primary).
+ * 3. Swap the primary node back to the evaluator. `changePrimaryNodeAtom`
+ *    clears `outputConnectionsAtom` for us as a side-effect.
+ * 4. Clear the persisted app selection + display label so the picker placeholder
+ *    reverts to "Select app".
+ */
+export const disconnectAppFromEvaluatorAtom = atom(null, (get, set) => {
+    const nodes = get(playgroundController.selectors.nodes())
+    const downstreamEvaluator = nodes.find((n) => n.depth > 0)
+    if (!downstreamEvaluator) {
+        // No downstream node means the graph is already in the
+        // standalone-evaluator shape, but a stale `persistedAppSelectionAtom`
+        // entry could still be on disk (e.g., from a previous session where
+        // `connectAppToEvaluatorAtom` persisted before its swap silently
+        // failed mid-mutation). Clear it on this path too so the next mount
+        // doesn't re-hydrate a phantom "connected" app.
+        set(persistedAppSelectionAtom, null)
+        return
+    }
+
+    const evaluatorEntity = {
+        type: downstreamEvaluator.entityType,
+        id: downstreamEvaluator.entityId,
+        label: downstreamEvaluator.label ?? "Evaluator",
+    }
+
+    set(playgroundController.actions.removeNode, downstreamEvaluator.id)
+    set(playgroundController.actions.changePrimaryNode, evaluatorEntity)
+    // `selectedAppLabelAtom` is derived from the node graph — clearing the
+    // downstream above is what flips it back to `null`. Only the persisted
+    // localStorage cache needs an explicit clear.
+    set(persistedAppSelectionAtom, null)
+})
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
index 35ff909bbd..f4db283381 100644
--- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx
@@ -14,15 +14,8 @@ import {useCallback, useEffect, useMemo} from "react"
 
 import {loadableController} from "@agenta/entities/loadable"
 import {testcaseMolecule} from "@agenta/entities/testcase"
-import {EntityPicker} from "@agenta/entity-ui"
-import {
-    createWorkflowRevisionAdapter,
-    type WorkflowRevisionSelectionResult,
-} from "@agenta/entity-ui/selection"
-import {playgroundController} from "@agenta/playground"
 import {type PlaygroundUIProviders} from "@agenta/playground-ui"
 import {preloadEditorPlugins, SyncStateTag} from "@agenta/ui"
-import {Typography} from "antd"
 import {useAtomValue, useSetAtom} from "jotai"
 import dynamic from "next/dynamic"
 
@@ -32,13 +25,10 @@ import {OSSPlaygroundShell} from "@/oss/components/Playground/OSSPlaygroundShell
 import SharedGenerationResultUtils from "@/oss/components/SharedGenerationResultUtils"
 import {playgroundSyncAtom} from "@/oss/state/url/playground"
 
-import {
-    connectAppToEvaluatorAtom,
-    evaluatorConfigEntityIdsAtom,
-    hasAppConnectedAtom,
-    selectedAppLabelAtom,
-} from "./atoms"
+import {evaluatorConfigEntityIdsAtom} from "./atoms"
 import EvaluatorPlaygroundHeader from "./EvaluatorPlaygroundHeader"
+import SelectAppEmptyState from "./SelectAppEmptyState"
+import {useEvaluatorRunControls} from "./useEvaluatorRunControls"
 
 const PlaygroundMainView = dynamic(
     () => import("@/oss/components/Playground/Components/MainLayout"),
@@ -77,63 +67,24 @@ const ConfigureEvaluatorPageInner = () => {
     useAtomValue(playgroundSyncAtom)
 
     const configEntityIds = useAtomValue(evaluatorConfigEntityIdsAtom)
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
-    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
-
-    // Read the current evaluator entity from playground nodes
-    // Phase 1: evaluator is at depth 0 (primary)
-    // Phase 2: evaluator is at depth 1 (downstream)
-    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
-    const evaluatorNode = useMemo(() => {
-        const downstream = nodes.find((n) => n.depth > 0)
-        if (downstream) return downstream
-        return nodes[0] ?? null
-    }, [nodes])
+
+    // Shared run controls (app adapter, app-select, run-on mode, run gate) — the
+    // same hook the header and the creation drawer use, so all surfaces agree.
+    const {appWorkflowAdapter, handleAppSelect, selectedAppLabel, runDisabled} =
+        useEvaluatorRunControls()
 
     // Preload editor plugins
     useEffect(() => {
         void preloadEditorPlugins()
     }, [])
 
-    // App workflow picker (shared between header and empty state)
-    const appWorkflowAdapter = useMemo(
-        () =>
-            createWorkflowRevisionAdapter({
-                skipVariantLevel: true,
-                excludeRevisionZero: true,
-                flags: {is_evaluator: false, is_feedback: false},
-            }),
-        [],
-    )
-
-    const handleAppSelect = useCallback(
-        (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
-            connectApp({
-                appRevisionId: selection.id,
-                appLabel: selection.label,
-                evaluatorRevisionId: evaluatorNode.entityId,
-                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
-            })
-        },
-        [connectApp, evaluatorNode],
-    )
-
     const runDisabledContent = useMemo(
         () => (
-            <>
-                <Typography.Text type="secondary" className="text-sm">
-                    Select an app to run the evaluator chain
-                </Typography.Text>
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={handleAppSelect}
-                    size="middle"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-            </>
+            <SelectAppEmptyState
+                adapter={appWorkflowAdapter}
+                onSelect={handleAppSelect}
+                selectedAppLabel={selectedAppLabel}
+            />
         ),
         [appWorkflowAdapter, handleAppSelect, selectedAppLabel],
     )
@@ -151,15 +102,16 @@ const ConfigureEvaluatorPageInner = () => {
 
     return (
         <OSSPlaygroundShell providers={providers}>
-            <div className="flex flex-col w-full h-full overflow-hidden">
-                <EvaluatorPlaygroundHeader
-                    appWorkflowAdapter={appWorkflowAdapter}
-                    onAppSelect={handleAppSelect}
-                />
+            {/* Definite height (viewport minus the app topbar) so the run panel's
+             * `h-full` centering resolves — same pattern as the app playground
+             * (`Playground.tsx`). With a plain `h-full` here the chain collapses
+             * to content height and the empty state sticks to the top. */}
+            <div className="flex flex-col w-full h-[calc(100dvh-75px)] overflow-hidden">
+                <EvaluatorPlaygroundHeader />
                 <PlaygroundMainView
                     mode="evaluator"
                     configEntityIdsOverride={configEntityIds}
-                    runDisabled={!hasAppConnected}
+                    runDisabled={runDisabled}
                     runDisabledContent={runDisabledContent}
                 />
             </div>
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
new file mode 100644
index 0000000000..c75dba5a98
--- /dev/null
+++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts
@@ -0,0 +1,107 @@
+/**
+ * useEvaluatorRunControls
+ *
+ * Single source of truth for the evaluator playground's run controls, shared by
+ * the full-page playground, the evaluator-creation drawer, and the workflow
+ * revision drawer. Before this hook, the app adapter, app-select handler,
+ * evaluator-node lookup, and run-on wiring were copy-pasted across every
+ * surface — which is exactly how the drawers drifted out of sync with the page
+ * (they kept forcing an app even in test-case mode). Centralizing it here means
+ * every surface behaves identically by construction.
+ */
+
+import {useCallback, useMemo} from "react"
+
+import {
+    createWorkflowRevisionAdapter,
+    type WorkflowRevisionSelectionResult,
+} from "@agenta/entity-ui/selection"
+import {playgroundController} from "@agenta/playground"
+import {useAtomValue, useSetAtom} from "jotai"
+
+import {
+    connectAppToEvaluatorAtom,
+    disconnectAppFromEvaluatorAtom,
+    effectiveRunOnModeAtom,
+    hasAppConnectedAtom,
+    runOnModeAtom,
+    selectedAppLabelAtom,
+    type RunOnMode,
+} from "./atoms"
+
+export function useEvaluatorRunControls() {
+    // Evaluator node — phase 1: evaluator at depth 0 (primary); phase 2:
+    // evaluator at depth 1 (downstream of a connected app).
+    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
+    const evaluatorNode = useMemo(() => {
+        const downstream = nodes.find((n) => n.depth > 0)
+        if (downstream) return downstream
+        return nodes[0] ?? null
+    }, [nodes])
+
+    // App picker — picks an upstream *app* workflow to attach to the evaluator.
+    // `parentLabel: "Application"` keeps the search bar saying "Search app…"
+    // rather than the adapter's historical "Search evaluator…" default.
+    const appWorkflowAdapter = useMemo(
+        () =>
+            createWorkflowRevisionAdapter({
+                skipVariantLevel: true,
+                excludeRevisionZero: true,
+                flags: {is_evaluator: false, is_feedback: false},
+                parentLabel: "Application",
+            }),
+        [],
+    )
+
+    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
+    const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom)
+
+    const handleAppSelect = useCallback(
+        (selection: WorkflowRevisionSelectionResult) => {
+            if (!evaluatorNode) return
+            connectApp({
+                appRevisionId: selection.id,
+                appLabel: selection.label,
+                evaluatorRevisionId: evaluatorNode.entityId,
+                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
+            })
+        },
+        [connectApp, evaluatorNode],
+    )
+
+    // Run-on mode. A connected app forces effective "app" mode (the node graph
+    // is the source of truth); the stored preference only applies when nothing
+    // is connected.
+    const runOnMode = useAtomValue(effectiveRunOnModeAtom)
+    const setRunOnMode = useSetAtom(runOnModeAtom)
+    const handlePickRunOn = useCallback(
+        (next: RunOnMode) => {
+            if (next === "trace") return // disabled, not selectable
+            // Leaving "app" mode drops the connected app so the graph returns to
+            // standalone-evaluator shape.
+            if (next === "data") disconnectApp()
+            setRunOnMode(next)
+        },
+        [disconnectApp, setRunOnMode],
+    )
+
+    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
+    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+
+    // In "app" mode with no app connected yet, the evaluator can't run — the run
+    // panel surfaces the app selector instead of the testcase rows. In test-case
+    // mode the evaluator runs standalone, so it's never blocked on an app. Only
+    // takes effect where the run panel renders (the page and expanded drawers).
+    const runDisabled = runOnMode === "app" && !hasAppConnected
+
+    return {
+        appWorkflowAdapter,
+        handleAppSelect,
+        disconnectApp,
+        runOnMode,
+        handlePickRunOn,
+        hasAppConnected,
+        selectedAppLabel,
+        runDisabled,
+    }
+}
diff --git a/web/oss/src/components/Evaluators/index.tsx b/web/oss/src/components/Evaluators/index.tsx
index ed318a1469..00e8737b30 100644
--- a/web/oss/src/components/Evaluators/index.tsx
+++ b/web/oss/src/components/Evaluators/index.tsx
@@ -3,7 +3,6 @@ import {memo, useCallback, useEffect, useMemo, useState} from "react"
 import {
     createEvaluatorFromTemplate,
     type EvaluatorCatalogTemplate,
-    hasFullPagePlaygroundUX,
     invalidateEvaluatorsListCache,
     workflowMolecule,
 } from "@agenta/entities/workflow"
@@ -260,22 +259,18 @@ const EvaluatorsRegistry = ({scope = "project", mode = "active"}: EvaluatorsRegi
                 return
             }
 
-            // Only prompt/code-authored evaluators open in the full-page
-            // playground. Declarative classifiers (match, contains, regex,
-            // json_multi_field_match, …) fall back to the drawer-edit flow —
-            // their config is a handful of form fields and the playground
-            // page would surface misleading envelope variable inputs.
+            // All non-archived automatic evaluators open in the full-page
+            // playground. Earlier this was gated on classifier type
+            // (`hasFullPagePlaygroundUX`) so declarative classifiers stayed in
+            // the drawer-edit flow, but in practice that meant whole evaluator
+            // types had no UI path into the per-evaluator pages (variants,
+            // traces). Drawer stays available as a secondary affordance via
+            // the row context menu's Configure action.
             //
             // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is
-            // off, every row click resolves to the drawer regardless of the
-            // evaluator's classifier (the new flow stays code-complete but
-            // hidden until follow-up fixes land).
-            const entity = record.revisionId ? workflowMolecule.get.data(record.revisionId) : null
+            // off, every row click resolves to the drawer.
             const shouldNavigateToFullPage = Boolean(
-                EVALUATOR_FULL_PAGE_NAV_ENABLED &&
-                record.workflowId &&
-                entity &&
-                hasFullPagePlaygroundUX(entity as Parameters<typeof hasFullPagePlaygroundUX>[0]),
+                EVALUATOR_FULL_PAGE_NAV_ENABLED && record.workflowId,
             )
 
             const navigated =
diff --git a/web/oss/src/components/Filters/Filters.tsx b/web/oss/src/components/Filters/Filters.tsx
index b1dd2d8385..42a53d736b 100644
--- a/web/oss/src/components/Filters/Filters.tsx
+++ b/web/oss/src/components/Filters/Filters.tsx
@@ -283,6 +283,7 @@ const Filters: React.FC<Props> = ({
     onApplyFilter,
     onClearFilter,
     buttonProps,
+    reconcileFilterRows,
 }) => {
     const evaluatorPreviews = useAtomValue(evaluatorsListDataAtom)
 
@@ -358,6 +359,37 @@ const Filters: React.FC<Props> = ({
                             : item.value == null
                               ? []
                               : [item.value]
+
+                        // Prefer a candidate whose `referenceCategory` matches
+                        // the entry's `"attributes.key"`. This disambiguates
+                        // the `references` family — application.id /
+                        // evaluator.id / environment.id all share
+                        // `baseField: "references"` and
+                        // `referenceProperty: "id"`, so without this check the
+                        // first match (application.id) always wins, mislabelling
+                        // an evaluator-scoped filter as "Application ID".
+                        const attributesKey = (() => {
+                            for (const entry of valuesArray) {
+                                if (entry && typeof entry === "object") {
+                                    const ak = (entry as Record<string, unknown>)["attributes.key"]
+                                    if (typeof ak === "string") return ak
+                                }
+                            }
+                            return undefined
+                        })()
+                        if (attributesKey) {
+                            for (const candidate of matches) {
+                                if (candidate.referenceCategory !== attributesKey) continue
+                                if (!candidate.referenceProperty) continue
+                                const refProp = candidate.referenceProperty
+                                const hasMatch = valuesArray.some(
+                                    (entry) =>
+                                        entry && typeof entry === "object" && refProp in entry,
+                                )
+                                if (hasMatch) return candidate
+                            }
+                        }
+
                         for (const candidate of matches) {
                             if (!candidate.referenceProperty) continue
                             const refProp = candidate.referenceProperty
@@ -511,6 +543,22 @@ const Filters: React.FC<Props> = ({
     const [isFilterOpen, setIsFilterOpen] = useState(false)
     const [keySearchTerms, setKeySearchTerms] = useState<Record<number, string>>({})
 
+    /**
+     * Display-only projection of `filter`. The reconciler is opt-in (passed by
+     * the parent) and may rewrite *cosmetic* row fields like `selectedField` /
+     * `selectedLabel` so the UI reflects an in-flight choice (e.g.,
+     * observability flipping the references row's label between "Application
+     * ID" / "Evaluator ID" as the user picks a trace_type, before Apply).
+     *
+     * Mutations still call `setFilter(filter)` by index, so the reconciler is
+     * required to preserve array length and per-index order — that contract
+     * is documented on the prop.
+     */
+    const displayedFilter = useMemo(
+        () => (reconcileFilterRows ? reconcileFilterRows(filter) : filter),
+        [filter, reconcileFilterRows],
+    )
+
     const sanitizedFilters = useMemo(() => {
         return sanitizeFilterItems(
             filter.filter(({field, operator, isPermanent, isCustomField}) => {
@@ -816,7 +864,7 @@ const Filters: React.FC<Props> = ({
                     </div>
 
                     <div className={filterContainerClass}>
-                        {filter.map((item, idx) => {
+                        {displayedFilter.map((item, idx) => {
                             const uiKey = item.selectedField || item.field || ""
                             const baseFieldCfg = getField(uiKey)
                             const field = effectiveFieldForRow(baseFieldCfg, item)
diff --git a/web/oss/src/components/Filters/types.d.ts b/web/oss/src/components/Filters/types.d.ts
index b25512a170..03f00d6a63 100644
--- a/web/oss/src/components/Filters/types.d.ts
+++ b/web/oss/src/components/Filters/types.d.ts
@@ -8,6 +8,20 @@ export interface Props {
     onApplyFilter: (filters: Filter[]) => void
     onClearFilter: (filters: Filter[]) => void
     buttonProps?: ButtonProps
+    /**
+     * Optional callback to derive a *display-only* view of the local filter
+     * state. Called whenever the user changes a row in the dialog. The dialog
+     * renders from the returned array, but mutations still target the
+     * underlying `filter` state by index, so the reconciler MUST preserve
+     * array length and per-index order.
+     *
+     * Used by observability to keep the permanent references row's label
+     * ("Application ID" vs "Evaluator ID") in sync with the dialog's local
+     * `trace_type` selection *before* the user clicks Apply — without the
+     * reconciler, the label only refreshes after Apply when the atom
+     * re-derives the permanent row.
+     */
+    reconcileFilterRows?: (rows: FilterItem[]) => FilterItem[]
 }
 
 export type CustomValueType = "string" | "number" | "boolean"
diff --git a/web/oss/src/components/Layout/FooterIsland.tsx b/web/oss/src/components/Layout/FooterIsland.tsx
deleted file mode 100644
index b5b335575d..0000000000
--- a/web/oss/src/components/Layout/FooterIsland.tsx
+++ /dev/null
@@ -1,30 +0,0 @@
-import {memo} from "react"
-
-import {GithubFilled, LinkedinFilled, TwitterOutlined} from "@ant-design/icons"
-import {Layout, Space, Typography} from "antd"
-import Link from "next/link"
-
-const {Footer} = Layout
-
-interface FooterIslandProps {
-    className?: string
-}
-
-export const FooterIsland = memo(function FooterIsland({className}: FooterIslandProps) {
-    return (
-        <Footer className={className}>
-            <Space size={10}>
-                <Link href="https://github.com/Agenta-AI/agenta" target="_blank">
-                    <GithubFilled />
-                </Link>
-                <Link href="https://www.linkedin.com/company/agenta-ai/" target="_blank">
-                    <LinkedinFilled />
-                </Link>
-                <Link href="https://twitter.com/agenta_ai" target="_blank">
-                    <TwitterOutlined />
-                </Link>
-            </Space>
-            <Typography.Text>Copyright © {new Date().getFullYear()} | Agenta.</Typography.Text>
-        </Footer>
-    )
-})
diff --git a/web/oss/src/components/Layout/Layout.tsx b/web/oss/src/components/Layout/Layout.tsx
index 8e73230639..eb415201f3 100644
--- a/web/oss/src/components/Layout/Layout.tsx
+++ b/web/oss/src/components/Layout/Layout.tsx
@@ -1,16 +1,12 @@
-import {memo, useCallback, useEffect, useRef, useState, type ReactNode, type RefObject} from "react"
+import {memo, useCallback, useEffect, useRef, useState, type ReactNode} from "react"
 
-import {GithubFilled, LinkedinFilled, TwitterOutlined} from "@ant-design/icons"
-import {ConfigProvider, Layout, Modal, Space, theme} from "antd"
+import {ConfigProvider, Layout, Modal, theme} from "antd"
 import clsx from "clsx"
 import {atom} from "jotai"
 import {useAtom, useAtomValue, useSetAtom, useStore} from "jotai"
 import {selectAtom} from "jotai/utils"
-import dynamic from "next/dynamic"
-import Link from "next/link"
 import {useRouter} from "next/router"
 import {ErrorBoundary} from "react-error-boundary"
-import {useResizeObserver} from "usehooks-ts"
 
 import useURL from "@/oss/hooks/useURL"
 import {currentAppAtom} from "@/oss/state/app"
@@ -146,11 +142,6 @@ const useCommittedLayoutFlags = (): LayoutRouteFlags => {
     return committedFlags
 }
 
-const FooterIsland = dynamic(() => import("./FooterIsland").then((m) => m.FooterIsland), {
-    ssr: false,
-    loading: () => null,
-})
-
 type StyleClasses = ReturnType<typeof useStyles>
 
 const {Content} = Layout
@@ -169,7 +160,6 @@ const AppWithVariants = memo(
         isEvaluator,
         isFullHeight,
         appTheme,
-        footerHeight,
     }: {
         children: ReactNode
         isAppRoute: boolean
@@ -179,7 +169,6 @@ const AppWithVariants = memo(
         classes: StyleClasses
         appTheme: string
         isPlayground?: boolean
-        footerHeight?: number
     }) => {
         const {baseAppURL} = useURL()
         const appState = useAppState()
@@ -361,24 +350,6 @@ const AppWithVariants = memo(
                                 </Content>
                             )}
                         </div>
-                        <div className="w-full h-[30px]"></div>
-                        <FooterIsland className={classes.footer}>
-                            <Space className={classes.footerLeft} size={10}>
-                                <Link href={"https://github.com/Agenta-AI/agenta"} target="_blank">
-                                    <GithubFilled className={classes.footerLinkIcon} />
-                                </Link>
-                                <Link
-                                    href={"https://www.linkedin.com/company/agenta-ai/"}
-                                    target="_blank"
-                                >
-                                    <LinkedinFilled className={classes.footerLinkIcon} />
-                                </Link>
-                                <Link href={"https://twitter.com/agenta_ai"} target="_blank">
-                                    <TwitterOutlined className={classes.footerLinkIcon} />
-                                </Link>
-                            </Space>
-                            <div>Copyright © {new Date().getFullYear()} | Agenta.</div>
-                        </FooterIsland>
                     </Layout>
                 </Layout>
             </div>
@@ -388,12 +359,7 @@ const AppWithVariants = memo(
 
 const App: React.FC<LayoutProps> = ({children}) => {
     const {appTheme} = useAppTheme()
-    const ref = useRef<HTMLElement | null>(null)
-    const {height: footerHeight} = useResizeObserver({
-        ref: ref as RefObject<HTMLElement>,
-        box: "border-box",
-    })
-    const classes = useStyles({themeMode: appTheme, footerHeight} as StyleProps)
+    const classes = useStyles({themeMode: appTheme})
     const {isHumanEval, isPlayground, isAppRoute, isAuthRoute, isEvaluator, isFullHeight} =
         useCommittedLayoutFlags()
 
@@ -419,7 +385,6 @@ const App: React.FC<LayoutProps> = ({children}) => {
                         isHumanEval={isHumanEval}
                         isEvaluator={isEvaluator}
                         isFullHeight={isFullHeight}
-                        footerHeight={footerHeight}
                     >
                         {children}
                         {contextHolder}
diff --git a/web/oss/src/components/Layout/assets/styles.ts b/web/oss/src/components/Layout/assets/styles.ts
index cc47034abc..49a9112c52 100644
--- a/web/oss/src/components/Layout/assets/styles.ts
+++ b/web/oss/src/components/Layout/assets/styles.ts
@@ -2,9 +2,7 @@ import {createUseStyles} from "react-jss"
 
 import type {JSSTheme, StyleProps as MainStyleProps} from "@/oss/lib/Types"
 
-export interface StyleProps extends MainStyleProps {
-    footerHeight: number
-}
+export type StyleProps = MainStyleProps
 
 export const useStyles = createUseStyles((theme: JSSTheme) => ({
     layout: ({themeMode}: StyleProps) => ({
@@ -14,15 +12,15 @@ export const useStyles = createUseStyles((theme: JSSTheme) => ({
         minHeight: "100vh",
         position: "relative",
     }),
-    content: ({footerHeight}: StyleProps) => ({
-        height: `calc(100% - ${footerHeight ?? 0}px)`,
+    content: {
+        height: "100%",
         paddingTop: "24px",
         paddingLeft: "1.5rem",
         paddingRight: "1.5rem",
-        marginBottom: `calc(2rem + ${footerHeight ?? 0}px)`,
+        marginBottom: "2rem",
         flex: 1,
         gap: 16,
-    }),
+    },
     breadcrumbContainer: {
         display: "flex",
         alignItems: "center",
@@ -31,38 +29,6 @@ export const useStyles = createUseStyles((theme: JSSTheme) => ({
         padding: "8px 1.5rem",
         borderBottom: `1px solid ${theme.colorBorderSecondary}`,
     },
-    footer: {
-        position: "absolute",
-        bottom: 0,
-        left: 0,
-        right: 0,
-        textAlign: "center",
-        padding: "5px 20px",
-        display: "flex",
-        alignItems: "center",
-        justifyContent: "space-between",
-        // antd's Layout.Footer defaults to colorBgLayout (#000 in dark), which
-        // reads as a mismatched black band against the #141414 content. Blend
-        // with whatever's behind it instead, and add a top border to separate
-        // it from the content above.
-        backgroundColor: "transparent",
-        borderTop: `1px solid ${theme.colorBorderSecondary}`,
-        // The social links are anchors that would otherwise inherit antd's
-        // colorLink (blue in dark). Use neutral text color so they read as icons,
-        // not links — matches the prior navy look in light, flips to light in dark.
-        "& a": {
-            color: theme.colorText,
-        },
-        "& a:hover": {
-            color: theme.colorTextSecondary,
-        },
-    },
-    footerLeft: {
-        fontSize: 18,
-    },
-    footerLinkIcon: ({themeMode}: StyleProps) => ({
-        color: themeMode === "dark" ? "#fff" : "#000",
-    }),
     topRightBar: {
         display: "flex",
         alignItems: "center",
diff --git a/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx b/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx
index 6ec0ba479d..315f0f7093 100644
--- a/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx
+++ b/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx
@@ -29,6 +29,9 @@ const DeployVariantButton = ({
 
     const runnableData = useAtomValue(workflowMolecule.selectors.data(revisionId || ""))
     const workflowId = runnableData?.workflow_id || ""
+    // Workflow-level evaluator flag — canonical, unlike the revision-level
+    // `flags.is_evaluator` which is `false` on v0 revisions of evaluators.
+    const isEvaluator = useAtomValue(workflowMolecule.selectors.isEvaluator(workflowId))
     const variants = useAtomValue(workflowVariantsListDataAtomFamily(workflowId))
 
     const {environments, variantName, revision} = useMemo(() => {
@@ -46,6 +49,12 @@ const DeployVariantButton = ({
 
     const handleCloseDeployModal = useCallback(() => setIsDeployModalOpen(false), [])
 
+    // Evaluator workflows aren't deployed to environments — never render a
+    // deploy trigger for them. Central guard so every surface that reuses this
+    // button (registry/overview menus, variant headers, the revision drawer) is
+    // covered without each call site repeating the check.
+    if (isEvaluator) return null
+
     return (
         <>
             {isValidElement(children) ? (
diff --git a/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx b/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx
index 864f2d938d..94e2478278 100644
--- a/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx
+++ b/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx
@@ -72,6 +72,10 @@ const PlaygroundVariantConfigHeader = ({
                 skipVariantLevel: true,
                 excludeRevisionZero: true,
                 flags: {is_evaluator: false, is_feedback: false},
+                // App browse picker — without this the search bar would say
+                // "Search evaluator…" (the adapter's default in skip-variant
+                // mode) while the user is browsing apps.
+                parentLabel: "Application",
             }),
         [],
     )
diff --git a/web/oss/src/components/PlaygroundRouter/index.tsx b/web/oss/src/components/PlaygroundRouter/index.tsx
index bd983f1461..4abb241beb 100644
--- a/web/oss/src/components/PlaygroundRouter/index.tsx
+++ b/web/oss/src/components/PlaygroundRouter/index.tsx
@@ -1,24 +1,13 @@
-import {memo, useEffect, useMemo, useRef} from "react"
+import {memo} from "react"
 
-import {
-    hasFullPagePlaygroundUX,
-    workflowLatestRevisionIdAtomFamily,
-    workflowMolecule,
-} from "@agenta/entities/workflow"
 import {bgColors} from "@agenta/ui"
 import {DownOutlined} from "@ant-design/icons"
 import {Flask, Plus} from "@phosphor-icons/react"
 import {Button, Space, Typography} from "antd"
 import {useAtomValue} from "jotai"
 import dynamic from "next/dynamic"
-import {useRouter} from "next/router"
 
-import {appIdentifiersAtom} from "@/oss/state/appState"
-import {
-    currentWorkflowAtom,
-    currentWorkflowContextAtom,
-    EVALUATOR_FULL_PAGE_NAV_ENABLED,
-} from "@/oss/state/workflow"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 const PlaygroundLoadingShell = () => {
     return (
@@ -60,92 +49,40 @@ const Playground = dynamic(() => import("../Playground/Playground"), {
     loading: PlaygroundLoadingShell,
 })
 
-/**
- * Stale-URL guard for evaluator playgrounds. Most evaluators (classifiers,
- * matchers, JSON validators, …) have no meaningful full-page playground UX —
- * just a handful of form fields the drawer already renders. When the
- * resolved workflow is one of those evaluators, redirect to the evaluators
- * registry with the revision pre-selected so the drawer opens automatically.
- * Prompt/code-authored evaluators (auto_ai_critique, llm, code) are kept on
- * the playground page.
- *
- * Classification source: the workflow LIST entry has no `data.uri` (data is
- * only populated on revision-detail responses), so we resolve the latest
- * revision via `workflowLatestRevisionIdAtomFamily` and read its seeded
- * entity from the molecule to get the URI. Without this, every evaluator
- * playground briefly looks "unknown" and the guard would mis-redirect
- * prompt-based evaluators like LLM-as-a-judge.
- */
-const useEvaluatorPlaygroundGuard = () => {
-    const ctx = useAtomValue(currentWorkflowContextAtom)
-    const workflow = useAtomValue(currentWorkflowAtom)
-    const {workspaceId, projectId} = useAtomValue(appIdentifiersAtom)
-    const router = useRouter()
-    const redirectedFor = useRef<string | null>(null)
-
-    const workflowId = ctx.workflowId ?? ""
-    const latestRevisionId = useAtomValue(
-        useMemo(() => workflowLatestRevisionIdAtomFamily(workflowId), [workflowId]),
-    )
-
-    useEffect(() => {
-        if (ctx.isResolving || ctx.isError || ctx.isNotFound) return
-        if (ctx.workflowKind !== "evaluator") return
-        if (!workflow || !ctx.workflowId) return
-        if (!workspaceId || !projectId) return
-        if (redirectedFor.current === ctx.workflowId) return
-
-        // Resolve the latest revision data — it carries `data.uri` and the
-        // URI-derived flags (`is_llm`, `is_code`) that classifier vs prompt
-        // evaluators differ on. The workflow list entry has neither.
-        const latestRevision = latestRevisionId
-            ? (workflowMolecule.get.data(latestRevisionId) as
-                  | Parameters<typeof hasFullPagePlaygroundUX>[0]
-                  | null)
-            : null
-
-        // Bail until we have a classifiable record. Redirecting on a half-
-        // loaded workflow would bounce prompt-based evaluators (whose URI
-        // hasn't been seeded yet) into the drawer mid-load.
-        const hasUri = Boolean(latestRevision?.data?.uri)
-        const hasTypeFlag = Boolean(
-            latestRevision?.flags?.is_llm ||
-            latestRevision?.flags?.is_code ||
-            workflow.flags?.is_llm ||
-            workflow.flags?.is_code,
-        )
-        if (!hasUri && !hasTypeFlag) return
-
-        // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off,
-        // skip the "stay on /playground" early return so every evaluator URL
-        // (including direct visits / bookmarks) bounces back to /evaluators
-        // and opens the drawer.
-        const classifyTarget = latestRevision ?? workflow
-        if (EVALUATOR_FULL_PAGE_NAV_ENABLED && hasFullPagePlaygroundUX(classifyTarget)) return
-
-        const base = `/w/${encodeURIComponent(workspaceId)}/p/${encodeURIComponent(projectId)}`
-        const target = latestRevisionId
-            ? `${base}/evaluators?revisionId=${encodeURIComponent(latestRevisionId)}`
-            : `${base}/evaluators`
-
-        redirectedFor.current = ctx.workflowId
-        router.replace(target)
-    }, [
-        ctx.isResolving,
-        ctx.isError,
-        ctx.isNotFound,
-        ctx.workflowKind,
-        ctx.workflowId,
-        workflow,
-        latestRevisionId,
-        workspaceId,
-        projectId,
-        router,
-    ])
-}
+// When the current workflow is an evaluator we render the evaluator-flavored
+// page (with `EvaluatorPlaygroundHeader` + `connectAppToEvaluatorAtom`) instead
+// of the generic app `<Playground />`. Same code path that powers
+// `/evaluators/playground` today — `playgroundSyncAtom` matches `/playground`
+// anywhere in the pathname so hydration works at both URLs unchanged.
+const ConfigureEvaluatorPage = dynamic(
+    () => import("@/oss/components/Evaluators/components/ConfigureEvaluator"),
+    {ssr: false, loading: PlaygroundLoadingShell},
+)
 
 const PlaygroundRouter = () => {
-    useEvaluatorPlaygroundGuard()
+    const ctx = useAtomValue(currentWorkflowContextAtom)
+
+    // Evaluators get the evaluator-flavored page so the upstream-app picker
+    // is visible (the generic header only exposes the reverse direction —
+    // app-needs-evaluator — not evaluator-needs-app). All evaluator kinds
+    // (LLM/code, declarative classifiers, custom hooks, …) land here on
+    // direct URL visits + sidebar switcher clicks; for simple classifiers
+    // ConfigureEvaluatorPage renders the same few form fields the drawer
+    // would, with the bonus of the evaluator-as-app surface (variants,
+    // traces, sidebar context).
+    //
+    // Exception: `is_feedback` evaluators (human-annotation workflows) are
+    // intentionally drawer-only in /evaluators — they don't run, they capture
+    // human input. Routing them to `ConfigureEvaluatorPage` would render a
+    // page with no testset/run controls that make sense for them. Direct
+    // URL visits to `/apps/<human-id>/playground` fall through to the
+    // generic `<Playground />`, which will (correctly) treat them as an
+    // unsupported playground target and let the upstream route guard /
+    // landing logic redirect them back to /evaluators.
+    const isFeedbackEvaluator = ctx.workflow?.flags?.is_feedback === true
+    if (ctx.workflowKind === "evaluator" && !isFeedbackEvaluator) {
+        return <ConfigureEvaluatorPage />
+    }
     return <Playground />
 }
 
diff --git a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
index 734ddbd2b4..fd265ce7d1 100644
--- a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
+++ b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx
@@ -1,7 +1,7 @@
 import {memo, useCallback, useMemo, useState} from "react"
 
 import {
-    fullPagePlaygroundEvaluatorsAtom,
+    nonHumanEvaluatorsAtom,
     nonArchivedAppWorkflowsAtom,
     nonArchivedEvaluatorsAtom,
     parseWorkflowKeyFromUri,
@@ -116,25 +116,25 @@ const SWITCHER_MENU_CLASS = clsx(
 const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => {
     const ctx = useAtomValue(currentWorkflowContextAtom)
     const apps = useAtomValue(nonArchivedAppWorkflowsAtom) as readonly Workflow[]
-    // Full set of evaluators — used for resolving the *active* workflow (the
-    // user may be inside a drawer-only evaluator currently). The switcher
-    // dropdown below uses `fullPagePlaygroundEvaluators` instead so it only
-    // lists evaluators whose destination is /apps/[id]/playground — clicking
-    // a declarative classifier or human evaluator from the sidebar would
-    // route through the route guard and bounce back to /evaluators, which is
-    // confusing.
     const evaluators = useAtomValue(nonArchivedEvaluatorsAtom) as readonly Workflow[]
+    // The switcher lists every AUTOMATIC evaluator — LLM, code, AND the
+    // declarative classifiers (exact match, regex, similarity / semantic
+    // similarity, json diff, contains json, …). `nonHumanEvaluatorsAtom`
+    // resolves `is_feedback` from each evaluator's LATEST REVISION — the
+    // workflow LIST records this card reads from `nonArchivedEvaluatorsAtom`
+    // carry NO `is_feedback`/`is_llm`/`is_code` flags (those live on the
+    // revision, not the parent artifact), which is why the old
+    // `!w.flags?.is_feedback` filter never excluded anything and human
+    // evaluators leaked in (QA 2026-06-05). It drops ONLY human (`is_feedback`)
+    // evaluators; navigation lands on the workflow's current sub-page (Overview/
+    // Evaluations are valid for every evaluator), so matchers no longer dead-end.
+    const automaticEvaluators = useAtomValue(nonHumanEvaluatorsAtom) as readonly Workflow[]
     // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off, the
-    // switcher dropdown hides the "Evaluators" group entirely. Clicking an
-    // entry would route to `/apps/<evaluatorId>/playground`, which the
-    // (also-gated) `PlaygroundRouter` guard would immediately bounce back to
-    // `/evaluators` — exposing the entry would just produce a flicker.
-    const fullPagePlaygroundEvaluatorsRaw = useAtomValue(
-        fullPagePlaygroundEvaluatorsAtom,
-    ) as readonly Workflow[]
-    const fullPagePlaygroundEvaluators: readonly Workflow[] = EVALUATOR_FULL_PAGE_NAV_ENABLED
-        ? fullPagePlaygroundEvaluatorsRaw
-        : EMPTY_WORKFLOWS
+    // switcher dropdown hides the "Evaluators" group entirely.
+    const switcherEvaluators: readonly Workflow[] = useMemo(() => {
+        if (!EVALUATOR_FULL_PAGE_NAV_ENABLED) return EMPTY_WORKFLOWS
+        return automaticEvaluators
+    }, [automaticEvaluators])
     const recentAppId = useAtomValue(recentAppIdAtom)
     const recentEvaluatorId = useAtomValue(recentEvaluatorIdAtom)
     const navigateToWorkflow = useSetAtom(routerAppNavigationAtom)
@@ -192,16 +192,16 @@ const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => {
                 children: apps.map((w) => toMenuItem(w, false)),
             })
         }
-        if (fullPagePlaygroundEvaluators.length) {
+        if (switcherEvaluators.length) {
             items.push({
                 key: "evaluators-header",
                 type: "group",
                 label: "Evaluators",
-                children: fullPagePlaygroundEvaluators.map((w) => toMenuItem(w, true)),
+                children: switcherEvaluators.map((w) => toMenuItem(w, true)),
             })
         }
         return items
-    }, [apps, fullPagePlaygroundEvaluators])
+    }, [apps, switcherEvaluators])
 
     const handleSwitcherClick = useCallback<NonNullable<MenuProps["onClick"]>>(
         ({key}) => {
diff --git a/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx b/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx
index 70a915b7ac..eb467d00f8 100644
--- a/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx
+++ b/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx
@@ -19,7 +19,7 @@ import {
     RocketLaunchIcon,
     ListChecksIcon,
 } from "@phosphor-icons/react"
-import {useAtomValue, useSetAtom} from "jotai"
+import {useSetAtom} from "jotai"
 
 import {useCrispChat} from "@/oss/hooks/useCrispChat"
 import {useSession} from "@/oss/hooks/useSession"
@@ -30,7 +30,6 @@ import {openWidgetAtom} from "@/oss/lib/onboarding"
 import {useAppsData} from "@/oss/state/app"
 import {useAppState} from "@/oss/state/appState"
 import {useOrgData} from "@/oss/state/org"
-import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 import {SidebarConfig} from "../../types"
 
@@ -47,15 +46,6 @@ export const useSidebarConfig = () => {
     const hasAppContext =
         routeLayer === "app" && Boolean(routedAppId || appURL || recentlyVisitedAppURL)
 
-    // Phase 4: when the current workflow is an evaluator, DISABLE (not hide)
-    // the app-section items that don't apply to evaluators (overview,
-    // evaluations). Items stay visible but greyed out so the user understands
-    // they exist — they just aren't applicable for this workflow type.
-    // Endpoints and deployments aren't in the sidebar today, so no extra
-    // gating needed for those.
-    const workflowCtx = useAtomValue(currentWorkflowContextAtom)
-    const isCurrentWorkflowEvaluator = workflowCtx.workflowKind === "evaluator"
-
     const sidebarConfig: SidebarConfig[] = [
         {
             key: "app-management-link",
@@ -123,9 +113,10 @@ export const useSidebarConfig = () => {
             icon: <DesktopIcon size={14} />,
             isHidden: !hasAppContext && !currentApp && !recentlyVisitedAppId,
             isAppSection: true,
-            // Disabled (not hidden) for evaluator workflows so the user still
-            // sees these surfaces exist — just not applicable here.
-            disabled: !hasProjectURL || isCurrentWorkflowEvaluator,
+            // Enabled for evaluators too — Overview surfaces the workflow's
+            // details, variants, and the evaluation runs that evaluated it
+            // (scoped by the workflow id as the `application` reference).
+            disabled: !hasProjectURL,
         },
         {
             key: "app-playground-link",
@@ -153,8 +144,10 @@ export const useSidebarConfig = () => {
             isHidden: !hasAppContext && !currentApp && !recentlyVisitedAppId,
             isAppSection: true,
             icon: <FlaskIcon size={14} />,
-            // Disabled (not hidden) for evaluator workflows.
-            disabled: !hasProjectURL || isCurrentWorkflowEvaluator,
+            // Enabled for evaluators too — shows the evaluation runs that
+            // evaluated this evaluator (scoped by its id as the `application`
+            // reference, same machinery as the app-scoped evaluations page).
+            disabled: !hasProjectURL,
             dataTour: "evaluations-nav",
         },
         {
diff --git a/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx b/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
index 6726f2423c..2b236c2243 100644
--- a/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
+++ b/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx
@@ -15,18 +15,12 @@ import {testcaseMolecule} from "@agenta/entities/testcase"
 import {
     registerWorkflowCommitCallbacks,
     getWorkflowCommitCallbacks,
-    hasFullPagePlaygroundUX,
     parseEvaluatorKeyFromUri,
     evaluatorTemplatesMapAtom,
     workflowMolecule,
     discardLocalServerDataAtom,
 } from "@agenta/entities/workflow"
-import {EntityPicker} from "@agenta/entity-ui"
 import {PlaygroundConfigSection} from "@agenta/entity-ui/drill-in"
-import {
-    createWorkflowRevisionAdapter,
-    type WorkflowRevisionSelectionResult,
-} from "@agenta/entity-ui/selection"
 import {VariantDetailsWithStatus, VariantNameCell} from "@agenta/entity-ui/variant"
 import {playgroundController} from "@agenta/playground"
 import {
@@ -53,7 +47,7 @@ import {
 } from "@agenta/playground-ui/workflow-revision-drawer"
 import {EnvironmentTag} from "@agenta/ui"
 import {Rocket} from "@phosphor-icons/react"
-import {Button, Typography, message} from "antd"
+import {Button, message} from "antd"
 import {getDefaultStore, useAtom, useAtomValue, useSetAtom} from "jotai"
 import dynamic from "next/dynamic"
 import {useRouter} from "next/router"
@@ -64,9 +58,10 @@ import {
     connectAppToEvaluatorAtom,
     persistedAppSelectionAtom,
     persistedTestsetSelectionAtom,
-    selectedAppLabelAtom,
 } from "@/oss/components/Evaluators/components/ConfigureEvaluator/atoms"
 import EvaluatorPlaygroundHeader from "@/oss/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader"
+import SelectAppEmptyState from "@/oss/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState"
+import {useEvaluatorRunControls} from "@/oss/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls"
 import {clearEvaluatorWorkflowCache} from "@/oss/components/Evaluators/store/evaluatorsPaginatedStore"
 import {invalidateAppManagementWorkflowQueries} from "@/oss/components/pages/app-management/store"
 import {invalidatePromptsWorkflowQueries} from "@/oss/components/pages/prompts/store"
@@ -200,7 +195,6 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
     const resetAll = useSetAtom(playgroundController.actions.resetAll)
     const clearAllRuns = useSetAtom(clearAllRunsMutationAtom)
     const setInitialized = useSetAtom(playgroundInitializedAtom)
-    const setSelectedAppLabel = useSetAtom(selectedAppLabelAtom)
     const setConnectedTestset = useSetAtom(connectedTestsetAtom)
     const connectApp = useSetAtom(connectAppToEvaluatorAtom)
     const setPersistedTestset = useSetAtom(persistedTestsetSelectionAtom)
@@ -211,10 +205,12 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
 
             const store = getDefaultStore()
 
-            // Restore persisted app selection (survives drawer close/reopen and commits)
+            // Restore persisted app selection (survives drawer close/reopen and commits).
+            // `selectedAppLabelAtom` is derived from the node graph now — the
+            // `connectApp` call below seeds the depth-0 node with the persisted
+            // label, which the derived atom picks up automatically.
             const persisted = store.get(persistedAppSelectionAtom)
             if (persisted) {
-                setSelectedAppLabel(persisted.appLabel)
                 connectApp({
                     appRevisionId: persisted.appRevisionId,
                     appLabel: persisted.appLabel,
@@ -272,7 +268,8 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
 
             resetAll()
             setInitialized(false)
-            setSelectedAppLabel(null)
+            // `selectedAppLabelAtom` is derived from the node graph — `resetAll`
+            // above clears the nodes, which flips the label back to `null`.
             setConnectedTestset(null)
         }
     }, [
@@ -281,7 +278,6 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
         resetAll,
         clearAllRuns,
         setInitialized,
-        setSelectedAppLabel,
         setConnectedTestset,
         connectApp,
     ])
@@ -311,60 +307,28 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
         })
     }, [connectedTestset, setPersistedTestset])
 
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+    // Shared run controls — the same hook the full page and the creation drawer
+    // use, so every evaluator surface gates runs identically (run-on aware) and
+    // can't drift apart again. (This drawer previously hardcoded
+    // `runDisabled={!hasAppConnected}`, which ignored the run-on mode and forced
+    // an app even in test-case mode.)
+    const {appWorkflowAdapter, handleAppSelect, selectedAppLabel, runDisabled} =
+        useEvaluatorRunControls()
 
     const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
-    const evaluatorNode = useMemo(() => {
-        const downstream = nodes.find((n) => n.depth > 0)
-        if (downstream) return downstream
-        return nodes[0] ?? null
-    }, [nodes])
-
-    // Derive from nodes directly (single source of truth, no atom indirection)
-    const hasAppConnected = useMemo(() => nodes.some((n) => n.depth > 0), [nodes])
     const configEntityIds = useMemo(() => {
         const downstream = nodes.filter((n) => n.depth > 0)
         if (downstream.length > 0) return downstream.map((n) => n.entityId)
         return nodes.map((n) => n.entityId)
     }, [nodes])
 
-    const appWorkflowAdapter = useMemo(
-        () =>
-            createWorkflowRevisionAdapter({
-                skipVariantLevel: true,
-                excludeRevisionZero: true,
-                flags: {is_evaluator: false, is_feedback: false},
-            }),
-        [],
-    )
-
-    const handleAppSelect = useCallback(
-        (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
-            connectApp({
-                appRevisionId: selection.id,
-                appLabel: selection.label,
-                evaluatorRevisionId: evaluatorNode.entityId,
-                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
-            })
-        },
-        [connectApp, evaluatorNode],
-    )
-
     const runDisabledContent = useMemo(
         () => (
-            <>
-                <Typography.Text type="secondary" className="text-sm">
-                    Select an app to run the evaluator chain
-                </Typography.Text>
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={handleAppSelect}
-                    size="middle"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-            </>
+            <SelectAppEmptyState
+                adapter={appWorkflowAdapter}
+                onSelect={handleAppSelect}
+                selectedAppLabel={selectedAppLabel}
+            />
         ),
         [appWorkflowAdapter, handleAppSelect, selectedAppLabel],
     )
@@ -382,12 +346,7 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
     return (
         <OSSPlaygroundShell providers={providers}>
             <div className="flex flex-col w-full h-full overflow-hidden">
-                {isExpanded && (
-                    <EvaluatorPlaygroundHeader
-                        appWorkflowAdapter={appWorkflowAdapter}
-                        onAppSelect={handleAppSelect}
-                    />
-                )}
+                {isExpanded && <EvaluatorPlaygroundHeader />}
                 <PlaygroundMainView
                     mode="evaluator"
                     viewMode={isExpanded ? "full" : "configOnly"}
@@ -395,7 +354,7 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => {
                     configViewMode={configViewMode}
                     onConfigViewModeChange={setConfigViewMode}
                     configEntityIdsOverride={configEntityIds}
-                    runDisabled={!hasAppConnected}
+                    runDisabled={runDisabled}
                     runDisabledContent={runDisabledContent}
                 />
             </div>
@@ -492,23 +451,18 @@ const useDrawerCreateCommitCallback = () => {
                     // (`Router.pathname` only flips on `routeChangeComplete`,
                     // so a synchronous close after `router.push` would patch
                     // the still-current `/evaluators` URL and push back to it.)
+                    //
                     // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the
-                    // flag is off, post-create stays in the drawer flow even
-                    // for evaluators whose classifier supports full-page UX.
-                    let eligibleForPlayground = false
-                    if (
-                        EVALUATOR_FULL_PAGE_NAV_ENABLED &&
-                        newAppId &&
-                        newRevisionId &&
-                        newWorkflow
-                    ) {
-                        eligibleForPlayground = hasFullPagePlaygroundUX({
-                            flags: newWorkflow.flags ?? null,
-                            data: newWorkflow.data ?? null,
-                            meta: newWorkflow.meta ?? null,
-                            slug: newWorkflow.slug ?? null,
-                        })
-                    }
+                    // flag is off, post-create stays in the drawer flow. When
+                    // on, every freshly committed evaluator (regardless of
+                    // template type) lands on `/apps/<id>/playground` —
+                    // mirroring app-create's post-commit navigation. The
+                    // earlier classifier-only gate was removed so declarative
+                    // evaluators get the same surface (variants, traces,
+                    // sidebar context) as LLM/code ones.
+                    const eligibleForPlayground = Boolean(
+                        EVALUATOR_FULL_PAGE_NAV_ENABLED && newAppId && newRevisionId,
+                    )
 
                     if (eligibleForPlayground && newAppId && newRevisionId) {
                         const url = `${baseAppURLRef.current}/${encodeURIComponent(
diff --git a/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx b/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx
index 8d4ae5b560..d5cf926818 100644
--- a/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx
+++ b/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx
@@ -1,6 +1,8 @@
+import {PageLayout} from "@agenta/ui"
+import {ArrowLeft} from "@phosphor-icons/react"
+import {Button} from "antd"
 import {useRouter} from "next/router"
 
-import ArchivedEntityLayout from "@/oss/components/ArchivedEntityLayout"
 import useURL from "@/oss/hooks/useURL"
 
 import ApplicationManagementSection from "./components/ApplicationManagementSection"
@@ -9,13 +11,26 @@ export default function ArchivedAppsPage() {
     const router = useRouter()
     const {baseAppURL} = useURL()
 
+    // Mirror the Archived Evaluators header: the back arrow sits inline with the
+    // title (no standalone "Back" button, no subtitle) so both archived pages
+    // share one layout via PageLayout.
+    const title = (
+        <span className="inline-flex items-center gap-2">
+            <Button
+                type="text"
+                size="small"
+                icon={<ArrowLeft size={16} />}
+                onClick={() => router.push(baseAppURL)}
+                className="!px-1"
+                aria-label="Back to apps"
+            />
+            <span>Archived Apps</span>
+        </span>
+    )
+
     return (
-        <ArchivedEntityLayout
-            title="Archived Apps"
-            subtitle="Archived apps are hidden from your workspace but can be restored at any time."
-            onBack={() => router.push(baseAppURL)}
-        >
+        <PageLayout title={title} className="grow min-h-0">
             <ApplicationManagementSection mode="archived" />
-        </ArchivedEntityLayout>
+        </PageLayout>
     )
 }
diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
index eaad8e3de3..9bb079e6f7 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/CreateEvaluatorDrawer/index.tsx
@@ -20,12 +20,6 @@ import {
     registerWorkflowCommitCallbacks,
     getWorkflowCommitCallbacks,
 } from "@agenta/entities/workflow"
-import {EntityPicker} from "@agenta/entity-ui"
-import {
-    createWorkflowRevisionAdapter,
-    type WorkflowRevisionSelectionResult,
-} from "@agenta/entity-ui/selection"
-import {playgroundController} from "@agenta/playground"
 import {type PlaygroundUIProviders} from "@agenta/playground-ui"
 import {ArrowsIn, ArrowsOut} from "@phosphor-icons/react"
 import {Button, Typography} from "antd"
@@ -34,12 +28,10 @@ import dynamic from "next/dynamic"
 
 import SimpleSharedEditor from "@/oss/components/EditorViews/SimpleSharedEditor"
 import EnhancedDrawer from "@/oss/components/EnhancedUIs/Drawer"
-import {
-    connectAppToEvaluatorAtom,
-    evaluatorConfigEntityIdsAtom,
-    hasAppConnectedAtom,
-    selectedAppLabelAtom,
-} from "@/oss/components/Evaluators/components/ConfigureEvaluator/atoms"
+import {evaluatorConfigEntityIdsAtom} from "@/oss/components/Evaluators/components/ConfigureEvaluator/atoms"
+import EvaluatorRunControls from "@/oss/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls"
+import SelectAppEmptyState from "@/oss/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState"
+import {useEvaluatorRunControls} from "@/oss/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls"
 import {clearEvaluatorWorkflowCache} from "@/oss/components/Evaluators/store/evaluatorsPaginatedStore"
 import PlaygroundTestcaseEditor from "@/oss/components/Playground/Components/PlaygroundTestcaseEditor"
 import {OSSPlaygroundShell} from "@/oss/components/Playground/OSSPlaygroundShell"
@@ -52,11 +44,6 @@ const PlaygroundMainView = dynamic(
     {ssr: false},
 )
 
-const TestsetDropdown = dynamic(
-    () => import("@/oss/components/Playground/Components/TestsetDropdown"),
-    {ssr: false},
-)
-
 interface CreateEvaluatorDrawerProps {
     /** Callback after successful evaluator creation. Called with the new revision ID. */
     onEvaluatorCreated?: (configId?: string) => void
@@ -70,53 +57,11 @@ const DrawerHeader = ({entityId, onClose}: {entityId: string; onClose: () => voi
     )
     const name = entityData?.name?.trim() || entityData?.slug?.trim() || "New Evaluator"
 
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
-    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
-
-    // Read current evaluator node (same logic as evaluator playground page)
-    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
-    const evaluatorNode = useMemo(() => {
-        const downstream = nodes.find((n) => n.depth > 0)
-        if (downstream) return downstream
-        return nodes[0] ?? null
-    }, [nodes])
-
-    const appWorkflowAdapter = useMemo(
-        () =>
-            createWorkflowRevisionAdapter({
-                skipVariantLevel: true,
-                excludeRevisionZero: true,
-                flags: {is_evaluator: false, is_feedback: false},
-            }),
-        [],
-    )
-
-    const handleAppSelect = useCallback(
-        (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
-            connectApp({
-                appRevisionId: selection.id,
-                appLabel: selection.label,
-                evaluatorRevisionId: evaluatorNode.entityId,
-                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
-            })
-        },
-        [connectApp, evaluatorNode],
-    )
-
     return (
         <div className="flex items-center justify-between px-4 py-3 border-0 border-b border-solid border-[var(--ag-rgba-051729-06)]">
             <Typography.Text className="text-base font-semibold">{name}</Typography.Text>
             <div className="flex items-center gap-2">
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={handleAppSelect}
-                    size="small"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-                {hasAppConnected && <TestsetDropdown />}
+                <EvaluatorRunControls />
                 <Button
                     type="text"
                     size="small"
@@ -141,10 +86,11 @@ const DrawerContent = ({
     onEvaluatorCreated?: (configId?: string) => void
 }) => {
     const isExpanded = useAtomValue(drawerExpandedAtom)
-    const hasAppConnected = useAtomValue(hasAppConnectedAtom)
     const configEntityIds = useAtomValue(evaluatorConfigEntityIdsAtom)
-    const connectApp = useSetAtom(connectAppToEvaluatorAtom)
-    const selectedAppLabel = useAtomValue(selectedAppLabelAtom)
+    // Same shared controls the header uses — the run gate now respects the
+    // run-on mode, so test-case mode runs without forcing an app.
+    const {appWorkflowAdapter, handleAppSelect, selectedAppLabel, runDisabled} =
+        useEvaluatorRunControls()
     const onEvaluatorCreatedRef = useRef(onEvaluatorCreated)
     onEvaluatorCreatedRef.current = onEvaluatorCreated
 
@@ -173,51 +119,13 @@ const DrawerContent = ({
         }
     }, [])
 
-    // Read current evaluator node for app selection
-    const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), []))
-    const evaluatorNode = useMemo(() => {
-        const downstream = nodes.find((n) => n.depth > 0)
-        if (downstream) return downstream
-        return nodes[0] ?? null
-    }, [nodes])
-
-    const appWorkflowAdapter = useMemo(
-        () =>
-            createWorkflowRevisionAdapter({
-                skipVariantLevel: true,
-                excludeRevisionZero: true,
-                flags: {is_evaluator: false, is_feedback: false},
-            }),
-        [],
-    )
-
-    const handleAppSelect = useCallback(
-        (selection: WorkflowRevisionSelectionResult) => {
-            if (!evaluatorNode) return
-            connectApp({
-                appRevisionId: selection.id,
-                appLabel: selection.label,
-                evaluatorRevisionId: evaluatorNode.entityId,
-                evaluatorLabel: evaluatorNode.label ?? "Evaluator",
-            })
-        },
-        [connectApp, evaluatorNode],
-    )
-
     const runDisabledContent = useMemo(
         () => (
-            <>
-                <Typography.Text type="secondary" className="text-sm">
-                    Select an app to run the evaluator chain
-                </Typography.Text>
-                <EntityPicker<WorkflowRevisionSelectionResult>
-                    variant="popover-cascader"
-                    adapter={appWorkflowAdapter}
-                    onSelect={handleAppSelect}
-                    size="middle"
-                    placeholder={selectedAppLabel ?? "Select app"}
-                />
-            </>
+            <SelectAppEmptyState
+                adapter={appWorkflowAdapter}
+                onSelect={handleAppSelect}
+                selectedAppLabel={selectedAppLabel}
+            />
         ),
         [appWorkflowAdapter, handleAppSelect, selectedAppLabel],
     )
@@ -240,7 +148,7 @@ const DrawerContent = ({
                     mode="evaluator"
                     viewMode={isExpanded ? "full" : "configOnly"}
                     configEntityIdsOverride={configEntityIds}
-                    runDisabled={!hasAppConnected}
+                    runDisabled={runDisabled}
                     runDisabledContent={runDisabledContent}
                 />
             </div>
diff --git a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
index 2b4e35f316..25f71b2c2c 100644
--- a/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
+++ b/web/oss/src/components/pages/evaluations/NewEvaluation/Components/NewEvaluationModalInner.tsx
@@ -14,7 +14,6 @@ import {
     evaluatorsListDataAtom,
     evaluatorsListQueryAtom,
     humanEvaluatorsListDataAtom,
-    humanEvaluatorsListQueryAtom,
     invalidateWorkflowsListCache,
     invalidateEvaluatorsListCache,
 } from "@agenta/entities/workflow"
@@ -128,18 +127,27 @@ const NewEvaluationModalInner = ({
             updatedAt: app.updated_at ?? null,
         }))
         if (selectedAppId && !options.some((opt) => opt.value === selectedAppId)) {
-            // Evaluators (and locally-picked workflows) aren't in useAppsData —
-            // fall back to the captured meta so the tag renders a real name.
+            // Evaluators (and locally-picked workflows) aren't in useAppsData.
+            // When the user picked a row we have `selectedWorkflowMeta`; for an
+            // app-scoped EVALUATOR route there's no meta, so resolve the name
+            // (and kind) from the evaluators list — otherwise the Application
+            // panel renders the raw workflow id instead of its name.
+            const evaluatorWorkflow = evaluatorWorkflows.find((e) => e.id === selectedAppId)
+            const isEvaluator = selectedWorkflowMeta?.isEvaluator ?? Boolean(evaluatorWorkflow)
             options.push({
-                label: selectedWorkflowMeta?.label ?? selectedAppId,
+                label:
+                    selectedWorkflowMeta?.label ??
+                    evaluatorWorkflow?.name ??
+                    evaluatorWorkflow?.slug ??
+                    selectedAppId,
                 value: selectedAppId,
-                type: selectedWorkflowMeta?.isEvaluator ? "evaluator" : null,
+                type: isEvaluator ? "evaluator" : null,
                 createdAt: null,
                 updatedAt: null,
             })
         }
         return options
-    }, [availableApps, selectedAppId, selectedWorkflowMeta])
+    }, [availableApps, selectedAppId, selectedWorkflowMeta, evaluatorWorkflows])
     const router = useRouter()
     const {baseAppURL, projectURL} = useURL()
 
@@ -149,9 +157,11 @@ const NewEvaluationModalInner = ({
     const configsData = useAtomValue(evaluatorConfigsListDataAtom)
     const configsQueryState = useAtomValue(evaluatorConfigsQueryStateAtom)
 
-    // Workflow-based evaluator list atoms (replace legacy useEvaluators hook)
+    // Workflow-based evaluator list atoms (replace legacy useEvaluators hook).
+    // The `humanEvaluatorsListDataAtom` subscription already drives the human
+    // evaluators query; we don't separately read its query-state object (doing
+    // so only churned the derived-evaluators memo above), so it's not read here.
     const humanEvaluatorsList = useAtomValue(humanEvaluatorsListDataAtom)
-    const humanEvaluatorsQuery = useAtomValue(humanEvaluatorsListQueryAtom)
     const evaluatorsList = useAtomValue(evaluatorsListDataAtom)
     const evaluatorsQuery = useAtomValue(evaluatorsListQueryAtom)
 
@@ -174,13 +184,17 @@ const NewEvaluationModalInner = ({
                     loadingEvaluatorConfigs: configsQueryState.isPending ?? false,
                 }
             }
+            // Depend on the *values* the body reads, not the query result
+            // objects — `evaluatorsQuery`/`humanEvaluatorsQuery` change identity
+            // on every query tick (and `humanEvaluatorsQuery` isn't read at all),
+            // which recomputed this memo every render and churned the derived
+            // `evaluators`/`evaluatorConfigs` → `appOptions`/Tabs items downstream.
         }, [
             preview,
             evaluationType,
             humanEvaluatorsList,
-            humanEvaluatorsQuery,
             evaluatorsList,
-            evaluatorsQuery,
+            evaluatorsQuery.isPending,
             templatesData,
             configsData,
             templatesQuery.isPending,
diff --git a/web/oss/src/components/pages/observability/assets/filters/fieldAdapter.ts b/web/oss/src/components/pages/observability/assets/filters/fieldAdapter.ts
index 243a71097a..15b68bdf68 100644
--- a/web/oss/src/components/pages/observability/assets/filters/fieldAdapter.ts
+++ b/web/oss/src/components/pages/observability/assets/filters/fieldAdapter.ts
@@ -25,6 +25,13 @@ export interface FieldConfig {
     valueDisplayText?: string
     queryKey?: string
     referenceProperty?: string
+    /**
+     * Category for the `references` family (application / evaluator /
+     * application_variant / environment). Used by `mapFilterData` to
+     * disambiguate which sub-column an incoming filter row maps to when
+     * multiple share `baseField: "references"` and `referenceProperty: "id"`.
+     */
+    referenceCategory?: string
     // reference/application/evaluator transforms
     toExternal?: (normalized: any) => any
     toUI?: (external: any) => any
@@ -86,6 +93,7 @@ const walk = (nodes: FilterMenuNode[], acc: FieldConfig[]) => {
             valueDisplayText: leaf.valueDisplayText,
             queryKey: leaf.queryKey,
             referenceProperty: leaf.referenceProperty,
+            referenceCategory: leaf.referenceCategory,
         }
 
         // references/application/evaluator → keep simple mapper
@@ -112,9 +120,23 @@ const walk = (nodes: FilterMenuNode[], acc: FieldConfig[]) => {
             }
             cfg.toUI = (external: any) => {
                 const arr = Array.isArray(external) ? external : external ? [external] : []
-                return arr.map((e: any) =>
+                // De-dup by extracted value. References can OR-match across
+                // slots in a single condition (e.g.,
+                // `[{id:X, key:eval}, {id:X, key:app}]` for "match this entity
+                // in either slot"). Without de-dup the UI shows the same id
+                // twice. The backend keeps the rich shape via `toExternal`.
+                const mapped = arr.map((e: any) =>
                     e && typeof e === "object" ? (e[leaf.referenceProperty!] ?? "") : e,
                 )
+                const seen = new Set<string>()
+                const out: any[] = []
+                for (const v of mapped) {
+                    const key = typeof v === "string" ? v : JSON.stringify(v)
+                    if (seen.has(key)) continue
+                    seen.add(key)
+                    out.push(v)
+                }
+                return out
             }
         }
 
diff --git a/web/oss/src/components/pages/observability/components/ObservabilityHeader/index.tsx b/web/oss/src/components/pages/observability/components/ObservabilityHeader/index.tsx
index 3b4d9db56a..be55e65fa3 100644
--- a/web/oss/src/components/pages/observability/components/ObservabilityHeader/index.tsx
+++ b/web/oss/src/components/pages/observability/components/ObservabilityHeader/index.tsx
@@ -13,6 +13,8 @@ import Papa from "papaparse"
 
 import EnhancedButton from "@/oss/components/EnhancedUIs/Button"
 import {SortResult} from "@/oss/components/Filters/Sort"
+import type {FilterItem} from "@/oss/components/Filters/types"
+import {fieldConfigByOptionKey} from "@/oss/components/pages/observability/assets/filters/fieldAdapter"
 import AddActionsDropdown from "@/oss/components/SharedActions/AddActionsDropdown"
 import {deleteTraceModalAtom} from "@/oss/components/SharedDrawers/TraceDrawer/components/DeleteTraceModal/store/atom"
 import useLazyEffect from "@/oss/hooks/useLazyEffect"
@@ -25,6 +27,7 @@ import {buildTraceQueryParams} from "@/oss/state/newObservability/atoms/queryHel
 import {createAdaptiveTracePageFetcher} from "@/oss/state/newObservability/etl/adaptiveTracePageFetcher"
 import {createExportWriter, PICKER_CANCELLED} from "@/oss/state/newObservability/etl/exportWriter"
 import {getAgData} from "@/oss/state/newObservability/selectors/tracing"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 import {createTraceObject, DEFAULT_TRACE_EXPORT_HEADERS} from "../../assets/exportUtils"
 import {buildAttributeKeyTreeOptions} from "../../assets/filters/attributeKeyOptions"
@@ -146,6 +149,92 @@ const ObservabilityHeader = ({
         () => getFilterColumns(attributeKeyOptions),
         [attributeKeyOptions],
     )
+
+    // --- Live label flip for the permanent references row in the dialog -----
+    //
+    // After Apply, the atom regenerates the references row's `attributes.key`
+    // from the effective trace_type (annotation → evaluator, invocation →
+    // application). That's what makes the label switch between "Evaluator ID"
+    // and "Application ID" in the chip outside the dialog. But while the user
+    // is still editing in the dialog, the row sits in local state — changing
+    // the trace_type dropdown there has no visual effect on the references
+    // row's label, which feels broken.
+    //
+    // The reconciler below produces a *display-only* projection of the local
+    // filter rows: if a trace_type row is present, it re-derives the permanent
+    // references row's `selectedField` / `selectedLabel` to match. The
+    // underlying `filter` state is untouched (the reconciler only runs in a
+    // `useMemo` inside the dialog) and the Apply path is unchanged — on
+    // Apply, the atom still strips and re-derives the permanent row, so the
+    // backend value matches the displayed label.
+    //
+    // Skipped for non-evaluator workflows: the references row is always pinned
+    // to `application` there, so flipping the label on trace_type changes
+    // would be misleading.
+    const workflowKind = useAtomValue(currentWorkflowContextAtom).workflowKind
+    const filterFieldMap = useMemo(() => fieldConfigByOptionKey(filterColumns), [filterColumns])
+    const reconcileFilterRows = useCallback(
+        (rows: FilterItem[]): FilterItem[] => {
+            if (workflowKind !== "evaluator") return rows
+
+            const tt = rows.find(
+                (r) => r.selectedField === "trace_type" || r.field === "trace_type",
+            )
+            // Mirror the atom's trace_type intent resolution (controls.ts):
+            // honour `is_not`/`not_in` against the 2-value enum by flipping.
+            const op = tt?.operator
+            const rawValue = Array.isArray(tt?.value) ? tt?.value[0] : tt?.value
+            const isAffirm = op === "is" || op === "in"
+            const isNeg = op === "is_not" || op === "not_in"
+            const normalize = (x: unknown): "annotation" | "invocation" | null =>
+                x === "annotation" ? "annotation" : x === "invocation" ? "invocation" : null
+            const flip = (x: unknown): "annotation" | "invocation" | null =>
+                x === "annotation" ? "invocation" : x === "invocation" ? "annotation" : null
+            let effective: "annotation" | "invocation" | null = null
+            if (tt && isAffirm) effective = normalize(rawValue)
+            else if (tt && isNeg) effective = flip(rawValue)
+
+            // When trace_type is absent, fall through to "no opinion" — keep
+            // whatever the row currently shows (which came from the atom's
+            // default for this workflow kind).
+            if (!effective) return rows
+
+            const targetCategory = effective === "invocation" ? "application" : "evaluator"
+
+            return rows.map((row) => {
+                if (!row.isPermanent) return row
+                const optionKey = row.selectedField || row.field
+                if (!optionKey) return row
+                const fc = filterFieldMap.get(optionKey)
+                if (!fc?.referenceCategory) return row
+                if (
+                    fc.referenceCategory !== "application" &&
+                    fc.referenceCategory !== "evaluator"
+                ) {
+                    return row
+                }
+                if (fc.referenceCategory === targetCategory) return row
+                // Find the corresponding FieldConfig for the target category
+                // with the same referenceProperty (id / slug).
+                let target: typeof fc | undefined
+                for (const candidate of filterFieldMap.values()) {
+                    if (candidate.referenceCategory !== targetCategory) continue
+                    if (candidate.referenceProperty !== fc.referenceProperty) continue
+                    target = candidate
+                    break
+                }
+                if (!target) return row
+                return {
+                    ...row,
+                    field: target.optionKey,
+                    selectedField: target.optionKey,
+                    selectedLabel: target.label,
+                    baseField: target.baseField,
+                }
+            })
+        },
+        [workflowKind, filterFieldMap],
+    )
     const selectedTraceIds = useMemo(
         () =>
             Array.from(
@@ -579,6 +668,7 @@ const ObservabilityHeader = ({
                             columns={filterColumns}
                             onApplyFilter={onApplyFilter}
                             onClearFilter={onClearFilter}
+                            reconcileFilterRows={reconcileFilterRows}
                         />
 
                         <Sort onSortApply={onSortApply} defaultSortValue="24 hours" />
diff --git a/web/oss/src/components/pages/overview/variants/VariantsOverview.tsx b/web/oss/src/components/pages/overview/variants/VariantsOverview.tsx
index a0f0bed285..925d521679 100644
--- a/web/oss/src/components/pages/overview/variants/VariantsOverview.tsx
+++ b/web/oss/src/components/pages/overview/variants/VariantsOverview.tsx
@@ -3,7 +3,7 @@ import {useCallback, useMemo} from "react"
 import {Rocket} from "@phosphor-icons/react"
 import {Button, Typography} from "antd"
 import clsx from "clsx"
-import {useSetAtom} from "jotai"
+import {useAtomValue, useSetAtom} from "jotai"
 import Link from "next/link"
 
 import {openDeployVariantModalAtom} from "@/oss/components/Playground/Components/Modals/DeployVariantModal/store/deployVariantModalStore"
@@ -13,6 +13,7 @@ import RegistryTable from "@/oss/components/VariantsComponents/Table/RegistryTab
 import {usePlaygroundNavigation} from "@/oss/hooks/usePlaygroundNavigation"
 import {useQuery} from "@/oss/hooks/useQuery"
 import useURL from "@/oss/hooks/useURL"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 const {Title} = Typography
 
@@ -21,6 +22,9 @@ const VariantsOverview = () => {
     const {appURL} = useURL()
     const {goToPlayground} = usePlaygroundNavigation()
     const openDeployVariantModal = useSetAtom(openDeployVariantModalAtom)
+    // Evaluator workflows aren't deployed — hide the row "Deploy" action.
+    const isCurrentWorkflowEvaluator =
+        useAtomValue(currentWorkflowContextAtom).workflowKind === "evaluator"
 
     const handleRowClick = useCallback(
         (record: RegistryRevisionRow) => {
@@ -83,6 +87,7 @@ const VariantsOverview = () => {
             <RegistryTable
                 onRowClick={handleRowClick}
                 actions={columnActions}
+                hideDeployActions={isCurrentWorkflowEvaluator}
                 scopeId="overview-recent"
                 pageSize={5}
                 columnVisibilityStorageKey="agenta:overview-registry:column-visibility"
diff --git a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/evaluations/index.tsx b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/evaluations/index.tsx
index f6f8581c10..2e5ea84fd3 100644
--- a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/evaluations/index.tsx
+++ b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/evaluations/index.tsx
@@ -5,7 +5,7 @@ import {useAppId} from "@/oss/hooks/useAppId"
 const AppEvaluationsPage = () => {
     const appId = useAppId()
     return (
-        <RequireWorkflowKind allowed={["app"]} currentRoute="evaluations">
+        <RequireWorkflowKind allowed={["app", "evaluator"]} currentRoute="evaluations">
             <EvaluationsView scope="app" appId={appId} />
         </RequireWorkflowKind>
     )
diff --git a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/overview/index.tsx b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/overview/index.tsx
index bb1302d913..3315d7c01e 100644
--- a/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/overview/index.tsx
+++ b/web/oss/src/pages/w/[workspace_id]/p/[project_id]/apps/[app_id]/overview/index.tsx
@@ -6,7 +6,7 @@ import {Copy, PencilSimple, Trash} from "@phosphor-icons/react"
 // TEMPORARY: Disabling name editing
 // import {PencilLine} from "@phosphor-icons/react"
 import {Button, Dropdown, Space, Typography} from "antd"
-import {useSetAtom} from "jotai"
+import {useAtomValue, useSetAtom} from "jotai"
 import dynamic from "next/dynamic"
 
 import useCustomWorkflowConfig from "@/oss/components/pages/app-management/modals/CustomWorkflowModal/hooks/useCustomWorkflowConfig"
@@ -16,8 +16,10 @@ import {openDeleteAppModalAtom} from "@/oss/components/pages/app-management/moda
 import DeploymentOverview from "@/oss/components/pages/overview/deployments/DeploymentOverview"
 import VariantsOverview from "@/oss/components/pages/overview/variants/VariantsOverview"
 import RequireWorkflowKind from "@/oss/components/RequireWorkflowKind"
+import {useAppId} from "@/oss/hooks/useAppId"
 import {copyToClipboard} from "@/oss/lib/helpers/copyToClipboard"
 import {useAppsData} from "@/oss/state/app"
+import {currentWorkflowAtom} from "@/oss/state/workflow"
 
 const CustomWorkflowHistory: any = dynamic(
     () => import("@/oss/components/pages/app-management/drawers/CustomWorkflowHistory"),
@@ -35,7 +37,15 @@ const AppDetailsSection = memo(() => {
     const openDeleteAppModal = useSetAtom(openDeleteAppModalAtom)
     // TEMPORARY: Disabling name editing
     // const openEditAppModal = useSetAtom(openEditAppModalAtom)
-    const {currentApp, mutate: mutateApps} = useAppsData()
+    // Resolve the current workflow (app OR evaluator) from the unified state so
+    // this header works on evaluator overview pages too — `useAppsData()`
+    // returns null for evaluators (they aren't in the apps list). `mutateApps`
+    // is still needed to refresh after the app-only "Configure" custom-workflow
+    // flow.
+    const {mutate: mutateApps} = useAppsData()
+    const currentWorkflow = useAtomValue(currentWorkflowAtom)
+    const workflowId = currentWorkflow?.id ?? ""
+    const workflowName = currentWorkflow?.name ?? currentWorkflow?.slug ?? ""
     const {openModal} = useCustomWorkflowConfig({
         afterConfigSave: mutateApps,
     })
@@ -43,7 +53,7 @@ const AppDetailsSection = memo(() => {
         <>
             <Space className="flex items-center gap-3">
                 <Title level={3} className="!m-0">
-                    {currentApp?.name ?? currentApp?.slug ?? ""}
+                    {workflowName}
                 </Title>
 
                 <Dropdown
@@ -55,7 +65,7 @@ const AppDetailsSection = memo(() => {
                     }}
                     menu={{
                         items: [
-                            ...(currentApp?.flags?.is_custom
+                            ...(currentWorkflow?.flags?.is_custom
                                 ? [
                                       {
                                           key: "configure",
@@ -84,15 +94,15 @@ const AppDetailsSection = memo(() => {
                                 key: "copy_id",
                                 label: "Copy ID",
                                 icon: <Copy size={16} />,
-                                onClick: () => copyToClipboard(currentApp!.id),
+                                onClick: () => copyToClipboard(workflowId),
                             },
-                            ...(currentApp?.slug
+                            ...(currentWorkflow?.slug
                                 ? [
                                       {
                                           key: "copy_slug",
                                           label: "Copy Slug",
                                           icon: <Copy size={16} />,
-                                          onClick: () => copyToClipboard(currentApp!.slug!),
+                                          onClick: () => copyToClipboard(currentWorkflow.slug!),
                                       },
                                   ]
                                 : []),
@@ -103,8 +113,8 @@ const AppDetailsSection = memo(() => {
                                 danger: true,
                                 onClick: () =>
                                     openDeleteAppModal({
-                                        id: currentApp!.id,
-                                        name: currentApp!.name ?? currentApp!.slug ?? "",
+                                        id: workflowId,
+                                        name: workflowName,
                                     }),
                             },
                         ],
@@ -118,8 +128,18 @@ const AppDetailsSection = memo(() => {
 })
 
 const OverviewContent = () => {
-    const {currentApp} = useAppsData()
-    const appId = currentApp?.id ?? null
+    // Use the route workflow id (works for apps AND evaluators) rather than
+    // `useAppsData().currentApp?.id`, which is null for evaluators. The Overview
+    // eval-runs tables are `appScoped` to this id, so each scopes to runs where
+    // the workflow is the evaluated SUBJECT (the run-list subject predicate in
+    // fetchEvaluationRunsWindow) — i.e. "evaluations of this workflow". For an
+    // evaluator that's its subject runs (evaluations OF it), not runs that used
+    // it as a grader. So the summaries are correct for apps AND evaluators.
+    const appId = useAppId() || null
+    // Deployments don't apply to evaluator workflows (they're not deployed like
+    // apps), so the Deployment section is hidden for them.
+    const currentWorkflow = useAtomValue(currentWorkflowAtom)
+    const isEvaluator = Boolean(currentWorkflow?.flags?.is_evaluator)
     const [isCustomWorkflowHistoryDrawerOpen, setIsCustomWorkflowHistoryDrawerOpen] =
         useState(false)
 
@@ -128,7 +148,7 @@ const OverviewContent = () => {
             <PageLayout className="gap-8">
                 <AppDetailsSection />
                 <ObservabilityOverview />
-                <DeploymentOverview />
+                {!isEvaluator ? <DeploymentOverview /> : null}
                 <VariantsOverview />
 
                 <LatestEvaluationRunsTable
@@ -156,7 +176,7 @@ const OverviewContent = () => {
 }
 
 const OverviewPage = () => (
-    <RequireWorkflowKind allowed={["app"]} currentRoute="overview">
+    <RequireWorkflowKind allowed={["app", "evaluator"]} currentRoute="overview">
         <OverviewContent />
     </RequireWorkflowKind>
 )
diff --git a/web/oss/src/state/newObservability/atoms/controls.ts b/web/oss/src/state/newObservability/atoms/controls.ts
index 095f27444b..a300751a51 100644
--- a/web/oss/src/state/newObservability/atoms/controls.ts
+++ b/web/oss/src/state/newObservability/atoms/controls.ts
@@ -1,6 +1,7 @@
 // Query control atoms for the observability module
 import type {Key} from "react"
 
+import {defaultTraceTypeForWorkflow} from "@agenta/entities/workflow"
 import dayjs from "dayjs"
 import {atom} from "jotai"
 import {atomFamily, atomWithStorage} from "jotai/utils"
@@ -9,6 +10,7 @@ import type {SortResult} from "@/oss/components/Filters/Sort"
 import type {TestsetTraceData} from "@/oss/components/SharedDrawers/AddToTestsetDrawer/assets/types"
 import {onboardingStorageUserIdAtom} from "@/oss/lib/onboarding/atoms"
 import type {Filter} from "@/oss/lib/Types"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 import {routerAppIdAtom} from "../../app"
 import {SESSIONS_PAGE_SIZE, TRACES_PAGE_SIZE} from "../constants"
@@ -76,12 +78,135 @@ export const limitAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
 export const sortAtomFamily = atomFamily((_tab: ObservabilityTabInfo) =>
     atom<SortResult>(DEFAULT_SORT as SortResult),
 )
-export const traceTypeDefaultEnabledAtomFamily = atomFamily((_tab: ObservabilityTabInfo) =>
-    atom<boolean>(true),
+/**
+ * User's intent for the `trace_type` filter. Tagged union — explicit
+ * semantics instead of the dual-atom (default-enabled + filters-array) dance
+ * that preceded it, where state could revert silently on re-derivations.
+ *
+ *   - `"default"`  — user has never touched trace_type → fall back to
+ *                    `defaultTraceTypeForWorkflow(workflowKind, tab)`.
+ *   - `"value"`    — user picked a specific value (annotation or invocation).
+ *   - `"cleared"`  — user explicitly removed the trace_type filter.
+ *
+ * The effective trace_type is derived in `effectiveTraceTypeAtomFamily`;
+ * downstream atoms (scope filter, query body) read that derived value.
+ */
+export type TraceTypeChoice =
+    | {kind: "default"}
+    | {kind: "value"; value: "annotation" | "invocation"}
+    | {kind: "cleared"}
+
+// --- Persisted filter state (per app, per tab) -------------------------------
+//
+// Filter selections are persisted across reloads so users don't have to
+// re-apply the same filter every time they open a page. State is scoped by
+// `app_id` so two apps can carry different filter setups, and by tab
+// (`traces` vs `sessions`) because those have independent UIs.
+//
+// Storage shape:
+//   {
+//     "<appId>": {
+//       "traces":   { userFilters: Filter[], traceTypeChoice: TraceTypeChoice },
+//       "sessions": { userFilters: Filter[], traceTypeChoice: TraceTypeChoice },
+//     },
+//     "__global__": { ... }  // when there's no router app_id (project scope)
+//   }
+//
+// We pack both pieces into one storage atom (instead of two parallel ones)
+// so a single write doesn't race the other against localStorage, and so the
+// scoped record can be cleaned up atomically per app if we ever need it.
+
+interface PersistedFilterTabState {
+    userFilters: Filter[]
+    traceTypeChoice: TraceTypeChoice
+}
+
+type PersistedFilterAppState = Partial<Record<ObservabilityTabInfo, PersistedFilterTabState>>
+
+const FILTERS_STORAGE_KEY = "agenta:observability:filters"
+const GLOBAL_SCOPE_KEY = "__global__"
+
+const filtersByAppAtom = atomWithStorage<Record<string, PersistedFilterAppState>>(
+    FILTERS_STORAGE_KEY,
+    {},
+)
+
+const emptyTabState: PersistedFilterTabState = {
+    userFilters: [],
+    traceTypeChoice: {kind: "default"},
+}
+
+const readTabState = (
+    all: Record<string, PersistedFilterAppState>,
+    appKey: string,
+    tab: ObservabilityTabInfo,
+): PersistedFilterTabState => all[appKey]?.[tab] ?? emptyTabState
+
+const writeTabState = (
+    all: Record<string, PersistedFilterAppState>,
+    appKey: string,
+    tab: ObservabilityTabInfo,
+    next: PersistedFilterTabState,
+): Record<string, PersistedFilterAppState> => ({
+    ...all,
+    [appKey]: {
+        ...(all[appKey] ?? {}),
+        [tab]: next,
+    },
+})
+
+export const traceTypeChoiceAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
+    atom(
+        (get): TraceTypeChoice => {
+            const appKey = get(routerAppIdAtom) || GLOBAL_SCOPE_KEY
+            return readTabState(get(filtersByAppAtom), appKey, tab).traceTypeChoice
+        },
+        (get, set, next: TraceTypeChoice) => {
+            const appKey = get(routerAppIdAtom) || GLOBAL_SCOPE_KEY
+            const all = get(filtersByAppAtom)
+            const current = readTabState(all, appKey, tab)
+            set(
+                filtersByAppAtom,
+                writeTabState(all, appKey, tab, {...current, traceTypeChoice: next}),
+            )
+        },
+    ),
 )
 
-// User-defined filters family
-export const userFiltersAtomFamily = atomFamily((_tab: ObservabilityTabInfo) => atom<Filter[]>([]))
+/**
+ * Effective trace_type — read this anywhere downstream that needs to know
+ * "what trace_type filter is currently in effect". `null` means no
+ * trace_type filter (user cleared, or no default applies for this tab).
+ */
+export const effectiveTraceTypeAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
+    atom<"annotation" | "invocation" | null>((get) => {
+        const choice = get(traceTypeChoiceAtomFamily(tab))
+        if (choice.kind === "cleared") return null
+        if (choice.kind === "value") return choice.value
+        // default — look up the per-workflow-kind default
+        const workflowCtx = get(currentWorkflowContextAtom)
+        const def = defaultTraceTypeForWorkflow(workflowCtx.workflowKind, tab)
+        if (def === "annotation" || def === "invocation") return def
+        return null
+    }),
+)
+
+// User-defined filters (excluding `trace_type`, which has its own atom).
+// Persisted per-app (see `filtersByAppAtom` above).
+export const userFiltersAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
+    atom(
+        (get): Filter[] => {
+            const appKey = get(routerAppIdAtom) || GLOBAL_SCOPE_KEY
+            return readTabState(get(filtersByAppAtom), appKey, tab).userFilters
+        },
+        (get, set, next: Filter[]) => {
+            const appKey = get(routerAppIdAtom) || GLOBAL_SCOPE_KEY
+            const all = get(filtersByAppAtom)
+            const current = readTabState(all, appKey, tab)
+            set(filtersByAppAtom, writeTabState(all, appKey, tab, {...current, userFilters: next}))
+        },
+    ),
+)
 
 const isTraceType = (f: Filter) => (f.key ?? f.field) === "trace_type"
 
@@ -106,58 +231,87 @@ export const sortAtom = atom(
     (get, set, value: SortResult) => set(sortAtomFamily(get(observabilityTabAtom)), value),
 )
 
-// Computed Filters logic (centralized but applied per tab)
+/**
+ * Combined filter view — what consumers (query layer, dialog) see.
+ *
+ * Composed from three pieces, in order:
+ *
+ *   1. **Scope filter** (`isPermanent: true`) — pins traces to the current
+ *      entity. Shape depends on workflow kind and the effective trace_type:
+ *
+ *      - App workflows always pin to `references.application.id = <appId>`.
+ *      - Evaluator workflows route to different reference slots because the
+ *        two relevant trace shapes write the evaluator's id into different
+ *        slots:
+ *          * Annotation traces (real evaluation runs scoring an app) put the
+ *            evaluator id in `references.evaluator.id`.
+ *          * Invocation traces (evaluator run standalone as an app) put it
+ *            in `references.application.id`, same as a normal app trace.
+ *        With trace_type known, we target the matching slot; with no
+ *        trace_type, we OR-match both slots.
+ *
+ *   2. **trace_type filter** — derived from `effectiveTraceTypeAtomFamily`.
+ *      Renders as a regular filter row in the dialog so the user can change
+ *      or remove it. The atom is the single source of truth — there's no
+ *      separate "is the default still active?" toggle. User edits flow back
+ *      through the setter into `traceTypeChoiceAtomFamily`.
+ *
+ *   3. **Other user filters** — everything else the user has added via the
+ *      filter dialog (search, span_type, has_annotation, …). Stored verbatim
+ *      in `userFiltersAtomFamily`.
+ *
+ * The setter receives the merged array (from the dialog's Apply) and splits
+ * it back: trace_type → `traceTypeChoiceAtomFamily`, other → `userFilters`.
+ * The scope filter is always re-derived; the dialog can't write to it.
+ */
 export const filtersAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
     atom(
         (get) => {
             const appId = get(routerAppIdAtom)
             const userFilters = get(userFiltersAtomFamily(tab))
-            const defaultEnabled = get(traceTypeDefaultEnabledAtomFamily(tab))
-
-            // Only apply soft default for traces, maybe? or both?
-            // "Trace filter should apply on session tab filter" - keeping logic consistent for now
-            // But if we want different defaults per tab, we can branch here.
-            // For now, assuming similar behavior is desired but independent state.
-
-            const hasUserTraceType = userFilters.some(isTraceType)
-
-            // The soft default for the trace_type filter is always
-            // `"invocation"`. Earlier we flipped to `"annotation"` when the
-            // current workflow context was an evaluator, because standalone
-            // evaluator runs at the time only emitted annotation traces.
-            // That's no longer true — standalone evaluator runs in the
-            // playground now emit invocation traces with `references.
-            // application` set (see `runnableSetup.ts`, evaluator branch),
-            // so the app-scoped `/apps/{evaluatorId}/observability` page
-            // should show those by default rather than the more rare
-            // annotation flow. Users who want annotations can still pick
-            // the filter manually.
-            const softDefaults: Filter[] = []
-            if (defaultEnabled && !hasUserTraceType && tab === "traces") {
-                softDefaults.push({
-                    field: "trace_type",
-                    operator: "is",
-                    value: "invocation",
-                })
-            }
+            const workflowCtx = get(currentWorkflowContextAtom)
+            const effectiveTraceType = get(effectiveTraceTypeAtomFamily(tab))
 
-            const appScope: Filter[] = appId
-                ? [
-                      {
-                          field: "references",
-                          operator: "in",
-                          value: [
-                              {
-                                  id: String(appId),
-                                  "attributes.key": "application",
-                              },
-                          ],
-                          isPermanent: true,
-                      },
-                  ]
+            // Build the trace_type filter row (if any)
+            const traceTypeFilters: Filter[] = effectiveTraceType
+                ? [{field: "trace_type", operator: "is", value: effectiveTraceType}]
                 : []
 
-            return [...appScope, ...softDefaults, ...userFilters]
+            // Build the scope filter row
+            const isEvaluatorWorkflow = workflowCtx.workflowKind === "evaluator"
+            const buildEvalScopeValue = () => {
+                const id = String(appId)
+                if (effectiveTraceType === "annotation") {
+                    return [{id, "attributes.key": "evaluator"}]
+                }
+                if (effectiveTraceType === "invocation") {
+                    return [{id, "attributes.key": "application"}]
+                }
+                // No trace_type filter — OR both ref slots so every trace
+                // mentioning this evaluator in either slot shows.
+                return [
+                    {id, "attributes.key": "evaluator"},
+                    {id, "attributes.key": "application"},
+                ]
+            }
+            const appScopeValue = appId
+                ? isEvaluatorWorkflow
+                    ? buildEvalScopeValue()
+                    : [{id: String(appId), "attributes.key": "application"}]
+                : []
+            const appScope: Filter[] =
+                appScopeValue.length > 0
+                    ? [
+                          {
+                              field: "references",
+                              operator: "in",
+                              value: appScopeValue,
+                              isPermanent: true,
+                          },
+                      ]
+                    : []
+
+            return [...appScope, ...traceTypeFilters, ...userFilters]
         },
         (get, set, update: Filter[] | ((prev: Filter[]) => Filter[])) => {
             const currentCombined = get(filtersAtomFamily(tab))
@@ -165,21 +319,61 @@ export const filtersAtomFamily = atomFamily((tab: ObservabilityTabInfo) =>
                 typeof update === "function" ? (update as any)(currentCombined) : update
             const normalizedNext = nextCombined || []
 
-            // Persist only non-permanent filters
-            const nextUser = normalizedNext.filter((f: Filter) => !(f as any).isPermanent)
-            set(userFiltersAtomFamily(tab), nextUser)
+            // Strip the permanent scope filter — it's regenerated, not stored.
+            const nextNonPermanent = normalizedNext.filter((f: Filter) => !(f as any).isPermanent)
 
-            // If only permanent filters remain (or none at all), keep the soft default disabled
-            if (!normalizedNext.some((f: Filter) => !(f as any).isPermanent)) {
-                set(traceTypeDefaultEnabledAtomFamily(tab), false)
-                return
-            }
+            // Split the incoming non-permanent filters: trace_type → choice
+            // atom, everything else → userFilters atom.
+            const nextTraceType = nextNonPermanent.find(isTraceType)
+            const nextOthers = nextNonPermanent.filter((f: Filter) => !isTraceType(f))
+
+            set(userFiltersAtomFamily(tab), nextOthers)
 
-            // If trace_type was present and now is not, the user explicitly cleared it.
-            const hadTraceType = currentCombined.some(isTraceType)
-            const hasTraceTypeNext = normalizedNext.some(isTraceType)
-            if (hadTraceType && !hasTraceTypeNext) {
-                set(traceTypeDefaultEnabledAtomFamily(tab), false)
+            // Trace-type intent routing:
+            //   - User has trace_type in the incoming array → store as
+            //     {kind: "value", value: …}.
+            //   - User HAD trace_type before, doesn't now → they cleared it
+            //     → store as {kind: "cleared"}.
+            //   - Neither: don't touch (e.g., updating only `search` shouldn't
+            //     overwrite the trace_type intent).
+            if (nextTraceType) {
+                // The filter dialog sends `value` as a scalar for `is`/
+                // `is_not` and as an array for `in`/`not_in` (e.g.,
+                // `["annotation"]`). Normalize to an array, filter to known
+                // enum values, then collapse single-value arrays back to a
+                // scalar for the choice atom — which only stores one value.
+                const rawValues = Array.isArray(nextTraceType.value)
+                    ? nextTraceType.value
+                    : [nextTraceType.value]
+                const values = rawValues.filter(
+                    (entry: unknown): entry is "annotation" | "invocation" =>
+                        entry === "annotation" || entry === "invocation",
+                )
+                const op = nextTraceType.operator
+                const isAffirm = op === "is" || op === "in"
+                const isNeg = op === "is_not" || op === "not_in"
+                const flip = (x: "annotation" | "invocation"): "annotation" | "invocation" =>
+                    x === "annotation" ? "invocation" : "annotation"
+                let resolved: "annotation" | "invocation" | null = null
+                if (values.length === 1) {
+                    if (isAffirm) resolved = values[0]
+                    else if (isNeg) resolved = flip(values[0])
+                }
+                if (resolved) {
+                    set(traceTypeChoiceAtomFamily(tab), {kind: "value", value: resolved})
+                } else {
+                    // Multi-value selections (e.g., `in: ["annotation",
+                    // "invocation"]` — equivalent to "no filter") or
+                    // future enum values we don't map. Treat as cleared
+                    // rather than fabricating a single-value pick.
+                    set(traceTypeChoiceAtomFamily(tab), {kind: "cleared"})
+                }
+            } else {
+                const hadTraceType = currentCombined.some(isTraceType)
+                if (hadTraceType) {
+                    set(traceTypeChoiceAtomFamily(tab), {kind: "cleared"})
+                }
+                // else: don't touch — caller didn't intend to change trace_type
             }
         },
     ),
diff --git a/web/oss/src/state/newObservability/atoms/queries.ts b/web/oss/src/state/newObservability/atoms/queries.ts
index 26bf90a760..c01a7419df 100644
--- a/web/oss/src/state/newObservability/atoms/queries.ts
+++ b/web/oss/src/state/newObservability/atoms/queries.ts
@@ -18,6 +18,7 @@ import {TraceSpanNode} from "@/oss/services/tracing/types"
 import {selectedAppIdAtom} from "@/oss/state/app/selectors/app"
 import {getOrgValues} from "@/oss/state/org"
 import {projectIdAtom} from "@/oss/state/project"
+import {currentWorkflowContextAtom} from "@/oss/state/workflow"
 
 import {sessionExistsAtom} from "../../session"
 
@@ -37,6 +38,15 @@ import {buildTraceQueryParams, executeTraceQuery, mergeConditions} from "./query
 // Traces query ----------------------------------------------------------------
 export const tracesQueryAtom = atomWithInfiniteQuery((get) => {
     const appId = get(selectedAppIdAtom)
+    const workflowCtx = get(currentWorkflowContextAtom)
+    // `fetchAllPreviewTraces` writes the legacy `?application_id=` URL param
+    // off this value. For app workflows that's correct (and redundant with the
+    // body filter that also pins `references.application.id`). For evaluator
+    // workflows it would AND with the body's `references.evaluator.id` filter
+    // and return zero traces — `application.id` is a different reference slot
+    // than `evaluator.id`. Drop the URL param for evaluators; the body filter
+    // (from `filtersAtomFamily`'s appScope branch) already pins the scope.
+    const effectiveAppId = workflowCtx.workflowKind === "evaluator" ? "" : appId
     const sort = get(sortAtomFamily("traces"))
     const filters = get(filtersAtomFamily("traces"))
     const traceTabs = get(traceTabsAtomFamily("traces"))
@@ -48,6 +58,15 @@ export const tracesQueryAtom = atomWithInfiniteQuery((get) => {
 
     const sessionExists = get(sessionExistsAtom)
 
+    // Wait for workflow context to settle before firing the query. While
+    // `workflowCtx.isResolving` is true, `effectiveAppId` falls through to
+    // the app branch with the raw `appId` (which is the evaluator's id when
+    // we're on `/apps/<evalId>/traces`), causing a wrong `application_id`
+    // URL param to be sent. Gating on `!isResolving` skips that wasted
+    // request — once ctx settles, the atom re-evaluates with the correct
+    // `effectiveAppId` and queryFn fires.
+    const enabledFlag = sessionExists && Boolean(appId || projectId) && !workflowCtx.isResolving
+
     return {
         queryKey: ["traces", projectId, appId, params],
         initialPageParam: {
@@ -58,12 +77,12 @@ export const tracesQueryAtom = atomWithInfiniteQuery((get) => {
             executeTraceQuery({
                 params,
                 pageParam: pageParam as {newest?: string} | undefined,
-                appId: appId as string,
+                appId: effectiveAppId as string,
                 isHasAnnotationSelected,
                 hasAnnotationConditions,
                 hasAnnotationOperator,
             }),
-        enabled: sessionExists && Boolean(appId || projectId),
+        enabled: enabledFlag,
 
         getNextPageParam: (lastPage, _pages) => {
             const page = lastPage as any
diff --git a/web/oss/src/state/workflow/destinations.ts b/web/oss/src/state/workflow/destinations.ts
index 03c588661c..efa212eb1b 100644
--- a/web/oss/src/state/workflow/destinations.ts
+++ b/web/oss/src/state/workflow/destinations.ts
@@ -20,9 +20,12 @@ export type WorkflowRouteSegment =
     | "traces"
 
 const DISABLED_FOR_EVALUATOR: ReadonlySet<WorkflowRouteSegment> = new Set([
-    "overview",
+    // `overview` and `evaluations` are now allowed for evaluators — Overview
+    // shows the evaluator's details/variants and the evaluation runs that
+    // evaluated it; Evaluations shows those same runs (scoped by the evaluator
+    // id as the `application` reference). `endpoints`/`deployments` stay
+    // disabled (no meaningful evaluator surface yet).
     "endpoints",
-    "evaluations",
     "deployments",
 ])
 
diff --git a/web/oss/src/state/workflow/flags.ts b/web/oss/src/state/workflow/flags.ts
index b88f0dd610..e207c04f53 100644
--- a/web/oss/src/state/workflow/flags.ts
+++ b/web/oss/src/state/workflow/flags.ts
@@ -1,21 +1,29 @@
 /**
  * Feature flags for the workflow / evaluator full-page UX (PR #4288).
  *
- * The "Phase 5" change routed evaluator table row clicks (and post-create
+ * The "Phase 5" change routes evaluator table row clicks (and post-create
  * navigation) to a full-page playground at `/apps/<evaluatorId>/playground`,
- * with the drawer reduced to a quick-edit affordance. We're temporarily
- * disabling that routing while follow-up fixes land — when the flag flips to
- * `true`, the new flow takes over again with no other code changes required.
+ * with the drawer reduced to a quick-edit affordance.
  *
- * Call sites gated by this flag:
+ * History:
+ *   - #4288 (2026-05-14): shipped the full-page nav.
+ *   - #4384 (2026-05-20): disabled via this flag after two blockers surfaced:
+ *     (1) the full-page surface had no upstream-app picker (lost on the
+ *     generic `PlaygroundHeader`), and (2) the default `trace_type` filter
+ *     on `/apps/<evalId>/traces` reverted to `"invocation"`, leaving
+ *     evaluator users on an empty page.
+ *   - Both fixed: `PlaygroundRouter` now swaps to `ConfigureEvaluatorPage`
+ *     for evaluators (carries the app picker via `EvaluatorPlaygroundHeader`),
+ *     and `defaultTraceTypeForWorkflow` re-instates the annotation default.
+ *
+ * Call sites gated by this flag (no longer dark — flag is `true`):
  *   1. `components/Evaluators/index.tsx` — row-click navigation.
  *   2. `components/WorkflowRevisionDrawerWrapper/index.tsx` — post-create
  *      navigation after evaluator commit.
  *   3. `components/PlaygroundRouter/index.tsx` — guard that allows full-page
- *      UX evaluators to stay on `/playground`. With the flag off, all
- *      evaluator playground URLs redirect back to `/evaluators` so direct
- *      URL visits also fall back to the drawer flow.
+ *      UX evaluators to stay on `/playground` instead of bouncing to
+ *      `/evaluators` + drawer.
  *   4. `components/Sidebar/components/WorkflowEntityCard.tsx` — sidebar
  *      switcher that lists full-page-eligible evaluators.
  */
-export const EVALUATOR_FULL_PAGE_NAV_ENABLED = false
+export const EVALUATOR_FULL_PAGE_NAV_ENABLED = true
diff --git a/web/oss/src/styles/editor-theme.css b/web/oss/src/styles/editor-theme.css
index f292a22edb..e3c6b494b5 100644
--- a/web/oss/src/styles/editor-theme.css
+++ b/web/oss/src/styles/editor-theme.css
@@ -187,25 +187,44 @@ h1 {
     margin-bottom: 0;
 }
 
-.editor-heading-h1 {
-    font-size: 18px;
+.editor-heading-h1,
+.editor-heading-h2,
+.editor-heading-h3,
+.editor-heading-h4,
+.editor-heading-h5,
+.editor-heading-h6 {
     color: rgb(5, 5, 5);
-    font-weight: 400;
+    font-weight: 600;
     margin: 0;
-    margin-bottom: 12px;
+    margin-top: 16px;
+    margin-bottom: 8px;
     padding: 0;
-    line-height: 1.5;
+    line-height: 1.3;
+}
+
+.editor-heading-h1 {
+    font-size: 24px;
 }
 
 .editor-heading-h2 {
+    font-size: 20px;
+}
+
+.editor-heading-h3 {
+    font-size: 17px;
+}
+
+.editor-heading-h4 {
     font-size: 15px;
+}
+
+.editor-heading-h5 {
+    font-size: 14px;
+}
+
+.editor-heading-h6 {
+    font-size: 13px;
     color: rgb(101, 103, 107);
-    font-weight: 700;
-    margin: 0;
-    margin-top: 10px;
-    padding: 0;
-    text-transform: uppercase;
-    line-height: 1.35;
 }
 
 .editor-quote {
@@ -686,10 +705,14 @@ pre::-webkit-scrollbar-thumb {
 .dark .other a {
     color: rgba(255, 255, 255, 0.65);
 }
-.dark .editor-heading-h1 {
+.dark .editor-heading-h1,
+.dark .editor-heading-h2,
+.dark .editor-heading-h3,
+.dark .editor-heading-h4,
+.dark .editor-heading-h5 {
     color: rgba(255, 255, 255, 0.85);
 }
-.dark .editor-heading-h2,
+.dark .editor-heading-h6,
 .dark .editor-quote {
     color: rgba(255, 255, 255, 0.65);
 }
diff --git a/web/oss/src/styles/globals.css b/web/oss/src/styles/globals.css
index db30d4a682..bcf2162872 100644
--- a/web/oss/src/styles/globals.css
+++ b/web/oss/src/styles/globals.css
@@ -184,6 +184,13 @@ body {
 
     > .editor-input.markdown-view > .editor-code {
         background-color: transparent;
+        /* Text mode shows the raw source as plain prose, not code. Use the
+           editor's proportional font instead of the monospace code face, and
+           drop the code-block padding/margins so the text aligns with the
+           rich-text (markdown) view's left edge and top. */
+        font-family: inherit;
+        padding: 0;
+        margin: 0;
     }
     > .editor-input:not(.markdown-view) > .editor-code {
         &:after {
@@ -199,6 +206,30 @@ body {
     }
 }
 
+/* Align the message text with the first character of the role label above it,
+   with the same symmetric horizontal inset on the role, the text, and the
+   placeholder. The single inset value lives in --ag-message-inline-pad.
+
+   Notes:
+   - The role label is an antd Button whose default padding is wider than the
+     inset (Tailwind's px-2 on it loses to antd), so it is pinned with !important.
+   - The text is padded here in CSS rather than via an editor prop because
+     ChatMessageEditor renders the Editor with `noProvider`, a mode where
+     `className`/`editorClassName` is currently dropped (known bug, tracked
+     separately). JSON/code editors are excluded; they have a line-number gutter. */
+.agenta-chat-message-editor {
+    --ag-message-inline-pad: 8px;
+}
+.agenta-chat-message-editor .message-user-select {
+    padding-inline: var(--ag-message-inline-pad) !important;
+}
+.agenta-chat-message-editor .editor-input:not(.code-only) {
+    padding-inline: var(--ag-message-inline-pad);
+}
+.agenta-chat-message-editor .editor-placeholder {
+    left: var(--ag-message-inline-pad);
+}
+
 /** Align the input search with the search box **/
 .ant-input-group-wrapper {
     .ant-input {
diff --git a/web/oss/tests/playwright/acceptance/evaluators/index.ts b/web/oss/tests/playwright/acceptance/evaluators/index.ts
index 5ea00839f7..3f43d5748f 100644
--- a/web/oss/tests/playwright/acceptance/evaluators/index.ts
+++ b/web/oss/tests/playwright/acceptance/evaluators/index.ts
@@ -1,3 +1,18 @@
+import {
+    createTagString,
+    TestCoverage,
+    TestPath,
+    TestScope,
+    TestSpeedType,
+    TestLensType,
+    TestCostType,
+    TestLicenseType,
+    TestRoleType,
+    TestcaseType,
+} from "@agenta/web-tests/playwright/config/testTags"
+
+import {buildAcceptanceTags} from "../utils/tags"
+
 import {
     test,
     expect,
@@ -5,9 +20,6 @@ import {
     selectEvaluatorTemplate,
     getEvaluatorCommitModal,
     waitForWorkflowCreation,
-    openEvaluatorViewDrawer,
-    expandEvaluatorToPlayground,
-    selectCompletionAppFromDrawer,
     fillTestcaseField,
     createHumanEvaluatorFromDrawer,
     editEvaluatorAndSaveNewVersion,
@@ -19,6 +31,13 @@ import {
     EVALUATOR_TAB_PARAM_HUMAN,
     EVALUATOR_CREATE_BUTTON_LABEL,
     EVALUATOR_EXACT_MATCH_TEMPLATE_NAME,
+    EVALUATOR_LLM_AS_A_JUDGE_TEMPLATE_NAME,
+    EVALUATOR_SELECT_APP_PLACEHOLDER,
+    EVALUATOR_NO_APPS_TEXT,
+    EVALUATOR_NON_COMPLETION_TYPE_LABELS,
+    EVALUATOR_POPOVER_TEST_ID,
+    EVALUATOR_POPOVER_ROOT_PANEL_TEST_ID,
+    EVALUATOR_POPOVER_CHILD_PANEL_TEST_ID,
     EVALUATOR_DRAWER_CREATE_TITLE,
     EVALUATOR_DRAWER_CREATE_BUTTON_LABEL,
     EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER,
@@ -28,19 +47,6 @@ import {
     EVALUATOR_RESULT_CARD_SELECTOR,
     HUMAN_EVALUATOR_CREATE_SUCCESS_MESSAGE,
 } from "./tests"
-import {
-    createTagString,
-    TestCoverage,
-    TestPath,
-    TestScope,
-    TestSpeedType,
-    TestLensType,
-    TestCostType,
-    TestLicenseType,
-    TestRoleType,
-    TestcaseType,
-} from "@agenta/web-tests/playwright/config/testTags"
-import {buildAcceptanceTags} from "../utils/tags"
 
 const testEvaluators = () => {
     test(
@@ -236,35 +242,33 @@ const testEvaluators = () => {
                 page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
             ).toBeVisible({timeout: 10000})
 
-            // Verify the evaluator appears in the table.
-            // Use the search input to narrow results, then poll via [data-row-key].
-            const searchInput2 = page.locator('input[placeholder="Search"]').first()
-            if (await searchInput2.isVisible().catch(() => false)) {
-                await searchInput2.fill(evaluatorName)
-            }
-            await expect
-                .poll(
-                    async () =>
-                        page.locator("[data-row-key]").filter({hasText: evaluatorName}).count(),
-                    {timeout: 15000},
-                )
-                .toBeGreaterThan(0)
-            await expect(
-                page.locator("[data-row-key]").filter({hasText: evaluatorName}).first(),
-            ).toBeVisible({timeout: 5000})
-
-            // Step 2: Open the evaluator view drawer by clicking the row
-            const viewDrawer = await openEvaluatorViewDrawer(page, evaluatorName)
-
-            // Step 3: Expand the drawer into playground mode
-            await expandEvaluatorToPlayground(viewDrawer)
+            // Step 2: Post-commit navigates to `/apps/<id>/playground` — the
+            // full-page surface introduced by the EVALUATOR_FULL_PAGE_NAV
+            // re-enable. Assert the redirect FIRST (no DOM-poll for the
+            // registry table). Earlier this test waited on `[data-row-key]`
+            // entries before the URL check, which raced against the redirect:
+            // once the post-commit navigation won, the table wasn't in the
+            // DOM and the poll timed out. The evaluator's presence in the
+            // registry is exercised by the post-create-row-click test
+            // alongside; here we only care that the create flow leads to
+            // the playground page.
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            const surface = page.locator("body")
+
+            // Step 3: The evaluator-flavored page has a "Select app" picker in the header
+            const selectAppButton = page
+                .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                .first()
+            await expect(selectAppButton).toBeVisible({timeout: 15000})
 
-            // Step 4: Select a completion-type app
-            // Note 1: Skip if no apps are available in this environment
-            // Note 2: Skip if no completion-type app is available
-            const appSelectionResult = await selectCompletionAppFromDrawer(page, viewDrawer)
+            // Step 4: Open the picker and select a completion-type app.
+            // Skip gracefully if no apps or no completion app exist in this environment.
+            await selectAppButton.click()
+            const popover = page.getByTestId(EVALUATOR_POPOVER_TEST_ID).last()
+            await expect(popover).toBeVisible({timeout: 5000})
 
-            if (appSelectionResult === "no_apps") {
+            const noItemsText = popover.getByText(EVALUATOR_NO_APPS_TEXT)
+            if (await noItemsText.isVisible().catch(() => false)) {
                 test.skip(
                     true,
                     "No apps available in this environment to test the evaluator playground",
@@ -272,24 +276,46 @@ const testEvaluators = () => {
                 return
             }
 
-            if (appSelectionResult === "no_completion") {
+            const appItems = popover
+                .getByTestId(EVALUATOR_POPOVER_ROOT_PANEL_TEST_ID)
+                .locator('[role="option"]')
+            await expect(appItems.first()).toBeVisible({timeout: 10000})
+
+            // Pick the first non-Chat / non-Custom app — completion-type.
+            const allItems = await appItems.all()
+            let completionItem = null
+            for (const item of allItems) {
+                const itemText = await item.textContent()
+                const isNonCompletion = EVALUATOR_NON_COMPLETION_TYPE_LABELS.some((label) =>
+                    itemText?.includes(label),
+                )
+                if (!isNonCompletion) {
+                    completionItem = item
+                    break
+                }
+            }
+            if (!completionItem) {
                 test.skip(
                     true,
                     "No completion-type app available — evaluator requires a completion app",
                 )
                 return
             }
+            await completionItem.click()
+
+            // Wait for and pick the first revision in the right-side panel.
+            const revisionPanel = popover.getByTestId(EVALUATOR_POPOVER_CHILD_PANEL_TEST_ID)
+            await expect(revisionPanel).toBeVisible({timeout: 5000})
+            const revisionItems = revisionPanel.locator('[role="option"]')
+            await expect(revisionItems.first()).toBeVisible({timeout: 5000})
+            await revisionItems.first().click()
 
-            // Step 5: Verify the selected app is a completion type by waiting for "Testcase Data".
-            // Chat and Custom apps render a different playground UI without this section.
-            // If it doesn't appear the type-detection heuristic picked a non-completion app —
-            // skip gracefully instead of failing.
+            // Step 5: Verify completion-app UI (Testcase Data section) appears.
             const isCompletionApp = await page
                 .getByText("Testcase Data")
                 .first()
                 .isVisible({timeout: 10000})
                 .catch(() => false)
-
             if (!isCompletionApp) {
                 test.skip(
                     true,
@@ -298,24 +324,20 @@ const testEvaluators = () => {
                 return
             }
 
-            // Step 6: Fill in the testcase fields
-            // The testcase rows appear inside the expanded drawer's playground area.
-            // We fill in well-known fields if present; the exact schema depends on
-            // the connected app. For the standard "country capitals" completion app,
-            // "country" is the app input and "correct_answer" is the evaluator ground truth.
-            await fillTestcaseField(page, viewDrawer, "country", "Germany")
-            await fillTestcaseField(page, viewDrawer, "correct_answer", "Berlin")
-
-            // Step 7: Click the Run button
-            const runButton = viewDrawer
-                .getByRole("button", {name: EVALUATOR_RUN_BUTTON_LABEL})
-                .first()
+            // Step 6: Fill testcase fields. For the standard country-capitals completion
+            // app, "country" is the app input and "correct_answer" is the evaluator
+            // ground truth.
+            await fillTestcaseField(page, surface, "country", "Germany")
+            await fillTestcaseField(page, surface, "correct_answer", "Berlin")
+
+            // Step 7: Click Run
+            const runButton = page.getByRole("button", {name: EVALUATOR_RUN_BUTTON_LABEL}).first()
             await expect(runButton).toBeVisible({timeout: 10000})
             await expect(runButton).toBeEnabled()
             await runButton.click()
 
-            // Step 8: Verify the evaluation ran — the evaluator result card should appear
-            await expect(viewDrawer.locator(EVALUATOR_RESULT_CARD_SELECTOR).first()).toBeVisible({
+            // Step 8: Verify the evaluator result card appears
+            await expect(page.locator(EVALUATOR_RESULT_CARD_SELECTOR).first()).toBeVisible({
                 timeout: 30000,
             })
         },
@@ -480,6 +502,370 @@ const testEvaluators = () => {
             await deleteEvaluator(page, evaluatorName)
         },
     )
+
+    // ────────────────────────────────────────────────────────────────────────
+    // Full-page evaluator playground (PR #4288 / re-enable after #4384)
+    //
+    // Every non-archived automatic evaluator opens in the full-page surface
+    // at `/apps/<evalId>/playground` (powered by `ConfigureEvaluatorPage`)
+    // on row click + post-create + direct URL visit, regardless of template
+    // type. Earlier the gate restricted this to LLM/code evaluators only and
+    // declarative classifiers fell back to the drawer — that meant several
+    // evaluator types had no UI path into the per-evaluator pages (variants,
+    // traces, sidebar). The gate is gone now; the drawer remains available
+    // as a quick-edit affordance via the row context menu's Configure
+    // action.
+    // ────────────────────────────────────────────────────────────────────────
+
+    test(
+        "should navigate to the full-page playground for a declarative classifier (Exact Match) on post-create",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.FAST,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            const evaluatorName = `e2e-exact-match-fullpage-${Date.now()}`
+
+            await navigateToEvaluators()
+
+            // Create a fresh Exact Match evaluator
+            const drawer = await selectEvaluatorTemplate(page, EVALUATOR_EXACT_MATCH_TEMPLATE_NAME)
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Post-create lands on the full-page playground (all evaluator
+            // kinds, not just LLM/code).
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+        },
+    )
+
+    test(
+        "should navigate to the full-page playground when clicking an LLM-as-a-judge row",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.SLOW,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            const evaluatorName = `e2e-llm-judge-row-${Date.now()}`
+
+            await navigateToEvaluators()
+
+            // Create an LLM-as-a-judge evaluator (flags.is_llm — full-page eligible)
+            const drawer = await selectEvaluatorTemplate(
+                page,
+                EVALUATOR_LLM_AS_A_JUDGE_TEMPLATE_NAME,
+            )
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Post-create navigation lands directly on the full-page playground
+            // (`WorkflowRevisionDrawerWrapper:489-502` evaluator-create branch).
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+
+            // The full-page evaluator surface renders ConfigureEvaluatorPage's
+            // header, whose marker is the upstream-app picker. This is the
+            // regression blocker #4384 disabled the flow over — when the swap
+            // is wrong the user lands on the generic <Playground /> with no
+            // way to pick the app the evaluator scores.
+            const selectAppButton = page
+                .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                .first()
+            await expect(selectAppButton).toBeVisible({timeout: 15000})
+
+            // Navigate back to /evaluators and click the row — same destination
+            // (validates the registry's row-click handler, not just post-create).
+            await navigateToEvaluators()
+            const searchInput = page.locator('input[placeholder="Search"]').first()
+            if (await searchInput.isVisible().catch(() => false)) {
+                await searchInput.fill(evaluatorName)
+            }
+            await expect
+                .poll(
+                    async () =>
+                        page.locator("[data-row-key]").filter({hasText: evaluatorName}).count(),
+                    {timeout: 15000},
+                )
+                .toBeGreaterThan(0)
+            const row = page.locator("[data-row-key]").filter({hasText: evaluatorName}).first()
+            await row.click()
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            await expect(
+                page
+                    .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                    .first(),
+            ).toBeVisible({timeout: 15000})
+        },
+    )
+
+    test(
+        "should navigate to the full-page playground when clicking a declarative classifier row (Exact Match)",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.FAST,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            // Verifies T17 (gate removal): declarative classifiers — not just
+            // LLM/code evaluators — open the full-page playground on row click.
+            const evaluatorName = `e2e-exact-match-rowclick-${Date.now()}`
+
+            await navigateToEvaluators()
+
+            // Create Exact Match
+            const drawer = await selectEvaluatorTemplate(page, EVALUATOR_EXACT_MATCH_TEMPLATE_NAME)
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Navigate back to the registry, then click the row.
+            await navigateToEvaluators()
+            const searchInput = page.locator('input[placeholder="Search"]').first()
+            if (await searchInput.isVisible().catch(() => false)) {
+                await searchInput.fill(evaluatorName)
+            }
+            await expect
+                .poll(
+                    async () =>
+                        page.locator("[data-row-key]").filter({hasText: evaluatorName}).count(),
+                    {timeout: 15000},
+                )
+                .toBeGreaterThan(0)
+            const row = page.locator("[data-row-key]").filter({hasText: evaluatorName}).first()
+            await row.click()
+
+            // Row click navigates to the full-page playground — same surface as
+            // LLM/code evaluators (Phase 6 unification, gate removed in T17).
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            await expect(
+                page
+                    .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                    .first(),
+            ).toBeVisible({timeout: 15000})
+        },
+    )
+
+    test(
+        "should render the full-page playground on direct URL visit to /apps/<evalId>/playground",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.FAST,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            // Verifies T17: direct URL visits to a declarative classifier's
+            // /apps/<evalId>/playground page render the evaluator-flavored
+            // surface — no bounce to /evaluators (the bounce was the behavior
+            // pre-T17 via the now-removed useEvaluatorPlaygroundGuard).
+            const evaluatorName = `e2e-exact-match-direct-${Date.now()}`
+
+            await navigateToEvaluators()
+            const drawer = await selectEvaluatorTemplate(page, EVALUATOR_EXACT_MATCH_TEMPLATE_NAME)
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Capture the post-create URL — it's the playground URL we want to
+            // re-visit directly. (Post-create navigation already lands here.)
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            const playgroundUrl = page.url()
+
+            // Navigate away, then revisit the URL directly. If the guard were
+            // still in place, this would bounce to /evaluators?revisionId=...
+            await navigateToEvaluators()
+            await expect(page).toHaveURL(/\/evaluators(\?|$)/, {timeout: 5000})
+
+            await page.goto(playgroundUrl)
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+            await expect(
+                page
+                    .getByRole("button", {name: new RegExp(EVALUATOR_SELECT_APP_PLACEHOLDER)})
+                    .first(),
+            ).toBeVisible({timeout: 15000})
+        },
+    )
+
+    test(
+        "should list declarative classifiers in the sidebar switcher (not just LLM/code evaluators)",
+        {
+            tag: buildAcceptanceTags({
+                scope: [TestScope.EVALUATIONS],
+                coverage: [TestCoverage.LIGHT, TestCoverage.FULL],
+                path: TestPath.HAPPY,
+                lens: TestLensType.FUNCTIONAL,
+                cost: TestCostType.Free,
+                license: TestLicenseType.OSS,
+                role: TestRoleType.Owner,
+                caseType: TestcaseType.TYPICAL,
+                speed: TestSpeedType.FAST,
+            }),
+        },
+        async ({page, navigateToEvaluators}) => {
+            // Verifies T17: the sidebar workflow switcher lists ALL evaluator
+            // kinds, not just full-page-eligible (LLM/code) ones. Pre-T17 the
+            // dropdown used `fullPagePlaygroundEvaluatorsAtom` which filtered
+            // declarative classifiers out — leaving them unreachable via UI
+            // navigation from anywhere except the /evaluators table.
+            const evaluatorName = `e2e-exact-match-sidebar-${Date.now()}`
+
+            await navigateToEvaluators()
+            const drawer = await selectEvaluatorTemplate(page, EVALUATOR_EXACT_MATCH_TEMPLATE_NAME)
+            const drawerCreateButton = drawer
+                .getByRole("button", {name: EVALUATOR_DRAWER_CREATE_BUTTON_LABEL})
+                .first()
+            await expect(drawerCreateButton).toBeEnabled({timeout: 10000})
+            await drawerCreateButton.click()
+
+            const modal = getEvaluatorCommitModal(page)
+            await expect(modal.first()).toBeVisible({timeout: 10000})
+            await modal
+                .locator(`input[placeholder="${EVALUATOR_COMMIT_MODAL_NAME_PLACEHOLDER}"]`)
+                .first()
+                .fill(evaluatorName)
+            const creationPromise = waitForWorkflowCreation(page)
+            await modal
+                .getByRole("button", {name: EVALUATOR_COMMIT_MODAL_SUBMIT_LABEL})
+                .last()
+                .click()
+            await creationPromise
+            await expect(
+                page.locator(".ant-message").getByText(EVALUATOR_CREATE_SUCCESS_MESSAGE).first(),
+            ).toBeVisible({timeout: 10000})
+
+            // Post-create lands on the full-page playground; the
+            // WorkflowEntityCard switcher appears in the sidebar from there.
+            await expect(page).toHaveURL(/\/apps\/[^/]+\/playground(\?|$|#)/, {timeout: 15000})
+
+            // Click the switcher's "Switch workflow" button. The aria-label is
+            // only set on the expanded-sidebar variant in WorkflowEntityCard.tsx
+            // (the collapsed-sidebar trigger uses just the icon button) — this
+            // test therefore assumes the sidebar is expanded, which is the
+            // default state. If a test environment ever defaults to collapsed,
+            // this finder would need to also match the icon-only button.
+            const switchButton = page.getByRole("button", {name: "Switch workflow"}).first()
+            await expect(switchButton).toBeVisible({timeout: 15000})
+            await switchButton.click()
+
+            // The dropdown opens via AntD's Dropdown. The just-created
+            // declarative classifier should be in the list — pre-T17 it
+            // wouldn't be (the dropdown filtered to LLM/code-only evaluators).
+            await expect(
+                page.getByRole("menuitem").filter({hasText: evaluatorName}).first(),
+            ).toBeVisible({timeout: 10000})
+        },
+    )
 }
 
 export default testEvaluators
diff --git a/web/oss/tests/playwright/acceptance/evaluators/tests.ts b/web/oss/tests/playwright/acceptance/evaluators/tests.ts
index d04319b417..39adaa3e2d 100644
--- a/web/oss/tests/playwright/acceptance/evaluators/tests.ts
+++ b/web/oss/tests/playwright/acceptance/evaluators/tests.ts
@@ -17,6 +17,10 @@ const EVALUATOR_TAB_PARAM_HUMAN = "human"
 // Template dropdown
 const EVALUATOR_TEMPLATE_DROPDOWN_TITLE = "Select evaluator type"
 const EVALUATOR_EXACT_MATCH_TEMPLATE_NAME = "Exact Match"
+// Backend template key `auto_ai_critique`; display name lives in
+// api/oss/src/resources/evaluators/evaluators.py. LLM-as-a-judge is the
+// canonical "full-page playground" evaluator (has flags.is_llm).
+const EVALUATOR_LLM_AS_A_JUDGE_TEMPLATE_NAME = "LLM-as-a-judge"
 
 // Drawer (create)
 const EVALUATOR_DRAWER_CREATE_TITLE = "New Evaluator"
@@ -563,6 +567,7 @@ export {
     EVALUATOR_SEARCH_PLACEHOLDER,
     EVALUATOR_TEMPLATE_DROPDOWN_TITLE,
     EVALUATOR_EXACT_MATCH_TEMPLATE_NAME,
+    EVALUATOR_LLM_AS_A_JUDGE_TEMPLATE_NAME,
     EVALUATOR_DRAWER_CREATE_TITLE,
     EVALUATOR_CORRECT_ANSWER_PROP,
     EVALUATOR_DRAWER_CREATE_BUTTON_LABEL,
@@ -573,6 +578,9 @@ export {
     EVALUATOR_SELECT_APP_PLACEHOLDER,
     EVALUATOR_NO_APPS_TEXT,
     EVALUATOR_NON_COMPLETION_TYPE_LABELS,
+    EVALUATOR_POPOVER_TEST_ID,
+    EVALUATOR_POPOVER_ROOT_PANEL_TEST_ID,
+    EVALUATOR_POPOVER_CHILD_PANEL_TEST_ID,
     EVALUATOR_RUN_BUTTON_LABEL,
     EVALUATOR_RESULT_CARD_SELECTOR,
     createHumanEvaluatorFromDrawer,
diff --git a/web/package.json b/web/package.json
index 023d05a446..e9e9a989de 100644
--- a/web/package.json
+++ b/web/package.json
@@ -1,6 +1,6 @@
 {
     "name": "agenta-web",
-    "version": "0.101.1",
+    "version": "0.102.0",
     "workspaces": [
         "ee",
         "oss",
diff --git a/web/packages/agenta-annotation/.gitignore b/web/packages/agenta-annotation/.gitignore
new file mode 100644
index 0000000000..96d253c48e
--- /dev/null
+++ b/web/packages/agenta-annotation/.gitignore
@@ -0,0 +1,3 @@
+# Generated by Vitest — do not commit
+test-results/
+coverage/
diff --git a/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts b/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts
new file mode 100644
index 0000000000..f796a10c8c
--- /dev/null
+++ b/web/packages/agenta-annotation/tests/unit/annotation-form-helpers.test.ts
@@ -0,0 +1,378 @@
+/**
+ * Unit tests for pure helper functions exported from annotationFormController.ts:
+ *   - isEmptyValue
+ *   - getOutputsSchema
+ *   - getMetricFieldsFromEvaluator
+ *   - getMetricsFromAnnotation
+ *
+ * The module has many heavy imports (Jotai atoms, entity API calls, session
+ * controller). We mock the external packages so no network or Jotai store
+ * is touched during tests.
+ */
+
+import {beforeEach, describe, expect, it, vi} from "vitest"
+
+// ---------------------------------------------------------------------------
+// Module-level mocks — vi.mock is hoisted before imports by Vitest
+// ---------------------------------------------------------------------------
+
+const mockResolveOutputSchema = vi.fn()
+
+vi.mock("@agenta/entities/workflow", () => ({
+    resolveOutputSchema: (data: unknown) => mockResolveOutputSchema(data),
+    workflowQueryAtomFamily: () => ({isPending: false, data: null}),
+    workflowLatestRevisionQueryAtomFamily: () => ({isPending: false, data: null}),
+}))
+
+vi.mock("@agenta/entities/annotation", () => ({
+    createAnnotation: vi.fn(),
+    updateAnnotation: vi.fn(),
+    invalidateAnnotationCacheByLink: vi.fn(),
+}))
+
+vi.mock("@agenta/entities/evaluationRun", () => ({
+    evaluationRunMolecule: {selectors: {annotationSteps: vi.fn(), scenarioSteps: vi.fn()}},
+    queryEvaluationResults: vi.fn(),
+}))
+
+vi.mock("@agenta/entities/simpleQueue", () => ({
+    invalidateScenarioProgressCache: vi.fn(),
+    invalidateSimpleQueueCache: vi.fn(),
+    invalidateSimpleQueuesListCache: vi.fn(),
+    simpleQueuePaginatedStore: {refreshAtom: {}},
+}))
+
+vi.mock("@agenta/entities/trace", () => ({
+    fetchPreviewTrace: vi.fn(),
+}))
+
+vi.mock("@agenta/shared/api", () => ({
+    axios: {patch: vi.fn(), post: vi.fn()},
+    getAgentaApiUrl: () => "http://localhost",
+    queryClient: {invalidateQueries: vi.fn()},
+}))
+
+vi.mock("@agenta/shared/state", () => ({
+    projectIdAtom: {},
+}))
+
+vi.mock("../../src/state/controllers/annotationSessionController", () => ({
+    annotationSessionController: {
+        selectors: {
+            evaluatorStepRefs: () => ({}),
+            scenarioAnnotations: () => ({}),
+            scenarioStatuses: () => ({}),
+            activeRunId: () => ({}),
+            focusAutoNext: () => ({}),
+        },
+        set: {markCompleted: vi.fn(), navigateNext: vi.fn()},
+        cache: {invalidateScenarioAnnotations: vi.fn()},
+    },
+}))
+
+// Import the functions AFTER all vi.mock() declarations
+import {
+    getMetricFieldsFromEvaluator,
+    getMetricsFromAnnotation,
+    getOutputsSchema,
+    isEmptyValue,
+} from "../../src/state/controllers/annotationFormController"
+import type {Annotation} from "@agenta/entities/annotation"
+import type {Workflow} from "@agenta/entities/workflow"
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function makeWorkflow(schemaProperties: Record<string, unknown> = {}): Workflow {
+    // resolveOutputSchema is mocked to return its input,
+    // so we set data to the schema shape directly.
+    return {
+        data: {properties: schemaProperties},
+        slug: "test-evaluator",
+        id: "wf-1",
+    } as unknown as Workflow
+}
+
+function makeAnnotation(
+    outputs: Record<string, unknown>,
+    references?: {evaluator?: {slug?: string}},
+): Annotation {
+    return {
+        trace_id: "trace-1",
+        span_id: "span-1",
+        data: {outputs},
+        references,
+        meta: {},
+    } as unknown as Annotation
+}
+
+beforeEach(() => {
+    // Default: resolveOutputSchema returns the data as-is (pass-through)
+    mockResolveOutputSchema.mockImplementation((data: unknown) => data)
+})
+
+// ---------------------------------------------------------------------------
+// isEmptyValue
+// ---------------------------------------------------------------------------
+
+describe("isEmptyValue", () => {
+    it.each([
+        [null, true],
+        [undefined, true],
+        ["", true],
+        [[], true],
+    ])("returns true for %s", (value, expected) => {
+        expect(isEmptyValue(value)).toBe(expected)
+    })
+
+    it.each([
+        [0, false],
+        [false, false],
+        ["0", false],
+        [[null], false],
+        [{}, false],
+        [" ", false],
+    ])("returns false for %s", (value, expected) => {
+        expect(isEmptyValue(value)).toBe(expected)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getOutputsSchema
+// ---------------------------------------------------------------------------
+
+describe("getOutputsSchema", () => {
+    it("returns the schema from resolveOutputSchema", () => {
+        const schema = {properties: {score: {type: "number"}}}
+        const workflow = makeWorkflow(schema.properties)
+        const result = getOutputsSchema(workflow)
+        expect(result).toMatchObject({properties: {score: {type: "number"}}})
+    })
+
+    it("returns empty object when resolveOutputSchema returns null", () => {
+        mockResolveOutputSchema.mockReturnValueOnce(null)
+        const result = getOutputsSchema(makeWorkflow())
+        expect(result).toEqual({})
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getMetricFieldsFromEvaluator — scalar types
+// ---------------------------------------------------------------------------
+
+describe("getMetricFieldsFromEvaluator — scalar types", () => {
+    it("produces a number field with null default", () => {
+        const wf = makeWorkflow({score: {type: "number", minimum: 0, maximum: 10}})
+        const fields = getMetricFieldsFromEvaluator(wf)
+        expect(fields.score).toMatchObject({value: null, type: "number", minimum: 0, maximum: 10})
+    })
+
+    it("produces an integer field with null default", () => {
+        const wf = makeWorkflow({count: {type: "integer"}})
+        expect(getMetricFieldsFromEvaluator(wf).count).toMatchObject({value: null, type: "integer"})
+    })
+
+    it("produces a boolean field with null default", () => {
+        const wf = makeWorkflow({approved: {type: "boolean"}})
+        expect(getMetricFieldsFromEvaluator(wf).approved).toMatchObject({
+            value: null,
+            type: "boolean",
+        })
+    })
+
+    it("produces a string field with empty-string default", () => {
+        const wf = makeWorkflow({notes: {type: "string"}})
+        expect(getMetricFieldsFromEvaluator(wf).notes).toMatchObject({value: "", type: "string"})
+    })
+})
+
+describe("getMetricFieldsFromEvaluator — array type", () => {
+    it("produces an array field with item schema", () => {
+        const wf = makeWorkflow({
+            labels: {
+                type: "array",
+                items: {type: "string", enum: ["good", "bad"]},
+            },
+        })
+        const fields = getMetricFieldsFromEvaluator(wf)
+        expect(fields.labels).toMatchObject({
+            value: [],
+            type: "array",
+            items: {type: "string", enum: ["good", "bad"]},
+        })
+    })
+
+    it("defaults item type to string when items is missing", () => {
+        const wf = makeWorkflow({tags: {type: "array"}})
+        expect(getMetricFieldsFromEvaluator(wf).tags.items).toMatchObject({
+            type: "string",
+            enum: [],
+        })
+    })
+})
+
+describe("getMetricFieldsFromEvaluator — anyOf schema", () => {
+    it("unwraps the first anyOf entry to get the real type", () => {
+        const wf = makeWorkflow({
+            score: {anyOf: [{type: "number", minimum: 0}, {type: "null"}]},
+        })
+        expect(getMetricFieldsFromEvaluator(wf).score).toMatchObject({value: null, type: "number"})
+    })
+})
+
+describe("getMetricFieldsFromEvaluator — array-of-types", () => {
+    it("filters 'null' from the type array and uses the remaining types", () => {
+        const wf = makeWorkflow({status: {type: ["string", "null"]}})
+        const field = getMetricFieldsFromEvaluator(wf).status
+        expect(field.type).toEqual(["string"])
+        expect(field.value).toBe("")
+    })
+
+    it("skips the property when only 'null' type remains after filtering", () => {
+        const wf = makeWorkflow({x: {type: ["null"]}})
+        expect(getMetricFieldsFromEvaluator(wf)).not.toHaveProperty("x")
+    })
+
+    it("includes non-null enum values and strips null/empty entries", () => {
+        const wf = makeWorkflow({
+            choice: {type: ["string", "null"], enum: ["a", null, "", "b"]},
+        })
+        const field = getMetricFieldsFromEvaluator(wf).choice
+        expect(field.enum).toEqual(["a", "b"])
+    })
+})
+
+describe("getMetricFieldsFromEvaluator — edge cases", () => {
+    it("returns empty object for an empty schema", () => {
+        mockResolveOutputSchema.mockReturnValueOnce(null)
+        expect(getMetricFieldsFromEvaluator(makeWorkflow())).toEqual({})
+    })
+
+    it("skips unsupported types (e.g. 'object')", () => {
+        const wf = makeWorkflow({meta: {type: "object"}})
+        expect(getMetricFieldsFromEvaluator(wf)).not.toHaveProperty("meta")
+    })
+
+    it("skips properties with no type field", () => {
+        const wf = makeWorkflow({weird: {description: "no type here"}})
+        expect(getMetricFieldsFromEvaluator(wf)).not.toHaveProperty("weird")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getMetricsFromAnnotation — flat outputs
+// ---------------------------------------------------------------------------
+
+describe("getMetricsFromAnnotation — flat outputs matching schema", () => {
+    it("fills a number field from flat outputs", () => {
+        const wf = makeWorkflow({score: {type: "number"}})
+        const ann = makeAnnotation({score: 8.5})
+        const fields = getMetricsFromAnnotation(ann, wf)
+        expect(fields.score).toMatchObject({value: 8.5, type: "number"})
+    })
+
+    it("fills a string field from flat outputs", () => {
+        // "notes" is a reserved flattening key — use a plain field name
+        const wf = makeWorkflow({label: {type: "string"}})
+        const ann = makeAnnotation({label: "looks good"})
+        expect(getMetricsFromAnnotation(ann, wf).label).toMatchObject({
+            value: "looks good",
+            type: "string",
+        })
+    })
+
+    it("uses schema default when key is absent in outputs", () => {
+        const wf = makeWorkflow({score: {type: "number"}})
+        const ann = makeAnnotation({})
+        expect(getMetricsFromAnnotation(ann, wf).score).toMatchObject({value: null, type: "number"})
+    })
+
+    it("uses '' as default for a missing string field", () => {
+        const wf = makeWorkflow({label: {type: "string"}})
+        const ann = makeAnnotation({})
+        expect(getMetricsFromAnnotation(ann, wf).label.value).toBe("")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getMetricsFromAnnotation — nested output structures
+// ---------------------------------------------------------------------------
+
+describe("getMetricsFromAnnotation — nested outputs", () => {
+    it("flattens metrics nested under 'metrics' key", () => {
+        const wf = makeWorkflow({score: {type: "number"}})
+        const ann = makeAnnotation({metrics: {score: 9}})
+        expect(getMetricsFromAnnotation(ann, wf).score.value).toBe(9)
+    })
+
+    it("flattens fields nested under 'notes' key", () => {
+        const wf = makeWorkflow({comment: {type: "string"}})
+        const ann = makeAnnotation({notes: {comment: "great"}})
+        expect(getMetricsFromAnnotation(ann, wf).comment.value).toBe("great")
+    })
+
+    it("flattens fields nested under 'extra' key", () => {
+        const wf = makeWorkflow({custom: {type: "string"}})
+        const ann = makeAnnotation({extra: {custom: "value"}})
+        expect(getMetricsFromAnnotation(ann, wf).custom.value).toBe("value")
+    })
+
+    it("flat keys outside of metrics/notes/extra are preserved directly", () => {
+        const wf = makeWorkflow({direct: {type: "number"}})
+        const ann = makeAnnotation({direct: 42})
+        expect(getMetricsFromAnnotation(ann, wf).direct.value).toBe(42)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getMetricsFromAnnotation — schema-free (infer from outputs)
+// ---------------------------------------------------------------------------
+
+describe("getMetricsFromAnnotation — schema-free inference", () => {
+    beforeEach(() => {
+        // Empty schema → falls back to inferFieldsFromOutputs
+        mockResolveOutputSchema.mockReturnValue(null)
+    })
+
+    it("infers a number field from a numeric output value", () => {
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({score: 7})
+        const fields = getMetricsFromAnnotation(ann, wf)
+        expect(fields.score.type).toBe("integer")
+        expect(fields.score.value).toBe(7)
+    })
+
+    it("infers a boolean field from a boolean output value", () => {
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({approved: true})
+        expect(getMetricsFromAnnotation(ann, wf).approved).toMatchObject({
+            value: true,
+            type: "boolean",
+        })
+    })
+
+    it("infers a string field from a string output value", () => {
+        // "notes" is a reserved key — use a plain field name
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({comment: "hello"})
+        expect(getMetricsFromAnnotation(ann, wf).comment).toMatchObject({
+            value: "hello",
+            type: "string",
+        })
+    })
+
+    it("serialises an object output to a JSON string field", () => {
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({meta: {key: "val"}})
+        const field = getMetricsFromAnnotation(ann, wf).meta
+        expect(field.type).toBe("string")
+        expect(field.value).toBe(JSON.stringify({key: "val"}))
+    })
+
+    it("returns empty object when annotation outputs are empty", () => {
+        const wf = makeWorkflow()
+        const ann = makeAnnotation({})
+        expect(getMetricsFromAnnotation(ann, wf)).toEqual({})
+    })
+})
diff --git a/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts b/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts
new file mode 100644
index 0000000000..4c7ce5c783
--- /dev/null
+++ b/web/packages/agenta-annotation/tests/unit/testset-sync.test.ts
@@ -0,0 +1,660 @@
+/**
+ * Unit tests for pure functions in src/state/testsetSync.ts.
+ *
+ * All functions under test are pure data transformations with no side effects.
+ * The entity imports in testsetSync.ts are type-only, so no mocking is needed.
+ */
+
+import {describe, expect, it} from "vitest"
+
+import type {Annotation} from "@agenta/entities/annotation"
+import type {Testcase} from "@agenta/entities/testcase"
+import {
+    buildTestcaseExportRows,
+    buildTestsetSyncOperations,
+    buildTestsetSyncPreview,
+    buildTraceTestsetRows,
+    getQueueAnnotationTag,
+    getTestsetSyncEvaluatorColumnKey,
+    mergeTestcaseAnnotationTags,
+    remapTargetRowsToBaseRevision,
+    selectQueueScopedAnnotation,
+    TESTCASE_QUEUE_KIND_TAG,
+} from "../../src/state/testsetSync"
+
+// ---------------------------------------------------------------------------
+// Minimal fixture builders
+// ---------------------------------------------------------------------------
+
+function makeAnnotation(
+    overrides: {
+        evaluatorSlug?: string
+        evaluatorId?: string
+        tags?: string[]
+        outputs?: Record<string, unknown>
+        traceId?: string
+        spanId?: string
+    } = {},
+): Annotation {
+    return {
+        trace_id: overrides.traceId ?? "trace-1",
+        span_id: overrides.spanId ?? "span-1",
+        meta: {tags: overrides.tags ?? []},
+        references: {
+            evaluator: {
+                id: overrides.evaluatorId,
+                slug: overrides.evaluatorSlug,
+            },
+        },
+        data: {outputs: overrides.outputs ?? {}},
+    } as unknown as Annotation
+}
+
+function queueTag(queueId: string) {
+    return `agenta:queue:${queueId}`
+}
+
+// ---------------------------------------------------------------------------
+// getQueueAnnotationTag
+// ---------------------------------------------------------------------------
+
+describe("getQueueAnnotationTag", () => {
+    it("formats queue ID into tag", () => {
+        expect(getQueueAnnotationTag("q-abc")).toBe("agenta:queue:q-abc")
+    })
+
+    it("handles arbitrary queue IDs", () => {
+        expect(getQueueAnnotationTag("123-456-789")).toBe("agenta:queue:123-456-789")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// mergeTestcaseAnnotationTags
+// ---------------------------------------------------------------------------
+
+describe("mergeTestcaseAnnotationTags", () => {
+    it("always includes the queue tag and kind tag", () => {
+        const tags = mergeTestcaseAnnotationTags({queueId: "q-1"})
+        expect(tags).toContain(queueTag("q-1"))
+        expect(tags).toContain(TESTCASE_QUEUE_KIND_TAG)
+    })
+
+    it("merges existing tags without duplicates", () => {
+        const tags = mergeTestcaseAnnotationTags({
+            queueId: "q-1",
+            existingTags: ["score", "notes", queueTag("q-1")],
+            outputKeys: ["score"],
+        })
+        expect(tags.filter((t) => t === "score")).toHaveLength(1)
+        expect(tags.filter((t) => t === queueTag("q-1"))).toHaveLength(1)
+        expect(tags).toContain("notes")
+    })
+
+    it("adds output keys as tags", () => {
+        const tags = mergeTestcaseAnnotationTags({
+            queueId: "q-1",
+            outputKeys: ["relevance", "fluency"],
+        })
+        expect(tags).toContain("relevance")
+        expect(tags).toContain("fluency")
+    })
+
+    it("handles null existingTags gracefully", () => {
+        const tags = mergeTestcaseAnnotationTags({queueId: "q-1", existingTags: null})
+        expect(tags).toContain(queueTag("q-1"))
+        expect(tags).toContain(TESTCASE_QUEUE_KIND_TAG)
+    })
+
+    it("filters out falsy tags from existingTags", () => {
+        const tags = mergeTestcaseAnnotationTags({
+            queueId: "q-1",
+            existingTags: ["", null as unknown as string, "valid-tag"],
+        })
+        expect(tags).not.toContain("")
+        expect(tags).not.toContain(null)
+        expect(tags).toContain("valid-tag")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// selectQueueScopedAnnotation
+// ---------------------------------------------------------------------------
+
+describe("selectQueueScopedAnnotation — no match", () => {
+    it("returns null annotation when list is empty", () => {
+        const result = selectQueueScopedAnnotation({
+            annotations: [],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: null, conflictCode: null})
+    })
+
+    it("returns null annotation when no annotation matches the evaluator slug", () => {
+        const ann = makeAnnotation({evaluatorSlug: "other-evaluator"})
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: null, conflictCode: null})
+    })
+})
+
+describe("selectQueueScopedAnnotation — queue-scoped matching", () => {
+    it("returns the annotation when exactly one queue-scoped match exists", () => {
+        const ann = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: ann, conflictCode: null})
+    })
+
+    it("returns duplicate_queue_annotations when multiple queue-scoped annotations match", () => {
+        const ann1 = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            traceId: "trace-1",
+        })
+        const ann2 = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            traceId: "trace-2",
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann1, ann2],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: null, conflictCode: "duplicate_queue_annotations"})
+    })
+
+    it("ignores annotations scoped to a different queue", () => {
+        const ann = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [queueTag("q-OTHER"), TESTCASE_QUEUE_KIND_TAG],
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        // Not a queue-scoped match for q-1, and it has a queue tag → not legacy either
+        expect(result.annotation).toBeNull()
+        expect(result.conflictCode).toBeNull()
+    })
+})
+
+describe("selectQueueScopedAnnotation — legacy fallback", () => {
+    it("falls back to a legacy annotation (no queue tags) when no queue-scoped match", () => {
+        const ann = makeAnnotation({
+            evaluatorSlug: "relevance",
+            tags: [], // no queue tags → legacy
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: ann, conflictCode: null})
+    })
+
+    it("returns duplicate_legacy_annotations when multiple legacy annotations match", () => {
+        const ann1 = makeAnnotation({evaluatorSlug: "relevance", tags: [], traceId: "trace-1"})
+        const ann2 = makeAnnotation({evaluatorSlug: "relevance", tags: [], traceId: "trace-2"})
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann1, ann2],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+        })
+        expect(result).toEqual({annotation: null, conflictCode: "duplicate_legacy_annotations"})
+    })
+})
+
+describe("selectQueueScopedAnnotation — evaluatorWorkflowId matching", () => {
+    it("matches annotation by evaluator workflow ID", () => {
+        const ann = makeAnnotation({
+            evaluatorId: "wf-abc",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+        })
+        const result = selectQueueScopedAnnotation({
+            annotations: [ann],
+            queueId: "q-1",
+            evaluatorSlug: "relevance",
+            evaluatorWorkflowId: "wf-abc",
+        })
+        expect(result).toEqual({annotation: ann, conflictCode: null})
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getTestsetSyncEvaluatorColumnKey
+// ---------------------------------------------------------------------------
+
+describe("getTestsetSyncEvaluatorColumnKey", () => {
+    const evaluator = {slug: "relevance", workflowId: "wf-1"}
+
+    it("returns evaluator slug when no annotation supplied", () => {
+        expect(getTestsetSyncEvaluatorColumnKey({evaluator})).toBe("relevance")
+    })
+
+    it("prefers annotation's evaluator slug over evaluator.slug", () => {
+        const ann = makeAnnotation({evaluatorSlug: "resolved-slug"})
+        expect(getTestsetSyncEvaluatorColumnKey({evaluator, annotation: ann})).toBe("resolved-slug")
+    })
+
+    it("falls back to evaluator.workflowId when slug is empty", () => {
+        const noSlugEval = {slug: "", workflowId: "wf-fallback"}
+        expect(getTestsetSyncEvaluatorColumnKey({evaluator: noSlugEval})).toBe("wf-fallback")
+    })
+
+    it("returns empty string when evaluator has no slug or workflowId", () => {
+        expect(getTestsetSyncEvaluatorColumnKey({evaluator: {slug: "", workflowId: ""}})).toBe("")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildTestsetSyncOperations
+// ---------------------------------------------------------------------------
+
+describe("buildTestsetSyncOperations", () => {
+    it("maps target rows to replace operations", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 2,
+            rows: [
+                {
+                    scenarioId: "s-1",
+                    testcaseId: "tc-1",
+                    testsetId: "ts-1",
+                    rowId: "r-1",
+                    data: {x: 1},
+                },
+                {
+                    scenarioId: "s-2",
+                    testcaseId: "tc-2",
+                    testsetId: "ts-1",
+                    rowId: "r-2",
+                    data: {x: 2},
+                },
+            ],
+        }
+
+        const ops = buildTestsetSyncOperations(target)
+        expect(ops).toEqual({
+            rows: {
+                replace: [
+                    {id: "r-1", data: {x: 1}},
+                    {id: "r-2", data: {x: 2}},
+                ],
+            },
+        })
+    })
+
+    it("produces an empty replace list for a target with no rows", () => {
+        const ops = buildTestsetSyncOperations({
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 0,
+            rows: [],
+        })
+        expect(ops.rows.replace).toHaveLength(0)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// remapTargetRowsToBaseRevision
+// ---------------------------------------------------------------------------
+
+describe("remapTargetRowsToBaseRevision", () => {
+    it("keeps rows whose rowId exists directly in baseRows", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 1,
+            rows: [
+                {scenarioId: "s-1", testcaseId: "tc-1", testsetId: "ts-1", rowId: "r-1", data: {}},
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "r-1"}],
+        })
+        expect(result.rows).toHaveLength(1)
+        expect(result.rows[0].rowId).toBe("r-1")
+        expect(droppedRowCount).toBe(0)
+    })
+
+    it("remaps a row using testcase_dedup_id when rowId is not in baseRows", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 1,
+            rows: [
+                {
+                    scenarioId: "s-1",
+                    testcaseId: "tc-1",
+                    testsetId: "ts-1",
+                    rowId: "old-id",
+                    data: {testcase_dedup_id: "dedup-abc"},
+                },
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "new-id", data: {testcase_dedup_id: "dedup-abc"}}],
+        })
+        expect(result.rows[0].rowId).toBe("new-id")
+        expect(droppedRowCount).toBe(0)
+    })
+
+    it("also remaps using legacy __dedup_id__ key", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 1,
+            rows: [
+                {
+                    scenarioId: "s-1",
+                    testcaseId: "tc-1",
+                    testsetId: "ts-1",
+                    rowId: "old-id",
+                    data: {__dedup_id__: "dedup-xyz"},
+                },
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "mapped-id", data: {__dedup_id__: "dedup-xyz"}}],
+        })
+        expect(result.rows[0].rowId).toBe("mapped-id")
+        expect(droppedRowCount).toBe(0)
+    })
+
+    it("drops rows with no matching rowId and no dedup key", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 1,
+            rows: [
+                {scenarioId: "s-1", testcaseId: "tc-1", testsetId: "ts-1", rowId: "gone", data: {}},
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "other-id"}],
+        })
+        expect(result.rows).toHaveLength(0)
+        expect(droppedRowCount).toBe(1)
+    })
+
+    it("updates rowCount to reflect mapped rows only", () => {
+        const target = {
+            testsetId: "ts-1",
+            baseRevisionId: "rev-1",
+            rowCount: 2,
+            rows: [
+                {scenarioId: "s-1", testcaseId: "tc-1", testsetId: "ts-1", rowId: "r-1", data: {}},
+                {scenarioId: "s-2", testcaseId: "tc-2", testsetId: "ts-1", rowId: "gone", data: {}},
+            ],
+        }
+        const {target: result, droppedRowCount} = remapTargetRowsToBaseRevision({
+            target,
+            baseRows: [{id: "r-1"}],
+        })
+        expect(result.rowCount).toBe(1)
+        expect(droppedRowCount).toBe(1)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildTraceTestsetRows
+// ---------------------------------------------------------------------------
+
+describe("buildTraceTestsetRows", () => {
+    it("builds a row per scenario with trace inputs and output", () => {
+        const rows = buildTraceTestsetRows({
+            scenarioIds: ["s-1"],
+            traceInputsByScenario: new Map([["s-1", {question: "What is AI?"}]]),
+            traceOutputsByScenario: new Map([["s-1", "AI is..."]]),
+            annotationsByScenario: new Map(),
+            outputColumnName: "answer",
+        })
+        expect(rows).toHaveLength(1)
+        expect(rows[0].scenarioId).toBe("s-1")
+        expect(rows[0].data.question).toBe("What is AI?")
+        expect(rows[0].data.answer).toBe("AI is...")
+    })
+
+    it("expands a nested 'inputs' key into top-level columns", () => {
+        const rows = buildTraceTestsetRows({
+            scenarioIds: ["s-1"],
+            traceInputsByScenario: new Map([["s-1", {inputs: {a: 1, b: 2}}]]),
+            traceOutputsByScenario: new Map(),
+            annotationsByScenario: new Map(),
+            outputColumnName: "output",
+        })
+        expect(rows[0].data.a).toBe(1)
+        expect(rows[0].data.b).toBe(2)
+        expect(rows[0].data).not.toHaveProperty("inputs")
+    })
+
+    it("merges annotation outputs into the row", () => {
+        const rows = buildTraceTestsetRows({
+            scenarioIds: ["s-1"],
+            traceInputsByScenario: new Map([["s-1", {q: "hi"}]]),
+            traceOutputsByScenario: new Map([["s-1", "hello"]]),
+            annotationsByScenario: new Map([["s-1", {relevance: {score: 5}}]]),
+            outputColumnName: "output",
+        })
+        expect(rows[0].data.relevance).toMatchObject({score: 5})
+    })
+
+    it("handles a missing scenario gracefully (uses empty defaults)", () => {
+        const rows = buildTraceTestsetRows({
+            scenarioIds: ["s-missing"],
+            traceInputsByScenario: new Map(),
+            traceOutputsByScenario: new Map(),
+            annotationsByScenario: new Map(),
+            outputColumnName: "output",
+        })
+        expect(rows).toHaveLength(1)
+        expect(rows[0].data.output).toBeUndefined()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildTestcaseExportRows
+// ---------------------------------------------------------------------------
+
+describe("buildTestcaseExportRows", () => {
+    const evaluator = {slug: "quality", workflowId: "wf-q"}
+
+    function makeTestcase(id: string, testsetId: string): Testcase {
+        return {id, testset_id: testsetId, data: {prompt: "hello"}} as unknown as Testcase
+    }
+
+    it("builds a row when annotation data exists for the testcase", () => {
+        const ann = makeAnnotation({
+            evaluatorSlug: "quality",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            outputs: {score: 8},
+        })
+        const rows = buildTestcaseExportRows({
+            scenarioIds: ["s-1"],
+            testcasesByScenarioId: new Map([["s-1", makeTestcase("tc-1", "ts-1")]]),
+            annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
+            evaluators: [evaluator],
+            queueId: "q-1",
+        })
+        expect(rows).toHaveLength(1)
+        expect(rows[0].testcaseId).toBe("tc-1")
+        expect(rows[0].testsetId).toBe("ts-1")
+        expect((rows[0].data as Record<string, unknown>).quality).toMatchObject({score: 8})
+    })
+
+    it("skips a scenario with no testcase mapping", () => {
+        const rows = buildTestcaseExportRows({
+            scenarioIds: ["s-missing"],
+            testcasesByScenarioId: new Map(),
+            annotationsByTestcaseId: new Map(),
+            evaluators: [evaluator],
+            queueId: "q-1",
+        })
+        expect(rows).toHaveLength(0)
+    })
+
+    it("skips a testcase with no annotations", () => {
+        const rows = buildTestcaseExportRows({
+            scenarioIds: ["s-1"],
+            testcasesByScenarioId: new Map([["s-1", makeTestcase("tc-1", "ts-1")]]),
+            annotationsByTestcaseId: new Map([["tc-1", []]]),
+            evaluators: [evaluator],
+            queueId: "q-1",
+        })
+        expect(rows).toHaveLength(0)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildTestsetSyncPreview
+// ---------------------------------------------------------------------------
+
+describe("buildTestsetSyncPreview", () => {
+    const evaluator = {slug: "quality", workflowId: "wf-q"}
+
+    function makeTestcase(id: string, testsetId: string): Testcase {
+        return {id, testset_id: testsetId, data: {}} as unknown as Testcase
+    }
+
+    function makeQueueAnn(traceId = "trace-1") {
+        return makeAnnotation({
+            evaluatorSlug: "quality",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            outputs: {score: 7},
+            traceId,
+        })
+    }
+
+    it("returns a missing_testcase conflict when testcase not found", () => {
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-missing"}],
+            testcasesById: new Map(),
+            annotationsByTestcaseId: new Map(),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map(),
+        })
+        expect(preview.conflicts).toHaveLength(1)
+        expect(preview.conflicts[0].code).toBe("missing_testcase")
+        expect(preview.hasBlockingConflicts).toBe(true)
+    })
+
+    it("returns a missing_testset conflict when testcase has no testset_id", () => {
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", {id: "tc-1", data: {}} as unknown as Testcase]]),
+            annotationsByTestcaseId: new Map(),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map(),
+        })
+        expect(preview.conflicts[0].code).toBe("missing_testset")
+    })
+
+    it("returns a missing_latest_revision conflict when no revision for testset", () => {
+        const ann = makeQueueAnn()
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1")]]),
+            annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map(), // ts-1 has no revision
+        })
+        expect(preview.conflicts.some((c) => c.code === "missing_latest_revision")).toBe(true)
+    })
+
+    it("produces a clean target when everything is resolved", () => {
+        const ann = makeQueueAnn()
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1")]]),
+            annotationsByTestcaseId: new Map([["tc-1", [ann]]]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
+        })
+        expect(preview.conflicts).toHaveLength(0)
+        expect(preview.targets).toHaveLength(1)
+        expect(preview.targets[0].testsetId).toBe("ts-1")
+        expect(preview.targets[0].baseRevisionId).toBe("rev-1")
+        expect(preview.exportableRows).toBe(1)
+        expect(preview.hasBlockingConflicts).toBe(false)
+    })
+
+    it("records duplicate_queue_annotations conflict and skips the row", () => {
+        const ann1 = makeQueueAnn("trace-1")
+        const ann2 = makeQueueAnn("trace-2")
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1")]]),
+            annotationsByTestcaseId: new Map([["tc-1", [ann1, ann2]]]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
+        })
+        expect(preview.conflicts[0].code).toBe("duplicate_queue_annotations")
+        expect(preview.exportableRows).toBe(0)
+        expect(preview.hasBlockingConflicts).toBe(true)
+    })
+
+    it("groups rows from different scenarios under the same testset target", () => {
+        const ann1 = makeQueueAnn("trace-1")
+        const ann2 = makeQueueAnn("trace-2")
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [
+                {scenarioId: "s-1", testcaseId: "tc-1"},
+                {scenarioId: "s-2", testcaseId: "tc-2"},
+            ],
+            testcasesById: new Map([
+                ["tc-1", makeTestcase("tc-1", "ts-1")],
+                ["tc-2", makeTestcase("tc-2", "ts-1")],
+            ]),
+            annotationsByTestcaseId: new Map([
+                ["tc-1", [ann1]],
+                ["tc-2", [ann2]],
+            ]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
+        })
+        expect(preview.targets).toHaveLength(1)
+        expect(preview.targets[0].rowCount).toBe(2)
+        expect(preview.exportableRows).toBe(2)
+    })
+
+    it("skips rows with no annotation data and does not add them as conflicts", () => {
+        const annNoOutputs = makeAnnotation({
+            evaluatorSlug: "quality",
+            tags: [queueTag("q-1"), TESTCASE_QUEUE_KIND_TAG],
+            outputs: {}, // empty
+        })
+        const preview = buildTestsetSyncPreview({
+            queueId: "q-1",
+            completedScenarios: [{scenarioId: "s-1", testcaseId: "tc-1"}],
+            testcasesById: new Map([["tc-1", makeTestcase("tc-1", "ts-1")]]),
+            annotationsByTestcaseId: new Map([["tc-1", [annNoOutputs]]]),
+            evaluators: [evaluator],
+            latestRevisionIdsByTestsetId: new Map([["ts-1", "rev-1"]]),
+        })
+        expect(preview.conflicts).toHaveLength(0)
+        expect(preview.exportableRows).toBe(0)
+    })
+})
diff --git a/web/packages/agenta-annotation/vitest.config.ts b/web/packages/agenta-annotation/vitest.config.ts
index a9a2cfed1d..92bca1ab9d 100644
--- a/web/packages/agenta-annotation/vitest.config.ts
+++ b/web/packages/agenta-annotation/vitest.config.ts
@@ -1,6 +1,15 @@
+import path from "path"
+
 import {defineConfig} from "vitest/config"
 
 export default defineConfig({
+    resolve: {
+        alias: {
+            // Stub @agenta/ui to avoid pulling in the full antd tree.
+            // Annotation tests only exercise pure functions — no React rendering.
+            "@agenta/ui": path.resolve(__dirname, "tests/__mocks__/agenta-ui.ts"),
+        },
+    },
     test: {
         include: ["tests/unit/**/*.test.ts"],
         environment: "node",
diff --git a/web/packages/agenta-api-client/package.json b/web/packages/agenta-api-client/package.json
index 93f9d20f6b..ef00623602 100644
--- a/web/packages/agenta-api-client/package.json
+++ b/web/packages/agenta-api-client/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agentaai/api-client",
-  "version": "0.101.1",
+  "version": "0.102.0",
   "private": true,
   "type": "module",
   "main": "./dist/index.js",
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/runReferenceFilter.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/runReferenceFilter.test.ts
new file mode 100644
index 0000000000..6bf46333e1
--- /dev/null
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/runReferenceFilter.test.ts
@@ -0,0 +1,207 @@
+/**
+ * Run-list reference predicate — the run-level counterpart to the scenario-row
+ * predicate. Covers role resolution off `step.references` (the role-keyed
+ * primary path + the step.type legacy fallback), the subject/grader
+ * distinction (`isSubjectRun`), the `hasResolvableSubject` safety guard, the
+ * `eq`/`ne` ops, multi-predicate AND, and the `makeRunReferenceFilter`
+ * pipeline transform.
+ */
+
+import assert from "node:assert/strict"
+import {describe, it} from "node:test"
+
+import type {Chunk} from "../../../etl/core/types"
+import {
+    collectRoleReferenceKeys,
+    evaluateRunReferencePredicate,
+    hasResolvableSubject,
+    isSubjectRun,
+    makeRunReferenceFilter,
+    matchesRunReferenceFilter,
+    type RunReferenceStep,
+} from "../runReferenceFilter"
+
+const EVALUATOR = "eval-with-reasoning"
+const APP = "app-comp-1"
+const GRADER = "eval-grader-x"
+
+/** A grader run: app `app-comp-1` graded by evaluator `eval-with-reasoning`. */
+const graderRun: RunReferenceStep[] = [
+    {type: "input", references: {testset: {id: "ts-1"}}},
+    {type: "invocation", references: {application: {id: APP}}},
+    {type: "annotation", references: {evaluator: {id: EVALUATOR, slug: "with-reasoning"}}},
+]
+
+/** A subject run: an evaluation run ON `eval-with-reasoning` (the #4237 feature). */
+const subjectRun: RunReferenceStep[] = [
+    {type: "input", references: {testset: {id: "ts-2"}}},
+    {type: "invocation", references: {application: {id: EVALUATOR}}},
+    {type: "annotation", references: {evaluator: {id: GRADER}}},
+]
+
+describe("collectRoleReferenceKeys", () => {
+    it("reads role-keyed references off each step", () => {
+        assert.deepEqual([...collectRoleReferenceKeys(graderRun, "application")], [APP])
+        assert.deepEqual(
+            [...collectRoleReferenceKeys(graderRun, "evaluator")].sort(),
+            [EVALUATOR, "with-reasoning"].sort(),
+        )
+        assert.deepEqual([...collectRoleReferenceKeys(graderRun, "testset")], ["ts-1"])
+    })
+
+    it("includes both id and slug so evaluators match either", () => {
+        const keys = collectRoleReferenceKeys(graderRun, "evaluator")
+        assert.ok(keys.has(EVALUATOR))
+        assert.ok(keys.has("with-reasoning"))
+    })
+
+    it("returns empty for missing/empty steps", () => {
+        assert.equal(collectRoleReferenceKeys(null, "application").size, 0)
+        assert.equal(collectRoleReferenceKeys(undefined, "application").size, 0)
+        assert.equal(collectRoleReferenceKeys([], "application").size, 0)
+        assert.equal(collectRoleReferenceKeys([{type: "invocation"}], "application").size, 0)
+    })
+
+    it("falls back to step.type for a legacy single-reference step", () => {
+        const legacy: RunReferenceStep[] = [{type: "invocation", references: {ref: {id: APP}}}]
+        assert.deepEqual([...collectRoleReferenceKeys(legacy, "application")], [APP])
+    })
+
+    it("does NOT use the legacy fallback when multiple references are present (avoids over-match)", () => {
+        const ambiguous: RunReferenceStep[] = [
+            {type: "invocation", references: {ref: {id: APP}, other: {id: "x"}}},
+        ]
+        assert.equal(collectRoleReferenceKeys(ambiguous, "application").size, 0)
+    })
+})
+
+describe("isSubjectRun / grader distinction", () => {
+    it("subject run: the evaluator is the application/subject", () => {
+        assert.equal(isSubjectRun(subjectRun, EVALUATOR), true)
+    })
+
+    it("grader run: the evaluator is NOT the subject (it's an annotation)", () => {
+        assert.equal(isSubjectRun(graderRun, EVALUATOR), false)
+    })
+
+    it("the app IS the subject of its own grader run", () => {
+        assert.equal(isSubjectRun(graderRun, APP), true)
+    })
+})
+
+describe("evaluateRunReferencePredicate ops", () => {
+    it("eq matches the role's id", () => {
+        assert.equal(
+            evaluateRunReferencePredicate({role: "evaluator", id: EVALUATOR}, graderRun),
+            true,
+        )
+    })
+
+    it("ne is the complement", () => {
+        assert.equal(
+            evaluateRunReferencePredicate(
+                {role: "application", id: EVALUATOR, op: "ne"},
+                graderRun,
+            ),
+            true,
+        )
+        assert.equal(
+            evaluateRunReferencePredicate(
+                {role: "application", id: EVALUATOR, op: "ne"},
+                subjectRun,
+            ),
+            false,
+        )
+    })
+
+    it("matches an evaluator by slug too", () => {
+        assert.equal(
+            evaluateRunReferencePredicate({role: "evaluator", id: "with-reasoning"}, graderRun),
+            true,
+        )
+    })
+})
+
+describe("hasResolvableSubject", () => {
+    it("true when an application reference exists", () => {
+        assert.equal(hasResolvableSubject(graderRun), true)
+        assert.equal(hasResolvableSubject(subjectRun), true)
+    })
+
+    it("false when no application reference can be resolved", () => {
+        assert.equal(
+            hasResolvableSubject([{type: "annotation", references: {evaluator: {id: EVALUATOR}}}]),
+            false,
+        )
+        assert.equal(hasResolvableSubject([]), false)
+        assert.equal(hasResolvableSubject(null), false)
+    })
+})
+
+describe("matchesRunReferenceFilter (AND-join)", () => {
+    it("AND-joins multiple predicates", () => {
+        // subject == evaluator AND grader == GRADER
+        assert.equal(
+            matchesRunReferenceFilter(
+                [
+                    {role: "application", id: EVALUATOR},
+                    {role: "evaluator", id: GRADER},
+                ],
+                subjectRun,
+            ),
+            true,
+        )
+        // subject == evaluator AND grader == (the wrong id) → fails
+        assert.equal(
+            matchesRunReferenceFilter(
+                [
+                    {role: "application", id: EVALUATOR},
+                    {role: "evaluator", id: "nope"},
+                ],
+                subjectRun,
+            ),
+            false,
+        )
+    })
+})
+
+describe("makeRunReferenceFilter (Transform)", () => {
+    it("keeps only subject runs and reports chunk telemetry", () => {
+        interface Row {
+            id: string
+            steps: RunReferenceStep[]
+        }
+        const rows: Row[] = [
+            {id: "subject", steps: subjectRun},
+            {id: "grader", steps: graderRun},
+        ]
+        const seen: {scanned: number; matched: number}[] = []
+        const filter = makeRunReferenceFilter<Row>({
+            predicates: {role: "application", id: EVALUATOR},
+            getSteps: (row) => row.steps,
+            onChunkFiltered: ({scanned, matched}) => seen.push({scanned, matched}),
+        })
+
+        const chunk: Chunk<Row> = {items: rows, cursor: null}
+        const out = filter(chunk) as Chunk<Row>
+
+        assert.deepEqual(
+            out.items.map((r) => r.id),
+            ["subject"],
+        )
+        assert.deepEqual(seen, [{scanned: 2, matched: 1}])
+    })
+
+    it("defaultGetSteps reads row.previewMeta.steps", () => {
+        interface Row {
+            previewMeta: {steps: RunReferenceStep[]}
+        }
+        const rows: Row[] = [{previewMeta: {steps: subjectRun}}, {previewMeta: {steps: graderRun}}]
+        const filter = makeRunReferenceFilter<Row>({
+            predicates: {role: "application", id: EVALUATOR},
+        })
+        const out = filter({items: rows, cursor: null}) as Chunk<Row>
+        assert.equal(out.items.length, 1)
+        assert.equal(out.items[0]!.previewMeta.steps, subjectRun)
+    })
+})
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
index 4bb71e0faf..5e647662fe 100644
--- a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
@@ -114,6 +114,24 @@ export {
     type PredicateGroupFilterOptions,
 } from "./rowPredicateFilter"
 
+// Run-list predicate filter — the run-level counterpart to rowPredicateFilter.
+// Drops whole RUNS from a run list by the ROLE their references play
+// (subject / "application" vs grader / "evaluator"), reusing the same
+// step.type → role convention. Powers "evaluations that evaluated THIS
+// workflow" — the evaluator Evaluations/Overview unification (feature F).
+export {
+    collectRoleReferenceKeys,
+    evaluateRunReferencePredicate,
+    isSubjectRun,
+    hasResolvableSubject,
+    matchesRunReferenceFilter,
+    makeRunReferenceFilter,
+    type RunReferenceStep,
+    type RunReferenceRole,
+    type RunReferencePredicate,
+    type RunReferenceFilterOptions,
+} from "./runReferenceFilter"
+
 // filterSchema — derives the filterable fields (typed + type-matched
 // operators) the Phase 2 filter UI offers. Decision D8 / eval-filtering D4.
 export {
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/runReferenceFilter.ts b/web/packages/agenta-entities/src/evaluationRun/etl/runReferenceFilter.ts
new file mode 100644
index 0000000000..54eae9d7ca
--- /dev/null
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/runReferenceFilter.ts
@@ -0,0 +1,225 @@
+/**
+ * Run-list predicate filter — the run-level counterpart to rowPredicateFilter.
+ *
+ * # Where this fits
+ *
+ * `rowPredicateFilter` drops scenario ROWS *within a single run* by their
+ * resolved cell values (an evaluator's `success`, a testset column, a metric).
+ * This module drops whole RUNS *from a run list* by the **role** their
+ * references play.
+ *
+ * The canonical use is the unification behind feature "F": an evaluator's
+ * Evaluations / Overview tab should show the evaluations that *evaluated this
+ * workflow* — runs where the visited workflow is the run's **subject** (its
+ * `application` / invocation reference) — NOT runs that merely *used* it as a
+ * grader (where it sits in an `evaluator` / annotation reference).
+ *
+ * The backend reference filter (`references @> [...]`) matches an id in *any*
+ * role, so `application = evaluatorId` over-returns: it also matches runs where
+ * the evaluator was a grader. That's harmless for apps (an app id only ever
+ * occupies the `application` role) but leaks for evaluators (their id occupies
+ * `evaluator` on every run they grade, and `application` on their own subject
+ * runs). This filter resolves the role from the run's structure and keeps the
+ * intended one.
+ *
+ * # Why structural, not `meta.application`
+ *
+ * The run carries a denormalized `meta.application` hint, but it's unreliable
+ * (absent on some runs) — a null hint silently bypasses any `meta`-based
+ * filter, which is exactly how grader runs slip through today. The run's
+ * `data.steps` are the source of truth: the invocation step's `application`
+ * reference is the evaluated/subject workflow, regardless of `meta`.
+ *
+ * # Role convention
+ *
+ * Same `step.type → role` mapping `resolveMappings` / `predicateToEntitySlices`
+ * use on the read side:
+ *
+ *   input       → testset
+ *   invocation  → application   (the evaluated / subject workflow)
+ *   annotation  → evaluator     (the grader)
+ *
+ * References are already role-keyed off each step
+ * (`{application: {id}}`, `{evaluator: {id, slug}}`, …); `step.type` is only a
+ * fallback for legacy steps whose single reference wasn't explicitly keyed.
+ *
+ * @packageDocumentation
+ */
+
+import type {Chunk, Transform} from "../../etl/core/types"
+
+/**
+ * Minimal structural shape of a run step — intentionally looser than
+ * `RunSchema`'s `RunStep` so callers can pass `previewMeta.steps[]`
+ * (whose `references` is typed `Record<string, unknown>`) without a cast.
+ */
+export interface RunReferenceStep {
+    type?: string | null
+    references?: Record<string, unknown> | null
+}
+
+/** A reference role a run step can carry. Open string for forward-compat. */
+export type RunReferenceRole = "application" | "evaluator" | "testset" | "query" | (string & {})
+
+/**
+ * `step.type → canonical role`. Used only as a fallback when a step's
+ * references aren't explicitly role-keyed (legacy single-reference steps).
+ */
+const STEP_TYPE_TO_ROLE: Record<string, RunReferenceRole> = {
+    input: "testset",
+    invocation: "application",
+    annotation: "evaluator",
+}
+
+/**
+ * One run-level clause: the run must (op "eq") or must not (op "ne") carry
+ * `id` in the given `role`.
+ *
+ *   - `role` — which reference slot the id must occupy ("application" = subject).
+ *   - `id` — the id (or slug) to match.
+ *   - `op` — "eq" → run HAS the id in this role; "ne" → run does NOT. Default "eq".
+ */
+export interface RunReferencePredicate {
+    role: RunReferenceRole
+    id: string
+    op?: "eq" | "ne"
+}
+
+function addRefKeys(ref: unknown, into: Set<string>): void {
+    if (!ref || typeof ref !== "object") return
+    const {id, slug} = ref as {id?: unknown; slug?: unknown}
+    if (typeof id === "string" && id) into.add(id)
+    // Evaluators are frequently referenced by slug rather than id, so match both.
+    if (typeof slug === "string" && slug) into.add(slug)
+}
+
+/**
+ * Collect every id/slug a given `role` occupies across a run's steps.
+ *
+ * Primary path: the role-keyed reference on each step (`refs[role]`). Fallback:
+ * a legacy step whose `references` isn't role-keyed but whose `step.type` maps
+ * to `role` and which carries exactly one reference.
+ */
+export function collectRoleReferenceKeys(
+    steps: readonly RunReferenceStep[] | null | undefined,
+    role: RunReferenceRole,
+): Set<string> {
+    const keys = new Set<string>()
+    if (!Array.isArray(steps)) return keys
+
+    for (const step of steps) {
+        const refs = step?.references
+        if (!refs || typeof refs !== "object") continue
+        const map = refs as Record<string, unknown>
+
+        const direct = map[role]
+        if (direct) {
+            addRefKeys(direct, keys)
+            continue
+        }
+
+        // Legacy fallback: references not explicitly role-keyed, but step.type
+        // identifies the role and the step carries a single reference.
+        const inferred = step?.type ? STEP_TYPE_TO_ROLE[String(step.type)] : undefined
+        if (inferred === role) {
+            const values = Object.values(map)
+            if (values.length === 1) addRefKeys(values[0], keys)
+        }
+    }
+
+    return keys
+}
+
+/** Evaluate a single run-reference predicate against a run's steps. */
+export function evaluateRunReferencePredicate(
+    predicate: RunReferencePredicate,
+    steps: readonly RunReferenceStep[] | null | undefined,
+): boolean {
+    const has = collectRoleReferenceKeys(steps, predicate.role).has(predicate.id)
+    return (predicate.op ?? "eq") === "ne" ? !has : has
+}
+
+/**
+ * True when `workflowId` is the run's evaluated / subject workflow — i.e. the
+ * workflow sits in an `application` (invocation) reference. This is the
+ * "evaluations that evaluated THIS workflow" predicate.
+ */
+export function isSubjectRun(
+    steps: readonly RunReferenceStep[] | null | undefined,
+    workflowId: string,
+): boolean {
+    return evaluateRunReferencePredicate({role: "application", id: workflowId}, steps)
+}
+
+/**
+ * Whether a run carries any resolvable `application` (subject) reference at all.
+ *
+ * Used as a safety guard: a run with no resolvable subject can't be classified
+ * structurally, so the caller should fall back to its prior heuristic
+ * (e.g. `meta.application`) rather than silently dropping the run.
+ */
+export function hasResolvableSubject(
+    steps: readonly RunReferenceStep[] | null | undefined,
+): boolean {
+    return collectRoleReferenceKeys(steps, "application").size > 0
+}
+
+// ============================================================================
+// ETL Transform parity
+//
+// The dataset-store fetch path consumes the pure helpers above directly, but
+// for headless / chunked ETL runs we expose a Transform factory mirroring
+// rowPredicateFilter's `makePredicateGroupFilter`. Predicates are AND-joined.
+// ============================================================================
+
+export interface RunReferenceFilterOptions<TRow> {
+    /** One or more predicates, AND-joined. All must hold for the run to pass. */
+    predicates: RunReferencePredicate | RunReferencePredicate[]
+    /** Extract the run's steps from a row. Defaults to `row.previewMeta?.steps` / `row.steps`. */
+    getSteps?: (row: TRow) => readonly RunReferenceStep[] | null | undefined
+    /** Optional per-chunk telemetry — feeds a hit-ratio meter. */
+    onChunkFiltered?: (info: {chunk: number; scanned: number; matched: number}) => void
+}
+
+function defaultGetSteps(row: unknown): readonly RunReferenceStep[] | null | undefined {
+    if (!row || typeof row !== "object") return null
+    const r = row as {steps?: unknown; previewMeta?: {steps?: unknown}}
+    if (Array.isArray(r.steps)) return r.steps as RunReferenceStep[]
+    if (Array.isArray(r.previewMeta?.steps)) return r.previewMeta!.steps as RunReferenceStep[]
+    return null
+}
+
+/** True when a run's steps satisfy every supplied predicate (logical AND). */
+export function matchesRunReferenceFilter(
+    predicates: RunReferencePredicate | RunReferencePredicate[],
+    steps: readonly RunReferenceStep[] | null | undefined,
+): boolean {
+    const list = Array.isArray(predicates) ? predicates : [predicates]
+    return list.every((p) => evaluateRunReferencePredicate(p, steps))
+}
+
+/**
+ * Build a `Transform<TRow, TRow>` that keeps only runs satisfying every
+ * supplied predicate. Stateless — reusable across pipeline runs.
+ */
+export function makeRunReferenceFilter<TRow>(
+    options: RunReferenceFilterOptions<TRow>,
+): Transform<TRow, TRow> {
+    const predicates = Array.isArray(options.predicates) ? options.predicates : [options.predicates]
+    const getSteps =
+        options.getSteps ?? (defaultGetSteps as RunReferenceFilterOptions<TRow>["getSteps"])!
+    let chunkIdx = 0
+
+    return (chunk: Chunk<TRow>) => {
+        chunkIdx++
+        const passing = chunk.items.filter((row) =>
+            matchesRunReferenceFilter(predicates, getSteps(row)),
+        )
+        options.onChunkFiltered?.({
+            chunk: chunkIdx,
+            scanned: chunk.items.length,
+            matched: passing.length,
+        })
+        return {...chunk, items: passing}
+    }
+}
diff --git a/web/packages/agenta-entities/src/workflow/core/index.ts b/web/packages/agenta-entities/src/workflow/core/index.ts
index dd0aa8b618..0f5e52659f 100644
--- a/web/packages/agenta-entities/src/workflow/core/index.ts
+++ b/web/packages/agenta-entities/src/workflow/core/index.ts
@@ -86,3 +86,11 @@ export {
     type EvaluatorDefinition,
     type MetricColumnDefinition,
 } from "./evaluatorResolution"
+
+// Observability defaults (kept pure for unit-testability)
+export {
+    defaultTraceTypeForWorkflow,
+    type TraceTypeDefault,
+    type ObservabilityTab,
+    type WorkflowKindForTraceDefault,
+} from "./traceTypeDefault"
diff --git a/web/packages/agenta-entities/src/workflow/core/schema.ts b/web/packages/agenta-entities/src/workflow/core/schema.ts
index 56aba6b29d..f1cb140e86 100644
--- a/web/packages/agenta-entities/src/workflow/core/schema.ts
+++ b/web/packages/agenta-entities/src/workflow/core/schema.ts
@@ -293,6 +293,12 @@ export const workflowSchema = z
         // Parent slugs (from revision responses; backend returns artifact_slug
         // and variant_slug alongside the IDs so callers can verify which
         // workflow/variant the revision belongs to without a second lookup).
+        //
+        // workflow_slug / workflow_variant_slug are also required for emitting
+        // evaluator references on playground chain runs — the trace storage
+        // layer identifies evaluator runs by slug (via
+        // `references.evaluator.slug`), and we want to write the parent
+        // workflow's slug, not the revision's.
         workflow_slug: z.string().nullable().optional(),
         workflow_variant_slug: z.string().nullable().optional(),
         artifact_slug: z.string().nullable().optional(),
diff --git a/web/packages/agenta-entities/src/workflow/core/traceTypeDefault.ts b/web/packages/agenta-entities/src/workflow/core/traceTypeDefault.ts
new file mode 100644
index 0000000000..b8ec1d880d
--- /dev/null
+++ b/web/packages/agenta-entities/src/workflow/core/traceTypeDefault.ts
@@ -0,0 +1,41 @@
+/**
+ * Soft default for the `trace_type` filter on the app-scoped observability
+ * page (`/apps/<entityId>/traces`).
+ *
+ * Lives in entities (not in OSS) so the truth table can be unit-tested with
+ * vitest. The OSS atom in `state/newObservability/atoms/controls.ts` calls
+ * this helper and applies the result as a filter when no user override is
+ * present.
+ *
+ * - `tab === "sessions"` → no default (Sessions tab is app-only; evaluators
+ *   don't emit them — the tab itself is hidden for evaluator workflows per
+ *   Phase 6.3.3, but a stale `?tab=sessions` URL still hits this code).
+ * - `workflowKind === "evaluator"` → `"annotation"`. Production evaluators
+ *   score app traces and emit annotation-type traces. The playground-
+ *   triggered standalone evaluator runs (which emit invocation traces with
+ *   `references.application` set) are the edge case, not the default.
+ * - everything else (`"app"`, `"snippet"`, `null`) → `"invocation"`. Apps
+ *   invoke models; the app-scoped observability page should default to
+ *   those.
+ *
+ * Returns `null` when no soft default should apply.
+ */
+export type TraceTypeDefault = "invocation" | "annotation"
+export type ObservabilityTab = "traces" | "sessions"
+
+/**
+ * Workflow role kind, mirrored locally to keep this helper free of OSS
+ * imports. OSS' canonical type lives at
+ * `web/oss/src/state/workflow/destinations.ts:11` with the same shape; the
+ * compiler will catch any drift at the wire-up site in `controls.ts`.
+ */
+export type WorkflowKindForTraceDefault = "app" | "evaluator" | "snippet"
+
+export function defaultTraceTypeForWorkflow(
+    workflowKind: WorkflowKindForTraceDefault | null,
+    tab: ObservabilityTab,
+): TraceTypeDefault | null {
+    if (tab !== "traces") return null
+    if (workflowKind === "evaluator") return "annotation"
+    return "invocation"
+}
diff --git a/web/packages/agenta-entities/src/workflow/index.ts b/web/packages/agenta-entities/src/workflow/index.ts
index 464f92f1dc..b3d5a612bf 100644
--- a/web/packages/agenta-entities/src/workflow/index.ts
+++ b/web/packages/agenta-entities/src/workflow/index.ts
@@ -95,6 +95,11 @@ export {
     isOnlineCapableEvaluator,
     hasFullPagePlaygroundUX,
     collectEvaluatorCandidates,
+    // Observability defaults
+    defaultTraceTypeForWorkflow,
+    type TraceTypeDefault,
+    type ObservabilityTab,
+    type WorkflowKindForTraceDefault,
     // Output schema utilities
     resolveInputSchema,
     resolveOutputSchema,
@@ -284,6 +289,7 @@ export {
     evaluatorsListDataAtom,
     nonArchivedEvaluatorsAtom,
     fullPagePlaygroundEvaluatorsAtom,
+    nonHumanEvaluatorsAtom,
     // Templates
     evaluatorTemplatesQueryAtom,
     evaluatorTemplatesDataAtom,
diff --git a/web/packages/agenta-entities/src/workflow/state/evaluatorUtils.ts b/web/packages/agenta-entities/src/workflow/state/evaluatorUtils.ts
index 7ea199e121..29d362d1c7 100644
--- a/web/packages/agenta-entities/src/workflow/state/evaluatorUtils.ts
+++ b/web/packages/agenta-entities/src/workflow/state/evaluatorUtils.ts
@@ -142,6 +142,29 @@ export const fullPagePlaygroundEvaluatorsAtom = atom<Workflow[]>((get) => {
     })
 })
 
+/**
+ * Non-archived **automatic** evaluators — i.e. all evaluators except human
+ * (`is_feedback`) ones. Unlike `fullPagePlaygroundEvaluatorsAtom`, this does
+ * NOT narrow to evaluators that have a full-page playground, so it includes the
+ * declarative classifiers too (exact match, regex, similarity / semantic
+ * similarity, json diff, contains json, …). This is the right list for the
+ * sidebar workflow switcher, which should surface every automatic evaluator.
+ *
+ * `is_feedback` lives on the revision (not the parent artifact), so it's
+ * resolved from each evaluator's latest revision (batched + cached). An
+ * evaluator whose latest revision hasn't resolved yet is held back until it
+ * does, so a human evaluator never briefly leaks into the list.
+ */
+export const nonHumanEvaluatorsAtom = atom<Workflow[]>((get) => {
+    const evaluators = get(nonArchivedEvaluatorsAtom)
+    return evaluators.filter((evaluator) => {
+        if (!evaluator.id) return false
+        const revision = get(workflowLatestRevisionQueryAtomFamily(evaluator.id)).data
+        if (!revision) return false
+        return !revision.flags?.is_feedback
+    })
+})
+
 /**
  * Invalidate the evaluators list cache.
  * Call after create/update/archive operations on evaluator workflows.
diff --git a/web/packages/agenta-entities/src/workflow/state/index.ts b/web/packages/agenta-entities/src/workflow/state/index.ts
index 0c85a3131d..0cb2d9f94a 100644
--- a/web/packages/agenta-entities/src/workflow/state/index.ts
+++ b/web/packages/agenta-entities/src/workflow/state/index.ts
@@ -159,6 +159,7 @@ export {
     evaluatorsListDataAtom,
     nonArchivedEvaluatorsAtom,
     fullPagePlaygroundEvaluatorsAtom,
+    nonHumanEvaluatorsAtom,
     // Templates
     evaluatorTemplatesQueryAtom,
     evaluatorTemplatesDataAtom,
diff --git a/web/packages/agenta-entities/tests/unit/traceTypeDefault.test.ts b/web/packages/agenta-entities/tests/unit/traceTypeDefault.test.ts
new file mode 100644
index 0000000000..4333d4b19c
--- /dev/null
+++ b/web/packages/agenta-entities/tests/unit/traceTypeDefault.test.ts
@@ -0,0 +1,59 @@
+/**
+ * Unit tests for defaultTraceTypeForWorkflow.
+ *
+ * The helper drives the soft-default `trace_type` filter on
+ * `/apps/<entityId>/traces` (see `web/oss/src/state/newObservability/atoms/
+ * controls.ts:filtersAtomFamily`). The truth table matters because getting
+ * this wrong means evaluator users land on an empty page by default — the
+ * regression that #4384 disabled the whole evaluator full-page flow over.
+ */
+
+import {describe, it, expect} from "vitest"
+
+import {defaultTraceTypeForWorkflow} from "../../src/workflow/core/traceTypeDefault"
+
+describe("defaultTraceTypeForWorkflow", () => {
+    describe("sessions tab", () => {
+        it("returns null for app workflow", () => {
+            expect(defaultTraceTypeForWorkflow("app", "sessions")).toBeNull()
+        })
+
+        it("returns null for evaluator workflow", () => {
+            expect(defaultTraceTypeForWorkflow("evaluator", "sessions")).toBeNull()
+        })
+
+        it("returns null for snippet workflow", () => {
+            expect(defaultTraceTypeForWorkflow("snippet", "sessions")).toBeNull()
+        })
+
+        it("returns null when workflow kind is unknown", () => {
+            expect(defaultTraceTypeForWorkflow(null, "sessions")).toBeNull()
+        })
+    })
+
+    describe("traces tab", () => {
+        it("defaults to annotation for evaluator workflows", () => {
+            // Production evaluators score app traces and emit annotation-type
+            // traces — that's the more common case for the per-evaluator
+            // observability view, not playground-triggered standalone runs.
+            expect(defaultTraceTypeForWorkflow("evaluator", "traces")).toBe("annotation")
+        })
+
+        it("defaults to invocation for app workflows", () => {
+            expect(defaultTraceTypeForWorkflow("app", "traces")).toBe("invocation")
+        })
+
+        it("defaults to invocation for snippet workflows", () => {
+            // Snippets behave like apps from an invocation perspective —
+            // they invoke models the same way and don't generate annotations.
+            expect(defaultTraceTypeForWorkflow("snippet", "traces")).toBe("invocation")
+        })
+
+        it("defaults to invocation when workflow kind is unknown (resolving)", () => {
+            // Cold-load fallback: when `currentWorkflowContextAtom` is still
+            // resolving, the kind comes through as `null`. Picking invocation
+            // is the safest default since most users land on app pages.
+            expect(defaultTraceTypeForWorkflow(null, "traces")).toBe("invocation")
+        })
+    })
+})
diff --git a/web/packages/agenta-entity-ui/src/selection/adapters/workflowRevisionRelationAdapter.ts b/web/packages/agenta-entity-ui/src/selection/adapters/workflowRevisionRelationAdapter.ts
index be063d1483..00d196dc69 100644
--- a/web/packages/agenta-entity-ui/src/selection/adapters/workflowRevisionRelationAdapter.ts
+++ b/web/packages/agenta-entity-ui/src/selection/adapters/workflowRevisionRelationAdapter.ts
@@ -327,12 +327,34 @@ export interface CreateWorkflowRevisionAdapterOptions {
     ) => WorkflowRevisionSelectionResult
 
     /**
-     * Empty state message.
+     * Display label for the parent (workflow) level. Drives the picker's
+     * search placeholder ("Search {parentLabel}…"), the empty-list "No
+     * {parentLabel} found" copy, and similar UI strings.
+     *
+     * Defaults to `"Evaluator"` when used in skip-variant mode (the adapter's
+     * original primary use case was evaluator selection), but consumers
+     * picking app workflows — e.g., `EvaluatorPlaygroundHeader` — should pass
+     * `"Application"` so the search bar doesn't say "Search evaluator…" while
+     * the user is actually picking an app.
+     *
+     * @example
+     * ```typescript
+     * createWorkflowRevisionAdapter({
+     *     skipVariantLevel: true,
+     *     flags: {is_evaluator: false},
+     *     parentLabel: "Application",
+     * })
+     * ```
+     */
+    parentLabel?: string
+
+    /**
+     * Empty state message. Defaults to "No {parentLabel}s found".
      */
     emptyMessage?: string
 
     /**
-     * Loading state message.
+     * Loading state message. Defaults to "Loading {parentLabel}s...".
      */
     loadingMessage?: string
 
@@ -421,12 +443,19 @@ export function createWorkflowRevisionAdapter(
         toSelection,
         emptyMessage,
         loadingMessage,
+        parentLabel = "Evaluator",
         flags,
         filterWorkflows,
         skipVariantLevel = false,
         workflowListAtom,
     } = options
 
+    // Derive empty/loading defaults from the parent label so callers picking
+    // app workflows don't see "No evaluators found" in an app picker.
+    const lowerParent = parentLabel.toLowerCase()
+    const resolvedEmptyMessage = emptyMessage ?? `No ${lowerParent}s found`
+    const resolvedLoadingMessage = loadingMessage ?? `Loading ${lowerParent}s...`
+
     const emptyListState: ListQueryState<unknown> = {
         data: [],
         isPending: false,
@@ -467,7 +496,7 @@ export function createWorkflowRevisionAdapter(
         return createTwoLevelAdapter<WorkflowRevisionSelectionResult>({
             name: "workflowRevision",
             parentType: "workflow",
-            parentLabel: "Evaluator",
+            parentLabel,
             parentListAtom: resolvedWorkflowsListAtom,
             parentOverrides: {
                 getId: (entity: unknown) => (entity as {id: string}).id,
@@ -502,7 +531,7 @@ export function createWorkflowRevisionAdapter(
                     return {
                         type: "workflowRevision",
                         id: revision.id,
-                        label: `${workflow?.label ?? "Evaluator"} / v${revision.version ?? 0}`,
+                        label: `${workflow?.label ?? parentLabel} / v${revision.version ?? 0}`,
                         path,
                         metadata: {
                             workflowId: workflow?.id ?? "",
@@ -513,8 +542,8 @@ export function createWorkflowRevisionAdapter(
                         },
                     }
                 }),
-            emptyMessage: emptyMessage ?? "No evaluators found",
-            loadingMessage: loadingMessage ?? "Loading evaluators...",
+            emptyMessage: resolvedEmptyMessage,
+            loadingMessage: resolvedLoadingMessage,
         })
     }
 
diff --git a/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx b/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
index 587dfae906..f284c4b9d8 100644
--- a/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
+++ b/web/packages/agenta-playground-ui/src/components/ExecutionItems/assets/ExecutionRow/SingleLayout.tsx
@@ -253,7 +253,24 @@ const DownstreamNodeCard = ({
                 }),
             [rowId, scopedEntityId],
         ),
-    ) as {status?: string; output?: unknown; error?: {message: string} | null} | null
+    ) as {
+        status?: string
+        output?: unknown
+        error?: {message: string} | null
+        traceId?: string | null
+    } | null
+
+    // Trace-link affordance for the downstream (evaluator) result — surfaced in
+    // the card legend so users can open the evaluator's own trace to debug a
+    // grade, the same way the primary app row exposes its trace (QA 2026-06-05:
+    // "show the trace links (icon) for evaluators too").
+    const providers = usePlaygroundUIOptional()
+    const SharedGenerationResultUtils = providers?.SharedGenerationResultUtils
+    const nodeTraceId = fullResult?.traceId ?? null
+    const traceActions =
+        nodeTraceId && SharedGenerationResultUtils ? (
+            <SharedGenerationResultUtils traceId={nodeTraceId} actionsOnly />
+        ) : undefined
 
     // Read output ports from the runnable bridge (includes per-field schema)
     const outputPorts = useAtomValue(
@@ -286,7 +303,7 @@ const DownstreamNodeCard = ({
     // Idle / cancelled / no result — show expected fields with placeholder dashes
     if (!fullResult || rawStatus === "idle" || rawStatus === "cancelled") {
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <EvaluatorFieldGrid entries={null} outputPorts={outputPorts} idle />
             </NodeResultCard>
         )
@@ -295,7 +312,7 @@ const DownstreamNodeCard = ({
     // Running / pending -> loading skeleton
     if (rawStatus === "running" || rawStatus === "pending") {
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <EvaluatorFieldGrid entries={null} outputPorts={outputPorts} loading />
             </NodeResultCard>
         )
@@ -308,7 +325,7 @@ const DownstreamNodeCard = ({
                 ? fullResult.error.message
                 : "Error"
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <span className="text-[var(--ant-color-error)] text-xs leading-5">{errorMsg}</span>
             </NodeResultCard>
         )
@@ -321,7 +338,7 @@ const DownstreamNodeCard = ({
                 ? fullResult.error.message
                 : "Skipped"
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <span className="text-[var(--ant-color-text-tertiary)] text-xs leading-5 italic">
                     {skipMsg}
                 </span>
@@ -343,14 +360,14 @@ const DownstreamNodeCard = ({
 
     if (!entries || entries.length === 0) {
         return (
-            <NodeResultCard name={nodeName} status={rawStatus}>
+            <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
                 <span className="text-xs leading-5">—</span>
             </NodeResultCard>
         )
     }
 
     return (
-        <NodeResultCard name={nodeName} status={rawStatus}>
+        <NodeResultCard name={nodeName} status={rawStatus} headerActions={traceActions}>
             <div
                 className="grid items-baseline text-xs leading-5"
                 style={{gridTemplateColumns: "auto 1fr", columnGap: 12, rowGap: 6}}
@@ -828,10 +845,14 @@ const SingleView = ({
                                     className={clsx(
                                         "flex items-start gap-2 px-3 py-2 rounded-md",
                                         "bg-blue-50 border border-solid border-blue-100",
+                                        "dark:bg-blue-900/20 dark:border-blue-900/40",
                                     )}
                                 >
-                                    <Info size={14} className="text-blue-500 mt-0.5 shrink-0" />
-                                    <div className="flex-1 text-xs text-gray-700 leading-relaxed">
+                                    <Info
+                                        size={14}
+                                        className="text-blue-500 dark:text-blue-300 mt-0.5 shrink-0"
+                                    />
+                                    <div className="flex-1 text-xs text-gray-700 dark:text-blue-50 leading-relaxed">
                                         Fill these with the data the application being evaluated
                                         received and produced. The evaluator will judge this pair —
                                         not your own typed values.
@@ -841,6 +862,7 @@ const SingleView = ({
                                         onClick={() => setEvaluatorCalloutDismissed(true)}
                                         className={clsx(
                                             "shrink-0 p-0.5 rounded text-gray-400 hover:text-gray-700 hover:bg-blue-100",
+                                            "dark:text-blue-200 dark:hover:text-blue-50 dark:hover:bg-blue-900/40",
                                             "border-0 bg-transparent cursor-pointer",
                                         )}
                                         aria-label="Dismiss"
diff --git a/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx b/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
index 0aabee0691..b1921ec305 100644
--- a/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
+++ b/web/packages/agenta-playground-ui/src/components/adapters/TurnMessageAdapter.tsx
@@ -21,10 +21,10 @@ import {
     PromptImageUpload,
     PromptDocumentUpload,
 } from "@agenta/ui/components/presentational"
-import type {ViewMode} from "@agenta/ui/drill-in"
+import {messageViewModeAtom, toMessageViewMode} from "@agenta/ui/drill-in"
 import type {UploadFile} from "antd"
 import clsx from "clsx"
-import {useAtomValue, useSetAtom} from "jotai"
+import {useAtom, useAtomValue, useSetAtom} from "jotai"
 import JSON5 from "json5"
 import {v4 as uuidv4} from "uuid"
 
@@ -215,9 +215,13 @@ const TurnMessageAdapter: React.FC<Props> = ({
 
         return fallback
     }, [computedText, msg])
-    const [viewMode, setViewMode] = useState<ViewMode>("text")
-    const isCodeMode = viewMode === "json" || viewMode === "yaml"
-    const editorLanguage = viewMode === "yaml" ? "yaml" : "json"
+    // Shared + persisted across all message editors (see messageViewModeAtom).
+    // The atom is typed `ViewMode` (can hold "form"), so coerce to a mode this
+    // editor can actually render before deriving any mode-dependent state.
+    const [viewMode, setViewMode] = useAtom(messageViewModeAtom)
+    const chatViewMode = toMessageViewMode(viewMode)
+    const isCodeMode = chatViewMode === "json" || chatViewMode === "yaml"
+    const editorLanguage = chatViewMode === "yaml" ? "yaml" : "json"
 
     const effectiveDisabled = Boolean(disabled)
     const isUserRole = kind === "user" && !isToolKind
@@ -656,7 +660,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                             isJSON={isCodeMode}
                             isTool={isCodeMode}
                             language={editorLanguage}
-                            markdownView={viewMode === "markdown"}
+                            markdownView={chatViewMode === "text"}
                             onFocusChange={handleEditorFocusChange}
                             text={p?.json}
                             enableTokens={messageProps?.enableTokens ?? !isCodeMode}
@@ -686,7 +690,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                                     resultHashes={propsResultHashes ?? resultHashes}
                                     results={results}
                                     text={p?.json ?? editorText}
-                                    viewMode={viewMode}
+                                    viewMode={chatViewMode}
                                     onViewModeChange={setViewMode}
                                     collapsed={isMessageCollapsed}
                                     allowFileUpload={isUserRole && !effectiveDisabled}
@@ -750,7 +754,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                         state={editorState}
                         isJSON={isCodeMode}
                         language={editorLanguage}
-                        markdownView={viewMode === "markdown"}
+                        markdownView={chatViewMode === "text"}
                         enableTokens={messageProps?.enableTokens ?? !isCodeMode}
                         headerRight={
                             <TurnMessageHeaderOptions
@@ -761,7 +765,7 @@ const TurnMessageAdapter: React.FC<Props> = ({
                                 resultHashes={propsResultHashes ?? resultHashes}
                                 results={results}
                                 text={editorText}
-                                viewMode={viewMode}
+                                viewMode={chatViewMode}
                                 onViewModeChange={setViewMode}
                                 collapsed={isMessageCollapsed}
                                 allowFileUpload={isUserRole && !effectiveDisabled}
diff --git a/web/packages/agenta-playground-ui/src/components/adapters/VariableControlAdapter.tsx b/web/packages/agenta-playground-ui/src/components/adapters/VariableControlAdapter.tsx
index a1a4640ce9..0a9ce04c2c 100644
--- a/web/packages/agenta-playground-ui/src/components/adapters/VariableControlAdapter.tsx
+++ b/web/packages/agenta-playground-ui/src/components/adapters/VariableControlAdapter.tsx
@@ -522,7 +522,7 @@ const VariableControlAdapter: React.FC<VariableControlAdapterProps> = ({
                 enableTokens={!editorProps?.codeOnly}
                 disabled={isEffectivelyDisabled}
             >
-                <MarkdownViewSynchronizer enabled={viewMode === "markdown"} />
+                <MarkdownViewSynchronizer enabled={viewMode === "text"} />
                 <SharedEditor
                     id={editorId}
                     noProvider
diff --git a/web/packages/agenta-playground-ui/src/context/PlaygroundUIContext.tsx b/web/packages/agenta-playground-ui/src/context/PlaygroundUIContext.tsx
index 8ae340cd73..cee1fe9435 100644
--- a/web/packages/agenta-playground-ui/src/context/PlaygroundUIContext.tsx
+++ b/web/packages/agenta-playground-ui/src/context/PlaygroundUIContext.tsx
@@ -59,6 +59,8 @@ export interface SharedGenerationResultUtilsProps {
     traceId?: string | null
     showStatus?: boolean
     className?: string
+    /** Render only the trace action (compact "open trace" icon, no metrics/status). */
+    actionsOnly?: boolean
 }
 
 /**
diff --git a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
index 86a5394efd..d974eb5e0c 100644
--- a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
+++ b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
@@ -24,7 +24,7 @@
 
 import {loadableStateAtomFamily} from "@agenta/entities/loadable"
 import {loadableController, snapshotAdapterRegistry} from "@agenta/entities/runnable"
-import {fetchTestcasesPage} from "@agenta/entities/testcase"
+import {fetchTestcasesPage, testcaseMolecule} from "@agenta/entities/testcase"
 import type {TraceSpan, TraceSpanNode} from "@agenta/entities/trace"
 import {extractAgData, extractInputs, extractOutputs} from "@agenta/entities/trace"
 import {
@@ -83,6 +83,11 @@ import {
     newTestcaseDataHashAtom,
 } from "../execution/selectors"
 import {pruneDanglingConnections} from "../helpers/connectionGraph"
+import {
+    collectDownstreamReferencedColumns,
+    reconcileRowDataForEntity,
+    resolveEntityInputContract,
+} from "../helpers/entityInputContract"
 import {extractAndLoadChatMessagesAtom} from "../helpers/extractAndLoadChatMessages"
 import {normalizeTestcaseRowsForLoad} from "../helpers/testcaseRowNormalization"
 import type {EntitySelection, PlaygroundNode, RunnableType} from "../types"
@@ -1907,6 +1912,13 @@ const setEntityIdsAtom = atom(null, (get, set, next: string[] | ((prev: string[]
                     oldLoadableId,
                     newLoadableId: newAnchorLoadableId,
                 })
+                // After the loadable re-link, the testcase row store is still
+                // carrying every key the *previous* primary populated (chat
+                // `messages`, old completion variables, etc.). Reconcile each
+                // row against the NEW primary's input schema so the UI shows
+                // only the relevant variables and execution doesn't have to
+                // strip them later (#4525 / AGE-3793).
+                pruneTestcaseRowsForEntity(get, set, anchorSwap.newEntityId)
             }
         }
     }
@@ -2125,6 +2137,113 @@ function relinkLoadableSessions(
     }
 }
 
+type PruneStatus = "acted" | "noop" | "unresolved"
+
+/**
+ * Reconcile every testcase row against the given entity's input contract.
+ *
+ * Why this exists: the testcase row store (`testcaseMolecule`) is shared
+ * across loadables. When the user swaps the primary app in the LLM-as-a-
+ * judge playground, the row data keeps every key the *previous* primary
+ * populated — chat `messages`, completion template variables that the new
+ * app doesn't declare, etc. Without reconciliation, those stale keys leak
+ * into the new app's request body and the downstream evaluator's envelope.
+ *
+ * Allow-list source is `inputPorts` (via `resolveEntityInputContract`), NOT
+ * `inputSchema.properties` — completion apps surface their variables as
+ * prompt template placeholders through `inputPorts` and have an EMPTY static
+ * input schema, so schema-based filtering keeps everything. Policy:
+ *   - App with a resolved contract → strict: keep only declared (or
+ *     downstream-evaluator-protected) keys.
+ *   - Evaluator → chat-transport only: evaluators spread extra testcase
+ *     columns, so we never strict-filter them.
+ *   - Unresolved contract (ports mid-hydration) → no-op; returns
+ *     `"unresolved"` so the caller can retry once the contract resolves. The
+ *     run-time reconciliation in `webWorkerIntegration` is the backstop.
+ *
+ * Columns referenced by downstream evaluator `<input>_key` settings (e.g.
+ * `correct_answer_key → ground_truth`) are protected so a strict clean
+ * against the app contract doesn't drop intentional evaluation inputs.
+ *
+ * Mutations go through `testcaseMolecule.actions.batchUpdate` setting stale
+ * keys to `undefined`, which the store's update reducer interprets as a
+ * delete. Drafts are created as needed (one per affected row).
+ */
+function pruneTestcaseRowsForEntity(get: Getter, set: Setter, entityId: string): PruneStatus {
+    const contract = resolveEntityInputContract(get, entityId)
+
+    // Unresolved, non-evaluator contract → we can't strict-filter safely yet.
+    // The evaluator path is always "resolved enough" (chat-transport strip
+    // works without a variable list), so only bail for non-evaluator apps.
+    if (!contract.isEvaluator && !contract.resolved) {
+        return "unresolved"
+    }
+
+    const displayRowIds = get(testcaseMolecule.atoms.displayRowIds)
+    if (!Array.isArray(displayRowIds) || displayRowIds.length === 0) return "noop"
+
+    const protectedColumns = collectDownstreamReferencedColumns(get, get(playgroundNodesAtom))
+
+    const updates: {id: string; updates: {data: Record<string, unknown>}}[] = []
+
+    for (const rowId of displayRowIds) {
+        const row = get(testcaseMolecule.data(rowId))
+        const data = (row as {data?: Record<string, unknown>} | null)?.data
+        if (!data || typeof data !== "object") continue
+
+        const {dropped} = reconcileRowDataForEntity(get, entityId, data, {
+            protectedKeys: protectedColumns,
+        })
+        if (dropped.length === 0) continue
+
+        const undefinedData: Record<string, unknown> = {}
+        for (const key of dropped) {
+            undefinedData[key] = undefined
+        }
+        updates.push({id: rowId, updates: {data: undefinedData}})
+    }
+
+    if (updates.length === 0) return "noop"
+
+    set(testcaseMolecule.actions.batchUpdate, updates)
+    return "acted"
+}
+
+/**
+ * Reconcile all testcase rows against the CURRENT primary (depth-0) entity's
+ * input contract, on demand — call this right after a primary swap so the
+ * shared row is cleaned the instant the app changes, without waiting for a
+ * run. The run-time reconciliation in `webWorkerIntegration` is the backstop.
+ *
+ * Hydration handling: the new primary's input ports may not be resolved at
+ * call time (the workflow is still loading). When the prune reports
+ * `"unresolved"` AND the entity isn't loaded yet, we subscribe to its
+ * `inputPorts` and retry once they resolve, then unsubscribe. If the entity
+ * is already loaded but has no resolvable variables, there's nothing to wait
+ * for, so we don't subscribe (avoids a dangling subscription).
+ */
+const reconcileRowsToPrimaryAtom = atom(null, (get, set) => {
+    const nodes = get(playgroundNodesAtom)
+    const primary = nodes.find((node) => node.depth === 0)
+    if (!primary) return
+    const entityId = primary.entityId
+
+    const status = pruneTestcaseRowsForEntity(get, set, entityId)
+    if (status !== "unresolved") return
+
+    // Unresolved: either the workflow is still loading, or it's a genuinely
+    // no-variable app. Only wait if it hasn't loaded yet.
+    const entityLoaded = get(workflowMolecule.selectors.data(entityId)) != null
+    if (entityLoaded) return
+
+    const store = getDefaultStore()
+    const unsub = store.sub(workflowMolecule.selectors.inputPorts(entityId), () => {
+        const retryStatus = pruneTestcaseRowsForEntity(store.get, store.set, entityId)
+        const nowLoaded = store.get(workflowMolecule.selectors.data(entityId)) != null
+        if (retryStatus !== "unresolved" || nowLoaded) unsub()
+    })
+})
+
 /**
  * Switch one entity for another in the displayed selection.
  * Handles both single and comparison mode. The loadable-scoped re-link
@@ -2246,6 +2365,13 @@ export const playgroundController = {
         /** Change the primary node */
         changePrimaryNode: changePrimaryNodeAtom,
 
+        /**
+         * Reconcile all testcase rows against the current primary entity's
+         * input contract. Call after a primary swap to clean stale keys from a
+         * previous app off the shared row immediately (#4525 / AGE-3793).
+         */
+        reconcileRowsToPrimary: reconcileRowsToPrimaryAtom,
+
         /** Disconnect from testset and reset to local mode */
         disconnectAndResetToLocal: disconnectAndResetToLocalAtom,
 
diff --git a/web/packages/agenta-playground/src/state/execution/executionItems.ts b/web/packages/agenta-playground/src/state/execution/executionItems.ts
index ae30f11cc3..5127d0fcdc 100644
--- a/web/packages/agenta-playground/src/state/execution/executionItems.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionItems.ts
@@ -1,4 +1,5 @@
 import {loadableController, type RequestPayloadData} from "@agenta/entities/runnable"
+import {isLocalDraftId, isPlaceholderId} from "@agenta/entities/shared"
 import {
     stripAgentaMetadataDeep,
     stripEnhancedWrappers,
@@ -210,6 +211,39 @@ function asRecord(value: unknown): Record<string, unknown> | null {
     return value as Record<string, unknown>
 }
 
+/**
+ * Strip reference `id`s that aren't real server UUIDs (local-draft or
+ * placeholder ids) from a request body's `references` map.
+ *
+ * The backend `/invoke` validator rejects a non-UUID reference id with a 422
+ * (QA 2026-06-05: an unsaved evaluator opened from the drawer shipped
+ * `references.evaluator_revision.id = "local-…"` → "Input should be a valid
+ * UUID"). This is the last line of defense, applied to the FINAL merged
+ * references regardless of which builder produced them (requestPayload
+ * references, executionRunner stage self/upstream references, or
+ * trace-span-extracted references). Slugs and versions are plain strings the
+ * backend accepts and are kept; a slot left with no fields is dropped.
+ */
+function sanitizeReferenceIds(references: unknown): Record<string, unknown> | null {
+    const refs = asRecord(references)
+    if (!refs) return null
+    let mutated = false
+    const out: Record<string, unknown> = {}
+    for (const [slot, value] of Object.entries(refs)) {
+        const ref = asRecord(value)
+        const id = ref?.id
+        if (ref && typeof id === "string" && (isLocalDraftId(id) || isPlaceholderId(id))) {
+            const rest = {...ref}
+            delete rest.id
+            mutated = true
+            if (Object.keys(rest).length > 0) out[slot] = rest
+        } else {
+            out[slot] = value
+        }
+    }
+    return mutated ? out : refs
+}
+
 function unwrapValue(value: unknown): unknown {
     const rec = asRecord(value)
     return rec && "value" in rec ? rec.value : value
@@ -1322,6 +1356,18 @@ function buildExecutionItem(
             : params.references
     }
 
+    // Final guard: never ship a local-draft / placeholder id in a reference —
+    // the backend `/invoke` validator 422s on non-UUID reference ids (QA
+    // 2026-06-05). Covers every reference source after they're merged above.
+    if (requestBody.references !== undefined) {
+        const sanitized = sanitizeReferenceIds(requestBody.references)
+        if (sanitized && Object.keys(sanitized).length > 0) {
+            requestBody.references = sanitized
+        } else {
+            delete requestBody.references
+        }
+    }
+
     const references: ExecutionItemReference = {
         loadableId: params.loadableId,
         rowId: params.rowId,
diff --git a/web/packages/agenta-playground/src/state/execution/executionRunner.ts b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
index 83339f1478..40929d9c1e 100644
--- a/web/packages/agenta-playground/src/state/execution/executionRunner.ts
+++ b/web/packages/agenta-playground/src/state/execution/executionRunner.ts
@@ -9,6 +9,7 @@ import {
     type StageExecutionResult,
     type EntitySelection,
 } from "@agenta/entities/runnable"
+import {isLocalDraftId} from "@agenta/entities/shared"
 import {workflowMolecule} from "@agenta/entities/workflow"
 import {generateId} from "@agenta/shared/utils"
 import type {Getter, Setter} from "jotai"
@@ -16,6 +17,7 @@ import {getDefaultStore} from "jotai/vanilla"
 
 import {messageIdsAtomFamily, messagesByIdAtomFamily} from "../chat/messageAtoms"
 import {SHARED_SESSION_ID, type ChatMessage} from "../chat/messageTypes"
+import {reconcileRowDataForEntity} from "../helpers/entityInputContract"
 import type {OutputConnection, PlaygroundNode} from "../types"
 
 import {
@@ -162,6 +164,118 @@ function buildUpstreamReferences(params: {
     return normalizeApplicationReferences(sourcePayload?.references)
 }
 
+/**
+ * Build the `references.evaluator{,_variant,_revision}` map for a chain stage
+ * whose target node is an evaluator.
+ *
+ * The playground node's `entity.id` is a REVISION id. We read the merged
+ * revision record from the workflow molecule and pull both the revision-level
+ * fields (id / slug / version) and the parent workflow + variant identity
+ * (workflow_id, workflow_slug, workflow_variant_id, workflow_variant_slug)
+ * that the backend writes on revision responses.
+ *
+ * The trace storage layer indexes evaluator references by these fields:
+ *   - `references.evaluator.{id, slug}` ← parent workflow identity
+ *   - `references.evaluator_variant.{id, slug}` ← parent variant identity
+ *   - `references.evaluator_revision.{id, slug, version}` ← this revision
+ *
+ * Without these, traces emitted from playground chain runs don't surface on
+ * the evaluator's `/apps/<evalId>/traces` page — the page filters by
+ * `references.evaluator.slug`, and a missing slot returns 0 matches.
+ * Matches the shape backend evaluation runs emit (verified against real
+ * auto-evaluation trace data on 2026-05-28).
+ *
+ * Returns `undefined` when the node isn't an evaluator workflow, or when the
+ * revision data isn't available yet (rare — only during initial hydration).
+ */
+function buildEvaluatorSelfReferences(params: {
+    get: Getter
+    revisionId: string
+}): TraceReferenceMap | undefined {
+    const revision = params.get(workflowMolecule.selectors.data(params.revisionId)) as
+        | (Record<string, unknown> & {flags?: Record<string, unknown> | null})
+        | null
+    if (!revision) return undefined
+    if (!revision.flags?.is_evaluator) return undefined
+
+    // A local-draft evaluator (opened in the drawer playground but not yet
+    // saved) has no committed server identity — its ids are `local-…` strings.
+    // The backend's reference validator rejects those as non-UUIDs (422, QA
+    // 2026-06-05). Drop any id that's a local draft so we never ship one as a
+    // reference; slugs and version are plain strings the backend accepts and
+    // are kept.
+    const realId = (value: unknown): string | undefined => {
+        const s = readString(value)
+        return s && !isLocalDraftId(s) ? s : undefined
+    }
+
+    const refs: TraceReferenceMap = {}
+
+    // evaluator (parent workflow)
+    const workflowId = realId(revision.workflow_id)
+    const workflowSlug = readString(revision.workflow_slug)
+    if (workflowId || workflowSlug) {
+        refs.evaluator = {
+            ...(workflowId ? {id: workflowId} : {}),
+            ...(workflowSlug ? {slug: workflowSlug} : {}),
+        }
+    }
+
+    // evaluator_variant (parent variant)
+    const variantId = realId(revision.workflow_variant_id) ?? realId(revision.variant_id)
+    const variantSlug = readString(revision.workflow_variant_slug)
+    if (variantId || variantSlug) {
+        refs.evaluator_variant = {
+            ...(variantId ? {id: variantId} : {}),
+            ...(variantSlug ? {slug: variantSlug} : {}),
+        }
+    }
+
+    // evaluator_revision (this revision)
+    const revisionId = realId(revision.id) ?? realId(params.revisionId)
+    const revisionSlug = readString(revision.slug)
+    const revisionVersion =
+        typeof revision.version === "number"
+            ? String(revision.version)
+            : readString(revision.version)
+    if (revisionId || revisionSlug || revisionVersion) {
+        refs.evaluator_revision = {
+            ...(revisionId ? {id: revisionId} : {}),
+            ...(revisionSlug ? {slug: revisionSlug} : {}),
+            ...(revisionVersion ? {version: revisionVersion} : {}),
+        }
+    }
+
+    return Object.keys(refs).length > 0 ? refs : undefined
+}
+
+/**
+ * Reconcile row data to an entity's input contract at execution time.
+ *
+ * This is the runtime safety net for #4525 / AGE-3793: testcase rows live in
+ * a shared store and preserve every key the user ever ran with (chat apps
+ * populate `messages`, completion apps populate template variables, etc.).
+ * When the user swaps the primary app, the same row carries stale keys.
+ *
+ * Reconciliation primarily happens at swap time in the playground controller
+ * (`pruneTestcaseRowsForEntity`); this pass catches the hydration window
+ * where the new entity's input contract wasn't yet resolved at swap time but
+ * IS resolved by the time the request is built.
+ *
+ * Delegates to the shared `reconcileRowDataForEntity` — allow-list derived
+ * from `inputPorts` (the same source `executionItems` uses for `variables`),
+ * NOT `inputSchema.properties` (empty for completion apps). Apps get a strict
+ * allow-list; evaluators / unresolved contracts get a chat-transport-only
+ * strip so workflows depending on extra testcase columns keep working.
+ */
+function reconcileEntityInputData(
+    get: Getter,
+    data: Record<string, unknown>,
+    entityId: string,
+): Record<string, unknown> {
+    return reconcileRowDataForEntity(get, entityId, data).data
+}
+
 function createConcurrencyLimiter(concurrency: number) {
     let active = 0
     const queue: (() => void)[] = []
@@ -360,8 +474,26 @@ export async function executeStepForSessionWithExecutionItems(
 
                     let nodeInputs: Record<string, unknown>
                     if (node.depth === 0) {
-                        nodeInputs = {...data}
+                        // Reconcile the row to the root entity's input contract so
+                        // stale keys from a previous primary app (e.g. chat `messages`
+                        // / `context` after swapping the upstream app in the
+                        // LLM-as-a-judge playground — issue #4525 / AGE-3793) don't
+                        // leak into the new app's request body via the downstream
+                        // "spread all keys" fallback in resolveVariableValues. Apps
+                        // get a strict allow-list (from inputPorts); evaluators get a
+                        // chat-transport-only strip.
+                        const rootEntityId = node.entity.id as string
+                        nodeInputs = reconcileEntityInputData(get, data, rootEntityId)
                     } else {
+                        // Reconcile testcase data before chain / evaluator input
+                        // construction, so the downstream "spread all keys" fallbacks
+                        // (resolveChainInputs no-mapping branch and
+                        // buildEvaluatorExecutionInputs additionalProperties spread)
+                        // can't carry stale keys from a previous app into the current
+                        // target entity (#4525 / AGE-3793).
+                        const targetEntityId = node.entity.id as string
+                        const dataForChain = reconcileEntityInputData(get, data, targetEntityId)
+
                         // Check whether the incoming connection has explicit valid mappings.
                         // resolveChainInputs always returns non-empty (fallback spreads testcaseData
                         // + prediction), so we can't rely on its result length alone.
@@ -379,7 +511,7 @@ export async function executeStepForSessionWithExecutionItems(
                                 allConnections,
                                 nodeId,
                                 nodeResults,
-                                data,
+                                dataForChain,
                             )
                             nodeInputs = resolved
                         } else {
@@ -395,10 +527,10 @@ export async function executeStepForSessionWithExecutionItems(
 
                             const evalStore = getDefaultStore()
                             const stageConfiguration = evalStore.get(
-                                workflowMolecule.selectors.configuration(node.entity.id as string),
+                                workflowMolecule.selectors.configuration(targetEntityId),
                             )
                             const stageSchemas = evalStore.get(
-                                workflowMolecule.selectors.ioSchemas(node.entity.id as string),
+                                workflowMolecule.selectors.ioSchemas(targetEntityId),
                             )
                             const inputSchema =
                                 (stageSchemas?.inputSchema as
@@ -408,10 +540,15 @@ export async function executeStepForSessionWithExecutionItems(
                                 session.mode === "chat"
                                     ? buildSharedChatInputs(get, loadableId)
                                     : undefined
+                            // Base the evaluator testcase on the stripped
+                            // `dataForChain` (not raw `data`) so stale chat-
+                            // transport keys from a previous chat app can't leak
+                            // in (#4525 / AGE-3793), then layer the current
+                            // shared chat inputs on top for chat-mode runs.
                             const evaluatorTestcaseData =
                                 rootChatInputs && Object.keys(rootChatInputs).length > 0
-                                    ? {...data, ...rootChatInputs}
-                                    : data
+                                    ? {...dataForChain, ...rootChatInputs}
+                                    : dataForChain
 
                             const evaluatorInputContext = {
                                 testcaseData: evaluatorTestcaseData,
@@ -471,20 +608,27 @@ export async function executeStepForSessionWithExecutionItems(
                                   nodeResults,
                               })
                             : undefined
-
-                    const isEvaluatorStage =
-                        node.depth > 0 &&
-                        get(workflowMolecule.selectors.isEvaluator(node.entity.id as string))
-                    const stageReferences =
-                        node.depth > 0 && !isEvaluatorStage
-                            ? buildUpstreamReferences({
-                                  get,
-                                  incomingConnection: allConnections.find(
-                                      (connection) => connection.targetNodeId === nodeId,
-                                  ),
-                                  runnableNodes,
-                              })
-                            : undefined
+                    const stageReferences = (() => {
+                        if (node.depth === 0) return undefined
+                        const upstream = buildUpstreamReferences({
+                            get,
+                            incomingConnection: allConnections.find(
+                                (connection) => connection.targetNodeId === nodeId,
+                            ),
+                            runnableNodes,
+                        })
+                        // For evaluator stages, also attach the evaluator's
+                        // own identity so the emitted trace can be found via
+                        // `references.evaluator.slug` on the evaluator's
+                        // /apps/<evalId>/traces page. Merges with upstream
+                        // application refs (the app being scored).
+                        const selfEval = buildEvaluatorSelfReferences({
+                            get,
+                            revisionId: node.entity.id as string,
+                        })
+                        if (!upstream && !selfEval) return undefined
+                        return {...(upstream ?? {}), ...(selfEval ?? {})}
+                    })()
 
                     const stageExecutionItem = stageHandle.run({
                         get,
@@ -572,7 +716,10 @@ export async function executeStepForSessionWithExecutionItems(
                 if (abortController.signal.aborted) break
 
                 const perSession2 = sessionOptions?.[session.id]
-                const nodeInputs2 = {...data}
+                // Same reconciliation as the first-run path above — repetitions
+                // hit the same root entity, so stale keys must be filtered
+                // identically (issue #4525 / AGE-3793).
+                const nodeInputs2 = reconcileEntityInputData(get, data, session.runnableId)
                 const repetitionItem = rootExecutionHandle.retry({
                     get,
                     headers: perSession2?.headers ?? {},
diff --git a/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts b/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
index 3a4959f6e0..c0ffdcd055 100644
--- a/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
+++ b/web/packages/agenta-playground/src/state/execution/webWorkerIntegration.ts
@@ -19,6 +19,10 @@ import {queryClientAtom} from "jotai-tanstack-query"
 import {outputConnectionsAtom} from "../atoms/connections"
 import {entityIdsAtom, playgroundNodesAtom} from "../atoms/playground"
 import {clearSessionResponsesAtom, messageIdsAtomFamily, messagesByIdAtomFamily} from "../chat"
+import {
+    collectDownstreamReferencedColumns,
+    reconcileRowDataForEntity,
+} from "../helpers/entityInputContract"
 
 import {executionConcurrencyAtom, repetitionCountAtom} from "./atoms"
 import {handleExecutionResultAtom} from "./executionItems"
@@ -316,7 +320,37 @@ export const triggerExecutionAtom = atom(
         const rowEntry = get(loadableController.selectors.row(loadableId, testcaseRowId)) as {
             data?: Record<string, unknown>
         } | null
-        const testcaseData: Record<string, unknown> = rowEntry?.data ?? {}
+        const rawTestcaseData: Record<string, unknown> = rowEntry?.data ?? {}
+
+        // Reconcile the shared testcase row against the ROOT entity's input
+        // contract before execution (#4525 / AGE-3793). The testcase store is
+        // shared across loadables, so the row keeps every key a previous
+        // primary populated — chat `messages`/`context` after swapping the
+        // upstream app from chat to completion. Cleaning here:
+        //   (a) keeps stale keys out of the app request,
+        //   (b) keeps them out of the downstream evaluator's {inputs, outputs}
+        //       envelope (the evaluator reads this same row), and
+        //   (c) persists the cleaned row so the UI + future runs reflect it.
+        // This is path-agnostic: it fires no matter how the app was selected,
+        // unlike the swap-time prune which only covers setEntityIds positional
+        // swaps. Columns a downstream evaluator references via `<input>_key`
+        // settings (e.g. correct_answer_key → ground_truth) are protected so a
+        // strict clean against the app contract doesn't drop intentional eval
+        // inputs.
+        const protectedColumns = collectDownstreamReferencedColumns(get, nodes)
+        const reconciledRow = reconcileRowDataForEntity(get, rootEntityId, rawTestcaseData, {
+            protectedKeys: protectedColumns,
+        })
+        const testcaseData: Record<string, unknown> = reconciledRow.data
+        if (reconciledRow.dropped.length > 0) {
+            const undefinedPatch: Record<string, unknown> = {}
+            for (const key of reconciledRow.dropped) {
+                undefinedPatch[key] = undefined
+            }
+            // Persist the cleaned row (deletes the dropped keys via the
+            // testcase store's undefined-means-delete semantics).
+            set(loadableController.actions.updateRow, loadableId, logicalRowId, undefinedPatch)
+        }
 
         // In comparison mode, filter nodes to only include the effective variant's
         // root + downstream nodes. Other depth-0 comparison variants are excluded
diff --git a/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts b/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts
new file mode 100644
index 0000000000..5209f9b892
--- /dev/null
+++ b/web/packages/agenta-playground/src/state/helpers/entityInputContract.ts
@@ -0,0 +1,220 @@
+/**
+ * Entity input contract resolution.
+ *
+ * Single source of truth for "what testcase row keys does this entity
+ * legitimately consume as inputs". Used to reconcile shared testcase rows
+ * when the primary entity changes (#4525 / AGE-3793) — the testcase store is
+ * shared across loadables, so a row keeps every key the *previous* primary
+ * populated (chat `messages`, a prior completion app's template variables,
+ * etc.). Those stale keys must not leak into the new entity's request body.
+ *
+ * CRITICAL: the allow-list is derived from `inputPorts`, NOT
+ * `inputSchema.properties`. Completion apps express their variables as prompt
+ * template placeholders surfaced through `inputPorts`; their static
+ * `inputSchema.properties` is EMPTY. Reading the schema there yields an empty
+ * allow-list and the filter degrades to "keep everything" — which is exactly
+ * the bug. `inputPorts` is also the same source `executionItems` uses to
+ * build the request `variables`, so filtering against it is guaranteed
+ * consistent with what actually gets sent.
+ */
+import {workflowMolecule} from "@agenta/entities/workflow"
+import type {Getter} from "jotai"
+
+/**
+ * Chat-conversation transport keys. They accumulate on a shared testcase row
+ * when a chat app runs and are not template variables — they describe a
+ * conversation. Stripped from non-chat entities. Kept conservative (only
+ * `messages`); `chatHistory` is rebuilt at runtime from the flat message
+ * store, never stored on row data.
+ */
+export const CHAT_TRANSPORT_KEYS = ["messages"] as const
+
+export interface EntityInputContract {
+    /**
+     * Keys the entity legitimately consumes as testcase inputs. Includes
+     * `messages` for chat apps. Empty when nothing could be resolved.
+     */
+    allowedKeys: Set<string>
+    /**
+     * True when we have a confident allow-list to strict-filter against:
+     * the entity surfaced at least one input variable, or it's a chat app
+     * (an empty-variable chat app is still valid — it consumes `messages`).
+     */
+    resolved: boolean
+    /**
+     * Evaluators get OPEN-schema treatment: they may spread arbitrary extra
+     * testcase columns (`additionalProperties`), so we never strict-filter
+     * their rows — only strip known chat-transport keys.
+     */
+    isEvaluator: boolean
+    /** Chat apps keep `messages`. */
+    isChat: boolean
+}
+
+function isNonEmptyString(value: unknown): value is string {
+    return typeof value === "string" && value.length > 0
+}
+
+/**
+ * Resolve the input contract for an entity, mirroring the variable
+ * resolution in `executionItems.ts` exactly:
+ *
+ *   variablesFromInputPorts = inputPorts[].key
+ *   variablesFromPayload     = requestPayload.__meta.variables
+ *                              ?? requestPayload.variables ?? []
+ *   variables = inputPorts.length > 0 ? inputPorts : payload
+ *
+ * plus `messages` when the entity runs in chat mode.
+ */
+export function resolveEntityInputContract(get: Getter, entityId: string): EntityInputContract {
+    const entity = get(workflowMolecule.selectors.data(entityId)) as
+        | {flags?: Record<string, unknown> | null}
+        | null
+        | undefined
+    const isEvaluator = !!entity?.flags?.is_evaluator
+
+    const mode = get(workflowMolecule.selectors.executionMode(entityId)) as
+        | "chat"
+        | "completion"
+        | undefined
+    const isChat = mode === "chat"
+
+    const inputPorts = (get(workflowMolecule.selectors.inputPorts(entityId)) ?? []) as {
+        key?: unknown
+    }[]
+    const variablesFromInputPorts = Array.from(
+        new Set(inputPorts.map((port) => port?.key).filter(isNonEmptyString)),
+    )
+
+    const requestPayload = get(workflowMolecule.selectors.requestPayload(entityId)) as
+        | {variables?: unknown; __meta?: {variables?: unknown} | null}
+        | null
+        | undefined
+    const metaVariables = requestPayload?.__meta?.variables
+    const payloadVariables = requestPayload?.variables
+    const rawPayloadVariables: unknown[] = Array.isArray(metaVariables)
+        ? metaVariables
+        : Array.isArray(payloadVariables)
+          ? payloadVariables
+          : []
+    const variablesFromPayload = rawPayloadVariables.filter(isNonEmptyString)
+
+    const variables =
+        variablesFromInputPorts.length > 0 ? variablesFromInputPorts : variablesFromPayload
+
+    const allowedKeys = new Set(variables)
+    if (isChat) allowedKeys.add("messages")
+
+    const resolved = variables.length > 0 || isChat
+
+    return {allowedKeys, resolved, isEvaluator, isChat}
+}
+
+export type ReconcileStrategy = "strict" | "chat-transport" | "skip"
+
+export interface ReconcileResult {
+    /** The reconciled data (new object only when keys were dropped). */
+    data: Record<string, unknown>
+    /** Keys that were removed. Empty when nothing changed. */
+    dropped: string[]
+    /** Which policy ran. */
+    strategy: ReconcileStrategy
+}
+
+export interface ReconcileOptions {
+    /**
+     * Keys to keep even when they aren't in the entity's allow-list. Used to
+     * protect testcase columns that a DOWNSTREAM evaluator consumes via its
+     * `<input>_key` settings (e.g. `correct_answer_key → ground_truth`). The
+     * primary app doesn't declare them, but they're intentional evaluation
+     * columns — not stale leftovers — so a strict clean must not drop them.
+     */
+    protectedKeys?: ReadonlySet<string>
+}
+
+/**
+ * Reconcile a row's data to an entity's input contract.
+ *
+ * Policy:
+ *  - Evaluator → `chat-transport`: only strip chat-transport keys the entity
+ *    doesn't declare. Preserves evaluators that spread additional testcase
+ *    columns.
+ *  - App with a resolved contract → `strict`: keep only declared (or
+ *    protected) keys.
+ *  - Unresolved (schema/ports mid-hydration, non-evaluator) → `chat-transport`
+ *    as a safety net; the caller may choose to defer a strict pass until the
+ *    contract resolves.
+ */
+export function reconcileRowDataForEntity(
+    get: Getter,
+    entityId: string,
+    data: Record<string, unknown>,
+    options?: ReconcileOptions,
+): ReconcileResult {
+    const contract = resolveEntityInputContract(get, entityId)
+    const protectedKeys = options?.protectedKeys
+
+    const useStrict = !contract.isEvaluator && contract.resolved
+
+    if (useStrict) {
+        const dropped: string[] = []
+        const next: Record<string, unknown> = {}
+        for (const [key, value] of Object.entries(data)) {
+            if (contract.allowedKeys.has(key) || protectedKeys?.has(key)) {
+                next[key] = value
+            } else {
+                dropped.push(key)
+            }
+        }
+        return dropped.length > 0
+            ? {data: next, dropped, strategy: "strict"}
+            : {data, dropped, strategy: "strict"}
+    }
+
+    // chat-transport strip (evaluators + unresolved contracts)
+    const dropped: string[] = []
+    const next: Record<string, unknown> = {...data}
+    for (const key of CHAT_TRANSPORT_KEYS) {
+        if (key in next && !contract.allowedKeys.has(key) && !protectedKeys?.has(key)) {
+            delete next[key]
+            dropped.push(key)
+        }
+    }
+    return dropped.length > 0
+        ? {data: next, dropped, strategy: "chat-transport"}
+        : {data, dropped, strategy: "chat-transport"}
+}
+
+/**
+ * Collect testcase column names that downstream evaluator nodes reference via
+ * their `<input>_key` settings (e.g. `correct_answer_key → ground_truth`).
+ *
+ * These columns are intentional evaluation inputs the primary app doesn't
+ * declare, so a strict row clean against the app contract must protect them
+ * (pass the result as `reconcileRowDataForEntity`'s `protectedKeys`).
+ *
+ * Mirrors the `<key>_key` resolution in `buildEvaluatorExecutionInputs`
+ * (`@agenta/entities/runnable`): a setting named `<input>_key` whose string
+ * value names a column, optionally prefixed `testcase.`.
+ */
+export function collectDownstreamReferencedColumns(
+    get: Getter,
+    nodes: readonly {depth: number; entityId: string}[],
+): Set<string> {
+    const columns = new Set<string>()
+    for (const node of nodes) {
+        if (node.depth === 0) continue
+        const settings = get(workflowMolecule.selectors.configuration(node.entityId)) as
+            | Record<string, unknown>
+            | null
+            | undefined
+        if (!settings || typeof settings !== "object") continue
+        for (const [key, value] of Object.entries(settings)) {
+            if (!key.endsWith("_key")) continue
+            if (typeof value !== "string" || value.length === 0) continue
+            const column = value.startsWith("testcase.") ? value.split(".")[1] : value
+            if (column) columns.add(column)
+        }
+    }
+    return columns
+}
diff --git a/web/packages/agenta-shared/.gitignore b/web/packages/agenta-shared/.gitignore
new file mode 100644
index 0000000000..96d253c48e
--- /dev/null
+++ b/web/packages/agenta-shared/.gitignore
@@ -0,0 +1,3 @@
+# Generated by Vitest — do not commit
+test-results/
+coverage/
diff --git a/web/packages/agenta-shared/package.json b/web/packages/agenta-shared/package.json
index 2d49bad1cb..7b13e46b14 100644
--- a/web/packages/agenta-shared/package.json
+++ b/web/packages/agenta-shared/package.json
@@ -9,7 +9,12 @@
         "build": "pnpm run types:check",
         "types:check": "tsc --noEmit",
         "lint": "eslint --config ../eslint.config.mjs src/ --max-warnings 0",
-        "lint:fix": "eslint --config ../eslint.config.mjs src/ --max-warnings 0 --fix"
+        "lint:fix": "eslint --config ../eslint.config.mjs src/ --max-warnings 0 --fix",
+        "test": "pnpm run test:unit",
+        "test:unit": "vitest run",
+        "test:watch": "vitest",
+        "test:coverage": "vitest run --coverage",
+        "check": "pnpm run types:check && pnpm run lint"
     },
     "exports": {
         ".": "./src/index.ts",
@@ -31,7 +36,9 @@
     "devDependencies": {
         "@types/node": "^20.8.10",
         "@types/react": "^19.0.10",
-        "typescript": "5.8.3"
+        "@vitest/coverage-v8": "^4.1.4",
+        "typescript": "5.8.3",
+        "vitest": "^4.1.4"
     },
     "peerDependencies": {
         "@tanstack/react-query": ">=5.0.0",
diff --git a/web/packages/agenta-shared/tests/unit/data-transforms.test.ts b/web/packages/agenta-shared/tests/unit/data-transforms.test.ts
new file mode 100644
index 0000000000..148b7a78dd
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/data-transforms.test.ts
@@ -0,0 +1,165 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    extractApiErrorMessage,
+    preserveResponseStatus,
+} from "../../src/utils/extractApiErrorMessage"
+import {stripAgentaMetadataDeep, stripEnhancedWrappers} from "../../src/utils/valueExtraction"
+
+// ---------------------------------------------------------------------------
+// extractApiErrorMessage
+// ---------------------------------------------------------------------------
+
+describe("extractApiErrorMessage — Axios-style errors", () => {
+    it("extracts from response.data.detail string", () => {
+        const error = {response: {data: {detail: "Not found"}}}
+        expect(extractApiErrorMessage(error)).toBe("Not found")
+    })
+
+    it("extracts from response.data.message string", () => {
+        const error = {response: {data: {message: "Forbidden"}}}
+        expect(extractApiErrorMessage(error)).toBe("Forbidden")
+    })
+
+    it("extracts from response.data.error string", () => {
+        const error = {response: {data: {error: "Internal error"}}}
+        expect(extractApiErrorMessage(error)).toBe("Internal error")
+    })
+
+    it("extracts from nested response.data.detail.message", () => {
+        const error = {response: {data: {detail: {message: "Nested message"}}}}
+        expect(extractApiErrorMessage(error)).toBe("Nested message")
+    })
+
+    it("extracts from an array of detail strings", () => {
+        const error = {response: {data: {detail: ["error one", "error two"]}}}
+        const result = extractApiErrorMessage(error)
+        expect(result).toContain("error one")
+    })
+})
+
+describe("extractApiErrorMessage — Error instances", () => {
+    it("returns error.message for a plain Error", () => {
+        expect(extractApiErrorMessage(new Error("Something failed"))).toBe("Something failed")
+    })
+})
+
+describe("extractApiErrorMessage — direct string/object", () => {
+    it("returns a non-empty string value directly", () => {
+        expect(extractApiErrorMessage("plain error string")).toBe("plain error string")
+    })
+
+    it("falls back to String(error) for unknown shapes", () => {
+        expect(extractApiErrorMessage(42)).toBe("42")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// preserveResponseStatus
+// ---------------------------------------------------------------------------
+
+describe("preserveResponseStatus", () => {
+    it("wraps an error with a custom message", () => {
+        const err = preserveResponseStatus(new Error("original"), "custom message")
+        expect(err.message).toBe("custom message")
+    })
+
+    it("preserves the response status from the original error", () => {
+        const axiosError = {response: {status: 404}, message: "Not found"}
+        const err = preserveResponseStatus(axiosError, "Not found")
+        expect(err.response?.status).toBe(404)
+    })
+
+    it("preserves the original error message when no override is given", () => {
+        const err = preserveResponseStatus(new Error("original"))
+        expect(err.message).toBe("original")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// stripAgentaMetadataDeep
+// ---------------------------------------------------------------------------
+
+describe("stripAgentaMetadataDeep", () => {
+    it("removes agenta_metadata keys from objects", () => {
+        const input = {name: "Alice", agenta_metadata: {source: "api"}}
+        const result = stripAgentaMetadataDeep(input)
+        expect(result).not.toHaveProperty("agenta_metadata")
+        expect((result as typeof input).name).toBe("Alice")
+    })
+
+    it("removes __agenta_metadata keys from objects", () => {
+        const input = {value: 1, __agenta_metadata: {}}
+        expect(stripAgentaMetadataDeep(input)).not.toHaveProperty("__agenta_metadata")
+    })
+
+    it("recursively strips metadata from nested objects", () => {
+        const input = {
+            user: {name: "Alice", agenta_metadata: {x: 1}},
+        }
+        const result = stripAgentaMetadataDeep(input) as typeof input
+        expect(result.user).not.toHaveProperty("agenta_metadata")
+        expect(result.user.name).toBe("Alice")
+    })
+
+    it("strips metadata from objects inside arrays", () => {
+        const input = [{score: 5, agenta_metadata: {}}]
+        const result = stripAgentaMetadataDeep(input) as typeof input
+        expect(result[0]).not.toHaveProperty("agenta_metadata")
+        expect(result[0].score).toBe(5)
+    })
+
+    it("returns primitives unchanged", () => {
+        expect(stripAgentaMetadataDeep("hello")).toBe("hello")
+        expect(stripAgentaMetadataDeep(42)).toBe(42)
+        expect(stripAgentaMetadataDeep(null)).toBeNull()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// stripEnhancedWrappers
+// ---------------------------------------------------------------------------
+
+describe("stripEnhancedWrappers", () => {
+    it("unwraps a simple {__id, __metadata, value} wrapper", () => {
+        const input = {__id: "x", __metadata: {}, value: "hello"}
+        expect(stripEnhancedWrappers(input)).toBe("hello")
+    })
+
+    it("strips __id and __metadata from plain objects (non-wrapper)", () => {
+        const input = {__id: "x", __metadata: {}, name: "Alice", age: 30}
+        const result = stripEnhancedWrappers(input) as {name: string; age: number}
+        expect(result).not.toHaveProperty("__id")
+        expect(result).not.toHaveProperty("__metadata")
+        expect(result.name).toBe("Alice")
+        expect(result.age).toBe(30)
+    })
+
+    it("recursively strips wrappers from nested objects", () => {
+        const input = {
+            user: {__id: "u1", __metadata: {}, name: "Alice"},
+        }
+        const result = stripEnhancedWrappers(input) as {user: {name: string}}
+        expect(result.user).not.toHaveProperty("__id")
+        expect(result.user.name).toBe("Alice")
+    })
+
+    it("processes arrays recursively", () => {
+        const input = [
+            {__id: "1", __metadata: {}, value: 1},
+            {__id: "2", __metadata: {}, value: 2},
+        ]
+        const result = stripEnhancedWrappers(input) as number[]
+        expect(result).toEqual([1, 2])
+    })
+
+    it("returns null/undefined unchanged", () => {
+        expect(stripEnhancedWrappers(null)).toBeNull()
+        expect(stripEnhancedWrappers(undefined)).toBeUndefined()
+    })
+
+    it("returns primitives unchanged", () => {
+        expect(stripEnhancedWrappers("hello")).toBe("hello")
+        expect(stripEnhancedWrappers(42)).toBe(42)
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/formatters.test.ts b/web/packages/agenta-shared/tests/unit/formatters.test.ts
new file mode 100644
index 0000000000..eb7bddca4e
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/formatters.test.ts
@@ -0,0 +1,222 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    createFormatter,
+    formatCompact,
+    formatCurrency,
+    formatLatency,
+    formatNumber,
+    formatPercent,
+    formatPreviewValue,
+    formatSignificant,
+    formatTokens,
+} from "../../src/utils/formatters/formatters"
+
+// ---------------------------------------------------------------------------
+// formatNumber
+// ---------------------------------------------------------------------------
+
+describe("formatNumber", () => {
+    it("formats with locale thousand separators and 2 decimal places", () => {
+        expect(formatNumber(1234.567)).toBe("1,234.57")
+    })
+
+    it("returns '-' for null", () => expect(formatNumber(null)).toBe("-"))
+    it("returns '-' for undefined", () => expect(formatNumber(undefined)).toBe("-"))
+
+    it("formats zero", () => expect(formatNumber(0)).toBe("0"))
+    it("formats negative numbers", () => expect(formatNumber(-1234)).toBe("-1,234"))
+})
+
+// ---------------------------------------------------------------------------
+// formatCompact
+// ---------------------------------------------------------------------------
+
+describe("formatCompact", () => {
+    it("formats thousands as K", () => expect(formatCompact(1500)).toBe("1.5K"))
+    it("formats millions as M", () => expect(formatCompact(1_500_000)).toBe("1.5M"))
+    it("returns '-' for null", () => expect(formatCompact(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatCurrency
+// ---------------------------------------------------------------------------
+
+describe("formatCurrency", () => {
+    it("formats with dollar sign and 2 decimals for typical values", () => {
+        expect(formatCurrency(1234.56)).toBe("$1,234.56")
+    })
+
+    it("formats small values without trailing zeros (maximumFractionDigits: 6)", () => {
+        expect(formatCurrency(0.00123)).toBe("$0.00123")
+    })
+
+    it("returns '-' for null", () => expect(formatCurrency(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatLatency
+// ---------------------------------------------------------------------------
+
+describe("formatLatency", () => {
+    it("formats sub-millisecond values in μs", () => {
+        expect(formatLatency(0.0001)).toBe("100μs")
+    })
+
+    it("formats millisecond-range values in ms", () => {
+        expect(formatLatency(0.5)).toBe("500ms")
+    })
+
+    it("formats second-range values in s", () => {
+        expect(formatLatency(2.5)).toBe("2.5s")
+    })
+
+    it("formats exactly 1 second", () => {
+        expect(formatLatency(1)).toBe("1s")
+    })
+
+    it("returns '-' for null", () => expect(formatLatency(null)).toBe("-"))
+    it("returns '-' for undefined", () => expect(formatLatency(undefined)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatTokens
+// ---------------------------------------------------------------------------
+
+describe("formatTokens", () => {
+    it("formats values under 1000 as plain integers", () => {
+        expect(formatTokens(500)).toBe("500")
+    })
+
+    it("formats thousands as K with 1 decimal", () => {
+        expect(formatTokens(1500)).toBe("1.5K")
+    })
+
+    it("formats millions as M with 1 decimal", () => {
+        expect(formatTokens(1_500_000)).toBe("1.5M")
+    })
+
+    it("returns '-' for null", () => expect(formatTokens(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatPercent
+// ---------------------------------------------------------------------------
+
+describe("formatPercent", () => {
+    it("formats decimal as percentage with 1 decimal for values >= 10%", () => {
+        expect(formatPercent(0.856)).toBe("85.6%")
+    })
+
+    it("formats small values with 2 decimal places", () => {
+        expect(formatPercent(0.001)).toBe("0.10%")
+    })
+
+    it("returns '100%' for values >= 99.95%", () => {
+        expect(formatPercent(1)).toBe("100%")
+        expect(formatPercent(0.9995)).toBe("100%")
+    })
+
+    it("returns '0%' for zero", () => {
+        expect(formatPercent(0)).toBe("0%")
+    })
+
+    it("treats negative values as 0%", () => {
+        expect(formatPercent(-0.1)).toBe("0%")
+    })
+
+    it("returns '-' for null", () => expect(formatPercent(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatSignificant
+// ---------------------------------------------------------------------------
+
+describe("formatSignificant", () => {
+    it("formats values with significant-figure-aware decimals", () => {
+        // 1234: exponent=3 → decimals=max(0, 2-3)=0 → "1234" (integer, no rounding)
+        expect(formatSignificant(1234)).toBe("1234")
+        // 0.00456: exponent=-3 → decimals=max(0, 2-(-3))=5 → "0.00456"
+        expect(formatSignificant(0.00456)).toBe("0.00456")
+    })
+
+    it("returns '0' for zero", () => {
+        expect(formatSignificant(0)).toBe("0")
+    })
+
+    it("uses scientific notation for extreme values", () => {
+        const result = formatSignificant(1.5e12)
+        expect(result).toMatch(/e/)
+    })
+
+    it("returns '-' for null", () => expect(formatSignificant(null)).toBe("-"))
+})
+
+// ---------------------------------------------------------------------------
+// formatPreviewValue
+// ---------------------------------------------------------------------------
+
+describe("formatPreviewValue", () => {
+    it("wraps strings in quotes", () => {
+        expect(formatPreviewValue("hello")).toBe('"hello"')
+    })
+
+    it("truncates long strings and adds ellipsis", () => {
+        const long = "a".repeat(60)
+        const result = formatPreviewValue(long, 50)
+        expect(result).toBe(`"${"a".repeat(50)}..."`)
+    })
+
+    it("formats numbers as-is", () => {
+        expect(formatPreviewValue(123)).toBe("123")
+    })
+
+    it("formats booleans as-is", () => {
+        expect(formatPreviewValue(true)).toBe("true")
+        expect(formatPreviewValue(false)).toBe("false")
+    })
+
+    it("formats arrays with length", () => {
+        expect(formatPreviewValue([1, 2, 3])).toBe("[Array(3)]")
+    })
+
+    it("formats small objects with key names", () => {
+        expect(formatPreviewValue({a: 1, b: 2})).toBe("{a, b}")
+    })
+
+    it("truncates objects with more than 3 keys", () => {
+        const result = formatPreviewValue({a: 1, b: 2, c: 3, d: 4})
+        expect(result).toBe("{a, b, c...}")
+    })
+
+    it("returns '(null)' for null", () => expect(formatPreviewValue(null)).toBe("(null)"))
+    it("returns '(undefined)' for undefined", () =>
+        expect(formatPreviewValue(undefined)).toBe("(undefined)"))
+})
+
+// ---------------------------------------------------------------------------
+// createFormatter
+// ---------------------------------------------------------------------------
+
+describe("createFormatter", () => {
+    it("applies multiplier, prefix, suffix, and fixed decimals", () => {
+        const fmt = createFormatter({multiplier: 100, suffix: "%", decimals: 1})
+        expect(fmt(0.856)).toBe("85.6%")
+    })
+
+    it("uses the custom fallback for null/undefined", () => {
+        const fmt = createFormatter({fallback: "n/a"})
+        expect(fmt(null)).toBe("n/a")
+        expect(fmt(undefined)).toBe("n/a")
+    })
+
+    it("uses compact notation when compact: true", () => {
+        const fmt = createFormatter({compact: true})
+        expect(fmt(1500)).toBe("1.5K")
+    })
+
+    it("prepends a prefix", () => {
+        const fmt = createFormatter({prefix: "$", decimals: 2})
+        expect(fmt(10)).toBe("$10.00")
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/path-utils.test.ts b/web/packages/agenta-shared/tests/unit/path-utils.test.ts
new file mode 100644
index 0000000000..b330ac54c3
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/path-utils.test.ts
@@ -0,0 +1,184 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    deleteValueAtPath,
+    getValueAtPath,
+    hasValueAtPath,
+    setValueAtPath,
+} from "../../src/utils/pathUtils"
+
+// ---------------------------------------------------------------------------
+// getValueAtPath
+// ---------------------------------------------------------------------------
+
+describe("getValueAtPath — basic object navigation", () => {
+    const data = {user: {profile: {name: "Alice", age: 30}}}
+
+    it("retrieves a deeply nested value", () => {
+        expect(getValueAtPath(data, ["user", "profile", "name"])).toBe("Alice")
+    })
+
+    it("returns the root when the path is empty", () => {
+        expect(getValueAtPath(data, [])).toBe(data)
+    })
+
+    it("returns undefined for a missing key", () => {
+        expect(getValueAtPath(data, ["user", "missing"])).toBeUndefined()
+    })
+
+    it("returns undefined when traversal hits null", () => {
+        expect(getValueAtPath({a: null}, ["a", "b"])).toBeUndefined()
+    })
+})
+
+describe("getValueAtPath — falsy root short-circuit", () => {
+    it("returns 0 immediately (falsy root, path ignored)", () => {
+        expect(getValueAtPath(0, ["a"])).toBe(0)
+    })
+
+    it("returns false immediately (falsy root, path ignored)", () => {
+        expect(getValueAtPath(false, ["a"])).toBe(false)
+    })
+
+    it("returns empty string immediately (falsy root, path ignored)", () => {
+        expect(getValueAtPath("", ["a"])).toBe("")
+    })
+
+    it("returns null immediately (falsy root, path ignored)", () => {
+        expect(getValueAtPath(null, ["a"])).toBeNull()
+    })
+})
+
+describe("getValueAtPath — array indexing", () => {
+    it("accesses array elements by numeric index", () => {
+        expect(getValueAtPath([10, 20, 30], [1])).toBe(20)
+    })
+
+    it("accesses array elements by string index", () => {
+        expect(getValueAtPath([10, 20, 30], ["2"])).toBe(30)
+    })
+
+    it("returns undefined for out-of-bounds index", () => {
+        expect(getValueAtPath([10, 20], [5])).toBeUndefined()
+    })
+
+    it("navigates mixed array/object paths", () => {
+        const data = {items: [{id: "a"}, {id: "b"}]}
+        expect(getValueAtPath(data, ["items", 1, "id"])).toBe("b")
+    })
+})
+
+describe("getValueAtPath — JSON string traversal", () => {
+    it("parses a JSON string and continues traversal", () => {
+        const data = {messages: '{"content": "hello"}'}
+        expect(getValueAtPath(data, ["messages", "content"])).toBe("hello")
+    })
+
+    it("returns undefined when the string is not valid JSON", () => {
+        const data = {messages: "not json"}
+        expect(getValueAtPath(data, ["messages", "content"])).toBeUndefined()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// setValueAtPath
+// ---------------------------------------------------------------------------
+
+describe("setValueAtPath — object mutation (immutable)", () => {
+    it("sets a nested value without mutating the original", () => {
+        const data = {user: {name: "Alice"}}
+        const updated = setValueAtPath(data, ["user", "name"], "Bob")
+        expect((updated as typeof data).user.name).toBe("Bob")
+        expect(data.user.name).toBe("Alice")
+    })
+
+    it("creates intermediate objects for new paths", () => {
+        const data = {}
+        const updated = setValueAtPath(data, ["a", "b"], 42) as {a: {b: number}}
+        expect(updated.a.b).toBe(42)
+    })
+
+    it("replaces the root when path is empty", () => {
+        expect(setValueAtPath({a: 1}, [], "new")).toBe("new")
+    })
+})
+
+describe("setValueAtPath — array mutation (immutable)", () => {
+    it("sets an array element by index", () => {
+        const arr = [1, 2, 3]
+        const updated = setValueAtPath(arr, [1], 99) as number[]
+        expect(updated[1]).toBe(99)
+        expect(arr[1]).toBe(2)
+    })
+
+    it("handles nested array+object paths", () => {
+        const data = {items: [{id: "a"}, {id: "b"}]}
+        const updated = setValueAtPath(data, ["items", 0, "id"], "z") as typeof data
+        expect(updated.items[0].id).toBe("z")
+        expect(updated.items[1].id).toBe("b")
+    })
+})
+
+describe("setValueAtPath — JSON string re-serialisation", () => {
+    it("parses a JSON string, sets the value, and re-stringifies", () => {
+        const data = {messages: '{"content": "hello"}'}
+        const updated = setValueAtPath(data, ["messages", "content"], "world") as typeof data
+        expect(updated.messages).toBe('{"content":"world"}')
+    })
+})
+
+// ---------------------------------------------------------------------------
+// deleteValueAtPath
+// ---------------------------------------------------------------------------
+
+describe("deleteValueAtPath — object", () => {
+    it("removes a key from a nested object (immutable)", () => {
+        const data = {user: {name: "Alice", age: 30}}
+        const updated = deleteValueAtPath(data, ["user", "age"]) as typeof data
+        expect(updated.user).not.toHaveProperty("age")
+        expect(updated.user.name).toBe("Alice")
+        expect(data.user.age).toBe(30)
+    })
+
+    it("returns data unchanged when path is empty", () => {
+        const data = {a: 1}
+        expect(deleteValueAtPath(data, [])).toBe(data)
+    })
+})
+
+describe("deleteValueAtPath — array", () => {
+    it("removes an element from an array by index", () => {
+        const result = deleteValueAtPath([10, 20, 30], [1]) as number[]
+        expect(result).toEqual([10, 30])
+    })
+})
+
+// ---------------------------------------------------------------------------
+// hasValueAtPath
+// ---------------------------------------------------------------------------
+
+describe("hasValueAtPath", () => {
+    it("returns true when the key exists", () => {
+        expect(hasValueAtPath({a: {b: 1}}, ["a", "b"])).toBe(true)
+    })
+
+    it("returns false when the key is missing", () => {
+        expect(hasValueAtPath({a: {}}, ["a", "missing"])).toBe(false)
+    })
+
+    it("returns false when a parent is null", () => {
+        expect(hasValueAtPath({a: null}, ["a", "b"])).toBe(false)
+    })
+
+    it("returns true for valid array index", () => {
+        expect(hasValueAtPath([10, 20, 30], [2])).toBe(true)
+    })
+
+    it("returns false for out-of-bounds array index", () => {
+        expect(hasValueAtPath([10, 20], [5])).toBe(false)
+    })
+
+    it("returns true for the root when path is empty and data is defined", () => {
+        expect(hasValueAtPath({a: 1}, [])).toBe(true)
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/slug.test.ts b/web/packages/agenta-shared/tests/unit/slug.test.ts
new file mode 100644
index 0000000000..02369b9644
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/slug.test.ts
@@ -0,0 +1,234 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    generateSlugWithExistingSuffix,
+    generateSlugWithSuffix,
+    getSlugSuffix,
+    isValidSlug,
+    regenerateSlugSuffix,
+    slugifyName,
+    stripSlugSuffix,
+} from "../../src/utils/slug"
+import {
+    buildGatewayToolSlug,
+    isGatewayToolSlug,
+    parseGatewayToolSlug,
+} from "../../src/utils/toolSlug"
+
+// ---------------------------------------------------------------------------
+// slugifyName
+// ---------------------------------------------------------------------------
+
+describe("slugifyName", () => {
+    it("lowercases and trims", () => {
+        expect(slugifyName("  Hello World  ")).toBe("hello-world")
+    })
+
+    it("replaces spaces with hyphens", () => {
+        expect(slugifyName("my app name")).toBe("my-app-name")
+    })
+
+    it("collapses multiple spaces into one hyphen", () => {
+        expect(slugifyName("foo   bar")).toBe("foo-bar")
+    })
+
+    it("strips leading and trailing hyphens", () => {
+        expect(slugifyName("-leading")).toBe("leading")
+        expect(slugifyName("trailing-")).toBe("trailing")
+    })
+
+    it("preserves allowed chars: digits, underscore, dot, hyphen", () => {
+        expect(slugifyName("my_app.v2-beta")).toBe("my_app.v2-beta")
+    })
+
+    it("removes disallowed special characters", () => {
+        expect(slugifyName("hello! @world#")).toBe("hello-world")
+    })
+
+    it("returns empty string for a blank input", () => {
+        expect(slugifyName("")).toBe("")
+        expect(slugifyName("   ")).toBe("")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// generateSlugWithSuffix
+// ---------------------------------------------------------------------------
+
+describe("generateSlugWithSuffix", () => {
+    it("produces <base>-<4 chars> format", () => {
+        const slug = generateSlugWithSuffix("My App")
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+    })
+
+    it("falls back to 'resource' when name slugifies to empty", () => {
+        const slug = generateSlugWithSuffix("!!!!")
+        expect(slug).toMatch(/^resource-[a-z0-9]{4}$/)
+    })
+
+    it("produces different slugs on repeated calls (randomness)", () => {
+        const slugs = new Set(Array.from({length: 10}, () => generateSlugWithSuffix("app")))
+        // With 36^4 = ~1.7M possibilities, collision probability over 10 draws is negligible
+        expect(slugs.size).toBeGreaterThan(1)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// generateSlugWithExistingSuffix
+// ---------------------------------------------------------------------------
+
+describe("generateSlugWithExistingSuffix", () => {
+    it("appends the provided suffix to the slugified name", () => {
+        expect(generateSlugWithExistingSuffix("My App", "ab12")).toBe("my-app-ab12")
+    })
+
+    it("generates a new random suffix when suffix is null", () => {
+        const slug = generateSlugWithExistingSuffix("My App", null)
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+    })
+
+    it("generates a new random suffix when suffix is undefined", () => {
+        const slug = generateSlugWithExistingSuffix("My App")
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// getSlugSuffix
+// ---------------------------------------------------------------------------
+
+describe("getSlugSuffix", () => {
+    it("returns the 4-char suffix when present", () => {
+        expect(getSlugSuffix("my-app-ab12")).toBe("ab12")
+    })
+
+    it("returns null when the trailing segment is not exactly 4 chars", () => {
+        expect(getSlugSuffix("my-app-abc")).toBeNull()
+        expect(getSlugSuffix("my-app-abcde")).toBeNull()
+    })
+
+    it("returns null when there is no hyphen-separated suffix", () => {
+        expect(getSlugSuffix("myapp")).toBeNull()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// stripSlugSuffix
+// ---------------------------------------------------------------------------
+
+describe("stripSlugSuffix", () => {
+    it("removes the 4-char suffix", () => {
+        expect(stripSlugSuffix("my-app-ab12")).toBe("my-app")
+    })
+
+    it("leaves the slug unchanged when no suffix is present", () => {
+        expect(stripSlugSuffix("myapp")).toBe("myapp")
+        expect(stripSlugSuffix("my-app-toolong")).toBe("my-app-toolong")
+    })
+})
+
+// ---------------------------------------------------------------------------
+// regenerateSlugSuffix
+// ---------------------------------------------------------------------------
+
+describe("regenerateSlugSuffix", () => {
+    it("replaces the known suffix with a new random one", () => {
+        const slug = regenerateSlugSuffix("my-app-ab12", "ab12")
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+        // The new suffix should differ from the old one (probabilistically)
+        // We just assert the format is correct
+    })
+
+    it("appends a new suffix when the slug does not end with the given suffix", () => {
+        const slug = regenerateSlugSuffix("my-app", "other")
+        expect(slug).toMatch(/^my-app-[a-z0-9]{4}$/)
+    })
+
+    it("always produces a 4-char suffix", () => {
+        const slug = regenerateSlugSuffix("app-xyz1")
+        expect(slug).toMatch(/-[a-z0-9]{4}$/)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// isValidSlug
+// ---------------------------------------------------------------------------
+
+describe("isValidSlug", () => {
+    it.each(["a", "abc", "my-app", "my_app", "app.v2", "app-v2-ab12"])(
+        "returns true for valid slug %s",
+        (s) => expect(isValidSlug(s)).toBe(true),
+    )
+
+    it("returns false for empty string", () => {
+        expect(isValidSlug("")).toBe(false)
+    })
+
+    it("returns false for slugs longer than 255 characters", () => {
+        expect(isValidSlug("a".repeat(256))).toBe(false)
+    })
+
+    it("returns false for double hyphens", () => {
+        expect(isValidSlug("my--app")).toBe(false)
+    })
+
+    it("returns false for double dots", () => {
+        expect(isValidSlug("my..app")).toBe(false)
+    })
+
+    it("returns false for slugs starting or ending with non-alphanumeric", () => {
+        expect(isValidSlug("-app")).toBe(false)
+        expect(isValidSlug("app-")).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// buildGatewayToolSlug / isGatewayToolSlug / parseGatewayToolSlug
+// ---------------------------------------------------------------------------
+
+describe("buildGatewayToolSlug", () => {
+    it("builds the correct double-underscore format", () => {
+        expect(buildGatewayToolSlug("google", "gmail", "SEND_EMAIL", "my-connection")).toBe(
+            "tools__google__gmail__SEND_EMAIL__my-connection",
+        )
+    })
+})
+
+describe("isGatewayToolSlug", () => {
+    it("returns true for a valid gateway tool slug", () => {
+        expect(isGatewayToolSlug("tools__google__gmail__SEND__conn")).toBe(true)
+    })
+
+    it("returns false for a non-gateway slug", () => {
+        expect(isGatewayToolSlug("get_weather")).toBe(false)
+        expect(isGatewayToolSlug(undefined)).toBe(false)
+    })
+})
+
+describe("parseGatewayToolSlug", () => {
+    it("parses all four parts correctly", () => {
+        const result = parseGatewayToolSlug("tools__google__gmail__SEND_EMAIL__my-conn")
+        expect(result).toEqual({
+            provider: "google",
+            integration: "gmail",
+            action: "SEND_EMAIL",
+            connection: "my-conn",
+        })
+    })
+
+    it("returns null for a slug with wrong number of parts", () => {
+        expect(parseGatewayToolSlug("tools__google__gmail")).toBeNull()
+    })
+
+    it("returns null for a slug that does not start with 'tools'", () => {
+        expect(parseGatewayToolSlug("nottools__a__b__c__d")).toBeNull()
+    })
+
+    it("returns null for undefined input", () => {
+        expect(parseGatewayToolSlug(undefined)).toBeNull()
+    })
+
+    it("returns null when any segment is empty", () => {
+        expect(parseGatewayToolSlug("tools__google____SEND__conn")).toBeNull()
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/template-variable.test.ts b/web/packages/agenta-shared/tests/unit/template-variable.test.ts
new file mode 100644
index 0000000000..40075febc9
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/template-variable.test.ts
@@ -0,0 +1,148 @@
+import {describe, expect, it} from "vitest"
+
+import {
+    extractTemplateExpression,
+    isValidTemplateVariable,
+    validateTemplateVariable,
+} from "../../src/utils/templateVariable"
+
+// ---------------------------------------------------------------------------
+// validateTemplateVariable — empty / malformed
+// ---------------------------------------------------------------------------
+
+describe("validateTemplateVariable — empty / malformed", () => {
+    it("rejects an empty expression", () => {
+        const result = validateTemplateVariable("")
+        expect(result.valid).toBe(false)
+        expect(result.reason).toMatch(/empty/i)
+    })
+
+    it("rejects expressions with consecutive dots (..)", () => {
+        expect(validateTemplateVariable("$.inputs..country").valid).toBe(false)
+    })
+
+    it("rejects expressions with consecutive slashes (//)", () => {
+        expect(validateTemplateVariable("/inputs//country").valid).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// validateTemplateVariable — JSONPath ($)
+// ---------------------------------------------------------------------------
+
+describe("validateTemplateVariable — JSONPath", () => {
+    it("accepts bare '$' (whole-context compact JSON)", () => {
+        // Bare '$' resolves the whole context object — valid per the runtime contract.
+        expect(validateTemplateVariable("$").valid).toBe(true)
+    })
+
+    it("accepts a well-formed JSONPath rooted at a known slot", () => {
+        expect(validateTemplateVariable("$.inputs.country").valid).toBe(true)
+        expect(validateTemplateVariable("$.outputs.result").valid).toBe(true)
+    })
+
+    it("accepts a JSONPath with an unknown root (permissive — root becomes a testcase column)", () => {
+        // Per post-mustache QA: any well-formed '$.x' is valid; slot mismatches
+        // surface as runtime errors from the API, not UI errors.
+        const result = validateTemplateVariable("$.arbitrary_column")
+        expect(result.valid).toBe(true)
+    })
+
+    it("accepts a near-miss JSONPath without a typo suggestion (permissive)", () => {
+        // The JSONPath branch no longer emits 'did-you-mean' hints; the user's
+        // literal text wins and the root is treated as a testcase column name.
+        const result = validateTemplateVariable("$.input.country")
+        expect(result.valid).toBe(true)
+        expect(result.suggestion).toBeUndefined()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// validateTemplateVariable — JSON Pointer (/)
+// ---------------------------------------------------------------------------
+
+describe("validateTemplateVariable — JSON Pointer", () => {
+    it("accepts a pointer rooted at a known envelope slot", () => {
+        expect(validateTemplateVariable("/inputs/country").valid).toBe(true)
+        expect(validateTemplateVariable("/outputs/result").valid).toBe(true)
+    })
+
+    it("accepts a single-segment identifier (may be a mustache section close tag)", () => {
+        // '/identifier' is ambiguous: it could be '{{/close}}' in mustache or a
+        // JSON Pointer to an envelope slot. Single-segment paths are accepted
+        // unconditionally; the runtime is the source of truth.
+        const result = validateTemplateVariable("/section")
+        expect(result.valid).toBe(true)
+    })
+
+    it("rejects a multi-segment pointer with an unknown root slot", () => {
+        const result = validateTemplateVariable("/input/country")
+        expect(result.valid).toBe(false)
+        expect(result.suggestion).toBe("inputs")
+    })
+
+    it("rejects '/' with no segments", () => {
+        expect(validateTemplateVariable("/").valid).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// validateTemplateVariable — plain names / dot notation
+// ---------------------------------------------------------------------------
+
+describe("validateTemplateVariable — plain names", () => {
+    it("accepts plain identifiers", () => {
+        expect(validateTemplateVariable("question").valid).toBe(true)
+        expect(validateTemplateVariable("my_variable").valid).toBe(true)
+    })
+
+    it("accepts dot-notation paths", () => {
+        expect(validateTemplateVariable("user.name").valid).toBe(true)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// isValidTemplateVariable
+// ---------------------------------------------------------------------------
+
+describe("isValidTemplateVariable", () => {
+    it("returns true for a valid expression", () => {
+        expect(isValidTemplateVariable("$.inputs.country")).toBe(true)
+    })
+
+    it("returns false for an invalid expression", () => {
+        expect(isValidTemplateVariable("")).toBe(false)
+        expect(isValidTemplateVariable("$foo")).toBe(false) // missing '.' after '$'
+        expect(isValidTemplateVariable("$.")).toBe(false) // trailing dot, no field
+    })
+})
+
+// ---------------------------------------------------------------------------
+// extractTemplateExpression
+// ---------------------------------------------------------------------------
+
+describe("extractTemplateExpression", () => {
+    it("strips {{ }} wrappers", () => {
+        expect(extractTemplateExpression("{{ $.inputs.country }}")).toBe("$.inputs.country")
+    })
+
+    it("strips {% %} wrappers", () => {
+        expect(extractTemplateExpression("{% if condition %}")).toBe("if condition")
+    })
+
+    it("strips {%- -%} wrappers (whitespace-trimming variants)", () => {
+        expect(extractTemplateExpression("{%- block -%}")).toBe("block")
+    })
+
+    it("strips {# #} comment wrappers", () => {
+        expect(extractTemplateExpression("{# comment #}")).toBe("comment")
+    })
+
+    it("returns the raw text when no wrapper is present", () => {
+        expect(extractTemplateExpression("plain")).toBe("plain")
+    })
+
+    it("returns empty string for empty input", () => {
+        expect(extractTemplateExpression("")).toBe("")
+    })
+})
diff --git a/web/packages/agenta-shared/tests/unit/validators-and-ids.test.ts b/web/packages/agenta-shared/tests/unit/validators-and-ids.test.ts
new file mode 100644
index 0000000000..92fc346e63
--- /dev/null
+++ b/web/packages/agenta-shared/tests/unit/validators-and-ids.test.ts
@@ -0,0 +1,138 @@
+import {describe, expect, it} from "vitest"
+
+import {isValidHttpUrl, isValidRegex, isValidUUID, validateUUID} from "../../src/utils/validators"
+import {uuidToSpanId, uuidToTraceId} from "../../src/utils/traceIds"
+import {removeTrailingSlash} from "../../src/utils/uriUtils"
+
+// ---------------------------------------------------------------------------
+// isValidUUID
+// ---------------------------------------------------------------------------
+
+describe("isValidUUID", () => {
+    it.each([
+        "123e4567-e89b-12d3-a456-426614174000",
+        "00000000-0000-0000-0000-000000000000",
+        "FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF",
+    ])("returns true for valid UUID %s", (uuid) => {
+        expect(isValidUUID(uuid)).toBe(true)
+    })
+
+    it.each([
+        "",
+        "not-a-uuid",
+        "123e4567-e89b-12d3-a456",
+        "123e4567-e89b-12d3-a456-42661417400Z",
+        "123e4567e89b12d3a456426614174000",
+    ])("returns false for invalid input %s", (input) => {
+        expect(isValidUUID(input)).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// validateUUID
+// ---------------------------------------------------------------------------
+
+describe("validateUUID", () => {
+    it("does not throw for a valid UUID", () => {
+        expect(() => validateUUID("123e4567-e89b-12d3-a456-426614174000", "id")).not.toThrow()
+    })
+
+    it("throws with a descriptive message for an invalid UUID", () => {
+        expect(() => validateUUID("not-valid", "userId")).toThrow(
+            "Invalid userId: must be a valid UUID",
+        )
+    })
+})
+
+// ---------------------------------------------------------------------------
+// isValidHttpUrl
+// ---------------------------------------------------------------------------
+
+describe("isValidHttpUrl", () => {
+    it.each(["http://example.com", "https://example.com/path?q=1"])("returns true for %s", (url) =>
+        expect(isValidHttpUrl(url)).toBe(true),
+    )
+
+    it.each(["ftp://example.com", "not-a-url", "", "javascript:alert(1)"])(
+        "returns false for %s",
+        (url) => expect(isValidHttpUrl(url)).toBe(false),
+    )
+})
+
+// ---------------------------------------------------------------------------
+// isValidRegex
+// ---------------------------------------------------------------------------
+
+describe("isValidRegex", () => {
+    it.each(["^[a-z]+$", "\\d+", "(foo|bar)", ".*"])("returns true for valid regex %s", (re) =>
+        expect(isValidRegex(re)).toBe(true),
+    )
+
+    it.each(["[invalid", "(unclosed", "*bad"])("returns false for invalid regex %s", (re) => {
+        expect(isValidRegex(re)).toBe(false)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// uuidToTraceId
+// ---------------------------------------------------------------------------
+
+describe("uuidToTraceId", () => {
+    it("strips dashes from a UUID", () => {
+        expect(uuidToTraceId("123e4567-e89b-12d3-a456-426614174000")).toBe(
+            "123e4567e89b12d3a456426614174000",
+        )
+    })
+
+    it("returns undefined for undefined input", () => {
+        expect(uuidToTraceId(undefined)).toBeUndefined()
+    })
+
+    it("returns undefined for empty string", () => {
+        expect(uuidToTraceId("")).toBeUndefined()
+    })
+})
+
+// ---------------------------------------------------------------------------
+// uuidToSpanId
+// ---------------------------------------------------------------------------
+
+describe("uuidToSpanId", () => {
+    it("returns the last 16 hex chars of the stripped UUID", () => {
+        // UUID: 123e4567-e89b-12d3-a456-426614174000
+        // Full hex: 123e4567e89b12d3a456426614174000  (32 chars)
+        // Last 16:                  a456426614174000
+        expect(uuidToSpanId("123e4567-e89b-12d3-a456-426614174000")).toBe("a456426614174000")
+    })
+
+    it("returns undefined for undefined input", () => {
+        expect(uuidToSpanId(undefined)).toBeUndefined()
+    })
+
+    it("span ID length is always 16", () => {
+        const spanId = uuidToSpanId("ffffffff-ffff-ffff-ffff-ffffffffffff")
+        expect(spanId).toHaveLength(16)
+    })
+})
+
+// ---------------------------------------------------------------------------
+// removeTrailingSlash
+// ---------------------------------------------------------------------------
+
+describe("removeTrailingSlash", () => {
+    it("removes a trailing slash", () => {
+        expect(removeTrailingSlash("http://example.com/")).toBe("http://example.com")
+    })
+
+    it("leaves a URI without trailing slash unchanged", () => {
+        expect(removeTrailingSlash("http://example.com")).toBe("http://example.com")
+    })
+
+    it("removes only the last slash, not interior ones", () => {
+        expect(removeTrailingSlash("http://example.com/path/")).toBe("http://example.com/path")
+    })
+
+    it("handles empty string", () => {
+        expect(removeTrailingSlash("")).toBe("")
+    })
+})
diff --git a/web/packages/agenta-shared/vitest.config.ts b/web/packages/agenta-shared/vitest.config.ts
new file mode 100644
index 0000000000..a9a2cfed1d
--- /dev/null
+++ b/web/packages/agenta-shared/vitest.config.ts
@@ -0,0 +1,19 @@
+import {defineConfig} from "vitest/config"
+
+export default defineConfig({
+    test: {
+        include: ["tests/unit/**/*.test.ts"],
+        environment: "node",
+        reporters: ["default", "junit"],
+        outputFile: {
+            junit: "./test-results/junit.xml",
+        },
+        coverage: {
+            provider: "v8",
+            include: ["src/**/*.ts"],
+            exclude: ["src/**/index.ts"],
+            reporter: ["text", "lcov", "json-summary"],
+            reportsDirectory: "./coverage",
+        },
+    },
+})
diff --git a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageEditor.tsx b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageEditor.tsx
index 864cabe098..85ceca76ef 100644
--- a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageEditor.tsx
+++ b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageEditor.tsx
@@ -214,7 +214,17 @@ const ChatMessageEditorInner: React.FC<ChatMessageEditorProps> = ({
             placeholder={placeholder}
             disabled={disabled}
             state={disabled ? "readOnly" : state}
-            className={cn("relative", flexLayouts.column, gapClasses.xs, "rounded-md", className)}
+            // `agenta-chat-message-editor` is the styling hook used in globals.css
+            // to align the message text with the role label (see that file). The
+            // padding can't go through `editorClassName` because ChatMessageEditor
+            // renders the Editor with `noProvider`, where `className` is dropped.
+            className={cn(
+                "agenta-chat-message-editor relative",
+                flexLayouts.column,
+                gapClasses.xs,
+                "rounded-md",
+                className,
+            )}
             footer={footer}
             onFocusChange={onFocusChange}
             maxPasteChars={maxPasteChars}
diff --git a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
index f42baa61ce..78fe345545 100644
--- a/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
+++ b/web/packages/agenta-ui/src/ChatMessage/components/ChatMessageList.tsx
@@ -12,10 +12,12 @@ import {
 } from "@agenta/shared/utils"
 import {Copy, MinusCircle, Plus} from "@phosphor-icons/react"
 import {Button, Tooltip} from "antd"
+import {useAtom} from "jotai"
 
 import {CollapseToggleButton, getCollapseStyle} from "../../components/presentational/buttons"
 import {ViewModeDropdown} from "../../drill-in/core/ViewModeDropdown"
-import {getViewOptions, type ViewMode} from "../../drill-in/utils/getViewOptions"
+import {messageViewModeAtom} from "../../drill-in/state/messageViewModeAtom"
+import {getViewOptions, toMessageViewMode, type ViewMode} from "../../drill-in/utils/getViewOptions"
 import {message, modal} from "../../utils/appMessageContext"
 import {cn, flexLayouts, gapClasses} from "../../utils/styles"
 import {createSnippetPdfAttachment} from "../utils/snippetAttachment"
@@ -89,9 +91,13 @@ const ChatMessageItem: React.FC<{
     onToggleMinimize,
 }) => {
     const containerRef = useRef<HTMLDivElement>(null)
-    const [viewMode, setViewMode] = useState<ChatViewMode>("text")
-    const isCodeMode = viewMode === "json" || viewMode === "yaml"
-    const editorLanguage: "json" | "yaml" = viewMode === "yaml" ? "yaml" : "json"
+    // Shared + persisted across all message editors (see messageViewModeAtom).
+    // The atom is typed `ViewMode` (can hold "form"), so coerce to a mode this
+    // editor can actually render before deriving any mode-dependent state.
+    const [viewMode, setViewMode] = useAtom(messageViewModeAtom)
+    const chatViewMode = toMessageViewMode(viewMode)
+    const isCodeMode = chatViewMode === "json" || chatViewMode === "yaml"
+    const editorLanguage: "json" | "yaml" = chatViewMode === "yaml" ? "yaml" : "json"
 
     const isToolResponse = msg.role === "tool"
     const hasToolCalls = Boolean(msg.tool_calls && msg.tool_calls.length > 0)
@@ -173,7 +179,7 @@ const ChatMessageItem: React.FC<{
                 onChangeText={(text) => onTextChange(index, text)}
                 isJSON={isCodeMode}
                 language={editorLanguage}
-                markdownView={viewMode === "markdown"}
+                markdownView={chatViewMode === "text"}
                 enableTokens={enableTokens && !isCodeMode}
                 templateFormat={templateFormat}
                 tokens={tokens}
@@ -196,7 +202,7 @@ const ChatMessageItem: React.FC<{
                         )}
                     >
                         <ViewModeDropdown<ChatViewMode>
-                            value={viewMode}
+                            value={chatViewMode}
                             options={viewOptions}
                             onChange={setViewMode}
                         />
diff --git a/web/packages/agenta-ui/src/ChatMessage/components/MarkdownToggleButton.tsx b/web/packages/agenta-ui/src/ChatMessage/components/MarkdownToggleButton.tsx
index c90a8c8204..ea19c0f50e 100644
--- a/web/packages/agenta-ui/src/ChatMessage/components/MarkdownToggleButton.tsx
+++ b/web/packages/agenta-ui/src/ChatMessage/components/MarkdownToggleButton.tsx
@@ -22,11 +22,11 @@ const MarkdownToggleButton = ({id}: MarkdownToggleButtonProps) => {
     }, [editor])
 
     return (
-        <Tooltip title={markdownView ? "Preview text" : "Preview markdown"}>
+        <Tooltip title={markdownView ? "Preview markdown" : "Preview text"}>
             <Button
                 type="text"
                 size="small"
-                icon={markdownView ? <TextAa size={14} /> : <MarkdownLogoIcon size={14} />}
+                icon={markdownView ? <MarkdownLogoIcon size={14} /> : <TextAa size={14} />}
                 onClick={onToggleMarkdown}
                 className={cn(flexLayouts.rowCenter, justifyClasses.center)}
             />
diff --git a/web/packages/agenta-ui/src/drill-in/FieldRenderers/JsonObjectField.tsx b/web/packages/agenta-ui/src/drill-in/FieldRenderers/JsonObjectField.tsx
index e46f7bd745..e09857d162 100644
--- a/web/packages/agenta-ui/src/drill-in/FieldRenderers/JsonObjectField.tsx
+++ b/web/packages/agenta-ui/src/drill-in/FieldRenderers/JsonObjectField.tsx
@@ -65,7 +65,7 @@ function ChatMessageObjectField({
             disabled={!editable}
             isJSON={isCodeMode}
             language={editorLanguage}
-            markdownView={viewMode === "markdown"}
+            markdownView={viewMode === "text"}
             enableTokens={!isCodeMode}
             templateFormat="curly"
             onChangeRole={(newRole: string) => {
diff --git a/web/packages/agenta-ui/src/drill-in/index.ts b/web/packages/agenta-ui/src/drill-in/index.ts
index e2081547b1..58ba59db23 100644
--- a/web/packages/agenta-ui/src/drill-in/index.ts
+++ b/web/packages/agenta-ui/src/drill-in/index.ts
@@ -126,8 +126,9 @@ export {
     canToggleRawMode,
     detectDataType,
 } from "./utils"
-export {getViewOptions} from "./utils/getViewOptions"
-export type {ViewMode, ViewOption} from "./utils/getViewOptions"
+export {getViewOptions, toMessageViewMode} from "./utils/getViewOptions"
+export type {ViewMode, MessageViewMode, ViewOption} from "./utils/getViewOptions"
+export {messageViewModeAtom} from "./state/messageViewModeAtom"
 
 // ============================================================================
 // FIELD RENDERERS
diff --git a/web/packages/agenta-ui/src/drill-in/state/messageViewModeAtom.ts b/web/packages/agenta-ui/src/drill-in/state/messageViewModeAtom.ts
new file mode 100644
index 0000000000..b3cab4975a
--- /dev/null
+++ b/web/packages/agenta-ui/src/drill-in/state/messageViewModeAtom.ts
@@ -0,0 +1,19 @@
+import {atomWithStorage} from "jotai/utils"
+
+import type {ViewMode} from "../utils/getViewOptions"
+
+/**
+ * Shared, persisted view mode for chat / prompt message editors.
+ *
+ * Replaces the per-message local `useState` so that:
+ *  - switching one message's view (Text / Markdown / JSON / YAML) switches every
+ *    message editor at once, and
+ *  - the choice survives a page refresh (persisted to localStorage).
+ *
+ * Scope note: this is a single app-wide atom, so it is shared by every consumer
+ * of the message editors (playground prompt + chat turns, and also the drill-in
+ * message fields). The key is intentionally not namespaced to "playground".
+ *
+ * Defaults to "text" so messages open as plain, raw text.
+ */
+export const messageViewModeAtom = atomWithStorage<ViewMode>("agenta:message-view-mode", "text")
diff --git a/web/packages/agenta-ui/src/drill-in/utils/getViewOptions.ts b/web/packages/agenta-ui/src/drill-in/utils/getViewOptions.ts
index 9a1b28f759..2daae9eb98 100644
--- a/web/packages/agenta-ui/src/drill-in/utils/getViewOptions.ts
+++ b/web/packages/agenta-ui/src/drill-in/utils/getViewOptions.ts
@@ -1,5 +1,17 @@
 export type ViewMode = "text" | "markdown" | "json" | "yaml" | "form"
 
+/** The view modes a chat / prompt message editor can render ("form" is for objects). */
+export type MessageViewMode = Exclude<ViewMode, "form">
+
+/**
+ * Coerce a (possibly app-wide / persisted) view mode to one a message editor can
+ * render. The shared `messageViewModeAtom` is typed `ViewMode`, so it can hold
+ * "form"; falling back to "text" keeps the dropdown and editor consistent instead
+ * of silently casting and rendering an unsupported mode.
+ */
+export const toMessageViewMode = (mode: ViewMode): MessageViewMode =>
+    mode === "form" ? "text" : mode
+
 export interface ViewOption {
     value: ViewMode
     label: string
diff --git a/web/pnpm-lock.yaml b/web/pnpm-lock.yaml
index 7fc761e2d7..f4003d76db 100644
--- a/web/pnpm-lock.yaml
+++ b/web/pnpm-lock.yaml
@@ -1222,9 +1222,15 @@ importers:
       '@types/react':
         specifier: ^19.0.10
         version: 19.2.14
+      '@vitest/coverage-v8':
+        specifier: ^4.1.4
+        version: 4.1.6(vitest@4.1.6)
       typescript:
         specifier: 5.8.3
         version: 5.8.3
+      vitest:
+        specifier: ^4.1.4
+        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@20.19.39)(@vitest/coverage-v8@4.1.6)(vite@8.0.12(@types/node@20.19.39)(esbuild@0.27.7)(jiti@2.7.0)(terser@5.47.0)(tsx@4.21.0)(yaml@2.8.4))
 
   packages/agenta-ui:
     dependencies: