diff --git a/.github/workflows/13-check-pr-contribution.yml b/.github/workflows/13-check-pr-contribution.yml index a4217327f7..04da371052 100644 --- a/.github/workflows/13-check-pr-contribution.yml +++ b/.github/workflows/13-check-pr-contribution.yml @@ -10,8 +10,11 @@ name: "13 - check PR contribution" # It never checks out or runs the PR's code, so this is safe. on: + # No 'reopened': a maintainer who manually reopens a flagged PR should win, + # otherwise the reopen event would immediately re-close it. Auto-reopen on a + # fixed description still works through 'edited' and 'synchronize'. pull_request_target: - types: [opened, edited, synchronize, reopened, ready_for_review] + types: [opened, edited, synchronize, ready_for_review] workflow_dispatch: inputs: pr_number: @@ -134,27 +137,24 @@ jobs: const reasons = []; - // 1) Template is present and filled. - const headers = ['Summary', 'Testing', 'Demo', 'Checklist']; - const lower = body.toLowerCase(); - const missing = headers.filter((h) => !lower.includes('## ' + h.toLowerCase())); + // 1) The PR is described. We only require a non-empty Summary, not the + // full template. Missing Testing/Checklist sections do not close a PR; + // a thorough PR with a demo should never be closed over a checklist. if (!body.trim()) { reasons.push('The pull request description is empty. Please fill in the PR template.'); - } else if (missing.length) { - reasons.push('The description is missing required sections (' + missing.join(', ') + '). Please use the PR template without removing its sections.'); } else if (!section('Summary')) { - reasons.push('The **Summary** section is empty. Describe what changed and why.'); + reasons.push('The **Summary** section is missing or empty. Describe what changed and why using the PR template.'); } - // 2) Demo is present for functional changes. + // 2) Demo is present for functional changes. Scan the whole body, not + // just the Demo section, so a screenshot or video placed anywhere counts. const files = await github.paginate(github.rest.pulls.listFiles, { owner, repo, pull_number: number, per_page: 100, }); const functional = files.some((f) => !EXEMPT.some((r) => r.test(f.filename))); - const demo = section('Demo') || ''; - const hasMedia = MEDIA.some((r) => r.test(demo)); + const hasMedia = MEDIA.some((r) => r.test(body)); if (functional && !hasMedia) { - reasons.push('This PR changes functional code (SDK, API, or frontend) but the **Demo** section has no screenshot or video. A short demo recording is required. Only test-only, docs-only, or chore changes may mark Demo as N/A.'); + reasons.push('This PR changes functional code (SDK, API, or frontend) but includes no demo. Add a screenshot or short video of the change. Only test-only, docs-only, or chore changes may skip it.'); } async function upsertComment(text) { diff --git a/api/pyproject.toml b/api/pyproject.toml index f189225c27..bed728e76f 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "api" -version = "0.101.1" +version = "0.102.0" description = "Agenta API" requires-python = ">=3.11,<3.14" authors = [ diff --git a/api/uv.lock b/api/uv.lock index 8e399a88c5..e03183d4a2 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -8,7 +8,7 @@ resolution-markers = [ [[package]] name = "agenta" -version = "0.101.1" +version = "0.102.0" source = { editable = "../sdks/python" } dependencies = [ { name = "agenta-client" }, @@ -70,7 +70,7 @@ dev = [ [[package]] name = "agenta-client" -version = "0.101.1" +version = "0.102.0" source = { editable = "../clients/python" } dependencies = [ { name = "httpx" }, @@ -259,7 +259,7 @@ wheels = [ [[package]] name = "api" -version = "0.101.1" +version = "0.102.0" source = { virtual = "." } dependencies = [ { name = "agenta" }, diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index eb0167cfb9..b6b760f4c4 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agenta-client" -version = "0.101.1" +version = "0.102.0" description = "Fern-generated Python client for the Agenta API." requires-python = ">=3.11,<3.14" authors = [ diff --git a/clients/python/uv.lock b/clients/python/uv.lock index bcf7e18af6..96b0d44d15 100644 --- a/clients/python/uv.lock +++ b/clients/python/uv.lock @@ -4,7 +4,7 @@ requires-python = ">=3.11, <3.14" [[package]] name = "agenta-client" -version = "0.101.1" +version = "0.102.0" source = { editable = "." } dependencies = [ { name = "httpx" }, diff --git a/hosting/kubernetes/helm/Chart.yaml b/hosting/kubernetes/helm/Chart.yaml index 5a80e7f5c7..3c88f08f85 100644 --- a/hosting/kubernetes/helm/Chart.yaml +++ b/hosting/kubernetes/helm/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: agenta description: A Helm chart for deploying Agenta (OSS or EE) on Kubernetes type: application -version: 0.101.1 -appVersion: "v0.101.1" +version: 0.102.0 +appVersion: "v0.102.0" keywords: - agenta - llm diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 94d30e10eb..f5dffe454d 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agenta" -version = "0.101.1" +version = "0.102.0" description = "The SDK for agenta is an open-source LLMOps platform." readme = "README.md" requires-python = ">=3.11,<3.14" diff --git a/sdks/python/uv.lock b/sdks/python/uv.lock index 45f9972644..33bb5ed7cd 100644 --- a/sdks/python/uv.lock +++ b/sdks/python/uv.lock @@ -4,7 +4,7 @@ requires-python = ">=3.11, <3.14" [[package]] name = "agenta" -version = "0.101.1" +version = "0.102.0" source = { editable = "." } dependencies = [ { name = "agenta-client" }, @@ -83,7 +83,7 @@ dev = [ [[package]] name = "agenta-client" -version = "0.101.1" +version = "0.102.0" source = { editable = "../../clients/python" } dependencies = [ { name = "httpx" }, diff --git a/services/pyproject.toml b/services/pyproject.toml index fb3bcc09f2..b29077b98f 100644 --- a/services/pyproject.toml +++ b/services/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "services" -version = "0.101.1" +version = "0.102.0" description = "Agenta Services (Chat & Completion)" requires-python = ">=3.11,<3.14" authors = [ diff --git a/services/uv.lock b/services/uv.lock index 120dd8a54e..61aea0ab8f 100644 --- a/services/uv.lock +++ b/services/uv.lock @@ -8,7 +8,7 @@ resolution-markers = [ [[package]] name = "agenta" -version = "0.101.1" +version = "0.102.0" source = { editable = "../sdks/python" } dependencies = [ { name = "agenta-client" }, @@ -70,7 +70,7 @@ dev = [ [[package]] name = "agenta-client" -version = "0.101.1" +version = "0.102.0" source = { editable = "../clients/python" } dependencies = [ { name = "httpx" }, @@ -2363,7 +2363,7 @@ wheels = [ [[package]] name = "services" -version = "0.101.1" +version = "0.102.0" source = { virtual = "." } dependencies = [ { name = "agenta" }, diff --git a/web/ee/package.json b/web/ee/package.json index 9b6f2b07a9..9e87a9a17a 100644 --- a/web/ee/package.json +++ b/web/ee/package.json @@ -1,6 +1,6 @@ { "name": "@agenta/ee", - "version": "0.101.1", + "version": "0.102.0", "private": true, "engines": { "node": "24.x" diff --git a/web/oss/package.json b/web/oss/package.json index bb9da8b419..29dc761848 100644 --- a/web/oss/package.json +++ b/web/oss/package.json @@ -1,6 +1,6 @@ { "name": "@agenta/oss", - "version": "0.101.1", + "version": "0.102.0", "private": true, "engines": { "node": "24.x" diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts index 3b3941cc15..bee2d3f3e6 100644 --- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts +++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/context.ts @@ -16,6 +16,12 @@ export interface EvaluationRunsTableOverrides { evaluationKind: EvaluationRunKind includePreview: boolean scope?: TableScope + /** + * Over-fetch successive server pages until a full page of subject runs is + * collected. Set by fixed-size, non-paginating surfaces (the Overview + * summary) so the subject filter doesn't leave them falsely empty. + */ + fillToLimit?: boolean } type TableScope = "app" | "project" @@ -34,6 +40,7 @@ export interface EvaluationRunsTableContext { storageKey: string createSupported: boolean createEvaluationType: "auto" | "human" | "online" | "custom" + fillToLimit: boolean } export const defaultEvaluationRunsTableOverrides: EvaluationRunsTableOverrides = { @@ -66,6 +73,7 @@ export const evaluationRunsTableContextAtom = atom(( const evaluationKind = overrides.evaluationKind const includePreview = overrides.includePreview + const fillToLimit = overrides.fillToLimit ?? false const projectId = overrides.projectIdOverride ?? identifiers.projectId ?? fallbackProjectId ?? null @@ -130,6 +138,7 @@ export const evaluationRunsTableContextAtom = atom(( storageKey, createSupported, createEvaluationType, + fillToLimit, } return context @@ -188,6 +197,7 @@ export const evaluationRunsMetaContextSliceAtom = selectAtom( includePreview: context.includePreview, evaluationKind: context.evaluationKind, derivedPreviewFlags: context.derivedPreviewFlags, + fillToLimit: context.fillToLimit, }), (a, b) => a.projectId === b.projectId && @@ -196,6 +206,7 @@ export const evaluationRunsMetaContextSliceAtom = selectAtom( a.activeAppId === b.activeAppId && a.includePreview === b.includePreview && a.evaluationKind === b.evaluationKind && + a.fillToLimit === b.fillToLimit && arrayEquals(a.effectiveAppIds, b.effectiveAppIds) && shallowEqualFlags(a.derivedPreviewFlags, b.derivedPreviewFlags), ) diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts index b7a2bba238..5d4d35b843 100644 --- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts +++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/fetchAutoEvaluationRuns.ts @@ -1,3 +1,5 @@ +import {hasResolvableSubject, isSubjectRun} from "@agenta/entities/evaluationRun/etl" + import type {WindowingState} from "@/oss/components/InfiniteVirtualTable/types" import {deriveEvaluationKind} from "@/oss/lib/evaluations/utils/evaluationKind" @@ -45,8 +47,21 @@ interface FetchEvaluationRunsWindowParams { statusFilters?: string[] | null evaluationTypeFilters?: ConcreteEvaluationRunKind[] | null dateRange?: {from?: string | null; to?: string | null} | null + /** + * Over-fetch successive server pages until `limit` *subject* runs are + * collected (or the stream is exhausted / a safety cap is hit). For the + * fixed-size Overview summary (no infinite scroll): a single server page can + * filter down to few/zero subject runs even when more exist deeper, which + * would falsely read as "this workflow has never been evaluated". The full + * page leaves this off — infinite scroll fills lazily on scroll. + */ + fillToLimit?: boolean } +// Over-fetch tuning (only used when `fillToLimit` + a subject filter are active). +const FILL_MAX_SERVER_PAGES = 8 +const FILL_MIN_SERVER_PAGE = 25 + const fetchPreviewRuns = async ({ projectId, appId, @@ -238,6 +253,7 @@ export const fetchEvaluationRunsWindow = async ({ cursor = null, evaluationTypeFilters, dateRange, + fillToLimit = false, }: FetchEvaluationRunsWindowParams): Promise => { if (!projectId) { return { @@ -259,31 +275,6 @@ export const fetchEvaluationRunsWindow = async ({ evaluationKind === "all" && allowedKinds && allowedKinds.size ? Array.from(allowedKinds) : null - const windowingPayload: QueryWindowingPayload = { - limit, - order: "descending" as const, - next: cursor ?? undefined, - } - if (dateRange?.to) { - windowingPayload.newest = dateRange.to - } - if (dateRange?.from) { - windowingPayload.oldest = dateRange.from - } - - const previewResult = includePreview - ? await fetchPreviewRuns({ - projectId, - appId: previewAppId, - searchQuery: previewSearchQuery, - references: previewReferences, - flags: previewFlags, - statuses: statusFilters && statusFilters.length ? statusFilters : undefined, - evaluationTypes: evaluationTypesPayload, - windowing: windowingPayload, - }) - : {runs: [], count: 0, windowing: null} - const rows: EvaluationRunApiRow[] = [] const normalizedSearch = previewSearchQuery?.trim().toLowerCase() ?? null @@ -305,11 +296,31 @@ export const fetchEvaluationRunsWindow = async ({ return normalizedStatusSet.has(statusValue.toLowerCase()) } - const allowedAppIds = appIds.filter((id) => typeof id === "string" && id.trim().length > 0) - const allowedAppSet = - allowedAppIds.length > 0 ? new Set(allowedAppIds.map((id) => id.trim())) : null + const allowedAppIds = appIds + .filter((id) => typeof id === "string" && id.trim().length > 0) + .map((id) => id.trim()) + const allowedAppSet = allowedAppIds.length > 0 ? new Set(allowedAppIds) : null - previewResult.runs.forEach((run) => { + // Run-list SUBJECT predicate (feature F): when scoped to a workflow, keep + // runs that *evaluated this workflow* — runs where the scoped id is the + // run's `application`/invocation reference (the evaluated subject) — and + // drop runs where it merely appears as a grader (`evaluator` reference). + // + // This replaces the prior `meta.application.id` heuristic, which is + // unreliable: a null `meta.application` silently bypassed the guard, which + // is how grader runs leaked onto an evaluator's Evaluations tab. The run's + // `data.steps` are the structural source of truth. We fall back to the + // `meta` heuristic only when a run carries no resolvable subject reference. + // + // `subjectScanned`/`subjectMatched` feed the hit-ratio meter: a low rolling + // pass-ratio means the scoped workflow is graded far more than it's + // evaluated — the signal that the backend role-aware reference filter (v2) + // is warranted. (The FE already sends the role via the payload's dict key; + // v2 is the backend honoring it. See evaluations/utils.py query_run_references.) + let subjectScanned = 0 + let subjectMatched = 0 + + const processRun = (run: PreviewEvaluationRun) => { // Derive kind from run.data.steps - this is the reliable source of truth // Do NOT rely on meta.evaluation_kind as it's flaky and unreliable const derivedKind = derivePreviewRunKind(run) @@ -331,8 +342,22 @@ export const fetchEvaluationRunsWindow = async ({ const runId = run.id ?? null const metaApplication = (run as any)?.meta?.application ?? {} const runAppId = metaApplication?.id ?? (run as any)?.meta?.appId ?? null - if (allowedAppSet && runAppId && !allowedAppSet.has(runAppId)) { - return + const previewMeta = extractPreviewRunMeta(run) + + if (allowedAppSet) { + subjectScanned += 1 + const steps = previewMeta.steps + const passesSubject = hasResolvableSubject(steps) + ? // Structural: the scoped workflow is the run's evaluated subject. + allowedAppIds.some((id) => isSubjectRun(steps, id)) + : // Fallback for runs with no resolvable subject reference: + // keep the prior `meta.application.id` behaviour rather than + // dropping a run we can't classify structurally. + !runAppId || allowedAppSet.has(runAppId) + if (!passesSubject) { + return + } + subjectMatched += 1 } const previewName = typeof (run as any)?.name === "string" ? (run as any).name : null if (!matchesSearch([runId, previewName, metaApplication?.id, metaApplication?.name])) { @@ -354,10 +379,66 @@ export const fetchEvaluationRunsWindow = async ({ : (run as any)?.status?.value) ?? null, appId: runAppId ?? null, preview: runId ? {id: runId} : undefined, - previewMeta: extractPreviewRunMeta(run), + previewMeta, evaluationKind: derivedKind, }) - }) + } + + // Over-fetch loop. The fixed-size summary (`fillToLimit`) can filter a single + // server page down to few/zero subject runs even when more exist deeper — + // which would falsely read as "this workflow has never been evaluated". When + // filling, pull successive server pages (advancing the cursor) until we have + // `limit` subject runs, the stream is exhausted, or the safety cap is hit. + // The full page leaves this off (single page) — its infinite scroll fills + // lazily on scroll, so changing its pagination here isn't needed. + const wantFill = Boolean(fillToLimit) && Boolean(allowedAppSet) + const serverPageLimit = wantFill ? Math.max(limit, FILL_MIN_SERVER_PAGE) : limit + const maxPages = wantFill ? FILL_MAX_SERVER_PAGES : 1 + + let currentCursor: string | undefined = cursor ?? undefined + let firstPageCount: number | null = null + let lastWindowing: QueryWindowingPayload | null = null + let pagesFetched = 0 + + while (pagesFetched < maxPages) { + pagesFetched += 1 + + const windowingPayload: QueryWindowingPayload = { + limit: serverPageLimit, + order: "descending" as const, + next: currentCursor, + } + if (dateRange?.to) { + windowingPayload.newest = dateRange.to + } + if (dateRange?.from) { + windowingPayload.oldest = dateRange.from + } + + const previewResult = includePreview + ? await fetchPreviewRuns({ + projectId, + appId: previewAppId, + searchQuery: previewSearchQuery, + references: previewReferences, + flags: previewFlags, + statuses: statusFilters && statusFilters.length ? statusFilters : undefined, + evaluationTypes: evaluationTypesPayload, + windowing: windowingPayload, + }) + : {runs: [], count: 0, windowing: null} + + if (firstPageCount === null) { + firstPageCount = previewResult.count ?? null + } + lastWindowing = previewResult.windowing + previewResult.runs.forEach(processRun) + + currentCursor = previewResult.windowing?.next ?? undefined + if (!wantFill || rows.length >= limit || !currentCursor) { + break + } + } rows.sort((a, b) => { const tsA = a.createdAt ? new Date(a.createdAt).getTime() : 0 @@ -365,14 +446,16 @@ export const fetchEvaluationRunsWindow = async ({ return tsB - tsA }) + // The fixed-size summary shows at most `limit` (latest N subject runs); the + // last over-fetched server page may carry a few extra past the limit. + const pageRows = wantFill ? rows.slice(0, limit) : rows const totalCount = - evaluationKind === "all" && allowedKinds - ? rows.length - : (previewResult.count ?? rows.length) - const pageRows = rows + evaluationKind === "all" && allowedKinds ? pageRows.length : (firstPageCount ?? rows.length) const nextOffset = offset + pageRows.length - const previewNextCursor = previewResult.windowing?.next ?? null - const hasMore = Boolean(previewNextCursor) + // The summary doesn't paginate (infinite scroll off), so it never advertises + // "more"; the full page advertises the page's server cursor as before. + const previewNextCursor = lastWindowing?.next ?? null + const hasMore = wantFill ? false : Boolean(previewNextCursor) return { rows: pageRows, @@ -380,6 +463,9 @@ export const fetchEvaluationRunsWindow = async ({ hasMore, nextOffset: hasMore ? nextOffset : null, nextCursor: previewNextCursor, - nextWindowing: normalizeWindowing(previewResult.windowing), + nextWindowing: normalizeWindowing(lastWindowing), + subjectFilterStats: allowedAppSet + ? {scanned: subjectScanned, matched: subjectMatched} + : undefined, } } diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts new file mode 100644 index 0000000000..0bfd513091 --- /dev/null +++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/subjectFilterMeter.ts @@ -0,0 +1,79 @@ +/** + * Per-context hit-ratio meter for the run-list SUBJECT predicate (feature F). + * + * The subject filter (`fetchAutoEvaluationRuns`) keeps runs that *evaluated the + * scoped workflow* and drops runs where it was only a grader. When the scoped + * workflow is graded far more often than it's evaluated, most fetched runs get + * dropped client-side — the "low hit-ratio" case the eval-filtering RFC's meter + * is built to detect (docs/designs/eval-filtering.md §D2 + §C3). + * + * A low rolling ratio is the signal that the backend role-aware reference + * filter (v2) is warranted. The FE already encodes the role as the reference + * payload's dict key; v2 is purely the backend honoring it + * (`evaluations/utils.py` `query_run_references` — see line 66). So this meter + * **reports the regime** (dev log + a readable getter for diagnostics); it does + * not — and cannot, from the FE — swap to a server-side filter. + * + * Meters are keyed by the subject-filter context (project + scoped workflow ids + * + kind). Each distinct context gets its own rolling window. + */ + +import { + createHitRatioMeter, + type HitRatioMeter, + type HitRatioRegime, +} from "@agenta/entities/evaluationRun/etl" + +const meters = new Map() + +const meterFor = (signature: string): HitRatioMeter => { + let meter = meters.get(signature) + if (!meter) { + meter = createHitRatioMeter() + meters.set(signature, meter) + } + return meter +} + +/** Stable signature for a subject-filter context. */ +export const subjectFilterSignature = ({ + projectId, + appIds, + evaluationKind, +}: { + projectId: string | null + appIds: string[] | null | undefined + evaluationKind: string +}): string => `${projectId ?? "null"}::${(appIds ?? []).join("|")}::${evaluationKind}` + +/** + * Record one page of subject-filter stats and return the resulting regime. + * + * `page` should be the fetch offset (monotonic, unique per page within a + * context). The meter dedups by it, so a refetch from offset 0 — common after + * cache invalidation — doesn't double-count. + */ +export const recordSubjectFilterPage = ({ + signature, + page, + scanned, + matched, +}: { + signature: string + page: number + scanned: number + matched: number +}): HitRatioRegime => { + const meter = meterFor(signature) + meter.record({chunk: page, scanned, matched}) + return meter.regime() +} + +/** Read the current regime without recording (diagnostics / banners). */ +export const getSubjectFilterRegime = (signature: string): HitRatioRegime | null => + meters.get(signature)?.regime() ?? null + +/** Drop a context's meter (e.g. when its filter signature is retired). */ +export const resetSubjectFilterMeter = (signature: string): void => { + meters.delete(signature) +} diff --git a/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts b/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts index b79bfe6cf2..803a19f0f4 100644 --- a/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts +++ b/web/oss/src/components/EvaluationRunsTablePOC/atoms/tableStore.ts @@ -16,6 +16,7 @@ import {buildReferencePayload} from "../utils/referencePayload" import {computeContextSignature, evaluationRunsMetaContextSliceAtom} from "./context" import {fetchEvaluationRunsWindow} from "./fetchAutoEvaluationRuns" +import {recordSubjectFilterPage, subjectFilterSignature} from "./subjectFilterMeter" import type {RunFlagsFilter} from "@/agenta-oss-common/lib/hooks/usePreviewEvaluations/index" @@ -31,6 +32,8 @@ export interface EvaluationRunsTableMeta { referenceFilters?: Record | null evaluationTypeFilters?: ConcreteEvaluationRunKind[] | null dateRange?: {from?: string | null; to?: string | null} | null + /** Over-fetch to fill a full page of subject runs (fixed-size summaries). */ + fillToLimit?: boolean /** Internal refresh trigger - incrementing this forces a refetch */ _refreshTrigger?: number } @@ -199,6 +202,7 @@ export const evaluationRunsTableMetaAtom = atom< referenceFilters, evaluationTypeFilters, dateRange, + fillToLimit: context.fillToLimit, _refreshTrigger: refreshTrigger, } @@ -377,8 +381,34 @@ const evaluationRunsDatasetStoreInternal = createInfiniteDatasetStore< statusFilters: meta.statusFilters ?? null, evaluationTypeFilters: meta.evaluationTypeFilters ?? null, dateRange: meta.dateRange ?? null, + fillToLimit: meta.fillToLimit ?? false, }) + // Feed the run-list subject predicate's pass-ratio to the hit-ratio + // meter. A low rolling ratio means the scoped workflow is graded far + // more than it's evaluated — the v1→v2 escalation signal (the backend + // role-aware reference filter is warranted). Observation only today. + if (result.subjectFilterStats) { + const signature = subjectFilterSignature({ + projectId: meta.projectId, + appIds: meta.appIds, + evaluationKind: meta.evaluationKind, + }) + const regime = recordSubjectFilterPage({ + signature, + page: offset, + scanned: result.subjectFilterStats.scanned, + matched: result.subjectFilterStats.matched, + }) + if (process.env.NODE_ENV !== "production" && regime.state === "escalate") { + console.log( + "[evaluationRunsTableStore] subject filter low hit-ratio —", + regime.reason, + {appIds: meta.appIds, kind: meta.evaluationKind}, + ) + } + } + return { rows: result.rows, totalCount: result.totalCount, diff --git a/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx b/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx index 543990186a..e00ab76843 100644 --- a/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx +++ b/web/oss/src/components/EvaluationRunsTablePOC/components/LatestEvaluationRunsTable/index.tsx @@ -73,6 +73,10 @@ const LatestEvaluationRunsTable = ({ appId, projectIdOverride, includePreview, + // Fixed-size summary (no infinite scroll): over-fetch so the + // subject filter doesn't leave it falsely empty when the + // workflow is graded more than it's evaluated. + fillToLimit: true, ...(appScoped && {scope: "app" as const}), }} pageSize={limit} diff --git a/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx b/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx index 2ec43bc3f4..a05874dab8 100644 --- a/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx +++ b/web/oss/src/components/EvaluationRunsTablePOC/components/filters/EvaluationRunsHeaderFilters.tsx @@ -10,6 +10,7 @@ import { type ReferenceTone, } from "@/oss/components/References/referenceColors" import {testsetsListQueryAtomFamily} from "@/oss/state/entities/testset" +import {currentWorkflowAtom} from "@/oss/state/workflow" import { evaluationRunsFilterOptionsAtom, @@ -137,10 +138,19 @@ const FiltersSummary = () => { () => optionMap(filterOptions.evaluatorOptions ?? []), [filterOptions.evaluatorOptions], ) - const appLabels = useMemo( - () => optionMap(filterOptions.appOptions ?? []), - [filterOptions.appOptions], - ) + const currentWorkflow = useAtomValue(currentWorkflowAtom) + const appLabels = useMemo(() => { + const map = optionMap(filterOptions.appOptions ?? []) + // The locked "Apps" chip is preset to the route workflow. Evaluator + // workflows aren't in the apps list (`appOptions`), so their id won't + // resolve to a name and the chip would show a raw id. Seed the map from + // the current workflow so the chip renders its name instead. + const workflowName = currentWorkflow?.name ?? currentWorkflow?.slug + if (currentWorkflow?.id && workflowName && !map.has(currentWorkflow.id)) { + map.set(currentWorkflow.id, workflowName) + } + return map + }, [filterOptions.appOptions, currentWorkflow]) const variantLabels = useMemo( () => optionMap( diff --git a/web/oss/src/components/EvaluationRunsTablePOC/types.ts b/web/oss/src/components/EvaluationRunsTablePOC/types.ts index 8370f7b945..18c1a4eb0a 100644 --- a/web/oss/src/components/EvaluationRunsTablePOC/types.ts +++ b/web/oss/src/components/EvaluationRunsTablePOC/types.ts @@ -70,4 +70,18 @@ export interface EvaluationRunsWindowResult { nextOffset: number | null nextCursor: string | null nextWindowing: WindowingState | null + /** + * Per-page stats for the run-list **subject** predicate (the structural + * "is this run an evaluation of the scoped workflow?" filter). Feeds the + * hit-ratio meter: when the rolling pass-ratio is low, the scoped workflow + * is being graded far more often than it's evaluated, signalling the + * backend role-aware reference filter (v2) is warranted. Absent when no + * subject filter is active (project scope). + */ + subjectFilterStats?: { + /** Runs reaching the subject check (already past kind/status/search). */ + scanned: number + /** Of those, runs whose subject is the scoped workflow. */ + matched: number + } } diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx index 879f13436f..b0b970eb4d 100644 --- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader.tsx @@ -1,40 +1,22 @@ /** * EvaluatorPlaygroundHeader * - * Simplified playground header for the evaluator configuration page. - * Shows evaluator name, app workflow selector, and testset dropdown. - * Reads evaluator info from playground nodes (URL-driven, no props needed). + * Header for the evaluator configuration page: the evaluator name plus the + * shared run controls. The controls (run-on selector, app picker, testset) + * live in `EvaluatorRunControls` so the page and the creation drawer share one + * implementation. Reads evaluator info from playground nodes (URL-driven). */ import {useMemo} from "react" import {workflowMolecule} from "@agenta/entities/workflow" -import {EntityPicker} from "@agenta/entity-ui" -import type { - EntitySelectionAdapter, - WorkflowRevisionSelectionResult, -} from "@agenta/entity-ui/selection" import {playgroundController} from "@agenta/playground" import {Typography} from "antd" import {useAtomValue} from "jotai" -import dynamic from "next/dynamic" -import {selectedAppLabelAtom} from "./atoms" +import EvaluatorRunControls from "./EvaluatorRunControls" -const TestsetDropdown = dynamic( - () => import("@/oss/components/Playground/Components/TestsetDropdown"), - {ssr: false}, -) - -interface EvaluatorPlaygroundHeaderProps { - appWorkflowAdapter: EntitySelectionAdapter - onAppSelect: (selection: WorkflowRevisionSelectionResult) => void -} - -const EvaluatorPlaygroundHeader: React.FC = ({ - appWorkflowAdapter, - onAppSelect, -}) => { +const EvaluatorPlaygroundHeader: React.FC = () => { // Read evaluator node from playground nodes // Phase 1: evaluator is at depth 0 (primary) // Phase 2: evaluator is at depth 1 (downstream) @@ -69,12 +51,6 @@ const EvaluatorPlaygroundHeader: React.FC = ({ evaluatorData?.slug?.trim() || "Evaluator" - // Selected app label for display in the picker trigger - const selectedAppLabel = useAtomValue(selectedAppLabelAtom) - - // Check if we have an app node (depth-0 with a different entity than evaluator) - const hasAppSelected = nodes.some((n) => n.depth === 0 && n.entityId !== evaluatorEntityId) - return (
@@ -83,16 +59,7 @@ const EvaluatorPlaygroundHeader: React.FC = ({
-
- - variant="popover-cascader" - adapter={appWorkflowAdapter} - onSelect={onAppSelect} - size="small" - placeholder={selectedAppLabel ?? "Select app"} - /> - {hasAppSelected && } -
+
) } diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx new file mode 100644 index 0000000000..b52c0271ac --- /dev/null +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/EvaluatorRunControls.tsx @@ -0,0 +1,83 @@ +/** + * EvaluatorRunControls + * + * The run-on + app + testset control cluster, shared by the evaluator + * playground page header and the evaluator-creation drawer header so the two + * stay identical. Reads everything from `useEvaluatorRunControls` (atom-backed), + * so it takes no props — drop it next to a title and it works on either surface. + * + * - Run-on selector (test case / app output / trace). + * - App picker — only in "app" mode, with a disconnect affordance once connected. + * - Test set dropdown — always available: it's the data source in test-case + * mode and feeds the app in app mode. + */ + +import {EntityPicker} from "@agenta/entity-ui" +import type {WorkflowRevisionSelectionResult} from "@agenta/entity-ui/selection" +import {X} from "@phosphor-icons/react" +import {Button, Tooltip} from "antd" +import dynamic from "next/dynamic" + +import RunOnSelector from "./RunOnSelector" +import {useEvaluatorRunControls} from "./useEvaluatorRunControls" + +const TestsetDropdown = dynamic( + () => import("@/oss/components/Playground/Components/TestsetDropdown"), + {ssr: false}, +) + +const EvaluatorRunControls = () => { + const { + appWorkflowAdapter, + handleAppSelect, + disconnectApp, + runOnMode, + handlePickRunOn, + hasAppConnected, + selectedAppLabel, + } = useEvaluatorRunControls() + + const isAppMode = runOnMode === "app" + + // Footer inside the picker popover — only when an app is currently connected. + const popupFooter = hasAppConnected ? ( +
+ +
+ ) : undefined + + return ( +
+ + + {isAppMode && ( + + variant="popover-cascader" + adapter={appWorkflowAdapter} + onSelect={handleAppSelect} + size="small" + placeholder={selectedAppLabel ?? "Select app"} + popupFooter={popupFooter} + /> + )} + + {isAppMode && hasAppConnected && ( + +
+ ) +} + +export default EvaluatorRunControls diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx new file mode 100644 index 0000000000..4fd8a2ee61 --- /dev/null +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/RunOnSelector.tsx @@ -0,0 +1,295 @@ +/** + * RunOnSelector + * + * The "Run on" control for the evaluator playground header. A leading dropdown + * that names the data source the evaluator runs against and draws the resulting + * data-flow, so the empty/first state explains itself instead of leaving the + * user with two disconnected loaders. + * + * Three modes: + * - Run directly on data (Data → Evaluator → Score) + * - Run on an app (Data → App → Output → Evaluator → Score) — default + * - Run on a trace (Trace → Evaluator → Score) — disabled for now + * + * All colors come from the live antd token (`theme.useToken()`) so the control + * follows light/dark mode automatically. + */ + +import {useState} from "react" + +import {AppstoreOutlined} from "@ant-design/icons" +import { + CaretDownIcon, + CheckIcon, + DatabaseIcon, + GavelIcon, + TreeViewIcon, +} from "@phosphor-icons/react" +import {Button, Dropdown, theme} from "antd" +import type {GlobalToken} from "antd" +import clsx from "clsx" + +import type {RunOnMode} from "./atoms" + +// The app icon used across the product (the sidebar "Prompts" item). Wrapped so +// it accepts the same `size`/`style` props as the phosphor icons it sits beside. +const AppIcon = ({size = 16, style}: {size?: number; style?: React.CSSProperties}) => ( + +) + +// ── flow pills ────────────────────────────────────────────────────────────── + +type FlowVariant = "data" | "app" | "out" | "eval" | "trace" + +interface FlowNode { + label: string + variant: FlowVariant +} + +const flowStyle = (token: GlobalToken, variant: FlowVariant): React.CSSProperties => { + switch (variant) { + case "data": + return {background: token.blue1, color: token.blue7, borderColor: token.blue2} + case "app": + return { + background: token.colorPrimaryBg, + color: token.colorText, + borderColor: token.colorPrimaryBorder, + } + case "out": + return {background: token.green1, color: token.green7, borderColor: token.green3} + case "eval": + // index 7 (not 6) so the text brightens under the dark algorithm — + // purple6 lands dark-on-dark and disappears on a dark background. + return {background: token.purple1, color: token.purple7, borderColor: token.purple3} + case "trace": + return {background: token.cyan1, color: token.cyan7, borderColor: token.cyan3} + } +} + +const FlowIcon = ({variant}: {variant: FlowVariant}) => { + switch (variant) { + case "data": + return + case "app": + return + case "eval": + return + case "trace": + return + default: + return null + } +} + +const FlowPills = ({steps, token}: {steps: FlowNode[]; token: GlobalToken}) => ( +
+ {steps.map((step, i) => ( + + {i > 0 && ( + + → + + )} + + + {step.label} + + + ))} +
+) + +// ── modes ─────────────────────────────────────────────────────────────────── + +interface ModeDef { + key: RunOnMode + /** Full label shown in the dropdown option. */ + label: string + /** Short label shown after "Run on:" in the trigger button. */ + shortLabel: string + Icon: React.ComponentType<{size?: number; style?: React.CSSProperties}> + desc: string + flow: FlowNode[] + badge?: "default" | "soon" + disabled?: boolean +} + +const MODES: ModeDef[] = [ + { + key: "data", + label: "Run directly on a test case", + shortLabel: "Test case", + Icon: DatabaseIcon, + desc: "Evaluate data you provide. Connect a test set, or type the input and output in by hand.", + flow: [ + {label: "Data", variant: "data"}, + {label: "Evaluator", variant: "eval"}, + {label: "Score", variant: "out"}, + ], + }, + { + key: "app", + label: "Run on an app output", + shortLabel: "App output", + Icon: AppIcon, + badge: "default", + desc: "Run an app over your data, then the evaluator grades its output. The usual evaluation flow.", + flow: [ + {label: "Data", variant: "data"}, + {label: "App", variant: "app"}, + {label: "Output", variant: "out"}, + {label: "Evaluator", variant: "eval"}, + {label: "Score", variant: "out"}, + ], + }, + { + key: "trace", + label: "Run on a trace", + shortLabel: "Trace", + Icon: TreeViewIcon, + badge: "soon", + disabled: true, + desc: "Pull the input and output straight from a logged trace in Observability.", + flow: [ + {label: "Trace", variant: "trace"}, + {label: "Evaluator", variant: "eval"}, + {label: "Score", variant: "out"}, + ], + }, +] + +// ── component ─────────────────────────────────────────────────────────────── + +interface RunOnSelectorProps { + mode: RunOnMode + onPick: (mode: RunOnMode) => void +} + +const RunOnSelector = ({mode, onPick}: RunOnSelectorProps) => { + const {token} = theme.useToken() + const [open, setOpen] = useState(false) + const [hovered, setHovered] = useState(null) + const current = MODES.find((m) => m.key === mode) ?? MODES.find((m) => m.key === "app")! + + const overlay = ( +
+
+ What should the evaluator run on? +
+ {MODES.map((m) => { + const selected = m.key === mode + const isHovered = hovered === m.key + const background = selected + ? token.controlItemBgActive + : isHovered && !m.disabled + ? token.colorFillTertiary + : "transparent" + return ( +
setHovered(m.key)} + onMouseLeave={() => setHovered((h) => (h === m.key ? null : h))} + onClick={() => { + if (m.disabled) return + onPick(m.key) + setOpen(false) + }} + className={clsx( + "flex items-start gap-3 rounded-md p-2.5", + m.disabled ? "cursor-default opacity-55" : "cursor-pointer", + )} + style={{background}} + > + + {selected && } + +
+
+ + {m.label} + {m.badge === "default" && ( + + default + + )} + {m.badge === "soon" && ( + + soon + + )} +
+
+ {m.desc} +
+
+ +
+
+
+ ) + })} +
+ ) + + return ( + overlay} + > + + + ) +} + +export default RunOnSelector diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx new file mode 100644 index 0000000000..6b31aad851 --- /dev/null +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState.tsx @@ -0,0 +1,56 @@ +/** + * SelectAppEmptyState + * + * Centered empty state shown in the run/generation panel when the evaluator is + * in "Run on an app" mode but no app is connected yet. The evaluator can't run + * until an app is picked, so this guides the user to the one action that + * unblocks them. Shared by the evaluator playground page and the + * evaluator-creation drawer so both read identically. + */ + +import {EntityPicker} from "@agenta/entity-ui" +import type { + EntitySelectionAdapter, + WorkflowRevisionSelectionResult, +} from "@agenta/entity-ui/selection" +import {AppstoreOutlined} from "@ant-design/icons" +import {Typography, theme} from "antd" + +interface SelectAppEmptyStateProps { + adapter: EntitySelectionAdapter + onSelect: (selection: WorkflowRevisionSelectionResult) => void + selectedAppLabel?: string | null +} + +const SelectAppEmptyState = ({adapter, onSelect, selectedAppLabel}: SelectAppEmptyStateProps) => { + const {token} = theme.useToken() + + return ( +
+
+ +
+
+ + Select an app to run the evaluator on + + + The evaluator grades this app's output. Pick which app to run, then fill + its inputs or load a test set. + +
+ + variant="popover-cascader" + adapter={adapter} + onSelect={onSelect} + size="middle" + placeholder={selectedAppLabel ?? "Select app"} + /> +
+ ) +} + +export default SelectAppEmptyState diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts index fdbd5d271b..0c83b594af 100644 --- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/atoms.ts @@ -83,6 +83,53 @@ export const persistedTestsetSelectionAtom = atom( }, ) +// ============================================================================ +// RUN-ON MODE +// ============================================================================ + +/** + * What the evaluator runs on: + * - "data" → run directly on data you provide (test set or manual input/output) + * - "app" → run an app over the data, then grade its output (the usual flow) + * - "trace" → grade the input/output of a logged trace (not yet available) + * + * "app" is the default so a fresh playground guides the user down the most + * common path (pick an app → run against it). The "trace" mode is disabled in + * the UI for now. + */ +export type RunOnMode = "data" | "app" | "trace" + +const runOnModeByProjectAtom = atomWithStorage>( + "agenta:evaluator:run-on-mode", + {}, +) + +/** Read/write the persisted run-on mode for the current project (default "app"). */ +export const runOnModeAtom = atom( + (get) => { + const projectId = get(projectIdAtom) || "__global__" + return get(runOnModeByProjectAtom)[projectId] ?? "app" + }, + (get, set, next: RunOnMode) => { + const projectId = get(projectIdAtom) || "__global__" + const all = get(runOnModeByProjectAtom) + set(runOnModeByProjectAtom, {...all, [projectId]: next}) + }, +) + +/** + * The mode actually in effect. + * + * A connected app (downstream evaluator node) always means we're in "app" mode, + * regardless of the stored preference — the node graph is the source of truth. + * Only when nothing is connected do we fall back to the stored mode. + */ +export const effectiveRunOnModeAtom = atom((get) => { + const nodes = get(playgroundNodesAtom) + if (nodes.some((n) => n.depth > 0)) return "app" + return get(runOnModeAtom) +}) + // ============================================================================ // DERIVED SELECTORS // ============================================================================ @@ -115,8 +162,27 @@ export const hasAppConnectedAtom = atom((get) => { return nodes.some((n) => n.depth > 0) }) -/** Label of the currently selected app workflow (for display in header picker). */ -export const selectedAppLabelAtom = atom(null) +/** + * Label of the currently selected app workflow (for display in header picker). + * + * Derived from the node graph: when an evaluator-as-downstream (depth > 0) + * exists, the primary (depth-0) node is the connected app, and its `label` + * is what we want to show. Returns `null` in standalone mode (no downstream). + * + * Derived (not a primitive atom) so URL-hydration of the snapshot — which + * restores `playgroundNodesAtom` along with each node's `label` — automatically + * surfaces the right label without any explicit re-seeding from the page. + * Previously the atom was a primitive `atom(null)`, which left + * the picker placeholder empty after reload while the disconnect button and + * testset dropdown (both gated on the node graph) showed normally. + */ +export const selectedAppLabelAtom = atom((get) => { + const nodes = get(playgroundNodesAtom) + const hasDownstream = nodes.some((n) => n.depth > 0) + if (!hasDownstream) return null + const primary = nodes.find((n) => n.depth === 0) + return primary?.label ?? null +}) // ============================================================================ // CONNECT APP (on app select) @@ -143,11 +209,14 @@ export const connectAppToEvaluatorAtom = atom( ) => { const {appRevisionId, appLabel, evaluatorRevisionId, evaluatorLabel} = params - // Track selected app label for display + persist across sessions - set(selectedAppLabelAtom, appLabel) - set(persistedAppSelectionAtom, {appRevisionId, appLabel}) - - // Replace primary node with app + // Replace primary node with the app FIRST — if the graph mutation + // bails out (changePrimaryNode returns null when there's no current + // primary to swap), we must not commit a stale persisted record. + // Pre-fix the persist happened before this call, which could leave + // an `{appRevisionId, appLabel}` entry in localStorage referring to + // a connection that never actually formed; the next mount would + // re-hydrate from that record and the picker would show "connected" + // for an app the playground never linked. const nodeId = set(playgroundController.actions.changePrimaryNode, { type: "workflow", id: appRevisionId, @@ -165,5 +234,84 @@ export const connectAppToEvaluatorAtom = atom( label: evaluatorLabel, }, }) + + // Clean the shared testcase row against the newly-selected app's input + // contract so stale keys from a previously-selected app (e.g. chat + // `messages`/`context` after swapping a chat app for a completion app) + // are dropped immediately — not only at run time (#4525 / AGE-3793). + // Runs AFTER connectDownstreamNode so the evaluator is in the graph and + // its referenced columns (correct_answer_key → ground_truth, etc.) are + // protected from the strict app-contract clean. + set(playgroundController.actions.reconcileRowsToPrimary) + + // Persist only after both graph mutations succeeded. The picker + // display label is derived from the depth-0 node's `label` via + // `selectedAppLabelAtom`, so no extra write needed here. + set(persistedAppSelectionAtom, {appRevisionId, appLabel}) + + // Pin the stored run-on mode to "app" too. While connected, + // `effectiveRunOnModeAtom` overrides to "app" regardless, but the + // stored mode is what we fall back to on disconnect — without this a + // user who connected an app from "data" mode would snap back to the + // testcase panel on disconnect instead of the "Select an app" state. + set(runOnModeAtom, "app") + + // Force the node-derived display atoms to re-settle after the two + // sequential `playgroundNodesAtom` writes above (changePrimaryNode → + // connectDownstreamNode). On a disconnect→reconnect cycle jotai applies + // the writes (the value is correct) but does NOT notify the mounted + // dependents — `selectedAppLabelAtom` / `hasAppConnectedAtom` and the + // package's generation-panel atoms stay stale, so the UI keeps showing + // the "Select an app" empty state even though an app is connected + // (QA 2026-06-05 — re-selecting the same app after disconnect). Reading + // the derived atoms here re-establishes the dependency and flushes the + // pending notification to their subscribers. + get(selectedAppLabelAtom) + get(hasAppConnectedAtom) }, ) + +// ============================================================================ +// DISCONNECT APP (reverse the connect) +// ============================================================================ + +/** + * Disconnect the upstream app and return to standalone evaluator mode. + * + * Reverse of `connectAppToEvaluatorAtom`: + * 1. Capture the downstream evaluator's identity (we need it after removal). + * 2. Remove the downstream evaluator node (`removeNodeAtom` keeps primary if + * target is depth > 0; if there's no depth-1 node, this is a no-op and we + * just swap primary). + * 3. Swap the primary node back to the evaluator. `changePrimaryNodeAtom` + * clears `outputConnectionsAtom` for us as a side-effect. + * 4. Clear the persisted app selection + display label so the picker placeholder + * reverts to "Select app". + */ +export const disconnectAppFromEvaluatorAtom = atom(null, (get, set) => { + const nodes = get(playgroundController.selectors.nodes()) + const downstreamEvaluator = nodes.find((n) => n.depth > 0) + if (!downstreamEvaluator) { + // No downstream node means the graph is already in the + // standalone-evaluator shape, but a stale `persistedAppSelectionAtom` + // entry could still be on disk (e.g., from a previous session where + // `connectAppToEvaluatorAtom` persisted before its swap silently + // failed mid-mutation). Clear it on this path too so the next mount + // doesn't re-hydrate a phantom "connected" app. + set(persistedAppSelectionAtom, null) + return + } + + const evaluatorEntity = { + type: downstreamEvaluator.entityType, + id: downstreamEvaluator.entityId, + label: downstreamEvaluator.label ?? "Evaluator", + } + + set(playgroundController.actions.removeNode, downstreamEvaluator.id) + set(playgroundController.actions.changePrimaryNode, evaluatorEntity) + // `selectedAppLabelAtom` is derived from the node graph — clearing the + // downstream above is what flips it back to `null`. Only the persisted + // localStorage cache needs an explicit clear. + set(persistedAppSelectionAtom, null) +}) diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx index 35ff909bbd..f4db283381 100644 --- a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/index.tsx @@ -14,15 +14,8 @@ import {useCallback, useEffect, useMemo} from "react" import {loadableController} from "@agenta/entities/loadable" import {testcaseMolecule} from "@agenta/entities/testcase" -import {EntityPicker} from "@agenta/entity-ui" -import { - createWorkflowRevisionAdapter, - type WorkflowRevisionSelectionResult, -} from "@agenta/entity-ui/selection" -import {playgroundController} from "@agenta/playground" import {type PlaygroundUIProviders} from "@agenta/playground-ui" import {preloadEditorPlugins, SyncStateTag} from "@agenta/ui" -import {Typography} from "antd" import {useAtomValue, useSetAtom} from "jotai" import dynamic from "next/dynamic" @@ -32,13 +25,10 @@ import {OSSPlaygroundShell} from "@/oss/components/Playground/OSSPlaygroundShell import SharedGenerationResultUtils from "@/oss/components/SharedGenerationResultUtils" import {playgroundSyncAtom} from "@/oss/state/url/playground" -import { - connectAppToEvaluatorAtom, - evaluatorConfigEntityIdsAtom, - hasAppConnectedAtom, - selectedAppLabelAtom, -} from "./atoms" +import {evaluatorConfigEntityIdsAtom} from "./atoms" import EvaluatorPlaygroundHeader from "./EvaluatorPlaygroundHeader" +import SelectAppEmptyState from "./SelectAppEmptyState" +import {useEvaluatorRunControls} from "./useEvaluatorRunControls" const PlaygroundMainView = dynamic( () => import("@/oss/components/Playground/Components/MainLayout"), @@ -77,63 +67,24 @@ const ConfigureEvaluatorPageInner = () => { useAtomValue(playgroundSyncAtom) const configEntityIds = useAtomValue(evaluatorConfigEntityIdsAtom) - const hasAppConnected = useAtomValue(hasAppConnectedAtom) - const connectApp = useSetAtom(connectAppToEvaluatorAtom) - const selectedAppLabel = useAtomValue(selectedAppLabelAtom) - - // Read the current evaluator entity from playground nodes - // Phase 1: evaluator is at depth 0 (primary) - // Phase 2: evaluator is at depth 1 (downstream) - const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), [])) - const evaluatorNode = useMemo(() => { - const downstream = nodes.find((n) => n.depth > 0) - if (downstream) return downstream - return nodes[0] ?? null - }, [nodes]) + + // Shared run controls (app adapter, app-select, run-on mode, run gate) — the + // same hook the header and the creation drawer use, so all surfaces agree. + const {appWorkflowAdapter, handleAppSelect, selectedAppLabel, runDisabled} = + useEvaluatorRunControls() // Preload editor plugins useEffect(() => { void preloadEditorPlugins() }, []) - // App workflow picker (shared between header and empty state) - const appWorkflowAdapter = useMemo( - () => - createWorkflowRevisionAdapter({ - skipVariantLevel: true, - excludeRevisionZero: true, - flags: {is_evaluator: false, is_feedback: false}, - }), - [], - ) - - const handleAppSelect = useCallback( - (selection: WorkflowRevisionSelectionResult) => { - if (!evaluatorNode) return - connectApp({ - appRevisionId: selection.id, - appLabel: selection.label, - evaluatorRevisionId: evaluatorNode.entityId, - evaluatorLabel: evaluatorNode.label ?? "Evaluator", - }) - }, - [connectApp, evaluatorNode], - ) - const runDisabledContent = useMemo( () => ( - <> - - Select an app to run the evaluator chain - - - variant="popover-cascader" - adapter={appWorkflowAdapter} - onSelect={handleAppSelect} - size="middle" - placeholder={selectedAppLabel ?? "Select app"} - /> - + ), [appWorkflowAdapter, handleAppSelect, selectedAppLabel], ) @@ -151,15 +102,16 @@ const ConfigureEvaluatorPageInner = () => { return ( -
- + {/* Definite height (viewport minus the app topbar) so the run panel's + * `h-full` centering resolves — same pattern as the app playground + * (`Playground.tsx`). With a plain `h-full` here the chain collapses + * to content height and the empty state sticks to the top. */} +
+
diff --git a/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts new file mode 100644 index 0000000000..c75dba5a98 --- /dev/null +++ b/web/oss/src/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls.ts @@ -0,0 +1,107 @@ +/** + * useEvaluatorRunControls + * + * Single source of truth for the evaluator playground's run controls, shared by + * the full-page playground, the evaluator-creation drawer, and the workflow + * revision drawer. Before this hook, the app adapter, app-select handler, + * evaluator-node lookup, and run-on wiring were copy-pasted across every + * surface — which is exactly how the drawers drifted out of sync with the page + * (they kept forcing an app even in test-case mode). Centralizing it here means + * every surface behaves identically by construction. + */ + +import {useCallback, useMemo} from "react" + +import { + createWorkflowRevisionAdapter, + type WorkflowRevisionSelectionResult, +} from "@agenta/entity-ui/selection" +import {playgroundController} from "@agenta/playground" +import {useAtomValue, useSetAtom} from "jotai" + +import { + connectAppToEvaluatorAtom, + disconnectAppFromEvaluatorAtom, + effectiveRunOnModeAtom, + hasAppConnectedAtom, + runOnModeAtom, + selectedAppLabelAtom, + type RunOnMode, +} from "./atoms" + +export function useEvaluatorRunControls() { + // Evaluator node — phase 1: evaluator at depth 0 (primary); phase 2: + // evaluator at depth 1 (downstream of a connected app). + const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), [])) + const evaluatorNode = useMemo(() => { + const downstream = nodes.find((n) => n.depth > 0) + if (downstream) return downstream + return nodes[0] ?? null + }, [nodes]) + + // App picker — picks an upstream *app* workflow to attach to the evaluator. + // `parentLabel: "Application"` keeps the search bar saying "Search app…" + // rather than the adapter's historical "Search evaluator…" default. + const appWorkflowAdapter = useMemo( + () => + createWorkflowRevisionAdapter({ + skipVariantLevel: true, + excludeRevisionZero: true, + flags: {is_evaluator: false, is_feedback: false}, + parentLabel: "Application", + }), + [], + ) + + const connectApp = useSetAtom(connectAppToEvaluatorAtom) + const disconnectApp = useSetAtom(disconnectAppFromEvaluatorAtom) + + const handleAppSelect = useCallback( + (selection: WorkflowRevisionSelectionResult) => { + if (!evaluatorNode) return + connectApp({ + appRevisionId: selection.id, + appLabel: selection.label, + evaluatorRevisionId: evaluatorNode.entityId, + evaluatorLabel: evaluatorNode.label ?? "Evaluator", + }) + }, + [connectApp, evaluatorNode], + ) + + // Run-on mode. A connected app forces effective "app" mode (the node graph + // is the source of truth); the stored preference only applies when nothing + // is connected. + const runOnMode = useAtomValue(effectiveRunOnModeAtom) + const setRunOnMode = useSetAtom(runOnModeAtom) + const handlePickRunOn = useCallback( + (next: RunOnMode) => { + if (next === "trace") return // disabled, not selectable + // Leaving "app" mode drops the connected app so the graph returns to + // standalone-evaluator shape. + if (next === "data") disconnectApp() + setRunOnMode(next) + }, + [disconnectApp, setRunOnMode], + ) + + const hasAppConnected = useAtomValue(hasAppConnectedAtom) + const selectedAppLabel = useAtomValue(selectedAppLabelAtom) + + // In "app" mode with no app connected yet, the evaluator can't run — the run + // panel surfaces the app selector instead of the testcase rows. In test-case + // mode the evaluator runs standalone, so it's never blocked on an app. Only + // takes effect where the run panel renders (the page and expanded drawers). + const runDisabled = runOnMode === "app" && !hasAppConnected + + return { + appWorkflowAdapter, + handleAppSelect, + disconnectApp, + runOnMode, + handlePickRunOn, + hasAppConnected, + selectedAppLabel, + runDisabled, + } +} diff --git a/web/oss/src/components/Evaluators/index.tsx b/web/oss/src/components/Evaluators/index.tsx index ed318a1469..00e8737b30 100644 --- a/web/oss/src/components/Evaluators/index.tsx +++ b/web/oss/src/components/Evaluators/index.tsx @@ -3,7 +3,6 @@ import {memo, useCallback, useEffect, useMemo, useState} from "react" import { createEvaluatorFromTemplate, type EvaluatorCatalogTemplate, - hasFullPagePlaygroundUX, invalidateEvaluatorsListCache, workflowMolecule, } from "@agenta/entities/workflow" @@ -260,22 +259,18 @@ const EvaluatorsRegistry = ({scope = "project", mode = "active"}: EvaluatorsRegi return } - // Only prompt/code-authored evaluators open in the full-page - // playground. Declarative classifiers (match, contains, regex, - // json_multi_field_match, …) fall back to the drawer-edit flow — - // their config is a handful of form fields and the playground - // page would surface misleading envelope variable inputs. + // All non-archived automatic evaluators open in the full-page + // playground. Earlier this was gated on classifier type + // (`hasFullPagePlaygroundUX`) so declarative classifiers stayed in + // the drawer-edit flow, but in practice that meant whole evaluator + // types had no UI path into the per-evaluator pages (variants, + // traces). Drawer stays available as a secondary affordance via + // the row context menu's Configure action. // // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is - // off, every row click resolves to the drawer regardless of the - // evaluator's classifier (the new flow stays code-complete but - // hidden until follow-up fixes land). - const entity = record.revisionId ? workflowMolecule.get.data(record.revisionId) : null + // off, every row click resolves to the drawer. const shouldNavigateToFullPage = Boolean( - EVALUATOR_FULL_PAGE_NAV_ENABLED && - record.workflowId && - entity && - hasFullPagePlaygroundUX(entity as Parameters[0]), + EVALUATOR_FULL_PAGE_NAV_ENABLED && record.workflowId, ) const navigated = diff --git a/web/oss/src/components/Filters/Filters.tsx b/web/oss/src/components/Filters/Filters.tsx index b1dd2d8385..42a53d736b 100644 --- a/web/oss/src/components/Filters/Filters.tsx +++ b/web/oss/src/components/Filters/Filters.tsx @@ -283,6 +283,7 @@ const Filters: React.FC = ({ onApplyFilter, onClearFilter, buttonProps, + reconcileFilterRows, }) => { const evaluatorPreviews = useAtomValue(evaluatorsListDataAtom) @@ -358,6 +359,37 @@ const Filters: React.FC = ({ : item.value == null ? [] : [item.value] + + // Prefer a candidate whose `referenceCategory` matches + // the entry's `"attributes.key"`. This disambiguates + // the `references` family — application.id / + // evaluator.id / environment.id all share + // `baseField: "references"` and + // `referenceProperty: "id"`, so without this check the + // first match (application.id) always wins, mislabelling + // an evaluator-scoped filter as "Application ID". + const attributesKey = (() => { + for (const entry of valuesArray) { + if (entry && typeof entry === "object") { + const ak = (entry as Record)["attributes.key"] + if (typeof ak === "string") return ak + } + } + return undefined + })() + if (attributesKey) { + for (const candidate of matches) { + if (candidate.referenceCategory !== attributesKey) continue + if (!candidate.referenceProperty) continue + const refProp = candidate.referenceProperty + const hasMatch = valuesArray.some( + (entry) => + entry && typeof entry === "object" && refProp in entry, + ) + if (hasMatch) return candidate + } + } + for (const candidate of matches) { if (!candidate.referenceProperty) continue const refProp = candidate.referenceProperty @@ -511,6 +543,22 @@ const Filters: React.FC = ({ const [isFilterOpen, setIsFilterOpen] = useState(false) const [keySearchTerms, setKeySearchTerms] = useState>({}) + /** + * Display-only projection of `filter`. The reconciler is opt-in (passed by + * the parent) and may rewrite *cosmetic* row fields like `selectedField` / + * `selectedLabel` so the UI reflects an in-flight choice (e.g., + * observability flipping the references row's label between "Application + * ID" / "Evaluator ID" as the user picks a trace_type, before Apply). + * + * Mutations still call `setFilter(filter)` by index, so the reconciler is + * required to preserve array length and per-index order — that contract + * is documented on the prop. + */ + const displayedFilter = useMemo( + () => (reconcileFilterRows ? reconcileFilterRows(filter) : filter), + [filter, reconcileFilterRows], + ) + const sanitizedFilters = useMemo(() => { return sanitizeFilterItems( filter.filter(({field, operator, isPermanent, isCustomField}) => { @@ -816,7 +864,7 @@ const Filters: React.FC = ({
- {filter.map((item, idx) => { + {displayedFilter.map((item, idx) => { const uiKey = item.selectedField || item.field || "" const baseFieldCfg = getField(uiKey) const field = effectiveFieldForRow(baseFieldCfg, item) diff --git a/web/oss/src/components/Filters/types.d.ts b/web/oss/src/components/Filters/types.d.ts index b25512a170..03f00d6a63 100644 --- a/web/oss/src/components/Filters/types.d.ts +++ b/web/oss/src/components/Filters/types.d.ts @@ -8,6 +8,20 @@ export interface Props { onApplyFilter: (filters: Filter[]) => void onClearFilter: (filters: Filter[]) => void buttonProps?: ButtonProps + /** + * Optional callback to derive a *display-only* view of the local filter + * state. Called whenever the user changes a row in the dialog. The dialog + * renders from the returned array, but mutations still target the + * underlying `filter` state by index, so the reconciler MUST preserve + * array length and per-index order. + * + * Used by observability to keep the permanent references row's label + * ("Application ID" vs "Evaluator ID") in sync with the dialog's local + * `trace_type` selection *before* the user clicks Apply — without the + * reconciler, the label only refreshes after Apply when the atom + * re-derives the permanent row. + */ + reconcileFilterRows?: (rows: FilterItem[]) => FilterItem[] } export type CustomValueType = "string" | "number" | "boolean" diff --git a/web/oss/src/components/Layout/FooterIsland.tsx b/web/oss/src/components/Layout/FooterIsland.tsx deleted file mode 100644 index b5b335575d..0000000000 --- a/web/oss/src/components/Layout/FooterIsland.tsx +++ /dev/null @@ -1,30 +0,0 @@ -import {memo} from "react" - -import {GithubFilled, LinkedinFilled, TwitterOutlined} from "@ant-design/icons" -import {Layout, Space, Typography} from "antd" -import Link from "next/link" - -const {Footer} = Layout - -interface FooterIslandProps { - className?: string -} - -export const FooterIsland = memo(function FooterIsland({className}: FooterIslandProps) { - return ( -
- - - - - - - - - - - - Copyright © {new Date().getFullYear()} | Agenta. -
- ) -}) diff --git a/web/oss/src/components/Layout/Layout.tsx b/web/oss/src/components/Layout/Layout.tsx index 8e73230639..eb415201f3 100644 --- a/web/oss/src/components/Layout/Layout.tsx +++ b/web/oss/src/components/Layout/Layout.tsx @@ -1,16 +1,12 @@ -import {memo, useCallback, useEffect, useRef, useState, type ReactNode, type RefObject} from "react" +import {memo, useCallback, useEffect, useRef, useState, type ReactNode} from "react" -import {GithubFilled, LinkedinFilled, TwitterOutlined} from "@ant-design/icons" -import {ConfigProvider, Layout, Modal, Space, theme} from "antd" +import {ConfigProvider, Layout, Modal, theme} from "antd" import clsx from "clsx" import {atom} from "jotai" import {useAtom, useAtomValue, useSetAtom, useStore} from "jotai" import {selectAtom} from "jotai/utils" -import dynamic from "next/dynamic" -import Link from "next/link" import {useRouter} from "next/router" import {ErrorBoundary} from "react-error-boundary" -import {useResizeObserver} from "usehooks-ts" import useURL from "@/oss/hooks/useURL" import {currentAppAtom} from "@/oss/state/app" @@ -146,11 +142,6 @@ const useCommittedLayoutFlags = (): LayoutRouteFlags => { return committedFlags } -const FooterIsland = dynamic(() => import("./FooterIsland").then((m) => m.FooterIsland), { - ssr: false, - loading: () => null, -}) - type StyleClasses = ReturnType const {Content} = Layout @@ -169,7 +160,6 @@ const AppWithVariants = memo( isEvaluator, isFullHeight, appTheme, - footerHeight, }: { children: ReactNode isAppRoute: boolean @@ -179,7 +169,6 @@ const AppWithVariants = memo( classes: StyleClasses appTheme: string isPlayground?: boolean - footerHeight?: number }) => { const {baseAppURL} = useURL() const appState = useAppState() @@ -361,24 +350,6 @@ const AppWithVariants = memo( )}
-
- - - - - - - - - - - - -
Copyright © {new Date().getFullYear()} | Agenta.
-
@@ -388,12 +359,7 @@ const AppWithVariants = memo( const App: React.FC = ({children}) => { const {appTheme} = useAppTheme() - const ref = useRef(null) - const {height: footerHeight} = useResizeObserver({ - ref: ref as RefObject, - box: "border-box", - }) - const classes = useStyles({themeMode: appTheme, footerHeight} as StyleProps) + const classes = useStyles({themeMode: appTheme}) const {isHumanEval, isPlayground, isAppRoute, isAuthRoute, isEvaluator, isFullHeight} = useCommittedLayoutFlags() @@ -419,7 +385,6 @@ const App: React.FC = ({children}) => { isHumanEval={isHumanEval} isEvaluator={isEvaluator} isFullHeight={isFullHeight} - footerHeight={footerHeight} > {children} {contextHolder} diff --git a/web/oss/src/components/Layout/assets/styles.ts b/web/oss/src/components/Layout/assets/styles.ts index cc47034abc..49a9112c52 100644 --- a/web/oss/src/components/Layout/assets/styles.ts +++ b/web/oss/src/components/Layout/assets/styles.ts @@ -2,9 +2,7 @@ import {createUseStyles} from "react-jss" import type {JSSTheme, StyleProps as MainStyleProps} from "@/oss/lib/Types" -export interface StyleProps extends MainStyleProps { - footerHeight: number -} +export type StyleProps = MainStyleProps export const useStyles = createUseStyles((theme: JSSTheme) => ({ layout: ({themeMode}: StyleProps) => ({ @@ -14,15 +12,15 @@ export const useStyles = createUseStyles((theme: JSSTheme) => ({ minHeight: "100vh", position: "relative", }), - content: ({footerHeight}: StyleProps) => ({ - height: `calc(100% - ${footerHeight ?? 0}px)`, + content: { + height: "100%", paddingTop: "24px", paddingLeft: "1.5rem", paddingRight: "1.5rem", - marginBottom: `calc(2rem + ${footerHeight ?? 0}px)`, + marginBottom: "2rem", flex: 1, gap: 16, - }), + }, breadcrumbContainer: { display: "flex", alignItems: "center", @@ -31,38 +29,6 @@ export const useStyles = createUseStyles((theme: JSSTheme) => ({ padding: "8px 1.5rem", borderBottom: `1px solid ${theme.colorBorderSecondary}`, }, - footer: { - position: "absolute", - bottom: 0, - left: 0, - right: 0, - textAlign: "center", - padding: "5px 20px", - display: "flex", - alignItems: "center", - justifyContent: "space-between", - // antd's Layout.Footer defaults to colorBgLayout (#000 in dark), which - // reads as a mismatched black band against the #141414 content. Blend - // with whatever's behind it instead, and add a top border to separate - // it from the content above. - backgroundColor: "transparent", - borderTop: `1px solid ${theme.colorBorderSecondary}`, - // The social links are anchors that would otherwise inherit antd's - // colorLink (blue in dark). Use neutral text color so they read as icons, - // not links — matches the prior navy look in light, flips to light in dark. - "& a": { - color: theme.colorText, - }, - "& a:hover": { - color: theme.colorTextSecondary, - }, - }, - footerLeft: { - fontSize: 18, - }, - footerLinkIcon: ({themeMode}: StyleProps) => ({ - color: themeMode === "dark" ? "#fff" : "#000", - }), topRightBar: { display: "flex", alignItems: "center", diff --git a/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx b/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx index 6ec0ba479d..315f0f7093 100644 --- a/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx +++ b/web/oss/src/components/Playground/Components/Modals/DeployVariantModal/assets/DeployVariantButton/index.tsx @@ -29,6 +29,9 @@ const DeployVariantButton = ({ const runnableData = useAtomValue(workflowMolecule.selectors.data(revisionId || "")) const workflowId = runnableData?.workflow_id || "" + // Workflow-level evaluator flag — canonical, unlike the revision-level + // `flags.is_evaluator` which is `false` on v0 revisions of evaluators. + const isEvaluator = useAtomValue(workflowMolecule.selectors.isEvaluator(workflowId)) const variants = useAtomValue(workflowVariantsListDataAtomFamily(workflowId)) const {environments, variantName, revision} = useMemo(() => { @@ -46,6 +49,12 @@ const DeployVariantButton = ({ const handleCloseDeployModal = useCallback(() => setIsDeployModalOpen(false), []) + // Evaluator workflows aren't deployed to environments — never render a + // deploy trigger for them. Central guard so every surface that reuses this + // button (registry/overview menus, variant headers, the revision drawer) is + // covered without each call site repeating the check. + if (isEvaluator) return null + return ( <> {isValidElement(children) ? ( diff --git a/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx b/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx index 864f2d938d..94e2478278 100644 --- a/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx +++ b/web/oss/src/components/Playground/Components/PlaygroundVariantConfig/assets/PlaygroundVariantConfigHeader.tsx @@ -72,6 +72,10 @@ const PlaygroundVariantConfigHeader = ({ skipVariantLevel: true, excludeRevisionZero: true, flags: {is_evaluator: false, is_feedback: false}, + // App browse picker — without this the search bar would say + // "Search evaluator…" (the adapter's default in skip-variant + // mode) while the user is browsing apps. + parentLabel: "Application", }), [], ) diff --git a/web/oss/src/components/PlaygroundRouter/index.tsx b/web/oss/src/components/PlaygroundRouter/index.tsx index bd983f1461..4abb241beb 100644 --- a/web/oss/src/components/PlaygroundRouter/index.tsx +++ b/web/oss/src/components/PlaygroundRouter/index.tsx @@ -1,24 +1,13 @@ -import {memo, useEffect, useMemo, useRef} from "react" +import {memo} from "react" -import { - hasFullPagePlaygroundUX, - workflowLatestRevisionIdAtomFamily, - workflowMolecule, -} from "@agenta/entities/workflow" import {bgColors} from "@agenta/ui" import {DownOutlined} from "@ant-design/icons" import {Flask, Plus} from "@phosphor-icons/react" import {Button, Space, Typography} from "antd" import {useAtomValue} from "jotai" import dynamic from "next/dynamic" -import {useRouter} from "next/router" -import {appIdentifiersAtom} from "@/oss/state/appState" -import { - currentWorkflowAtom, - currentWorkflowContextAtom, - EVALUATOR_FULL_PAGE_NAV_ENABLED, -} from "@/oss/state/workflow" +import {currentWorkflowContextAtom} from "@/oss/state/workflow" const PlaygroundLoadingShell = () => { return ( @@ -60,92 +49,40 @@ const Playground = dynamic(() => import("../Playground/Playground"), { loading: PlaygroundLoadingShell, }) -/** - * Stale-URL guard for evaluator playgrounds. Most evaluators (classifiers, - * matchers, JSON validators, …) have no meaningful full-page playground UX — - * just a handful of form fields the drawer already renders. When the - * resolved workflow is one of those evaluators, redirect to the evaluators - * registry with the revision pre-selected so the drawer opens automatically. - * Prompt/code-authored evaluators (auto_ai_critique, llm, code) are kept on - * the playground page. - * - * Classification source: the workflow LIST entry has no `data.uri` (data is - * only populated on revision-detail responses), so we resolve the latest - * revision via `workflowLatestRevisionIdAtomFamily` and read its seeded - * entity from the molecule to get the URI. Without this, every evaluator - * playground briefly looks "unknown" and the guard would mis-redirect - * prompt-based evaluators like LLM-as-a-judge. - */ -const useEvaluatorPlaygroundGuard = () => { - const ctx = useAtomValue(currentWorkflowContextAtom) - const workflow = useAtomValue(currentWorkflowAtom) - const {workspaceId, projectId} = useAtomValue(appIdentifiersAtom) - const router = useRouter() - const redirectedFor = useRef(null) - - const workflowId = ctx.workflowId ?? "" - const latestRevisionId = useAtomValue( - useMemo(() => workflowLatestRevisionIdAtomFamily(workflowId), [workflowId]), - ) - - useEffect(() => { - if (ctx.isResolving || ctx.isError || ctx.isNotFound) return - if (ctx.workflowKind !== "evaluator") return - if (!workflow || !ctx.workflowId) return - if (!workspaceId || !projectId) return - if (redirectedFor.current === ctx.workflowId) return - - // Resolve the latest revision data — it carries `data.uri` and the - // URI-derived flags (`is_llm`, `is_code`) that classifier vs prompt - // evaluators differ on. The workflow list entry has neither. - const latestRevision = latestRevisionId - ? (workflowMolecule.get.data(latestRevisionId) as - | Parameters[0] - | null) - : null - - // Bail until we have a classifiable record. Redirecting on a half- - // loaded workflow would bounce prompt-based evaluators (whose URI - // hasn't been seeded yet) into the drawer mid-load. - const hasUri = Boolean(latestRevision?.data?.uri) - const hasTypeFlag = Boolean( - latestRevision?.flags?.is_llm || - latestRevision?.flags?.is_code || - workflow.flags?.is_llm || - workflow.flags?.is_code, - ) - if (!hasUri && !hasTypeFlag) return - - // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off, - // skip the "stay on /playground" early return so every evaluator URL - // (including direct visits / bookmarks) bounces back to /evaluators - // and opens the drawer. - const classifyTarget = latestRevision ?? workflow - if (EVALUATOR_FULL_PAGE_NAV_ENABLED && hasFullPagePlaygroundUX(classifyTarget)) return - - const base = `/w/${encodeURIComponent(workspaceId)}/p/${encodeURIComponent(projectId)}` - const target = latestRevisionId - ? `${base}/evaluators?revisionId=${encodeURIComponent(latestRevisionId)}` - : `${base}/evaluators` - - redirectedFor.current = ctx.workflowId - router.replace(target) - }, [ - ctx.isResolving, - ctx.isError, - ctx.isNotFound, - ctx.workflowKind, - ctx.workflowId, - workflow, - latestRevisionId, - workspaceId, - projectId, - router, - ]) -} +// When the current workflow is an evaluator we render the evaluator-flavored +// page (with `EvaluatorPlaygroundHeader` + `connectAppToEvaluatorAtom`) instead +// of the generic app ``. Same code path that powers +// `/evaluators/playground` today — `playgroundSyncAtom` matches `/playground` +// anywhere in the pathname so hydration works at both URLs unchanged. +const ConfigureEvaluatorPage = dynamic( + () => import("@/oss/components/Evaluators/components/ConfigureEvaluator"), + {ssr: false, loading: PlaygroundLoadingShell}, +) const PlaygroundRouter = () => { - useEvaluatorPlaygroundGuard() + const ctx = useAtomValue(currentWorkflowContextAtom) + + // Evaluators get the evaluator-flavored page so the upstream-app picker + // is visible (the generic header only exposes the reverse direction — + // app-needs-evaluator — not evaluator-needs-app). All evaluator kinds + // (LLM/code, declarative classifiers, custom hooks, …) land here on + // direct URL visits + sidebar switcher clicks; for simple classifiers + // ConfigureEvaluatorPage renders the same few form fields the drawer + // would, with the bonus of the evaluator-as-app surface (variants, + // traces, sidebar context). + // + // Exception: `is_feedback` evaluators (human-annotation workflows) are + // intentionally drawer-only in /evaluators — they don't run, they capture + // human input. Routing them to `ConfigureEvaluatorPage` would render a + // page with no testset/run controls that make sense for them. Direct + // URL visits to `/apps//playground` fall through to the + // generic ``, which will (correctly) treat them as an + // unsupported playground target and let the upstream route guard / + // landing logic redirect them back to /evaluators. + const isFeedbackEvaluator = ctx.workflow?.flags?.is_feedback === true + if (ctx.workflowKind === "evaluator" && !isFeedbackEvaluator) { + return + } return } diff --git a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx index 734ddbd2b4..fd265ce7d1 100644 --- a/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx +++ b/web/oss/src/components/Sidebar/components/WorkflowEntityCard.tsx @@ -1,7 +1,7 @@ import {memo, useCallback, useMemo, useState} from "react" import { - fullPagePlaygroundEvaluatorsAtom, + nonHumanEvaluatorsAtom, nonArchivedAppWorkflowsAtom, nonArchivedEvaluatorsAtom, parseWorkflowKeyFromUri, @@ -116,25 +116,25 @@ const SWITCHER_MENU_CLASS = clsx( const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => { const ctx = useAtomValue(currentWorkflowContextAtom) const apps = useAtomValue(nonArchivedAppWorkflowsAtom) as readonly Workflow[] - // Full set of evaluators — used for resolving the *active* workflow (the - // user may be inside a drawer-only evaluator currently). The switcher - // dropdown below uses `fullPagePlaygroundEvaluators` instead so it only - // lists evaluators whose destination is /apps/[id]/playground — clicking - // a declarative classifier or human evaluator from the sidebar would - // route through the route guard and bounce back to /evaluators, which is - // confusing. const evaluators = useAtomValue(nonArchivedEvaluatorsAtom) as readonly Workflow[] + // The switcher lists every AUTOMATIC evaluator — LLM, code, AND the + // declarative classifiers (exact match, regex, similarity / semantic + // similarity, json diff, contains json, …). `nonHumanEvaluatorsAtom` + // resolves `is_feedback` from each evaluator's LATEST REVISION — the + // workflow LIST records this card reads from `nonArchivedEvaluatorsAtom` + // carry NO `is_feedback`/`is_llm`/`is_code` flags (those live on the + // revision, not the parent artifact), which is why the old + // `!w.flags?.is_feedback` filter never excluded anything and human + // evaluators leaked in (QA 2026-06-05). It drops ONLY human (`is_feedback`) + // evaluators; navigation lands on the workflow's current sub-page (Overview/ + // Evaluations are valid for every evaluator), so matchers no longer dead-end. + const automaticEvaluators = useAtomValue(nonHumanEvaluatorsAtom) as readonly Workflow[] // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the flag is off, the - // switcher dropdown hides the "Evaluators" group entirely. Clicking an - // entry would route to `/apps//playground`, which the - // (also-gated) `PlaygroundRouter` guard would immediately bounce back to - // `/evaluators` — exposing the entry would just produce a flicker. - const fullPagePlaygroundEvaluatorsRaw = useAtomValue( - fullPagePlaygroundEvaluatorsAtom, - ) as readonly Workflow[] - const fullPagePlaygroundEvaluators: readonly Workflow[] = EVALUATOR_FULL_PAGE_NAV_ENABLED - ? fullPagePlaygroundEvaluatorsRaw - : EMPTY_WORKFLOWS + // switcher dropdown hides the "Evaluators" group entirely. + const switcherEvaluators: readonly Workflow[] = useMemo(() => { + if (!EVALUATOR_FULL_PAGE_NAV_ENABLED) return EMPTY_WORKFLOWS + return automaticEvaluators + }, [automaticEvaluators]) const recentAppId = useAtomValue(recentAppIdAtom) const recentEvaluatorId = useAtomValue(recentEvaluatorIdAtom) const navigateToWorkflow = useSetAtom(routerAppNavigationAtom) @@ -192,16 +192,16 @@ const WorkflowEntityCard = memo(({collapsed}: WorkflowEntityCardProps) => { children: apps.map((w) => toMenuItem(w, false)), }) } - if (fullPagePlaygroundEvaluators.length) { + if (switcherEvaluators.length) { items.push({ key: "evaluators-header", type: "group", label: "Evaluators", - children: fullPagePlaygroundEvaluators.map((w) => toMenuItem(w, true)), + children: switcherEvaluators.map((w) => toMenuItem(w, true)), }) } return items - }, [apps, fullPagePlaygroundEvaluators]) + }, [apps, switcherEvaluators]) const handleSwitcherClick = useCallback>( ({key}) => { diff --git a/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx b/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx index 70a915b7ac..eb467d00f8 100644 --- a/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx +++ b/web/oss/src/components/Sidebar/hooks/useSidebarConfig/index.tsx @@ -19,7 +19,7 @@ import { RocketLaunchIcon, ListChecksIcon, } from "@phosphor-icons/react" -import {useAtomValue, useSetAtom} from "jotai" +import {useSetAtom} from "jotai" import {useCrispChat} from "@/oss/hooks/useCrispChat" import {useSession} from "@/oss/hooks/useSession" @@ -30,7 +30,6 @@ import {openWidgetAtom} from "@/oss/lib/onboarding" import {useAppsData} from "@/oss/state/app" import {useAppState} from "@/oss/state/appState" import {useOrgData} from "@/oss/state/org" -import {currentWorkflowContextAtom} from "@/oss/state/workflow" import {SidebarConfig} from "../../types" @@ -47,15 +46,6 @@ export const useSidebarConfig = () => { const hasAppContext = routeLayer === "app" && Boolean(routedAppId || appURL || recentlyVisitedAppURL) - // Phase 4: when the current workflow is an evaluator, DISABLE (not hide) - // the app-section items that don't apply to evaluators (overview, - // evaluations). Items stay visible but greyed out so the user understands - // they exist — they just aren't applicable for this workflow type. - // Endpoints and deployments aren't in the sidebar today, so no extra - // gating needed for those. - const workflowCtx = useAtomValue(currentWorkflowContextAtom) - const isCurrentWorkflowEvaluator = workflowCtx.workflowKind === "evaluator" - const sidebarConfig: SidebarConfig[] = [ { key: "app-management-link", @@ -123,9 +113,10 @@ export const useSidebarConfig = () => { icon: , isHidden: !hasAppContext && !currentApp && !recentlyVisitedAppId, isAppSection: true, - // Disabled (not hidden) for evaluator workflows so the user still - // sees these surfaces exist — just not applicable here. - disabled: !hasProjectURL || isCurrentWorkflowEvaluator, + // Enabled for evaluators too — Overview surfaces the workflow's + // details, variants, and the evaluation runs that evaluated it + // (scoped by the workflow id as the `application` reference). + disabled: !hasProjectURL, }, { key: "app-playground-link", @@ -153,8 +144,10 @@ export const useSidebarConfig = () => { isHidden: !hasAppContext && !currentApp && !recentlyVisitedAppId, isAppSection: true, icon: , - // Disabled (not hidden) for evaluator workflows. - disabled: !hasProjectURL || isCurrentWorkflowEvaluator, + // Enabled for evaluators too — shows the evaluation runs that + // evaluated this evaluator (scoped by its id as the `application` + // reference, same machinery as the app-scoped evaluations page). + disabled: !hasProjectURL, dataTour: "evaluations-nav", }, { diff --git a/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx b/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx index 6726f2423c..2b236c2243 100644 --- a/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx +++ b/web/oss/src/components/WorkflowRevisionDrawerWrapper/index.tsx @@ -15,18 +15,12 @@ import {testcaseMolecule} from "@agenta/entities/testcase" import { registerWorkflowCommitCallbacks, getWorkflowCommitCallbacks, - hasFullPagePlaygroundUX, parseEvaluatorKeyFromUri, evaluatorTemplatesMapAtom, workflowMolecule, discardLocalServerDataAtom, } from "@agenta/entities/workflow" -import {EntityPicker} from "@agenta/entity-ui" import {PlaygroundConfigSection} from "@agenta/entity-ui/drill-in" -import { - createWorkflowRevisionAdapter, - type WorkflowRevisionSelectionResult, -} from "@agenta/entity-ui/selection" import {VariantDetailsWithStatus, VariantNameCell} from "@agenta/entity-ui/variant" import {playgroundController} from "@agenta/playground" import { @@ -53,7 +47,7 @@ import { } from "@agenta/playground-ui/workflow-revision-drawer" import {EnvironmentTag} from "@agenta/ui" import {Rocket} from "@phosphor-icons/react" -import {Button, Typography, message} from "antd" +import {Button, message} from "antd" import {getDefaultStore, useAtom, useAtomValue, useSetAtom} from "jotai" import dynamic from "next/dynamic" import {useRouter} from "next/router" @@ -64,9 +58,10 @@ import { connectAppToEvaluatorAtom, persistedAppSelectionAtom, persistedTestsetSelectionAtom, - selectedAppLabelAtom, } from "@/oss/components/Evaluators/components/ConfigureEvaluator/atoms" import EvaluatorPlaygroundHeader from "@/oss/components/Evaluators/components/ConfigureEvaluator/EvaluatorPlaygroundHeader" +import SelectAppEmptyState from "@/oss/components/Evaluators/components/ConfigureEvaluator/SelectAppEmptyState" +import {useEvaluatorRunControls} from "@/oss/components/Evaluators/components/ConfigureEvaluator/useEvaluatorRunControls" import {clearEvaluatorWorkflowCache} from "@/oss/components/Evaluators/store/evaluatorsPaginatedStore" import {invalidateAppManagementWorkflowQueries} from "@/oss/components/pages/app-management/store" import {invalidatePromptsWorkflowQueries} from "@/oss/components/pages/prompts/store" @@ -200,7 +195,6 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => { const resetAll = useSetAtom(playgroundController.actions.resetAll) const clearAllRuns = useSetAtom(clearAllRunsMutationAtom) const setInitialized = useSetAtom(playgroundInitializedAtom) - const setSelectedAppLabel = useSetAtom(selectedAppLabelAtom) const setConnectedTestset = useSetAtom(connectedTestsetAtom) const connectApp = useSetAtom(connectAppToEvaluatorAtom) const setPersistedTestset = useSetAtom(persistedTestsetSelectionAtom) @@ -211,10 +205,12 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => { const store = getDefaultStore() - // Restore persisted app selection (survives drawer close/reopen and commits) + // Restore persisted app selection (survives drawer close/reopen and commits). + // `selectedAppLabelAtom` is derived from the node graph now — the + // `connectApp` call below seeds the depth-0 node with the persisted + // label, which the derived atom picks up automatically. const persisted = store.get(persistedAppSelectionAtom) if (persisted) { - setSelectedAppLabel(persisted.appLabel) connectApp({ appRevisionId: persisted.appRevisionId, appLabel: persisted.appLabel, @@ -272,7 +268,8 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => { resetAll() setInitialized(false) - setSelectedAppLabel(null) + // `selectedAppLabelAtom` is derived from the node graph — `resetAll` + // above clears the nodes, which flips the label back to `null`. setConnectedTestset(null) } }, [ @@ -281,7 +278,6 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => { resetAll, clearAllRuns, setInitialized, - setSelectedAppLabel, setConnectedTestset, connectApp, ]) @@ -311,60 +307,28 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => { }) }, [connectedTestset, setPersistedTestset]) - const selectedAppLabel = useAtomValue(selectedAppLabelAtom) + // Shared run controls — the same hook the full page and the creation drawer + // use, so every evaluator surface gates runs identically (run-on aware) and + // can't drift apart again. (This drawer previously hardcoded + // `runDisabled={!hasAppConnected}`, which ignored the run-on mode and forced + // an app even in test-case mode.) + const {appWorkflowAdapter, handleAppSelect, selectedAppLabel, runDisabled} = + useEvaluatorRunControls() const nodes = useAtomValue(useMemo(() => playgroundController.selectors.nodes(), [])) - const evaluatorNode = useMemo(() => { - const downstream = nodes.find((n) => n.depth > 0) - if (downstream) return downstream - return nodes[0] ?? null - }, [nodes]) - - // Derive from nodes directly (single source of truth, no atom indirection) - const hasAppConnected = useMemo(() => nodes.some((n) => n.depth > 0), [nodes]) const configEntityIds = useMemo(() => { const downstream = nodes.filter((n) => n.depth > 0) if (downstream.length > 0) return downstream.map((n) => n.entityId) return nodes.map((n) => n.entityId) }, [nodes]) - const appWorkflowAdapter = useMemo( - () => - createWorkflowRevisionAdapter({ - skipVariantLevel: true, - excludeRevisionZero: true, - flags: {is_evaluator: false, is_feedback: false}, - }), - [], - ) - - const handleAppSelect = useCallback( - (selection: WorkflowRevisionSelectionResult) => { - if (!evaluatorNode) return - connectApp({ - appRevisionId: selection.id, - appLabel: selection.label, - evaluatorRevisionId: evaluatorNode.entityId, - evaluatorLabel: evaluatorNode.label ?? "Evaluator", - }) - }, - [connectApp, evaluatorNode], - ) - const runDisabledContent = useMemo( () => ( - <> - - Select an app to run the evaluator chain - - - variant="popover-cascader" - adapter={appWorkflowAdapter} - onSelect={handleAppSelect} - size="middle" - placeholder={selectedAppLabel ?? "Select app"} - /> - + ), [appWorkflowAdapter, handleAppSelect, selectedAppLabel], ) @@ -382,12 +346,7 @@ const DrawerEvaluatorPlayground = memo(({entityId}: {entityId: string}) => { return (
- {isExpanded && ( - - )} + {isExpanded && } { configViewMode={configViewMode} onConfigViewModeChange={setConfigViewMode} configEntityIdsOverride={configEntityIds} - runDisabled={!hasAppConnected} + runDisabled={runDisabled} runDisabledContent={runDisabledContent} />
@@ -492,23 +451,18 @@ const useDrawerCreateCommitCallback = () => { // (`Router.pathname` only flips on `routeChangeComplete`, // so a synchronous close after `router.push` would patch // the still-current `/evaluators` URL and push back to it.) + // // Gated by `EVALUATOR_FULL_PAGE_NAV_ENABLED`: while the - // flag is off, post-create stays in the drawer flow even - // for evaluators whose classifier supports full-page UX. - let eligibleForPlayground = false - if ( - EVALUATOR_FULL_PAGE_NAV_ENABLED && - newAppId && - newRevisionId && - newWorkflow - ) { - eligibleForPlayground = hasFullPagePlaygroundUX({ - flags: newWorkflow.flags ?? null, - data: newWorkflow.data ?? null, - meta: newWorkflow.meta ?? null, - slug: newWorkflow.slug ?? null, - }) - } + // flag is off, post-create stays in the drawer flow. When + // on, every freshly committed evaluator (regardless of + // template type) lands on `/apps//playground` — + // mirroring app-create's post-commit navigation. The + // earlier classifier-only gate was removed so declarative + // evaluators get the same surface (variants, traces, + // sidebar context) as LLM/code ones. + const eligibleForPlayground = Boolean( + EVALUATOR_FULL_PAGE_NAV_ENABLED && newAppId && newRevisionId, + ) if (eligibleForPlayground && newAppId && newRevisionId) { const url = `${baseAppURLRef.current}/${encodeURIComponent( diff --git a/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx b/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx index 8d4ae5b560..d5cf926818 100644 --- a/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx +++ b/web/oss/src/components/pages/app-management/ArchivedAppsPage.tsx @@ -1,6 +1,8 @@ +import {PageLayout} from "@agenta/ui" +import {ArrowLeft} from "@phosphor-icons/react" +import {Button} from "antd" import {useRouter} from "next/router" -import ArchivedEntityLayout from "@/oss/components/ArchivedEntityLayout" import useURL from "@/oss/hooks/useURL" import ApplicationManagementSection from "./components/ApplicationManagementSection" @@ -9,13 +11,26 @@ export default function ArchivedAppsPage() { const router = useRouter() const {baseAppURL} = useURL() + // Mirror the Archived Evaluators header: the back arrow sits inline with the + // title (no standalone "Back" button, no subtitle) so both archived pages + // share one layout via PageLayout. + const title = ( + +