diff --git a/apps/web/src/app/admin/api/cloud-agent-next/hooks.ts b/apps/web/src/app/admin/api/cloud-agent-next/hooks.ts new file mode 100644 index 0000000000..54e3322263 --- /dev/null +++ b/apps/web/src/app/admin/api/cloud-agent-next/hooks.ts @@ -0,0 +1,62 @@ +'use client'; + +import { useTRPC } from '@/lib/trpc/utils'; +import { useQuery } from '@tanstack/react-query'; + +type CloudAgentNextFilters = { + /** Inclusive ISO datetime lower bound for observed-outcome reporting. */ + startDate: string; + /** Exclusive ISO datetime upper bound for observed-outcome reporting. */ + endDate: string; +}; + +export type CloudAgentNextHealthFilters = CloudAgentNextFilters & { + bucket: 'hour' | 'day'; + createdOnPlatform?: string | null; +}; + +type CloudAgentNextHealthError = { + source: 'setup' | 'run'; + stage: string; + code: string; +}; + +function enabledForInterval(params: CloudAgentNextFilters) { + return Boolean(params.startDate && params.endDate); +} + +export function useCloudAgentNextHealthPlatforms() { + const trpc = useTRPC(); + return useQuery(trpc.admin.cloudAgentNext.listHealthPlatforms.queryOptions()); +} + +export function useCloudAgentNextHealthOverview( + params: CloudAgentNextHealthFilters, + enabled = true +) { + const trpc = useTRPC(); + return useQuery({ + ...trpc.admin.cloudAgentNext.getHealthOverview.queryOptions(params), + enabled: enabled && enabledForInterval(params), + refetchOnReconnect: false, + refetchOnWindowFocus: false, + }); +} + +export function useCloudAgentNextHealthErrorSessions( + params: CloudAgentNextHealthFilters, + error: CloudAgentNextHealthError | null +) { + const trpc = useTRPC(); + return useQuery({ + ...trpc.admin.cloudAgentNext.listHealthErrorSessions.queryOptions({ + startDate: params.startDate, + endDate: params.endDate, + source: error?.source ?? 'run', + stage: error?.stage ?? 'not-selected', + code: error?.code ?? 'not-selected', + createdOnPlatform: params.createdOnPlatform, + }), + enabled: enabledForInterval(params) && Boolean(error), + }); +} diff --git a/apps/web/src/app/admin/cloud-agent-next/page.tsx b/apps/web/src/app/admin/cloud-agent-next/page.tsx new file mode 100644 index 0000000000..196148150a --- /dev/null +++ b/apps/web/src/app/admin/cloud-agent-next/page.tsx @@ -0,0 +1,5 @@ +import CloudAgentNextOutcomesPage from '@/app/admin/components/CloudAgentNextTelemetry/CloudAgentNextOutcomesPage'; + +export default function Page() { + return ; +} diff --git a/apps/web/src/app/admin/components/AppSidebar.tsx b/apps/web/src/app/admin/components/AppSidebar.tsx index e0f0881db5..9caba3b34a 100644 --- a/apps/web/src/app/admin/components/AppSidebar.tsx +++ b/apps/web/src/app/admin/components/AppSidebar.tsx @@ -190,6 +190,11 @@ const analyticsObservabilityItems: MenuItem[] = [ url: '/admin/model-eval-ingest', icon: () => , }, + { + title: () => 'Cloud Agent health', + url: '/admin/cloud-agent-next', + icon: () => , + }, { title: () => 'Session Traces', url: '/admin/session-traces', diff --git a/apps/web/src/app/admin/components/CloudAgentNextTelemetry/CloudAgentNextOutcomesPage.tsx b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/CloudAgentNextOutcomesPage.tsx new file mode 100644 index 0000000000..84a154f8b2 --- /dev/null +++ b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/CloudAgentNextOutcomesPage.tsx @@ -0,0 +1,666 @@ +'use client'; + +import { useEffect, useState } from 'react'; +import { AlertCircle, ChevronRight, Loader2, RefreshCw } from 'lucide-react'; +import { + Bar, + BarChart, + CartesianGrid, + Legend, + ResponsiveContainer, + Tooltip, + XAxis, + YAxis, +} from 'recharts'; +import AdminPage from '@/app/admin/components/AdminPage'; +import { + useCloudAgentNextHealthErrorSessions, + useCloudAgentNextHealthOverview, + useCloudAgentNextHealthPlatforms, + type CloudAgentNextHealthFilters, +} from '@/app/admin/api/cloud-agent-next/hooks'; +import { CopyButton } from '@/components/admin/CopyButton'; +import { Alert, AlertDescription, AlertTitle } from '@/components/ui/alert'; +import { Badge } from '@/components/ui/badge'; +import { BreadcrumbItem, BreadcrumbPage } from '@/components/ui/breadcrumb'; +import { Button } from '@/components/ui/button'; +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'; +import { + Dialog, + DialogContent, + DialogDescription, + DialogHeader, + DialogTitle, +} from '@/components/ui/dialog'; +import { Label } from '@/components/ui/label'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { Skeleton } from '@/components/ui/skeleton'; +import { cn } from '@/lib/utils'; +import { rollingHealthInterval } from './health-interval'; +import { getOperationalFailureStats } from './health-summary'; +import { + DEFAULT_HEALTH_PERIOD, + getStoredHealthPeriod, + isHealthPeriod, + setStoredHealthPeriod, + type HealthPeriod, +} from './health-period-preference'; +import { + Table, + TableBody, + TableCaption, + TableCell, + TableHead, + TableHeader, + TableRow, +} from '@/components/ui/table'; + +type RangeValue = HealthPeriod; +type HealthBucket = CloudAgentNextHealthFilters['bucket']; +type RangeOption = { + value: RangeValue; + label: string; + durationMs: number; + bucket: HealthBucket; +}; + +const RANGE_OPTIONS = [ + { value: '1h', label: 'Last hour', durationMs: 60 * 60 * 1000, bucket: 'hour' }, + { value: '3h', label: 'Last 3 hours', durationMs: 3 * 60 * 60 * 1000, bucket: 'hour' }, + { + value: '24h', + label: 'Last 24 hours', + durationMs: 24 * 60 * 60 * 1000, + bucket: 'hour', + }, + { value: '7d', label: 'Last 7 days', durationMs: 7 * 24 * 60 * 60 * 1000, bucket: 'hour' }, + { value: '14d', label: 'Last 14 days', durationMs: 14 * 24 * 60 * 60 * 1000, bucket: 'day' }, + { value: '30d', label: 'Last 30 days', durationMs: 30 * 24 * 60 * 60 * 1000, bucket: 'day' }, +] satisfies ReadonlyArray; + +type HealthData = NonNullable['data']>; +type SeriesPoint = HealthData['series'][number]; +type TopError = HealthData['topErrors'][number]; +type TooltipPayload = { payload: SeriesPoint }; + +const DEFAULT_RANGE: RangeValue = DEFAULT_HEALTH_PERIOD; +const ALL_PLATFORMS_VALUE = 'all-platforms'; +const UNKNOWN_PLATFORM_VALUE = 'unknown-platform'; +const EXACT_PLATFORM_PREFIX = 'platform:'; + +function platformSelectionValue(platform: string): string { + return `${EXACT_PLATFORM_PREFIX}${platform}`; +} + +function createdOnPlatformForSelection(selection: string): string | null | undefined { + if (selection === ALL_PLATFORMS_VALUE) return undefined; + if (selection === UNKNOWN_PLATFORM_VALUE) return null; + if (selection.startsWith(EXACT_PLATFORM_PREFIX)) { + return selection.slice(EXACT_PLATFORM_PREFIX.length); + } + return undefined; +} + +const utcLongLabel = new Intl.DateTimeFormat('en-US', { + timeZone: 'UTC', + month: 'short', + day: 'numeric', + hour: '2-digit', + minute: '2-digit', + hourCycle: 'h23', +}); +const utcShortTime = new Intl.DateTimeFormat('en-US', { + timeZone: 'UTC', + hour: '2-digit', + hourCycle: 'h23', +}); +const utcShortDay = new Intl.DateTimeFormat('en-US', { + timeZone: 'UTC', + month: 'short', + day: 'numeric', +}); + +function intervalForRange( + range: RangeValue, + platformSelection = ALL_PLATFORMS_VALUE +): CloudAgentNextHealthFilters { + const selectedRange = RANGE_OPTIONS.find(option => option.value === range) ?? RANGE_OPTIONS[3]; + const createdOnPlatform = createdOnPlatformForSelection(platformSelection); + return { + ...rollingHealthInterval(selectedRange), + ...(createdOnPlatform === undefined ? {} : { createdOnPlatform }), + }; +} + +function formatBucketLabel(bucketStart: string, range: RangeValue): string { + if (range === '1h' || range === '3h' || range === '24h') { + return utcShortTime.format(new Date(bucketStart)); + } + return utcShortDay.format(new Date(bucketStart)); +} + +function bucketLabel(bucket: HealthBucket): string { + return bucket === 'day' ? 'Daily' : 'Hourly'; +} + +function formatBucketStart(bucketStart: string, bucket: HealthBucket): string { + const date = new Date(bucketStart); + return bucket === 'day' ? `${utcShortDay.format(date)} UTC` : `${utcLongLabel.format(date)} UTC`; +} + +type MetricTone = 'success' | 'danger' | 'warning'; + +const metricToneStyles: Record = { + success: { + panel: 'border-green-500/20 bg-green-500/5', + value: 'text-green-400', + }, + danger: { + panel: 'border-red-500/20 bg-red-500/5', + value: 'text-red-400', + }, + warning: { + panel: 'border-yellow-500/20 bg-yellow-500/5', + value: 'text-yellow-400', + }, +}; + +function Metric({ + label, + value, + detail, + tone, +}: { + label: string; + value: string; + detail?: string; + tone: MetricTone; +}) { + const styles = metricToneStyles[tone]; + return ( +
+
{label}
+
{value}
+ {detail &&
{detail}
} +
+ ); +} + +function DashboardSkeleton() { + return ( +
+ + + +
+ ); +} + +function HealthSummary({ summary }: { summary: HealthData['summary'] }) { + const operationalFailures = getOperationalFailureStats(summary); + const failureRate = operationalFailures.failureRatePercent; + return ( + + + Observed health + + Events observed in the selected rolling period. Interruptions are excluded from failure + rate. + + + + 0 ? 'danger' : 'success'} + /> + + + + + + + ); +} + +function HealthTooltip({ + active, + payload, + bucket, +}: { + active?: boolean; + payload?: TooltipPayload[]; + bucket: HealthBucket; +}) { + const point = active ? payload?.[0]?.payload : undefined; + if (!point) return null; + return ( +
+

{formatBucketStart(point.bucketStart, bucket)}

+
+

+ Completed runs + {point.completedRuns.toLocaleString()} +

+

+ Failed runs + {point.failedRuns.toLocaleString()} +

+

+ Setup failures + {point.setupFailures.toLocaleString()} +

+

+ Interrupted runs + {point.interruptedRuns.toLocaleString()} +

+
+
+ ); +} + +function OutcomeTrendChart({ + data, + range, + bucket, +}: { + data: SeriesPoint[]; + range: RangeValue; + bucket: HealthBucket; +}) { + const label = bucketLabel(bucket); + return ( + + + {label} outcomes + + Completed, failed, setup-failed, and interrupted events in UTC- + {bucket === 'day' ? 'day' : 'hour'} buckets. Edge buckets may be partial. + + + +
+ + + + formatBucketLabel(String(bucketStart), range)} + minTickGap={32} + tick={{ fontSize: 11 }} + /> + + } /> + + + + + + + +
+
+
+ ); +} + +function errorSourceBadge(source: TopError['source']) { + return {source === 'setup' ? 'Setup' : 'Run'}; +} + +function ErrorSessionsDialog({ + error, + interval, + onClose, +}: { + error: TopError; + interval: CloudAgentNextHealthFilters; + onClose: () => void; +}) { + const sessions = useCloudAgentNextHealthErrorSessions(interval, error); + const rows = sessions.data?.rows ?? []; + return ( + !open && onClose()}> + + + Affected sessions + + + {error.source} / {error.stage} / {error.code} + {' '} + - {error.count.toLocaleString()} matching error events in the selected period. + + + {sessions.isLoading ? ( +
+ Loading affected sessions... +
+ ) : sessions.error ? ( + + + Could not load affected sessions + {sessions.error.message} + + ) : rows.length === 0 ? ( +

+ No retained sessions found for this error. +

+ ) : ( +
+
+

+ Showing {rows.length.toLocaleString()} of{' '} + {sessions.data?.totalSessions.toLocaleString()} affected sessions + {sessions.data && sessions.data.totalSessions > sessions.data.limit + ? ' (newest first)' + : ''} + . +

+ row.kiloSessionId).join('\n')} + label="visible Kilo session IDs" + showText + /> +
+
+ + + Sessions affected by the selected Cloud Agent error. + + + + Kilo session ID + Cloud Agent ID + Latest occurrence (UTC) + Events + + + + {rows.map(row => ( + + + + {row.kiloSessionId} + + + + + + {row.cloudAgentSessionId} + + + + + {row.occurredAt + ? `${utcLongLabel.format(new Date(row.occurredAt))} UTC` + : '--'} + + + {row.matchingEvents.toLocaleString()} + + + ))} + +
+
+
+ )} +
+
+ ); +} + +function TopErrors({ + errors, + interval, +}: { + errors: TopError[]; + interval: CloudAgentNextHealthFilters; +}) { + const [selectedError, setSelectedError] = useState(null); + const total = errors.reduce((count, error) => count + error.count, 0); + return ( + + + Top errors + + Setup failures and failed runs only. {total.toLocaleString()} events in the top 10. Select + an error to inspect sessions. + + + + {errors.length === 0 ? ( +

+ No operational errors observed in this period. +

+ ) : ( +
+ + + Top operational Cloud Agent errors in the selected period. Select an error to + inspect affected sessions. + + + + Source + Error + Events + + + + {errors.map(error => ( + + {errorSourceBadge(error.source)} + + + + + {error.count.toLocaleString()} + + + ))} + +
+
+ )} + {selectedError && ( + setSelectedError(null)} + /> + )} +
+
+ ); +} + +export default function CloudAgentNextOutcomesPage() { + const [range, setRange] = useState(DEFAULT_RANGE); + const [platformSelection, setPlatformSelection] = useState(ALL_PLATFORMS_VALUE); + const [interval, setInterval] = useState(() => intervalForRange(DEFAULT_RANGE)); + const [hasLoadedPeriodPreference, setHasLoadedPeriodPreference] = useState(false); + const healthPlatforms = useCloudAgentNextHealthPlatforms(); + const health = useCloudAgentNextHealthOverview(interval, hasLoadedPeriodPreference); + const bucket = interval.bucket; + + useEffect(() => { + const storedRange = getStoredHealthPeriod(); + if (storedRange !== DEFAULT_RANGE) { + setRange(storedRange); + setInterval(intervalForRange(storedRange)); + } + setHasLoadedPeriodPreference(true); + }, []); + + function updateRange(value: string) { + if (!isHealthPeriod(value)) return; + setStoredHealthPeriod(value); + setRange(value); + setInterval(intervalForRange(value, platformSelection)); + } + + function updatePlatformSelection(value: string) { + setPlatformSelection(value); + setInterval(intervalForRange(range, value)); + } + + function refresh() { + const nextInterval = intervalForRange(range, platformSelection); + if ( + nextInterval.startDate === interval.startDate && + nextInterval.endDate === interval.endDate && + nextInterval.bucket === interval.bucket && + nextInterval.createdOnPlatform === interval.createdOnPlatform + ) { + void health.refetch(); + return; + } + setInterval(nextInterval); + } + + return ( + + Cloud Agent health + + } + buttons={ + + } + > +
+
+
+

Cloud Agent health

+

+ Operational outcome trends from best-effort Cloud Agent reporting. +

+
+
+
+ + +
+
+ + +
+
+
+

+ Reporting is best-effort, so totals can undercount execution. Periods end at refresh time; + edge UTC {bucket === 'day' ? 'days' : 'hours'} may be partial. +

+ {health.error && ( + + + Could not load Cloud Agent health + {health.error.message} + + )} + {health.isFetching && !health.isLoading && ( +
+ Refreshing health data... +
+ )} + {!hasLoadedPeriodPreference || health.isLoading ? ( + + ) : health.data ? ( + <> + + + + + ) : null} +
+
+ ); +} diff --git a/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-interval.test.ts b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-interval.test.ts new file mode 100644 index 0000000000..247b566e85 --- /dev/null +++ b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-interval.test.ts @@ -0,0 +1,16 @@ +import { rollingHealthInterval } from './health-interval'; + +describe('rollingHealthInterval', () => { + it('ends an hourly range at the exact refresh time', () => { + expect( + rollingHealthInterval( + { durationMs: 3 * 60 * 60 * 1000, bucket: 'hour' }, + new Date('2035-01-10T12:34:56.789Z') + ) + ).toEqual({ + startDate: '2035-01-10T09:34:56.789Z', + endDate: '2035-01-10T12:34:56.789Z', + bucket: 'hour', + }); + }); +}); diff --git a/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-interval.ts b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-interval.ts new file mode 100644 index 0000000000..e51f813e5d --- /dev/null +++ b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-interval.ts @@ -0,0 +1,20 @@ +export type HealthBucket = 'hour' | 'day'; + +type HealthRange = { + durationMs: number; + bucket: HealthBucket; +}; + +type HealthInterval = { + startDate: string; + endDate: string; + bucket: HealthBucket; +}; + +export function rollingHealthInterval(range: HealthRange, now = new Date()): HealthInterval { + return { + startDate: new Date(now.getTime() - range.durationMs).toISOString(), + endDate: now.toISOString(), + bucket: range.bucket, + }; +} diff --git a/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-period-preference.test.ts b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-period-preference.test.ts new file mode 100644 index 0000000000..551e73f72a --- /dev/null +++ b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-period-preference.test.ts @@ -0,0 +1,17 @@ +import { describe, expect, it } from '@jest/globals'; +import { DEFAULT_HEALTH_PERIOD, parseHealthPeriod } from './health-period-preference'; + +describe('Cloud Agent health period preference', () => { + it('restores every supported period preset', () => { + for (const period of ['1h', '3h', '24h', '7d', '14d', '30d']) { + expect(parseHealthPeriod(period)).toBe(period); + } + }); + + it('defaults to the seven day period for absent or stale values', () => { + expect(DEFAULT_HEALTH_PERIOD).toBe('7d'); + expect(parseHealthPeriod(null)).toBe(DEFAULT_HEALTH_PERIOD); + expect(parseHealthPeriod('90d')).toBe(DEFAULT_HEALTH_PERIOD); + expect(parseHealthPeriod('')).toBe(DEFAULT_HEALTH_PERIOD); + }); +}); diff --git a/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-period-preference.ts b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-period-preference.ts new file mode 100644 index 0000000000..8d4d4deedc --- /dev/null +++ b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-period-preference.ts @@ -0,0 +1,24 @@ +import { safeLocalStorage } from '@/lib/localStorage'; + +const HEALTH_PERIOD_STORAGE_KEY = 'cloud-agent-next:admin-health-period'; +const HEALTH_PERIODS = ['1h', '3h', '24h', '7d', '14d', '30d'] as const; + +export type HealthPeriod = (typeof HEALTH_PERIODS)[number]; + +export const DEFAULT_HEALTH_PERIOD: HealthPeriod = '7d'; + +export function isHealthPeriod(value: string): value is HealthPeriod { + return HEALTH_PERIODS.some(period => period === value); +} + +export function parseHealthPeriod(value: string | null): HealthPeriod { + return value && isHealthPeriod(value) ? value : DEFAULT_HEALTH_PERIOD; +} + +export function getStoredHealthPeriod(): HealthPeriod { + return parseHealthPeriod(safeLocalStorage.getItem(HEALTH_PERIOD_STORAGE_KEY)); +} + +export function setStoredHealthPeriod(period: HealthPeriod): void { + safeLocalStorage.setItem(HEALTH_PERIOD_STORAGE_KEY, period); +} diff --git a/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-summary.test.ts b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-summary.test.ts new file mode 100644 index 0000000000..9a52c8a093 --- /dev/null +++ b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-summary.test.ts @@ -0,0 +1,26 @@ +import { describe, expect, it } from '@jest/globals'; +import { getOperationalFailureStats } from './health-summary'; + +describe('getOperationalFailureStats', () => { + it('calculates operational failure percentage without counting interruptions', () => { + expect( + getOperationalFailureStats({ + completedRuns: 90, + failedRuns: 7, + setupFailures: 3, + interruptedRuns: 25, + }) + ).toEqual({ failureEvents: 10, assessedOutcomes: 100, failureRatePercent: 10 }); + }); + + it('does not report a percentage when no operational outcomes were assessed', () => { + expect( + getOperationalFailureStats({ + completedRuns: 0, + failedRuns: 0, + setupFailures: 0, + interruptedRuns: 4, + }) + ).toEqual({ failureEvents: 0, assessedOutcomes: 0, failureRatePercent: null }); + }); +}); diff --git a/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-summary.ts b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-summary.ts new file mode 100644 index 0000000000..6408bd455e --- /dev/null +++ b/apps/web/src/app/admin/components/CloudAgentNextTelemetry/health-summary.ts @@ -0,0 +1,16 @@ +type ObservedHealthSummary = { + completedRuns: number; + failedRuns: number; + setupFailures: number; + interruptedRuns: number; +}; + +export function getOperationalFailureStats(summary: ObservedHealthSummary) { + const failureEvents = summary.failedRuns + summary.setupFailures; + const assessedOutcomes = summary.completedRuns + failureEvents; + return { + failureEvents, + assessedOutcomes, + failureRatePercent: assessedOutcomes === 0 ? null : (failureEvents / assessedOutcomes) * 100, + }; +} diff --git a/apps/web/src/components/admin/CopyButton.tsx b/apps/web/src/components/admin/CopyButton.tsx index 7d946a7a0d..5a19e2a34b 100644 --- a/apps/web/src/components/admin/CopyButton.tsx +++ b/apps/web/src/components/admin/CopyButton.tsx @@ -2,6 +2,7 @@ import { useState, useEffect } from 'react'; import { Copy, Check } from 'lucide-react'; +import { cn } from '@/lib/utils'; type CopyButtonProps = { text: string; @@ -52,16 +53,17 @@ export function CopyButton({ text, className = '', showText = false, label }: Co return (