feat(admin): show auto-routing reasoning effort (#4093)

iscekic · web-flow · commit a96d05a92b89 · 2026-06-18T13:31:07.000+02:00
* feat(admin): show auto-routing reasoning effort

* fix(auto-routing): resolve benchmark identity defaults in worker

* chore(auto-routing): run benchmark cron during downtime

* fix(auto-routing): resolve benchmark identity for classifier runs
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
@@ -113,10 +113,43 @@ describe('RoutingTableView', () => {
 
     expect(html.indexOf('threshold-meeting')).toBeLessThan(html.indexOf('below-threshold-cheaper'));
   });
+
+  it('renders reasoning effort next to the model name', () => {
+    const html = renderToStaticMarkup(
+      React.createElement(RoutingTableView, {
+        data: {
+          publishedAt: '2026-06-17T00:00:00.000Z',
+          table: {
+            version: 'run-1',
+            generatedAt: '2026-06-17T00:00:00.000Z',
+            minAccuracy: 0.7,
+            switchCostFactor: 3,
+            source: 'benchmark',
+            routes: {
+              'implementation/code_generation': [
+                {
+                  model: 'openai/gpt-5',
+                  accuracy: 0.8,
+                  avgCostUsd: 0.006,
+                  meetsThreshold: true,
+                  reasoningEffort: 'high',
+                },
+              ],
+            },
+          },
+        },
+      })
+    );
+
+    expect(html.indexOf('Model')).toBeLessThan(html.indexOf('Reasoning effort'));
+    expect(html.indexOf('Reasoning effort')).toBeLessThan(html.indexOf('Accuracy'));
+    expect(html.indexOf('openai/gpt-5')).toBeLessThan(html.indexOf('high'));
+    expect(html.indexOf('high')).toBeLessThan(html.indexOf('80.0%'));
+  });
 });
 
 describe('configToFormState', () => {
-  it('yields defaults including classifierMaxP95LatencyMs "1000" when config is null', () => {
+  it('yields defaults with blank benchmark identity override fields when config is null', () => {
     const state = configToFormState(null);
     expect(state.classifierRepetitions).toBe(1);
     expect(state.deciderRepetitions).toBe(1);
@@ -128,8 +161,8 @@ describe('configToFormState', () => {
     expect(state.autoDeciderModels).toEqual([]);
     expect(state.excludedAutoDeciderModels).toBe('');
     expect(state.maxConcurrency).toBe(100);
-    expect(state.benchmarkUserId).toBe('ce12ef3d-ae95-4d77-b4f0-23735f0a0591');
-    expect(state.benchmarkOrgId).toBe('9d278969-5453-4ae3-a51f-a8d2274a7b56');
+    expect(state.benchmarkUserId).toBe('');
+    expect(state.benchmarkOrgId).toBe('');
   });
 });
 
@@ -190,6 +223,16 @@ describe('formStateToConfig round-trip', () => {
     const result = formStateToConfig(stateWithEmpty, baseConfig);
     expect(result.classifierMaxP95LatencyMs).toBeNull();
   });
+
+  it('converts blank benchmark identity override fields to null in config', () => {
+    const state = configToFormState(baseConfig);
+    const result = formStateToConfig(
+      { ...state, benchmarkUserId: '  ', benchmarkOrgId: '  ' },
+      baseConfig
+    );
+    expect(result.benchmarkUserId).toBeNull();
+    expect(result.benchmarkOrgId).toBeNull();
+  });
 });
 
 describe('effectiveDeciderModels', () => {
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -6,6 +6,8 @@ import {
   BenchmarkConfigResponseSchema,
   BenchmarkRoutingTableResponseSchema,
   BenchmarkRunsResponseSchema,
+  DEFAULT_BENCHMARK_ORG_ID,
+  DEFAULT_BENCHMARK_USER_ID,
   StartBenchmarkRunResponseSchema,
   type BenchmarkConfig,
   type BenchmarkKind,
@@ -124,9 +126,6 @@ type DeciderModelRow = {
 
 type AutoDeciderModelRow = AutoBenchmarkDeciderModel;
 
-const DEFAULT_BENCHMARK_USER_ID = 'ce12ef3d-ae95-4d77-b4f0-23735f0a0591';
-const DEFAULT_BENCHMARK_ORG_ID = '9d278969-5453-4ae3-a51f-a8d2274a7b56';
-
 export function configToFormState(config: BenchmarkConfig | null): {
   classifierModels: string;
   deciderModels: DeciderModelRow[];
@@ -144,8 +143,8 @@ export function configToFormState(config: BenchmarkConfig | null): {
   autoDeciderMaxCostUsd: number;
 } {
   if (config === null) {
-    // No config saved yet: the worker fabricates nothing, so the form starts
-    // empty and the admin must enter and save a config before running.
+    // No config saved yet: identity fields are overrides, so blank means the
+    // worker uses its default benchmark user and org at run time.
     return {
       classifierModels: '',
       deciderModels: [],
@@ -154,8 +153,8 @@ export function configToFormState(config: BenchmarkConfig | null): {
       minAccuracy: 0.7,
       switchCostFactor: 3,
       maxConcurrency: 100,
-      benchmarkUserId: DEFAULT_BENCHMARK_USER_ID,
-      benchmarkOrgId: DEFAULT_BENCHMARK_ORG_ID,
+      benchmarkUserId: '',
+      benchmarkOrgId: '',
       classifierRepetitions: 1,
       deciderRepetitions: 1,
       classifierMaxP95LatencyMs: '1000',
@@ -652,37 +651,35 @@ function BenchmarkConfigEditor({
         </div>
 
         <div className="grid gap-4 md:grid-cols-2">
-          {/* Benchmark user id */}
           <div className="flex flex-col gap-1.5">
             <Label htmlFor="benchmark-user-id" className="text-sm font-medium">
-              Benchmark user id
+              Benchmark user override
             </Label>
             <Input
               id="benchmark-user-id"
               value={form.benchmarkUserId}
               onChange={e => updateForm(prev => ({ ...prev, benchmarkUserId: e.target.value }))}
               className="h-8 font-mono text-xs"
-              placeholder="(unset)"
+              placeholder={`Default: ${DEFAULT_BENCHMARK_USER_ID}`}
             />
             <p className="text-muted-foreground text-xs">
-              Kilo user the decider CLI authenticates as.
+              Leave blank to run decider benchmarks as the default benchmark user.
             </p>
           </div>
 
-          {/* Benchmark org id */}
           <div className="flex flex-col gap-1.5">
             <Label htmlFor="benchmark-org-id" className="text-sm font-medium">
-              Benchmark org id
+              Benchmark org override
             </Label>
             <Input
               id="benchmark-org-id"
               value={form.benchmarkOrgId}
               onChange={e => updateForm(prev => ({ ...prev, benchmarkOrgId: e.target.value }))}
               className="h-8 font-mono text-xs"
-              placeholder="(personal credits)"
+              placeholder={`Default: ${DEFAULT_BENCHMARK_ORG_ID}`}
             />
             <p className="text-muted-foreground text-xs">
-              Optional org context; when set, decider runs bill org credits.
+              Leave blank to bill decider benchmarks to the default benchmark org.
             </p>
           </div>
         </div>
@@ -943,6 +940,7 @@ export function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse
                 <TableHeader>
                   <TableRow>
                     <TableHead>Model</TableHead>
+                    <TableHead className="w-36">Reasoning effort</TableHead>
                     <TableHead className="text-right">Accuracy</TableHead>
                     <TableHead className="text-right">Avg cost</TableHead>
                     <TableHead className="text-right">Cost / accuracy</TableHead>
@@ -955,6 +953,9 @@ export function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse
                       <TableCell className="max-w-56 truncate font-mono text-xs">
                         {c.model}
                       </TableCell>
+                      <TableCell className="capitalize text-xs">
+                        {c.reasoningEffort ?? 'default'}
+                      </TableCell>
                       <TableCell className="text-right tabular-nums text-xs">
                         {formatAccuracy(c.accuracy)}
                       </TableCell>
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
@@ -20,6 +20,8 @@ export type BenchmarkDeciderModel = z.infer<typeof BenchmarkDeciderModelSchema>;
 
 export const AUTO_DECIDER_DEFAULT_MIN_COST_USD = 15;
 export const AUTO_DECIDER_DEFAULT_MAX_COST_USD = 25;
+export const DEFAULT_BENCHMARK_USER_ID = 'ce12ef3d-ae95-4d77-b4f0-23735f0a0591';
+export const DEFAULT_BENCHMARK_ORG_ID = '9d278969-5453-4ae3-a51f-a8d2274a7b56';
 
 export const AutoBenchmarkDeciderModelSchema = BenchmarkDeciderModelSchema.extend({
   avgAttemptCostUsd: z.number().nonnegative(),
@@ -61,12 +63,11 @@ export const BenchmarkConfigSchema = z
     // Benchmark-wide parallelism budget. Decider runs use it as a live
     // container budget; classifier runs use it for parallel OpenRouter calls.
     maxConcurrency: z.number().int().min(1).max(100),
-    // The Kilo user whose identity/billing the decider CLI runs execute under.
-    // Null until an admin configures it; decider runs fail fast while null.
+    // Optional override for the Kilo user whose identity/billing the decider
+    // CLI runs execute under. Null means the worker uses DEFAULT_BENCHMARK_USER_ID.
     benchmarkUserId: z.string().trim().min(1).nullable(),
-    // Optional organization context for the benchmark user. When present, the
-    // CLI token and container run execute in org context so usage bills org
-    // credits instead of personal credits.
+    // Optional override for the organization context. Null means the worker
+    // uses DEFAULT_BENCHMARK_ORG_ID.
     benchmarkOrgId: z.string().trim().min(1).nullable().default(null),
     // Session stickiness knob carried into published routing tables: a session
     // stays on its incumbent model while it meets the route's accuracy
@@ -122,6 +123,15 @@ export const BenchmarkConfigSchema = z
   });
 export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
 
+export function resolveBenchmarkIdentity(
+  config: Pick<BenchmarkConfig, 'benchmarkUserId' | 'benchmarkOrgId'>
+): { benchmarkUserId: string; benchmarkOrgId: string } {
+  return {
+    benchmarkUserId: config.benchmarkUserId ?? DEFAULT_BENCHMARK_USER_ID,
+    benchmarkOrgId: config.benchmarkOrgId ?? DEFAULT_BENCHMARK_ORG_ID,
+  };
+}
+
 export const AutoBenchmarkDeciderCandidatesResponseSchema = z.object({
   candidates: z.array(
     z.object({
diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts
@@ -6,7 +6,12 @@ import {
   MirrorPayloadSchema,
   UpdateClassifierModelRequestSchema,
 } from './index';
-import { BenchmarkConfigSchema } from './benchmark';
+import {
+  BenchmarkConfigSchema,
+  DEFAULT_BENCHMARK_ORG_ID,
+  DEFAULT_BENCHMARK_USER_ID,
+  resolveBenchmarkIdentity,
+} from './benchmark';
 
 describe('auto routing contracts', () => {
   it('validates the cross-service request and response contracts', () => {
@@ -213,6 +218,27 @@ describe('BenchmarkConfigSchema defaults', () => {
   });
 });
 
+describe('resolveBenchmarkIdentity', () => {
+  it('uses worker defaults when benchmark identity overrides are null', () => {
+    expect(resolveBenchmarkIdentity({ benchmarkUserId: null, benchmarkOrgId: null })).toEqual({
+      benchmarkUserId: DEFAULT_BENCHMARK_USER_ID,
+      benchmarkOrgId: DEFAULT_BENCHMARK_ORG_ID,
+    });
+  });
+
+  it('preserves configured benchmark identity overrides', () => {
+    expect(
+      resolveBenchmarkIdentity({
+        benchmarkUserId: 'override-user',
+        benchmarkOrgId: 'override-org',
+      })
+    ).toEqual({
+      benchmarkUserId: 'override-user',
+      benchmarkOrgId: 'override-org',
+    });
+  });
+});
+
 describe('BenchmarkConfigSchema duplicate model ids', () => {
   const base = {
     minAccuracy: 0.8,
diff --git a/services/auto-routing-benchmark/README.md b/services/auto-routing-benchmark/README.md
@@ -62,9 +62,10 @@ SECRET=$(grep '^INTERNAL_API_SECRET=' ../../.env.local | cut -d= -f2- | tr -d '"
 curl -s http://localhost:8814/admin/config -H "Authorization: Bearer $SECRET"
 ```
 
-Decider runs need a `benchmarkUserId` that exists locally with credits. When
-`benchmarkOrgId` is set, the benchmark user must belong to that org and usage
-bills org credits. The dev seed provides `auto-routing-cli-local`.
+Decider runs use the worker's default benchmark user and org unless
+`benchmarkUserId` / `benchmarkOrgId` overrides are saved in config. Any
+effective benchmark user must exist locally with credits and belong to the
+effective org. The dev seed provides `auto-routing-cli-local`.
 
 > Local KV/D1 writes from a *second* `wrangler` process are not seen by the
 > running dev process (miniflare holds its own view). After writing state out of
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
@@ -1,8 +1,10 @@
 import { beforeEach, describe, expect, it, vi } from 'vitest';
-import type {
-  BenchmarkConfig,
-  BenchmarkModelSummary,
-  RoutingTable,
+import {
+  DEFAULT_BENCHMARK_ORG_ID,
+  DEFAULT_BENCHMARK_USER_ID,
+  type BenchmarkConfig,
+  type BenchmarkModelSummary,
+  type RoutingTable,
 } from '@kilocode/auto-routing-contracts';
 import { app } from './index';
 import { computeEngineIdentity } from './run';
@@ -423,6 +425,8 @@ describe('POST /admin/runs', () => {
     const [, runArg] = vi.mocked(insertRun).mock.calls[0];
     expect(runArg.min_accuracy).toBe(TEST_CONFIG.minAccuracy);
     expect(runArg.switch_cost_factor).toBe(TEST_CONFIG.switchCostFactor);
+    expect(runArg.benchmark_user_id).toBe(DEFAULT_BENCHMARK_USER_ID);
+    expect(runArg.benchmark_org_id).toBe(DEFAULT_BENCHMARK_ORG_ID);
     const queuedMessages = queueSendBatch.mock.calls.flatMap(([messages]) => messages);
     expect(queueSendBatch).toHaveBeenCalledTimes(2);
     expect(queuedMessages).toHaveLength(
@@ -437,6 +441,25 @@ describe('POST /admin/runs', () => {
     });
   });
 
+  it('starts a decider run with default benchmark identity when overrides are null', async () => {
+    vi.mocked(getConfigRows).mockResolvedValue({
+      ...TEST_CONFIG_ROWS,
+      config: {
+        ...TEST_CONFIG_ROWS.config,
+        benchmark_user_id: null,
+        benchmark_org_id: null,
+      },
+      deciderModels: [{ model: 'vendor/a', reasoning_effort: null }],
+    });
+
+    const res = await authedPost('/admin/runs', { kind: 'decider' });
+    expect(res.status).toBe(200);
+    expect(insertRun).toHaveBeenCalledOnce();
+    const [, runArg] = vi.mocked(insertRun).mock.calls[0];
+    expect(runArg.benchmark_user_id).toBe(DEFAULT_BENCHMARK_USER_ID);
+    expect(runArg.benchmark_org_id).toBe(DEFAULT_BENCHMARK_ORG_ID);
+  });
+
   it('carries a decider model only when its benchmark identity still matches', async () => {
     vi.mocked(getConfigRows).mockResolvedValue({
       ...TEST_CONFIG_ROWS,
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
@@ -1,6 +1,7 @@
 import * as z from 'zod';
 import {
   BenchmarkConfigSchema,
+  resolveBenchmarkIdentity,
   StartBenchmarkRunRequestSchema,
   type BenchmarkRun,
 } from '@kilocode/auto-routing-contracts';
@@ -93,19 +94,20 @@ export function registerAdminRoutes(app: Hono<HonoEnv>): void {
     zodJsonValidator(DebugCliRequestSchema, { errorMessage: 'Invalid debug request' }),
     async c => {
       const config = await getBenchmarkConfig(c.env.BENCH_DB);
-      if (!config?.benchmarkUserId) {
-        return c.json({ error: 'benchmarkUserId is not configured' }, 400);
+      if (!config) {
+        return c.json({ error: 'benchmark config is not configured' }, 400);
       }
+      const benchmarkIdentity = resolveBenchmarkIdentity(config);
       const kiloToken = await fetchBenchmarkUserToken(
         c.env,
-        config.benchmarkUserId,
-        config.benchmarkOrgId
+        benchmarkIdentity.benchmarkUserId,
+        benchmarkIdentity.benchmarkOrgId
       );
       const result = await debugRunCli(c.env, {
         ...c.req.valid('json'),
         kiloToken,
         kiloApiUrl: c.env.KILO_CLI_API_URL,
-        orgId: config.benchmarkOrgId,
+        orgId: benchmarkIdentity.benchmarkOrgId,
       });
       return c.json(result);
     }
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
diff --git a/services/auto-routing-benchmark/worker-configuration.d.ts b/services/auto-routing-benchmark/worker-configuration.d.ts
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc