Skip to content

Commit a96d05a

Browse files
authored
feat(admin): show auto-routing reasoning effort (#4093)
* feat(admin): show auto-routing reasoning effort * fix(auto-routing): resolve benchmark identity defaults in worker * chore(auto-routing): run benchmark cron during downtime * fix(auto-routing): resolve benchmark identity for classifier runs
1 parent 0a02b3a commit a96d05a

10 files changed

Lines changed: 155 additions & 54 deletions

File tree

apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,43 @@ describe('RoutingTableView', () => {
113113

114114
expect(html.indexOf('threshold-meeting')).toBeLessThan(html.indexOf('below-threshold-cheaper'));
115115
});
116+
117+
it('renders reasoning effort next to the model name', () => {
118+
const html = renderToStaticMarkup(
119+
React.createElement(RoutingTableView, {
120+
data: {
121+
publishedAt: '2026-06-17T00:00:00.000Z',
122+
table: {
123+
version: 'run-1',
124+
generatedAt: '2026-06-17T00:00:00.000Z',
125+
minAccuracy: 0.7,
126+
switchCostFactor: 3,
127+
source: 'benchmark',
128+
routes: {
129+
'implementation/code_generation': [
130+
{
131+
model: 'openai/gpt-5',
132+
accuracy: 0.8,
133+
avgCostUsd: 0.006,
134+
meetsThreshold: true,
135+
reasoningEffort: 'high',
136+
},
137+
],
138+
},
139+
},
140+
},
141+
})
142+
);
143+
144+
expect(html.indexOf('Model')).toBeLessThan(html.indexOf('Reasoning effort'));
145+
expect(html.indexOf('Reasoning effort')).toBeLessThan(html.indexOf('Accuracy'));
146+
expect(html.indexOf('openai/gpt-5')).toBeLessThan(html.indexOf('high'));
147+
expect(html.indexOf('high')).toBeLessThan(html.indexOf('80.0%'));
148+
});
116149
});
117150

118151
describe('configToFormState', () => {
119-
it('yields defaults including classifierMaxP95LatencyMs "1000" when config is null', () => {
152+
it('yields defaults with blank benchmark identity override fields when config is null', () => {
120153
const state = configToFormState(null);
121154
expect(state.classifierRepetitions).toBe(1);
122155
expect(state.deciderRepetitions).toBe(1);
@@ -128,8 +161,8 @@ describe('configToFormState', () => {
128161
expect(state.autoDeciderModels).toEqual([]);
129162
expect(state.excludedAutoDeciderModels).toBe('');
130163
expect(state.maxConcurrency).toBe(100);
131-
expect(state.benchmarkUserId).toBe('ce12ef3d-ae95-4d77-b4f0-23735f0a0591');
132-
expect(state.benchmarkOrgId).toBe('9d278969-5453-4ae3-a51f-a8d2274a7b56');
164+
expect(state.benchmarkUserId).toBe('');
165+
expect(state.benchmarkOrgId).toBe('');
133166
});
134167
});
135168

@@ -190,6 +223,16 @@ describe('formStateToConfig round-trip', () => {
190223
const result = formStateToConfig(stateWithEmpty, baseConfig);
191224
expect(result.classifierMaxP95LatencyMs).toBeNull();
192225
});
226+
227+
it('converts blank benchmark identity override fields to null in config', () => {
228+
const state = configToFormState(baseConfig);
229+
const result = formStateToConfig(
230+
{ ...state, benchmarkUserId: ' ', benchmarkOrgId: ' ' },
231+
baseConfig
232+
);
233+
expect(result.benchmarkUserId).toBeNull();
234+
expect(result.benchmarkOrgId).toBeNull();
235+
});
193236
});
194237

195238
describe('effectiveDeciderModels', () => {

apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import {
66
BenchmarkConfigResponseSchema,
77
BenchmarkRoutingTableResponseSchema,
88
BenchmarkRunsResponseSchema,
9+
DEFAULT_BENCHMARK_ORG_ID,
10+
DEFAULT_BENCHMARK_USER_ID,
911
StartBenchmarkRunResponseSchema,
1012
type BenchmarkConfig,
1113
type BenchmarkKind,
@@ -124,9 +126,6 @@ type DeciderModelRow = {
124126

125127
type AutoDeciderModelRow = AutoBenchmarkDeciderModel;
126128

127-
const DEFAULT_BENCHMARK_USER_ID = 'ce12ef3d-ae95-4d77-b4f0-23735f0a0591';
128-
const DEFAULT_BENCHMARK_ORG_ID = '9d278969-5453-4ae3-a51f-a8d2274a7b56';
129-
130129
export function configToFormState(config: BenchmarkConfig | null): {
131130
classifierModels: string;
132131
deciderModels: DeciderModelRow[];
@@ -144,8 +143,8 @@ export function configToFormState(config: BenchmarkConfig | null): {
144143
autoDeciderMaxCostUsd: number;
145144
} {
146145
if (config === null) {
147-
// No config saved yet: the worker fabricates nothing, so the form starts
148-
// empty and the admin must enter and save a config before running.
146+
// No config saved yet: identity fields are overrides, so blank means the
147+
// worker uses its default benchmark user and org at run time.
149148
return {
150149
classifierModels: '',
151150
deciderModels: [],
@@ -154,8 +153,8 @@ export function configToFormState(config: BenchmarkConfig | null): {
154153
minAccuracy: 0.7,
155154
switchCostFactor: 3,
156155
maxConcurrency: 100,
157-
benchmarkUserId: DEFAULT_BENCHMARK_USER_ID,
158-
benchmarkOrgId: DEFAULT_BENCHMARK_ORG_ID,
156+
benchmarkUserId: '',
157+
benchmarkOrgId: '',
159158
classifierRepetitions: 1,
160159
deciderRepetitions: 1,
161160
classifierMaxP95LatencyMs: '1000',
@@ -652,37 +651,35 @@ function BenchmarkConfigEditor({
652651
</div>
653652

654653
<div className="grid gap-4 md:grid-cols-2">
655-
{/* Benchmark user id */}
656654
<div className="flex flex-col gap-1.5">
657655
<Label htmlFor="benchmark-user-id" className="text-sm font-medium">
658-
Benchmark user id
656+
Benchmark user override
659657
</Label>
660658
<Input
661659
id="benchmark-user-id"
662660
value={form.benchmarkUserId}
663661
onChange={e => updateForm(prev => ({ ...prev, benchmarkUserId: e.target.value }))}
664662
className="h-8 font-mono text-xs"
665-
placeholder="(unset)"
663+
placeholder={`Default: ${DEFAULT_BENCHMARK_USER_ID}`}
666664
/>
667665
<p className="text-muted-foreground text-xs">
668-
Kilo user the decider CLI authenticates as.
666+
Leave blank to run decider benchmarks as the default benchmark user.
669667
</p>
670668
</div>
671669

672-
{/* Benchmark org id */}
673670
<div className="flex flex-col gap-1.5">
674671
<Label htmlFor="benchmark-org-id" className="text-sm font-medium">
675-
Benchmark org id
672+
Benchmark org override
676673
</Label>
677674
<Input
678675
id="benchmark-org-id"
679676
value={form.benchmarkOrgId}
680677
onChange={e => updateForm(prev => ({ ...prev, benchmarkOrgId: e.target.value }))}
681678
className="h-8 font-mono text-xs"
682-
placeholder="(personal credits)"
679+
placeholder={`Default: ${DEFAULT_BENCHMARK_ORG_ID}`}
683680
/>
684681
<p className="text-muted-foreground text-xs">
685-
Optional org context; when set, decider runs bill org credits.
682+
Leave blank to bill decider benchmarks to the default benchmark org.
686683
</p>
687684
</div>
688685
</div>
@@ -943,6 +940,7 @@ export function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse
943940
<TableHeader>
944941
<TableRow>
945942
<TableHead>Model</TableHead>
943+
<TableHead className="w-36">Reasoning effort</TableHead>
946944
<TableHead className="text-right">Accuracy</TableHead>
947945
<TableHead className="text-right">Avg cost</TableHead>
948946
<TableHead className="text-right">Cost / accuracy</TableHead>
@@ -955,6 +953,9 @@ export function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse
955953
<TableCell className="max-w-56 truncate font-mono text-xs">
956954
{c.model}
957955
</TableCell>
956+
<TableCell className="capitalize text-xs">
957+
{c.reasoningEffort ?? 'default'}
958+
</TableCell>
958959
<TableCell className="text-right tabular-nums text-xs">
959960
{formatAccuracy(c.accuracy)}
960961
</TableCell>

packages/auto-routing-contracts/src/benchmark.ts

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ export type BenchmarkDeciderModel = z.infer<typeof BenchmarkDeciderModelSchema>;
2020

2121
export const AUTO_DECIDER_DEFAULT_MIN_COST_USD = 15;
2222
export const AUTO_DECIDER_DEFAULT_MAX_COST_USD = 25;
23+
export const DEFAULT_BENCHMARK_USER_ID = 'ce12ef3d-ae95-4d77-b4f0-23735f0a0591';
24+
export const DEFAULT_BENCHMARK_ORG_ID = '9d278969-5453-4ae3-a51f-a8d2274a7b56';
2325

2426
export const AutoBenchmarkDeciderModelSchema = BenchmarkDeciderModelSchema.extend({
2527
avgAttemptCostUsd: z.number().nonnegative(),
@@ -61,12 +63,11 @@ export const BenchmarkConfigSchema = z
6163
// Benchmark-wide parallelism budget. Decider runs use it as a live
6264
// container budget; classifier runs use it for parallel OpenRouter calls.
6365
maxConcurrency: z.number().int().min(1).max(100),
64-
// The Kilo user whose identity/billing the decider CLI runs execute under.
65-
// Null until an admin configures it; decider runs fail fast while null.
66+
// Optional override for the Kilo user whose identity/billing the decider
67+
// CLI runs execute under. Null means the worker uses DEFAULT_BENCHMARK_USER_ID.
6668
benchmarkUserId: z.string().trim().min(1).nullable(),
67-
// Optional organization context for the benchmark user. When present, the
68-
// CLI token and container run execute in org context so usage bills org
69-
// credits instead of personal credits.
69+
// Optional override for the organization context. Null means the worker
70+
// uses DEFAULT_BENCHMARK_ORG_ID.
7071
benchmarkOrgId: z.string().trim().min(1).nullable().default(null),
7172
// Session stickiness knob carried into published routing tables: a session
7273
// stays on its incumbent model while it meets the route's accuracy
@@ -122,6 +123,15 @@ export const BenchmarkConfigSchema = z
122123
});
123124
export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
124125

126+
export function resolveBenchmarkIdentity(
127+
config: Pick<BenchmarkConfig, 'benchmarkUserId' | 'benchmarkOrgId'>
128+
): { benchmarkUserId: string; benchmarkOrgId: string } {
129+
return {
130+
benchmarkUserId: config.benchmarkUserId ?? DEFAULT_BENCHMARK_USER_ID,
131+
benchmarkOrgId: config.benchmarkOrgId ?? DEFAULT_BENCHMARK_ORG_ID,
132+
};
133+
}
134+
125135
export const AutoBenchmarkDeciderCandidatesResponseSchema = z.object({
126136
candidates: z.array(
127137
z.object({

packages/auto-routing-contracts/src/contracts.test.ts

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@ import {
66
MirrorPayloadSchema,
77
UpdateClassifierModelRequestSchema,
88
} from './index';
9-
import { BenchmarkConfigSchema } from './benchmark';
9+
import {
10+
BenchmarkConfigSchema,
11+
DEFAULT_BENCHMARK_ORG_ID,
12+
DEFAULT_BENCHMARK_USER_ID,
13+
resolveBenchmarkIdentity,
14+
} from './benchmark';
1015

1116
describe('auto routing contracts', () => {
1217
it('validates the cross-service request and response contracts', () => {
@@ -213,6 +218,27 @@ describe('BenchmarkConfigSchema defaults', () => {
213218
});
214219
});
215220

221+
describe('resolveBenchmarkIdentity', () => {
222+
it('uses worker defaults when benchmark identity overrides are null', () => {
223+
expect(resolveBenchmarkIdentity({ benchmarkUserId: null, benchmarkOrgId: null })).toEqual({
224+
benchmarkUserId: DEFAULT_BENCHMARK_USER_ID,
225+
benchmarkOrgId: DEFAULT_BENCHMARK_ORG_ID,
226+
});
227+
});
228+
229+
it('preserves configured benchmark identity overrides', () => {
230+
expect(
231+
resolveBenchmarkIdentity({
232+
benchmarkUserId: 'override-user',
233+
benchmarkOrgId: 'override-org',
234+
})
235+
).toEqual({
236+
benchmarkUserId: 'override-user',
237+
benchmarkOrgId: 'override-org',
238+
});
239+
});
240+
});
241+
216242
describe('BenchmarkConfigSchema duplicate model ids', () => {
217243
const base = {
218244
minAccuracy: 0.8,

services/auto-routing-benchmark/README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,10 @@ SECRET=$(grep '^INTERNAL_API_SECRET=' ../../.env.local | cut -d= -f2- | tr -d '"
6262
curl -s http://localhost:8814/admin/config -H "Authorization: Bearer $SECRET"
6363
```
6464

65-
Decider runs need a `benchmarkUserId` that exists locally with credits. When
66-
`benchmarkOrgId` is set, the benchmark user must belong to that org and usage
67-
bills org credits. The dev seed provides `auto-routing-cli-local`.
65+
Decider runs use the worker's default benchmark user and org unless
66+
`benchmarkUserId` / `benchmarkOrgId` overrides are saved in config. Any
67+
effective benchmark user must exist locally with credits and belong to the
68+
effective org. The dev seed provides `auto-routing-cli-local`.
6869

6970
> Local KV/D1 writes from a *second* `wrangler` process are not seen by the
7071
> running dev process (miniflare holds its own view). After writing state out of

services/auto-routing-benchmark/src/admin.test.ts

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import { beforeEach, describe, expect, it, vi } from 'vitest';
2-
import type {
3-
BenchmarkConfig,
4-
BenchmarkModelSummary,
5-
RoutingTable,
2+
import {
3+
DEFAULT_BENCHMARK_ORG_ID,
4+
DEFAULT_BENCHMARK_USER_ID,
5+
type BenchmarkConfig,
6+
type BenchmarkModelSummary,
7+
type RoutingTable,
68
} from '@kilocode/auto-routing-contracts';
79
import { app } from './index';
810
import { computeEngineIdentity } from './run';
@@ -423,6 +425,8 @@ describe('POST /admin/runs', () => {
423425
const [, runArg] = vi.mocked(insertRun).mock.calls[0];
424426
expect(runArg.min_accuracy).toBe(TEST_CONFIG.minAccuracy);
425427
expect(runArg.switch_cost_factor).toBe(TEST_CONFIG.switchCostFactor);
428+
expect(runArg.benchmark_user_id).toBe(DEFAULT_BENCHMARK_USER_ID);
429+
expect(runArg.benchmark_org_id).toBe(DEFAULT_BENCHMARK_ORG_ID);
426430
const queuedMessages = queueSendBatch.mock.calls.flatMap(([messages]) => messages);
427431
expect(queueSendBatch).toHaveBeenCalledTimes(2);
428432
expect(queuedMessages).toHaveLength(
@@ -437,6 +441,25 @@ describe('POST /admin/runs', () => {
437441
});
438442
});
439443

444+
it('starts a decider run with default benchmark identity when overrides are null', async () => {
445+
vi.mocked(getConfigRows).mockResolvedValue({
446+
...TEST_CONFIG_ROWS,
447+
config: {
448+
...TEST_CONFIG_ROWS.config,
449+
benchmark_user_id: null,
450+
benchmark_org_id: null,
451+
},
452+
deciderModels: [{ model: 'vendor/a', reasoning_effort: null }],
453+
});
454+
455+
const res = await authedPost('/admin/runs', { kind: 'decider' });
456+
expect(res.status).toBe(200);
457+
expect(insertRun).toHaveBeenCalledOnce();
458+
const [, runArg] = vi.mocked(insertRun).mock.calls[0];
459+
expect(runArg.benchmark_user_id).toBe(DEFAULT_BENCHMARK_USER_ID);
460+
expect(runArg.benchmark_org_id).toBe(DEFAULT_BENCHMARK_ORG_ID);
461+
});
462+
440463
it('carries a decider model only when its benchmark identity still matches', async () => {
441464
vi.mocked(getConfigRows).mockResolvedValue({
442465
...TEST_CONFIG_ROWS,

services/auto-routing-benchmark/src/admin.ts

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import * as z from 'zod';
22
import {
33
BenchmarkConfigSchema,
4+
resolveBenchmarkIdentity,
45
StartBenchmarkRunRequestSchema,
56
type BenchmarkRun,
67
} from '@kilocode/auto-routing-contracts';
@@ -93,19 +94,20 @@ export function registerAdminRoutes(app: Hono<HonoEnv>): void {
9394
zodJsonValidator(DebugCliRequestSchema, { errorMessage: 'Invalid debug request' }),
9495
async c => {
9596
const config = await getBenchmarkConfig(c.env.BENCH_DB);
96-
if (!config?.benchmarkUserId) {
97-
return c.json({ error: 'benchmarkUserId is not configured' }, 400);
97+
if (!config) {
98+
return c.json({ error: 'benchmark config is not configured' }, 400);
9899
}
100+
const benchmarkIdentity = resolveBenchmarkIdentity(config);
99101
const kiloToken = await fetchBenchmarkUserToken(
100102
c.env,
101-
config.benchmarkUserId,
102-
config.benchmarkOrgId
103+
benchmarkIdentity.benchmarkUserId,
104+
benchmarkIdentity.benchmarkOrgId
103105
);
104106
const result = await debugRunCli(c.env, {
105107
...c.req.valid('json'),
106108
kiloToken,
107109
kiloApiUrl: c.env.KILO_CLI_API_URL,
108-
orgId: config.benchmarkOrgId,
110+
orgId: benchmarkIdentity.benchmarkOrgId,
109111
});
110112
return c.json(result);
111113
}

0 commit comments

Comments
 (0)