Skip to content

Commit d5849a5

Browse files
committed
refactor(benchmark): remove fixed word/number count constraints from LLM tests
- Topic Classification: remove '3-6 words' / 'short phrase' from prompts, now just 'Respond with ONLY the topic title' - Remove word count assertion (wc <= 8) and upper char bounds - Chat & JSON: remove upper-bound char limits (<2000, <500, <3000) - Narrative Synthesis: remove <4000 char limit - Contradictory Instructions: 'under 50 words' -> 'succinct' - Context Preprocessing: 'brief 1-line summary' -> 'summary' LLMs perform poorly on fixed word count targets. Validation assertions for minimum response length and JSON structure preserved.
1 parent 38da250 commit d5849a5

1 file changed

Lines changed: 13 additions & 15 deletions

File tree

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ ${userMessage}
446446
447447
## Response Format
448448
Respond with ONLY a valid JSON object, no other text:
449-
{"keep": [<actual index numbers from the list above>], "summary": "<brief 1-line summary of what was dropped>"}
449+
{"keep": [<actual index numbers from the list above>], "summary": "<summary of what was dropped>"}
450450
451451
Example: if keeping messages at indices 0, 18, 22 → {"keep": [0, 18, 22], "summary": "Removed 4 duplicate 'what happened today' questions"}
452452
If nothing should be dropped, keep ALL indices and set summary to "".`;
@@ -566,16 +566,14 @@ suite('📋 Context Preprocessing', async () => {
566566
// ═══════════════════════════════════════════════════════════════════════════════
567567

568568
suite('🏷️ Topic Classification', async () => {
569-
await test('First turn → topic title (3-6 words)', async () => {
569+
await test('First turn → topic title', async () => {
570570
const r = await llmCall([{
571-
role: 'user', content: `Classify this exchange's topic in 3-6 words. Respond with ONLY the topic title.
571+
role: 'user', content: `Classify this exchange's topic. Respond with ONLY the topic title.
572572
User: "What has happened today on the cameras?"
573573
Assistant: "Today, your cameras captured motion events including a person at the front door at 9:40 AM..."` }]);
574574
const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').replace(/^(new\s+)?topic\s*:\s*/i, '').trim();
575575
assert(cleaned.length > 0, 'Topic empty');
576-
const wc = cleaned.split(/\s+/).length;
577-
assert(wc <= 8, `Too verbose: ${wc} words`);
578-
return `"${cleaned}" (${wc} words)`;
576+
return `"${cleaned}"`;
579577
});
580578

581579
await test('Same topic → SAME', async () => {
@@ -585,7 +583,7 @@ User: "Show me the clip from 9:40 AM"
585583
Assistant: "Here's the clip from 9:40 AM showing a person at the front door..."
586584
Current topic: "Camera Events Review"
587585
If the topic hasn't changed, respond: SAME
588-
Otherwise respond with ONLY the new topic title (3-6 words).` }]);
586+
Otherwise respond with ONLY the new topic title.` }]);
589587
const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '');
590588
assert(cleaned.toUpperCase() === 'SAME', `Expected SAME, got "${cleaned}"`);
591589
return 'SAME ✓';
@@ -598,19 +596,19 @@ User: "What's the system status? How much storage am I using?"
598596
Assistant: "System healthy. Storage: 45GB of 500GB, VLM running on GPU."
599597
Current topic: "Camera Events Review"
600598
If the topic hasn't changed, respond: SAME
601-
Otherwise respond with ONLY the new topic title (3-6 words).` }]);
599+
Otherwise respond with ONLY the new topic title.` }]);
602600
const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').replace(/^(new\s+)?topic\s*:\s*/i, '').trim();
603601
assert(cleaned.toUpperCase() !== 'SAME', 'Expected new topic');
604602
return `"${cleaned}"`;
605603
});
606604

607605
await test('Greeting → valid topic', async () => {
608606
const r = await llmCall([{
609-
role: 'user', content: `Classify this exchange's topic in 3-6 words. Respond with ONLY the topic title.
607+
role: 'user', content: `Classify this exchange's topic. Respond with ONLY the topic title.
610608
User: "Hi, good morning!"
611609
Assistant: "Good morning! How can I help you with your home security today?"` }]);
612610
const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').trim();
613-
assert(cleaned.length > 0 && cleaned.length < 50, `Bad: "${cleaned}"`);
611+
assert(cleaned.length > 0, `Bad: empty topic`);
614612
return `"${cleaned}"`;
615613
});
616614
});
@@ -818,7 +816,7 @@ suite('💬 Chat & JSON Compliance', async () => {
818816
{ role: 'user', content: 'What can you do?' },
819817
]);
820818
const c = stripThink(r.content);
821-
assert(c.length > 20 && c.length < 2000, `Length ${c.length}`);
819+
assert(c.length > 20, `Response too short: ${c.length} chars`);
822820
return `${c.length} chars`;
823821
});
824822

@@ -827,7 +825,7 @@ suite('💬 Chat & JSON Compliance', async () => {
827825
{ role: 'system', content: 'You are Aegis. When you have nothing to say, respond ONLY: NO_REPLY' },
828826
{ role: 'user', content: '[Tool Context] video_search returned 3 clips' },
829827
]);
830-
assert(stripThink(r.content).length < 500, 'Response too long for tool context');
828+
// No upper-bound length check — LLMs may be verbose
831829
return `"${stripThink(r.content).slice(0, 40)}"`;
832830
});
833831

@@ -907,13 +905,13 @@ suite('💬 Chat & JSON Compliance', async () => {
907905

908906
await test('Contradictory instructions → balanced response', async () => {
909907
const r = await llmCall([
910-
{ role: 'system', content: 'You are Aegis. Keep all responses under 50 words.' },
908+
{ role: 'system', content: 'You are Aegis. Keep all responses succinct.' },
911909
{ role: 'user', content: 'Give me a very detailed, comprehensive explanation of how the security classification system works with all four levels and examples of each.' },
912910
]);
913911
const c = stripThink(r.content);
914912
// Model should produce something reasonable — not crash or refuse
915913
assert(c.length > 30, 'Response too short');
916-
assert(c.length < 3000, 'Response unreasonably long');
914+
// No upper-bound length check — LLMs may produce varying lengths
917915
return `${c.split(/\s+/).length} words, ${c.length} chars`;
918916
});
919917

@@ -1035,7 +1033,7 @@ suite('📝 Narrative Synthesis', async () => {
10351033
const c = stripThink(r.content);
10361034
// Should be concise — not just repeat all 22 events
10371035
assert(c.length > 100, `Response too short: ${c.length} chars`);
1038-
assert(c.length < 4000, `Response too long (raw dump?): ${c.length} chars`);
1036+
// No upper-bound length check — narrative length varies by model
10391037
// Should mention key categories
10401038
const lower = c.toLowerCase();
10411039
assert(lower.includes('deliver') || lower.includes('package'),

0 commit comments

Comments
 (0)