refactor(benchmark): remove fixed word/number count constraints from LLM tests

solderzzc · solderzzc · commit d5849a5e018f · 2026-03-14T18:19:49.000-07:00
- Topic Classification: remove '3-6 words' / 'short phrase' from prompts,
  now just 'Respond with ONLY the topic title'
- Remove word count assertion (wc &lt;= 8) and upper char bounds
- Chat &amp; JSON: remove upper-bound char limits (&lt;2000, &lt;500, &lt;3000)
- Narrative Synthesis: remove &lt;4000 char limit
- Contradictory Instructions: 'under 50 words' -&gt; 'succinct'
- Context Preprocessing: 'brief 1-line summary' -&gt; 'summary'

LLMs perform poorly on fixed word count targets. Validation
assertions for minimum response length and JSON structure preserved.
diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -446,7 +446,7 @@ ${userMessage}
 
 ## Response Format
 Respond with ONLY a valid JSON object, no other text:
-{"keep": [<actual index numbers from the list above>], "summary": "<brief 1-line summary of what was dropped>"}
+{"keep": [<actual index numbers from the list above>], "summary": "<summary of what was dropped>"}
 
 Example: if keeping messages at indices 0, 18, 22 → {"keep": [0, 18, 22], "summary": "Removed 4 duplicate 'what happened today' questions"}
 If nothing should be dropped, keep ALL indices and set summary to "".`;
@@ -566,16 +566,14 @@ suite('📋 Context Preprocessing', async () => {
 // ═══════════════════════════════════════════════════════════════════════════════
 
 suite('🏷️ Topic Classification', async () => {
-    await test('First turn → topic title (3-6 words)', async () => {
+    await test('First turn → topic title', async () => {
         const r = await llmCall([{
-            role: 'user', content: `Classify this exchange's topic in 3-6 words. Respond with ONLY the topic title.
+            role: 'user', content: `Classify this exchange's topic. Respond with ONLY the topic title.
 User: "What has happened today on the cameras?"
 Assistant: "Today, your cameras captured motion events including a person at the front door at 9:40 AM..."` }]);
         const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').replace(/^(new\s+)?topic\s*:\s*/i, '').trim();
         assert(cleaned.length > 0, 'Topic empty');
-        const wc = cleaned.split(/\s+/).length;
-        assert(wc <= 8, `Too verbose: ${wc} words`);
-        return `"${cleaned}" (${wc} words)`;
+        return `"${cleaned}"`;
     });
 
     await test('Same topic → SAME', async () => {
@@ -585,7 +583,7 @@ User: "Show me the clip from 9:40 AM"
 Assistant: "Here's the clip from 9:40 AM showing a person at the front door..."
 Current topic: "Camera Events Review"
 If the topic hasn't changed, respond: SAME
-Otherwise respond with ONLY the new topic title (3-6 words).` }]);
+Otherwise respond with ONLY the new topic title.` }]);
         const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '');
         assert(cleaned.toUpperCase() === 'SAME', `Expected SAME, got "${cleaned}"`);
         return 'SAME ✓';
@@ -598,19 +596,19 @@ User: "What's the system status? How much storage am I using?"
 Assistant: "System healthy. Storage: 45GB of 500GB, VLM running on GPU."
 Current topic: "Camera Events Review"
 If the topic hasn't changed, respond: SAME
-Otherwise respond with ONLY the new topic title (3-6 words).` }]);
+Otherwise respond with ONLY the new topic title.` }]);
         const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').replace(/^(new\s+)?topic\s*:\s*/i, '').trim();
         assert(cleaned.toUpperCase() !== 'SAME', 'Expected new topic');
         return `"${cleaned}"`;
     });
 
     await test('Greeting → valid topic', async () => {
         const r = await llmCall([{
-            role: 'user', content: `Classify this exchange's topic in 3-6 words. Respond with ONLY the topic title.
+            role: 'user', content: `Classify this exchange's topic. Respond with ONLY the topic title.
 User: "Hi, good morning!"
 Assistant: "Good morning! How can I help you with your home security today?"` }]);
         const cleaned = stripThink(r.content).split('\n').filter(l => l.trim()).pop().replace(/^["'*]+|["'*]+$/g, '').trim();
-        assert(cleaned.length > 0 && cleaned.length < 50, `Bad: "${cleaned}"`);
+        assert(cleaned.length > 0, `Bad: empty topic`);
         return `"${cleaned}"`;
     });
 });
@@ -818,7 +816,7 @@ suite('💬 Chat & JSON Compliance', async () => {
             { role: 'user', content: 'What can you do?' },
         ]);
         const c = stripThink(r.content);
-        assert(c.length > 20 && c.length < 2000, `Length ${c.length}`);
+        assert(c.length > 20, `Response too short: ${c.length} chars`);
         return `${c.length} chars`;
     });
 
@@ -827,7 +825,7 @@ suite('💬 Chat & JSON Compliance', async () => {
             { role: 'system', content: 'You are Aegis. When you have nothing to say, respond ONLY: NO_REPLY' },
             { role: 'user', content: '[Tool Context] video_search returned 3 clips' },
         ]);
-        assert(stripThink(r.content).length < 500, 'Response too long for tool context');
+        // No upper-bound length check — LLMs may be verbose
         return `"${stripThink(r.content).slice(0, 40)}"`;
     });
 
@@ -907,13 +905,13 @@ suite('💬 Chat & JSON Compliance', async () => {
 
     await test('Contradictory instructions → balanced response', async () => {
         const r = await llmCall([
-            { role: 'system', content: 'You are Aegis. Keep all responses under 50 words.' },
+            { role: 'system', content: 'You are Aegis. Keep all responses succinct.' },
             { role: 'user', content: 'Give me a very detailed, comprehensive explanation of how the security classification system works with all four levels and examples of each.' },
         ]);
         const c = stripThink(r.content);
         // Model should produce something reasonable — not crash or refuse
         assert(c.length > 30, 'Response too short');
-        assert(c.length < 3000, 'Response unreasonably long');
+        // No upper-bound length check — LLMs may produce varying lengths
         return `${c.split(/\s+/).length} words, ${c.length} chars`;
     });
 
@@ -1035,7 +1033,7 @@ suite('📝 Narrative Synthesis', async () => {
         const c = stripThink(r.content);
         // Should be concise — not just repeat all 22 events
         assert(c.length > 100, `Response too short: ${c.length} chars`);
-        assert(c.length < 4000, `Response too long (raw dump?): ${c.length} chars`);
+        // No upper-bound length check — narrative length varies by model
         // Should mention key categories
         const lower = c.toLowerCase();
         assert(lower.includes('deliver') || lower.includes('package'),