Skip to content

Commit 48f9bf8

Browse files
simbasimba
authored andcommitted
fix(benchmark): remove assistant prefill, fix VLM suite counting
- Remove assistant prefill injection that caused 400 errors with Qwen3.5 when enable_thinking is active - Remove presence_penalty from JSON-expected requests - Fix VLM/LLM split to only count image analysis suites as VLM
1 parent 9769445 commit 48f9bf8

1 file changed

Lines changed: 10 additions & 11 deletions

File tree

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,10 @@ async function llmCall(messages, opts = {}) {
165165
}
166166

167167
const model = opts.model || (opts.vlm ? VLM_MODEL : LLM_MODEL) || undefined;
168-
// For JSON-expected tests, disable thinking (Qwen3.5 doesn't support /no_think)
169-
// Method 1: Inject empty <think></think> assistant prefix to skip reasoning phase
170-
// Method 2: chat_template_kwargs via extra_body (works if server supports it)
168+
// For JSON-expected tests, use low temperature + top_p to encourage
169+
// direct JSON output without extended reasoning.
170+
// NOTE: Do NOT inject assistant prefill — Qwen3.5 rejects prefill
171+
// when enable_thinking is active (400 error).
171172
if (opts.expectJSON) {
172173
messages = [...messages];
173174
// Remove any leftover /no_think from messages
@@ -177,8 +178,6 @@ async function llmCall(messages, opts = {}) {
177178
}
178179
return m;
179180
});
180-
// Inject empty think block as assistant prefix (most portable method)
181-
messages.push({ role: 'assistant', content: '<think>\n</think>\n' });
182181
}
183182

184183
// Build request params
@@ -188,9 +187,8 @@ async function llmCall(messages, opts = {}) {
188187
...(model && { model }),
189188
...(opts.temperature !== undefined && { temperature: opts.temperature }),
190189
...(opts.maxTokens && { max_tokens: opts.maxTokens }),
191-
// Qwen3.5 non-thinking mode recommended params
192190
...(opts.expectJSON && opts.temperature === undefined && { temperature: 0.7 }),
193-
...(opts.expectJSON && { top_p: 0.8, presence_penalty: 1.5 }),
191+
...(opts.expectJSON && { top_p: 0.8 }),
194192
...(opts.tools && { tools: opts.tools }),
195193
};
196194

@@ -2021,10 +2019,11 @@ async function main() {
20212019
const indexFile = path.join(RESULTS_DIR, 'index.json');
20222020
let index = [];
20232021
try { index = JSON.parse(fs.readFileSync(indexFile, 'utf8')); } catch { }
2024-
// Compute LLM vs VLM split
2025-
const vlmSuite = results.suites.find(s => s.name.includes('VLM'));
2026-
const vlmPassed = vlmSuite ? vlmSuite.tests.filter(t => t.status === 'pass').length : 0;
2027-
const vlmTotal = vlmSuite ? vlmSuite.tests.length : 0;
2022+
// Compute LLM vs VLM split (only count image analysis suites as VLM)
2023+
const isVlmImageSuite = (name) => name.includes('VLM Scene') || name.includes('📸');
2024+
const vlmSuites = results.suites.filter(s => isVlmImageSuite(s.name));
2025+
const vlmPassed = vlmSuites.reduce((n, s) => n + s.tests.filter(t => t.status === 'pass').length, 0);
2026+
const vlmTotal = vlmSuites.reduce((n, s) => n + s.tests.length, 0);
20282027
const llmPassed = passed - vlmPassed;
20292028
const llmTotal = total - vlmTotal;
20302029

0 commit comments

Comments
 (0)