Skip to content

Commit f367a41

Browse files
authored
Merge pull request #151 from SharpAI/develop
Develop
2 parents 795bb29 + 62b4e28 commit f367a41

File tree

1 file changed

+48
-17
lines changed

1 file changed

+48
-17
lines changed

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,20 @@ async function llmCall(messages, opts = {}) {
165165
}
166166

167167
const model = opts.model || (opts.vlm ? VLM_MODEL : LLM_MODEL) || undefined;
168-
// For JSON-expected tests, disable thinking (Qwen3 /no_think directive)
169-
// This prevents the model from wasting tokens on reasoning before outputting JSON
168+
// For JSON-expected tests, disable thinking (Qwen3.5 doesn't support /no_think)
169+
// Method 1: Inject empty <think></think> assistant prefix to skip reasoning phase
170+
// Method 2: chat_template_kwargs via extra_body (works if server supports it)
170171
if (opts.expectJSON) {
171-
const lastUserIdx = messages.findLastIndex(m => m.role === 'user');
172-
if (lastUserIdx >= 0) {
173-
messages = [...messages];
174-
messages[lastUserIdx] = { ...messages[lastUserIdx], content: messages[lastUserIdx].content + ' /no_think' };
175-
}
172+
messages = [...messages];
173+
// Remove any leftover /no_think from messages
174+
messages = messages.map(m => {
175+
if (m.role === 'user' && typeof m.content === 'string' && m.content.endsWith(' /no_think')) {
176+
return { ...m, content: m.content.slice(0, -10) };
177+
}
178+
return m;
179+
});
180+
// Inject empty think block as assistant prefix (most portable method)
181+
messages.push({ role: 'assistant', content: '<think>\n</think>\n' });
176182
}
177183

178184
// Build request params
@@ -182,7 +188,9 @@ async function llmCall(messages, opts = {}) {
182188
...(model && { model }),
183189
...(opts.temperature !== undefined && { temperature: opts.temperature }),
184190
...(opts.maxTokens && { max_completion_tokens: opts.maxTokens }),
185-
...(opts.expectJSON && { response_format: { type: 'json_object' } }),
191+
// Qwen3.5 non-thinking mode recommended params
192+
...(opts.expectJSON && opts.temperature === undefined && { temperature: 0.7 }),
193+
...(opts.expectJSON && { top_p: 0.8, presence_penalty: 1.5 }),
186194
...(opts.tools && { tools: opts.tools }),
187195
};
188196

@@ -192,7 +200,7 @@ async function llmCall(messages, opts = {}) {
192200
let idleTimer = setTimeout(() => controller.abort(), idleMs);
193201
const resetIdle = () => { clearTimeout(idleTimer); idleTimer = setTimeout(() => controller.abort(), idleMs); };
194202
// Log prompt being sent
195-
log(`\n 📤 Prompt (${messages.length} messages, params: ${JSON.stringify({maxTokens: opts.maxTokens, expectJSON: !!opts.expectJSON, response_format: params.response_format})}):`);
203+
log(`\n 📤 Prompt (${messages.length} messages, params: ${JSON.stringify({maxTokens: opts.maxTokens, expectJSON: !!opts.expectJSON})}):`);
196204
for (const m of messages) {
197205
if (typeof m.content === 'string') {
198206
log(` [${m.role}] ${m.content}`);
@@ -274,10 +282,15 @@ async function llmCall(messages, opts = {}) {
274282
break;
275283
}
276284
}
277-
// Hard cap: abort if token count far exceeds maxTokens (server may
278-
// not count thinking tokens toward the limit)
279-
if (opts.maxTokens && tokenCount > opts.maxTokens * 3) {
280-
log(` ⚠ Aborting: ${tokenCount} tokens exceeds ${opts.maxTokens}×3 safety limit`);
285+
// Hard cap: abort if token count far exceeds maxTokens
286+
if (opts.maxTokens && tokenCount > opts.maxTokens * 2) {
287+
log(` ⚠ Aborting: ${tokenCount} tokens exceeds ${opts.maxTokens}×2 safety limit`);
288+
controller.abort();
289+
break;
290+
}
291+
// Global safety limit: no benchmark test should ever need >2000 tokens
292+
if (tokenCount > 2000) {
293+
log(` ⚠ Aborting: ${tokenCount} tokens exceeds global 2000-token safety limit`);
281294
controller.abort();
282295
break;
283296
}
@@ -334,10 +347,28 @@ function parseJSON(text) {
334347
const cleaned = stripThink(text);
335348
let jsonStr = cleaned;
336349
const codeBlock = cleaned.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
337-
if (codeBlock) jsonStr = codeBlock[1];
338-
else {
339-
const idx = cleaned.search(/[{[]/);
340-
if (idx > 0) jsonStr = cleaned.slice(idx);
350+
if (codeBlock) {
351+
jsonStr = codeBlock[1];
352+
} else {
353+
// Find first { or [ and extract balanced JSON
354+
const startIdx = cleaned.search(/[{[]/);
355+
if (startIdx >= 0) {
356+
const opener = cleaned[startIdx];
357+
const closer = opener === '{' ? '}' : ']';
358+
let depth = 0;
359+
let inString = false;
360+
let escape = false;
361+
for (let i = startIdx; i < cleaned.length; i++) {
362+
const ch = cleaned[i];
363+
if (escape) { escape = false; continue; }
364+
if (ch === '\\' && inString) { escape = true; continue; }
365+
if (ch === '"') { inString = !inString; continue; }
366+
if (!inString) {
367+
if (ch === opener) depth++;
368+
else if (ch === closer) { depth--; if (depth === 0) { jsonStr = cleaned.slice(startIdx, i + 1); break; } }
369+
}
370+
}
371+
}
341372
}
342373
return JSON.parse(jsonStr.trim());
343374
}

0 commit comments

Comments
 (0)