Skip to content

Commit 795bb29

Browse files
authored
Merge pull request #149 from SharpAI/develop
Develop
2 parents d4ab3dd + b57d469 commit 795bb29

File tree

2 files changed

+98
-21
lines changed

2 files changed

+98
-21
lines changed

skills.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,9 @@
101101
"id": "camera-claw",
102102
"name": "Camera Claw",
103103
"description": "Security camera for your AI agent — sandbox, record, and monitor OpenClaw activity.",
104-
"version": "1.1.0",
104+
"version": "2026.3.12",
105105
"category": "integrations",
106+
"url": "https://github.com/SharpAI/CameraClaw",
106107
"repo_url": "https://github.com/SharpAI/CameraClaw",
107108
"code_structure": [
108109
{ "path": "SKILL.md", "desc": "Aegis skill manifest (11 params)" },
@@ -114,7 +115,7 @@
114115
{ "path": "scripts/health-check.js", "desc": "Container health checker" },
115116
{ "path": "docs/aegis_openclaw_note.md", "desc": "Aegis integration requirements" }
116117
],
117-
"tags": ["security", "sandbox", "monitoring", "openclaw"],
118+
"tags": ["security", "sandbox", "monitoring", "openclaw", "ai-agent"],
118119
"platforms": [
119120
"linux-x64",
120121
"linux-arm64",

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 95 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,15 @@ async function llmCall(messages, opts = {}) {
165165
}
166166

167167
const model = opts.model || (opts.vlm ? VLM_MODEL : LLM_MODEL) || undefined;
168+
// For JSON-expected tests, disable thinking (Qwen3 /no_think directive)
169+
// This prevents the model from wasting tokens on reasoning before outputting JSON
170+
if (opts.expectJSON) {
171+
const lastUserIdx = messages.findLastIndex(m => m.role === 'user');
172+
if (lastUserIdx >= 0) {
173+
messages = [...messages];
174+
messages[lastUserIdx] = { ...messages[lastUserIdx], content: messages[lastUserIdx].content + ' /no_think' };
175+
}
176+
}
168177

169178
// Build request params
170179
const params = {
@@ -173,6 +182,7 @@ async function llmCall(messages, opts = {}) {
173182
...(model && { model }),
174183
...(opts.temperature !== undefined && { temperature: opts.temperature }),
175184
...(opts.maxTokens && { max_completion_tokens: opts.maxTokens }),
185+
...(opts.expectJSON && { response_format: { type: 'json_object' } }),
176186
...(opts.tools && { tools: opts.tools }),
177187
};
178188

@@ -181,6 +191,34 @@ async function llmCall(messages, opts = {}) {
181191
const idleMs = opts.timeout || IDLE_TIMEOUT_MS;
182192
let idleTimer = setTimeout(() => controller.abort(), idleMs);
183193
const resetIdle = () => { clearTimeout(idleTimer); idleTimer = setTimeout(() => controller.abort(), idleMs); };
194+
// Log prompt being sent
195+
log(`\n 📤 Prompt (${messages.length} messages, params: ${JSON.stringify({maxTokens: opts.maxTokens, expectJSON: !!opts.expectJSON, response_format: params.response_format})}):`);
196+
for (const m of messages) {
197+
if (typeof m.content === 'string') {
198+
log(` [${m.role}] ${m.content}`);
199+
} else if (Array.isArray(m.content)) {
200+
// Multi-part content (VLM with images)
201+
for (const part of m.content) {
202+
if (part.type === 'text') {
203+
log(` [${m.role}] ${part.text}`);
204+
} else if (part.type === 'image_url') {
205+
const url = part.image_url?.url || '';
206+
const b64Match = url.match(/^data:([^;]+);base64,(.+)/);
207+
if (b64Match) {
208+
const mimeType = b64Match[1];
209+
const b64Data = b64Match[2];
210+
const sizeKB = Math.round(b64Data.length * 3 / 4 / 1024);
211+
log(` [${m.role}] 🖼️ [Image: ${mimeType}, ~${sizeKB}KB]`);
212+
log(`[IMG:${url}]`);
213+
} else {
214+
log(` [${m.role}] 🖼️ [Image URL: ${url.slice(0, 80)}…]`);
215+
}
216+
}
217+
}
218+
} else {
219+
log(` [${m.role}] ${JSON.stringify(m.content).slice(0, 200)}`);
220+
}
221+
}
184222

185223
try {
186224
const stream = await client.chat.completions.create(params, {
@@ -193,6 +231,7 @@ async function llmCall(messages, opts = {}) {
193231
let model = '';
194232
let usage = {};
195233
let tokenCount = 0;
234+
let tokenBuffer = '';
196235

197236
for await (const chunk of stream) {
198237
resetIdle();
@@ -204,8 +243,43 @@ async function llmCall(messages, opts = {}) {
204243
if (delta?.reasoning_content) reasoningContent += delta.reasoning_content;
205244
if (delta?.content || delta?.reasoning_content) {
206245
tokenCount++;
246+
// Buffer and log tokens — tag with field source
247+
const isContent = !!delta?.content;
248+
const tok = delta?.content || delta?.reasoning_content || '';
249+
// Tag first token of each field type
250+
if (tokenCount === 1) tokenBuffer += isContent ? '[C] ' : '[R] ';
251+
tokenBuffer += tok;
252+
if (tokenCount % 20 === 0) {
253+
log(tokenBuffer);
254+
tokenBuffer = '';
255+
}
207256
if (tokenCount % 100 === 0) {
208-
log(` … ${tokenCount} tokens received`);
257+
log(` … ${tokenCount} tokens (content: ${content.length}c, reasoning: ${reasoningContent.length}c)`);
258+
}
259+
260+
// Smart early abort for JSON-expected tests:
261+
// If the model is producing reasoning_content (thinking) for a JSON test,
262+
// abort after 100 reasoning tokens — it should output JSON directly.
263+
if (opts.expectJSON && !isContent && tokenCount > 100) {
264+
log(` ⚠ Aborting: ${tokenCount} reasoning tokens for JSON test — model is thinking instead of outputting JSON`);
265+
controller.abort();
266+
break;
267+
}
268+
// If content is arriving, check it starts with JSON
269+
if (opts.expectJSON && isContent && content.length >= 50) {
270+
const stripped = content.replace(/<think>[\s\S]*?<\/think>\s*/gi, '').trimStart();
271+
if (stripped.length >= 50 && !/^\s*[{\[]/.test(stripped)) {
272+
log(` ⚠ Aborting: expected JSON but got: "${stripped.slice(0, 80)}…"`);
273+
controller.abort();
274+
break;
275+
}
276+
}
277+
// Hard cap: abort if token count far exceeds maxTokens (server may
278+
// not count thinking tokens toward the limit)
279+
if (opts.maxTokens && tokenCount > opts.maxTokens * 3) {
280+
log(` ⚠ Aborting: ${tokenCount} tokens exceeds ${opts.maxTokens}×3 safety limit`);
281+
controller.abort();
282+
break;
209283
}
210284
}
211285

@@ -224,6 +298,9 @@ async function llmCall(messages, opts = {}) {
224298
if (chunk.usage) usage = chunk.usage;
225299
}
226300

301+
// Flush remaining token buffer
302+
if (tokenBuffer) log(tokenBuffer);
303+
227304
// If the model only produced reasoning_content (thinking) with no content,
228305
// use the reasoning output as the response content for evaluation purposes.
229306
if (!content && reasoningContent) {
@@ -337,12 +414,11 @@ ${userMessage}
337414
4. Keep system messages (they contain tool results)
338415
339416
## Response Format
340-
Return ONLY this JSON (no other text):
341-
{"keep": [0, 5, 8], "summary": "brief 1-line summary of dropped exchanges"}
417+
Respond with ONLY a valid JSON object, no other text:
418+
{"keep": [<actual index numbers from the list above>], "summary": "<brief 1-line summary of what was dropped>"}
342419
343-
- "keep": array of message indices to KEEP (from the index list above)
344-
- "summary": what the dropped messages were about (so context is not lost entirely)
345-
- If nothing should be dropped, set keep to ALL indices and summary to ""`;
420+
Example: if keeping messages at indices 0, 18, 22 → {"keep": [0, 18, 22], "summary": "Removed 4 duplicate 'what happened today' questions"}
421+
If nothing should be dropped, keep ALL indices and set summary to "".`;
346422
}
347423

348424
suite('📋 Context Preprocessing', async () => {
@@ -356,7 +432,7 @@ suite('📋 Context Preprocessing', async () => {
356432
{ idx: 18, ts: '12:56 PM', text: 'What has happened today' },
357433
{ idx: 22, ts: '1:08 PM', text: 'What has happened today' },
358434
];
359-
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'What has happened today?') }]);
435+
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'What has happened today?') }], { maxTokens: 300, expectJSON: true });
360436
const p = parseJSON(r.content);
361437
assert(Array.isArray(p.keep), 'keep must be array');
362438
assert(p.keep.length <= 3, `Expected ≤3, got ${p.keep.length}`);
@@ -373,7 +449,7 @@ suite('📋 Context Preprocessing', async () => {
373449
{ idx: 18, ts: '12:00 PM', text: 'What is the system status?' },
374450
{ idx: 22, ts: '1:00 PM', text: 'What has happened today' },
375451
];
376-
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Any alerts triggered?') }]);
452+
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Any alerts triggered?') }], { maxTokens: 300, expectJSON: true });
377453
const p = parseJSON(r.content);
378454
assert(Array.isArray(p.keep), 'keep must be array');
379455
assert(p.keep.includes(3) || p.keep.includes(10) || p.keep.includes(18), 'Should keep unique topics');
@@ -387,7 +463,7 @@ suite('📋 Context Preprocessing', async () => {
387463
{ idx: 6, ts: '10:00 AM', text: 'What is the system status?' },
388464
{ idx: 10, ts: '11:00 AM', text: 'Analyze the clip from 9:40 AM' },
389465
];
390-
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Any new motion events?') }]);
466+
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Any new motion events?') }], { maxTokens: 300, expectJSON: true });
391467
const p = parseJSON(r.content);
392468
assert(Array.isArray(p.keep) && p.keep.length === 4, `Expected 4, got ${p.keep?.length}`);
393469
return `kept all 4 ✓`;
@@ -398,7 +474,7 @@ suite('📋 Context Preprocessing', async () => {
398474
{ idx: 0, ts: '9:00 AM', text: 'Hello' },
399475
{ idx: 2, ts: '9:05 AM', text: 'Show cameras' },
400476
];
401-
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Thanks') }]);
477+
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Thanks') }], { maxTokens: 300, expectJSON: true });
402478
const p = parseJSON(r.content);
403479
assert(Array.isArray(p.keep), 'keep must be array');
404480
return `kept ${p.keep.length}/2`;
@@ -427,7 +503,7 @@ suite('📋 Context Preprocessing', async () => {
427503
{ idx: 36, ts: '12:30 PM', text: 'What happened today?' },
428504
{ idx: 38, ts: '12:45 PM', text: 'Were there any packages delivered?' },
429505
];
430-
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'What happened today?') }]);
506+
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'What happened today?') }], { maxTokens: 300, expectJSON: true });
431507
const p = parseJSON(r.content);
432508
assert(Array.isArray(p.keep), 'keep must be array');
433509
// 10 duplicates of "What happened today?" → should keep ≤12 of 20
@@ -444,7 +520,7 @@ suite('📋 Context Preprocessing', async () => {
444520
{ idx: 3, ts: '9:05 AM', text: '[System] Alert triggered: person at front door' },
445521
{ idx: 4, ts: '9:10 AM', text: 'What happened today?' },
446522
];
447-
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Show me alerts') }]);
523+
const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Show me alerts') }], { maxTokens: 300, expectJSON: true });
448524
const p = parseJSON(r.content);
449525
assert(Array.isArray(p.keep), 'keep must be array');
450526
// System messages (idx 1, 3) must be kept
@@ -538,7 +614,7 @@ suite('🧠 Knowledge Distillation', async () => {
538614
const r = await llmCall([
539615
{ role: 'system', content: DISTILL_PROMPT },
540616
{ role: 'user', content: `## Topic: Camera Setup\n## Existing KIs: (none)\n## Conversation\nUser: I have three cameras. Front door is a Blink Mini, living room is Blink Indoor, side parking is Blink Outdoor.\nAegis: Got it! Want to set up alerts?\nUser: Yes, person detection on front door after 10pm. My name is Sam.\nAegis: Alert set. Nice to meet you, Sam!` },
541-
]);
617+
], { maxTokens: 500, expectJSON: true });
542618
const p = parseJSON(r.content);
543619
assert(p && typeof p === 'object', 'Must return object');
544620
const facts = (p.items || []).reduce((n, i) => n + (i.facts?.length || 0), 0) + (p.new_items || []).reduce((n, i) => n + (i.facts?.length || 0), 0);
@@ -550,7 +626,7 @@ suite('🧠 Knowledge Distillation', async () => {
550626
const r = await llmCall([
551627
{ role: 'system', content: DISTILL_PROMPT },
552628
{ role: 'user', content: `## Topic: Greeting\n## Existing KIs: (none)\n## Conversation\nUser: Hi\nAegis: Hello! How can I help?\nUser: Thanks, bye\nAegis: Goodbye!` },
553-
]);
629+
], { maxTokens: 500, expectJSON: true });
554630
const p = parseJSON(r.content);
555631
const facts = (p.items || []).reduce((n, i) => n + (i.facts?.length || 0), 0) + (p.new_items || []).reduce((n, i) => n + (i.facts?.length || 0), 0);
556632
assert(facts === 0, `Expected 0 facts, got ${facts}`);
@@ -561,7 +637,7 @@ suite('🧠 Knowledge Distillation', async () => {
561637
const r = await llmCall([
562638
{ role: 'system', content: DISTILL_PROMPT },
563639
{ role: 'user', content: `## Topic: Alert Configuration\n## Existing KIs: alert_preferences\n## Conversation\nUser: No notifications from side parking 8am-5pm. Too many false alarms from passing cars.\nAegis: Quiet hours set for side parking 8 AM-5 PM.\nUser: Front door alerts go to Telegram. Discord for everything else.\nAegis: Done — front door to Telegram, rest to Discord.` },
564-
]);
640+
], { maxTokens: 500, expectJSON: true });
565641
const p = parseJSON(r.content);
566642
const facts = (p.items || []).reduce((n, i) => n + (i.facts?.length || 0), 0) + (p.new_items || []).reduce((n, i) => n + (i.facts?.length || 0), 0);
567643
assert(facts >= 2, `Expected ≥2 facts, got ${facts}`);
@@ -572,7 +648,7 @@ suite('🧠 Knowledge Distillation', async () => {
572648
const r = await llmCall([
573649
{ role: 'system', content: DISTILL_PROMPT },
574650
{ role: 'user', content: `## Topic: Camera Update\n## Existing KIs: home_profile (facts: ["3 cameras: Blink Mini front, Blink Indoor living, Blink Outdoor side", "Owner: Sam"])\n## Conversation\nUser: I just installed a fourth camera in the backyard. It's a Reolink Argus 3 Pro.\nAegis: Nice upgrade! I've noted your new backyard Reolink camera. That brings your total to 4 cameras.\nUser: Also, I got a dog named Max, golden retriever.\nAegis: Welcome, Max! I'll note that for the pet detections.` },
575-
]);
651+
], { maxTokens: 500, expectJSON: true });
576652
const p = parseJSON(r.content);
577653
const allFacts = [...(p.items || []).flatMap(i => i.facts || []), ...(p.new_items || []).flatMap(i => i.facts || [])];
578654
assert(allFacts.length >= 2, `Expected ≥2 facts, got ${allFacts.length}`);
@@ -587,7 +663,7 @@ suite('🧠 Knowledge Distillation', async () => {
587663
const r = await llmCall([
588664
{ role: 'system', content: DISTILL_PROMPT },
589665
{ role: 'user', content: `## Topic: Camera Change\n## Existing KIs: home_profile (facts: ["3 cameras: Blink Mini front, Blink Indoor living, Blink Outdoor side"])\n## Conversation\nUser: I replaced the living room camera. The Blink Indoor died. I put a Ring Indoor there now.\nAegis: Got it — living room camera is now a Ring Indoor. Updated.\nUser: Actually I also moved the side parking camera to the garage instead.\nAegis: Camera moved from side parking to garage, noted.` },
590-
]);
666+
], { maxTokens: 500, expectJSON: true });
591667
const p = parseJSON(r.content);
592668
const allFacts = [...(p.items || []).flatMap(i => i.facts || []), ...(p.new_items || []).flatMap(i => i.facts || [])];
593669
assert(allFacts.length >= 1, `Expected ≥1 fact, got ${allFacts.length}`);
@@ -634,7 +710,7 @@ suite('🔔 Event Deduplication', async () => {
634710
const r = await llmCall([
635711
{ role: 'system', content: 'You are a security event classifier. Respond only with valid JSON.' },
636712
{ role: 'user', content: buildDedupPrompt(s.current, s.recent, s.age_sec) },
637-
], { maxTokens: 150, temperature: 0.1 });
713+
], { maxTokens: 150, temperature: 0.1, expectJSON: true });
638714
const p = parseJSON(r.content);
639715
if (s.expected_duplicate !== undefined) {
640716
assert(p.duplicate === s.expected_duplicate, `Expected duplicate=${s.expected_duplicate}, got ${p.duplicate}`);
@@ -847,7 +923,7 @@ suite('🛡️ Security Classification', async () => {
847923
const r = await llmCall([
848924
{ role: 'system', content: SECURITY_CLASSIFY_PROMPT },
849925
{ role: 'user', content: `Event description: ${s.description}` },
850-
], { maxTokens: 200, temperature: 0.1 });
926+
], { maxTokens: 200, temperature: 0.1, expectJSON: true });
851927
const p = parseJSON(r.content);
852928
assert(expectedClassifications.includes(p.classification),
853929
`Expected "${expectedLabel}", got "${p.classification}"`);

0 commit comments

Comments
 (0)