Skip to content

Commit a61af3e

Browse files
committed
feat: expand benchmark from 31 to 54 tests
New suites: - 🛡️ Security Classification (8 tests): normal/monitor/suspicious/critical including package delivery, child, photographing, vehicle tampering - 📝 Narrative Synthesis (3 tests): chronological summary, suspicious event prioritization, multi-camera grouping Expanded existing suites: - Tool Use: 6→12 scenarios (package inquiry, ambiguous intent, urgent, vehicle filter, animal alert subscription) - Event Dedup: 3→5 (car leave/return, delivery sequence) - Chat: +1 emergency response tone test - VLM: 4→7 frames (package doorstep, backyard animal, group of people) Qwen3.5-4B baseline: 50/54 (92.6%)
1 parent 6ff12b0 commit a61af3e

File tree

5 files changed

+469
-49
lines changed

5 files changed

+469
-49
lines changed
876 KB
Loading
792 KB
Loading
783 KB
Loading

skills/analysis/home-security-benchmark/fixtures/tool-use-scenarios.json

Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,72 @@
9090
"time_range"
9191
],
9292
"context": "User asking about specific events in a time range should trigger video_search"
93+
},
94+
{
95+
"id": "tool_7_package_delivery",
96+
"name": "Package delivery inquiry",
97+
"user_message": "Did any packages get delivered today? I'm expecting an Amazon order.",
98+
"expected_tool": "video_search",
99+
"expected_params": [
100+
"query",
101+
"time_range"
102+
],
103+
"context": "Package/delivery questions should search video clips, not analyze"
104+
},
105+
{
106+
"id": "tool_8_ambiguous_safety",
107+
"name": "Ambiguous safety check",
108+
"user_message": "Is everything okay at home? I've been away all day.",
109+
"expected_tool": "video_search",
110+
"expected_params": [
111+
"query"
112+
],
113+
"context": "Ambiguous safety question should trigger a general video search"
114+
},
115+
{
116+
"id": "tool_9_ambiguous_suspicious",
117+
"name": "Ambiguous suspicious activity",
118+
"user_message": "Anything weird going on in the last hour?",
119+
"expected_tool": "video_search",
120+
"expected_params": [
121+
"query",
122+
"time_range"
123+
],
124+
"context": "Vague suspicion query should trigger time-filtered video search"
125+
},
126+
{
127+
"id": "tool_10_urgent_intruder",
128+
"name": "Urgent intruder report",
129+
"user_message": "Someone is at my door right now! Check the cameras immediately!",
130+
"expected_tool": "video_search",
131+
"expected_params": [
132+
"query",
133+
"time_range"
134+
],
135+
"context": "Urgent real-time query should still trigger video_search (LLM cannot view live)"
136+
},
137+
{
138+
"id": "tool_11_vehicle_camera_filter",
139+
"name": "Vehicle activity on specific camera",
140+
"user_message": "Show me all the cars that came and went on the driveway camera today",
141+
"expected_tool": "video_search",
142+
"expected_params": [
143+
"query",
144+
"time_range",
145+
"camera"
146+
],
147+
"context": "Vehicle activity on a specific camera should use camera filter"
148+
},
149+
{
150+
"id": "tool_12_animal_alert",
151+
"name": "Animal detection alert",
152+
"user_message": "Let me know if there's any animal in the backyard, especially at night",
153+
"expected_tool": "event_subscribe",
154+
"expected_params": [
155+
"eventType",
156+
"camera"
157+
],
158+
"context": "Animal alert request should trigger event_subscribe with animal type"
93159
}
94160
],
95161
"security_scenarios": [
@@ -125,6 +191,276 @@
125191
"parked",
126192
"extended"
127193
]
194+
},
195+
{
196+
"id": "sec_4_child_playing",
197+
"name": "Child playing in yard",
198+
"description": "Small child riding a bicycle in the front yard during daytime",
199+
"expected_classification": "normal",
200+
"expected_tags": [
201+
"person",
202+
"child",
203+
"play"
204+
]
205+
},
206+
{
207+
"id": "sec_5_photographing",
208+
"name": "Person photographing house",
209+
"description": "Adult standing on sidewalk holding phone up, appears to be photographing the house and cameras",
210+
"expected_classification": "suspicious",
211+
"expected_tags": [
212+
"person",
213+
"phone",
214+
"surveillance"
215+
]
216+
},
217+
{
218+
"id": "sec_6_cat_night",
219+
"name": "Cat at night",
220+
"description": "Small cat walking across yard at night, visible on infrared camera",
221+
"expected_classification": "normal",
222+
"expected_tags": [
223+
"animal",
224+
"cat",
225+
"night"
226+
]
227+
},
228+
{
229+
"id": "sec_7_tampering",
230+
"name": "Vehicle tampering",
231+
"description": "Two people walking between parked cars, trying door handles on multiple vehicles at 2 AM",
232+
"expected_classification": "critical",
233+
"expected_tags": [
234+
"person",
235+
"vehicle",
236+
"tampering"
237+
]
238+
},
239+
{
240+
"id": "sec_8_amazon_delivery",
241+
"name": "Amazon delivery van",
242+
"description": "Amazon branded delivery van stops in front of house, driver walks to front door with package, drops it and returns to van",
243+
"expected_classification": "normal",
244+
"expected_tags": [
245+
"vehicle",
246+
"delivery",
247+
"package"
248+
]
249+
}
250+
],
251+
"dedup_scenarios": [
252+
{
253+
"id": "dedup_1_same_person",
254+
"current": {
255+
"camera": "Front Door",
256+
"type": "motion",
257+
"summary": "Person in blue shirt standing on sidewalk looking at camera"
258+
},
259+
"recent": {
260+
"camera": "Front Door",
261+
"type": "motion",
262+
"summary": "Man in blue shirt on sidewalk, inspecting security camera"
263+
},
264+
"age_sec": 120,
265+
"expected_duplicate": true,
266+
"name": "Same person lingering → duplicate"
267+
},
268+
{
269+
"id": "dedup_2_different_person",
270+
"current": {
271+
"camera": "Front Door",
272+
"type": "motion",
273+
"summary": "Woman in red dress carrying package to front door"
274+
},
275+
"recent": {
276+
"camera": "Front Door",
277+
"type": "motion",
278+
"summary": "Man in blue shirt on sidewalk looking at camera"
279+
},
280+
"age_sec": 300,
281+
"expected_duplicate": false,
282+
"name": "Different person → unique"
283+
},
284+
{
285+
"id": "dedup_3_multi_camera_vehicle",
286+
"current": {
287+
"camera": "Side Parking",
288+
"type": "motion",
289+
"summary": "Car pulling into driveway"
290+
},
291+
"recent": {
292+
"camera": "Front Door",
293+
"type": "motion",
294+
"summary": "Car visible on street near driveway"
295+
},
296+
"age_sec": 60,
297+
"expected_duplicate": true,
298+
"name": "Multi-camera same vehicle"
299+
},
300+
{
301+
"id": "dedup_4_car_leave_return",
302+
"current": {
303+
"camera": "Side Parking",
304+
"type": "motion",
305+
"summary": "Silver SUV pulling into driveway from street"
306+
},
307+
"recent": {
308+
"camera": "Side Parking",
309+
"type": "motion",
310+
"summary": "Silver SUV backing out of driveway onto street"
311+
},
312+
"age_sec": 1800,
313+
"expected_duplicate": false,
314+
"name": "Same car leaving then returning → unique events"
315+
},
316+
{
317+
"id": "dedup_5_delivery_sequence",
318+
"current": {
319+
"camera": "Front Door",
320+
"type": "motion",
321+
"summary": "Delivery person walking back to van after leaving package"
322+
},
323+
"recent": {
324+
"camera": "Front Door",
325+
"type": "motion",
326+
"summary": "Delivery person approaching door with package"
327+
},
328+
"age_sec": 45,
329+
"expected_duplicate": true,
330+
"name": "Delivery person ring-drop-leave → same event"
331+
}
332+
],
333+
"narrative_scenarios": [
334+
{
335+
"id": "narr_1_chronological",
336+
"name": "Chronological summary from search results",
337+
"clips": [
338+
{
339+
"time": "8:15 AM",
340+
"camera": "Front Door",
341+
"summary": "Mail carrier delivering letters",
342+
"id": "clip_001"
343+
},
344+
{
345+
"time": "9:40 AM",
346+
"camera": "Front Door",
347+
"summary": "Person approaching door, rang doorbell",
348+
"id": "clip_002"
349+
},
350+
{
351+
"time": "11:22 AM",
352+
"camera": "Side Parking",
353+
"summary": "Car pulling into driveway",
354+
"id": "clip_003"
355+
},
356+
{
357+
"time": "2:05 PM",
358+
"camera": "Front Door",
359+
"summary": "Amazon delivery driver leaving package",
360+
"id": "clip_004"
361+
},
362+
{
363+
"time": "5:30 PM",
364+
"camera": "Backyard",
365+
"summary": "Cat walking across lawn",
366+
"id": "clip_005"
367+
}
368+
],
369+
"user_question": "What happened today?",
370+
"must_include": [
371+
"morning",
372+
"afternoon"
373+
],
374+
"must_not_include": [
375+
"clip_00",
376+
"ID:"
377+
],
378+
"context": "LLM should produce a chronological narrative, not dump raw clip data"
379+
},
380+
{
381+
"id": "narr_2_highlight_important",
382+
"name": "Highlight suspicious among normal events",
383+
"clips": [
384+
{
385+
"time": "7:00 AM",
386+
"camera": "Front Door",
387+
"summary": "Morning newspaper delivery",
388+
"id": "clip_010"
389+
},
390+
{
391+
"time": "10:30 AM",
392+
"camera": "Front Door",
393+
"summary": "Mail carrier delivering package",
394+
"id": "clip_011"
395+
},
396+
{
397+
"time": "1:15 PM",
398+
"camera": "Front Door",
399+
"summary": "Unknown person trying front door handle, then walked away",
400+
"id": "clip_012"
401+
},
402+
{
403+
"time": "3:00 PM",
404+
"camera": "Backyard",
405+
"summary": "Squirrel on fence",
406+
"id": "clip_013"
407+
}
408+
],
409+
"user_question": "Anything I should be worried about from today?",
410+
"must_include": [
411+
"door handle",
412+
"1:15"
413+
],
414+
"must_not_include": [
415+
"squirrel"
416+
],
417+
"context": "LLM should lead with the suspicious event, not bury it"
418+
},
419+
{
420+
"id": "narr_3_multi_camera_grouping",
421+
"name": "Group events by camera in summary",
422+
"clips": [
423+
{
424+
"time": "8:00 AM",
425+
"camera": "Front Door",
426+
"summary": "Person walking by",
427+
"id": "clip_020"
428+
},
429+
{
430+
"time": "8:30 AM",
431+
"camera": "Backyard",
432+
"summary": "Dog running in yard",
433+
"id": "clip_021"
434+
},
435+
{
436+
"time": "9:00 AM",
437+
"camera": "Front Door",
438+
"summary": "Delivery person with box",
439+
"id": "clip_022"
440+
},
441+
{
442+
"time": "9:15 AM",
443+
"camera": "Side Parking",
444+
"summary": "Car arriving",
445+
"id": "clip_023"
446+
},
447+
{
448+
"time": "9:30 AM",
449+
"camera": "Backyard",
450+
"summary": "Cat on fence",
451+
"id": "clip_024"
452+
}
453+
],
454+
"user_question": "Give me a breakdown of what happened on each camera this morning",
455+
"must_include": [
456+
"Front Door",
457+
"Backyard",
458+
"Side Parking"
459+
],
460+
"must_not_include": [
461+
"clip_02"
462+
],
463+
"context": "LLM should organize by camera when asked"
128464
}
129465
]
130466
}

0 commit comments

Comments
 (0)