You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if "<|channel>" in content or "<channel|>" in content:
355
+
return "DEGENERATE", "leaked control tokens"
356
+
return "TEXT", content[:60]
357
+
358
+
FAILS = []
359
+
360
+
print("\n─── [1/3] Vague query WITH tool schema (must handle ambiguity naturally, tool call or text) ───")
361
+
vague_ok = 0
362
+
for i in range(5):
363
+
tc, content, t, pt = call(
364
+
[{"role":"system","content":"You are a helpful AI assistant."}, {"role":"user","content":"what is the news"}], tools=[TOOL])
365
+
kind, detail = classify(tc, content)
366
+
ok = kind in ("TOOL_CALL", "TEXT")
367
+
if ok: vague_ok += 1
368
+
print(f" {'✅' if ok else '❌'} run {i+1} [{t:.1f}s P={pt}t]: {kind} — {detail.replace(chr(10), ' ')[:75]}")
369
+
print(f" → {vague_ok}/5 runs passed without degenerating")
370
+
if vague_ok < 3:
371
+
FAILS.append(f"Vague query: only {vague_ok}/5 clean runs (need ≥3)")
372
+
373
+
print("\n─── [2/3] Control: same query WITHOUT tools (must be coherent text) ───")
374
+
coherent_ok = 0
375
+
for i in range(3):
376
+
tc, content, t, pt = call([{"role":"system","content":"You are a helpful AI assistant."}, {"role":"user","content":"what is the news"}], temp=0.7, max_tokens=200)
377
+
kind, detail = classify(tc, content)
378
+
ok = kind == "TEXT"
379
+
if ok: coherent_ok += 1
380
+
print(f" {'✅' if ok else '❌'} run {i+1} [{t:.1f}s P={pt}t]: {kind} — {detail}")
381
+
print(f" → {coherent_ok}/3 coherent text responses")
382
+
if coherent_ok < 3:
383
+
FAILS.append(f"No-tool control: only {coherent_ok}/3 coherent (need 3)")
[{"role":"system","content":"You are a helpful AI assistant."}, {"role":"user","content":"Use web_search to find news today"}], tools=[TOOL], max_tokens=2000)
390
+
kind, detail = classify(tc, content)
391
+
ok = kind == "TOOL_CALL"
392
+
if ok: explicit_ok += 1
393
+
print(f" {'✅' if ok else '❌'} run {i+1} [{t:.1f}s P={pt}t]: {kind} — {detail}")
394
+
print(f" → {explicit_ok}/3 tool_calls")
395
+
if explicit_ok < 3:
396
+
FAILS.append(f"Explicit query: only {explicit_ok}/3 tool_calls (need 3)")
397
+
398
+
print("\n" + "─"*60)
399
+
if not FAILS:
400
+
print("✅ REGRESSION PASSED — tool-call degeneration bug is fixed.")
0 commit comments