Skip to content

Commit dc7c004

Browse files
tbitcsoz-agent
andcommitted
feat: fail-fast on tool crashes + conversational agent + tool_crash event
- System prompt: TOOL ERROR RULE — stop immediately on any tool error, report in one sentence, ask user to report, do nothing else - System prompt: RESPONSE STYLE RULE — always respond in plain sentences, never dump raw tool output or JSON; summarize in 1-3 sentences - AgentRunner: _is_critical_error() detects Python exceptions/import errors in tool output (distinct from normal governance failures) - AgentRunner: _collect_diagnostics() gathers tool name, error summary, specsmith version, Python version, OS, project type for the report - AgentRunner: emit tool_crash JSON event with full diagnostics on crash - AgentRunner: _hard_stop flag breaks _agent_turn loop immediately, bypassing the LLM so it cannot attempt to self-troubleshoot Co-Authored-By: Oz <oz-agent@warp.dev>
1 parent e3a4d74 commit dc7c004

1 file changed

Lines changed: 118 additions & 1 deletion

File tree

src/specsmith/agent/runner.py

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,25 @@ def build_system_prompt(
173173
If the user inputs another language, internally translate it, then reply IN ENGLISH ONLY.
174174
VIOLATING THIS RULE IS A CRITICAL ERROR.
175175
176+
## TOOL ERROR RULE — HARD STOP (NEVER TROUBLESHOOT ERRORS):
177+
When ANY tool returns an error, exception, or non-zero exit code:
178+
1. STOP immediately. Do not attempt to fix, diagnose, or retry.
179+
2. Say in ONE sentence: what you were doing and what failed.
180+
Example: "The audit tool hit an unexpected error and needs to be reported."
181+
3. Then say: "Would you like to report this bug?"
182+
4. Wait. Do nothing else. The user will decide.
183+
This tool is not designed to fix itself. Fail fast, report quickly.
184+
185+
## RESPONSE STYLE RULE — CONVERSATIONAL PLAIN ENGLISH:
186+
Always respond in natural sentences, like a helpful colleague would.
187+
- NEVER dump raw tool output, JSON, tables of IDs, or code blocks in your reply.
188+
- Summarize what you found in 1-3 plain sentences.
189+
- If a command found issues: say how many and what kind.
190+
- If everything is fine: say so briefly.
191+
- Details go in the tool result panel; your words give the meaning.
192+
Example good: "Audit found 3 issues: LEDGER.md is missing and 2 requirements lack tests."
193+
Example bad: "The tool returned: [\u2717] LEDGER.md MISSING, [\u2717] REQ-001 uncovered..."
194+
176195
You are an AEE-integrated specsmith agent for this project.
177196
178197
## Project Governance
@@ -281,6 +300,7 @@ def __init__(
281300
self._skills: list[Skill] = load_skills(Path(self.project_dir))
282301
self._hooks = HookRegistry()
283302
self._system_prompt = ""
303+
self._hard_stop: bool = False # set True when a critical tool crash is detected
284304

285305
# Execution profile — loaded from scaffold.yml at session start
286306
from specsmith import profiles
@@ -379,6 +399,76 @@ def _has_non_english(self, text: str) -> bool:
379399
hits = len(self._NON_ASCII_BLOCKS.findall(text))
380400
return hits > 5 and (hits / max(len(text), 1)) > 0.05
381401

402+
# ---- Critical error patterns that trigger a hard stop ----
403+
_CRITICAL_PATTERNS = re.compile(
404+
r"Traceback \(most recent call last\)"
405+
r"|\[ERROR\]"
406+
r"|UnicodeDecodeError"
407+
r"|UnicodeEncodeError"
408+
r"|ImportError"
409+
r"|ModuleNotFoundError"
410+
r"|AttributeError: '"
411+
r"|TypeError: unsupported"
412+
r"|PermissionError"
413+
r"|OSError: "
414+
r"|RuntimeError: ",
415+
re.IGNORECASE,
416+
)
417+
418+
@staticmethod
419+
def _is_critical_error(output: str) -> bool:
420+
"""Return True if tool output indicates an unexpected crash.
421+
422+
Normal governance failures (audit issues, missing files) are NOT
423+
critical — only Python exceptions and import errors are.
424+
"""
425+
if not output:
426+
return False
427+
# Non-zero exit alone is expected (e.g. audit found issues).
428+
# Only flag when a Python exception signature is present.
429+
return AgentRunner._CRITICAL_PATTERNS.search(output) is not None
430+
431+
def _collect_diagnostics(self, tool_name: str, output: str) -> dict:
432+
"""Collect diagnostic context for a crash report."""
433+
import platform as _platform
434+
import sys as _sys
435+
436+
from specsmith import __version__ as _ver
437+
438+
project_type = ""
439+
try:
440+
import yaml as _yaml
441+
442+
sf = Path(self.project_dir) / "scaffold.yml"
443+
if sf.exists():
444+
raw = _yaml.safe_load(sf.read_text(encoding="utf-8")) or {}
445+
project_type = str(raw.get("type", ""))
446+
except Exception: # noqa: BLE001
447+
pass
448+
449+
# Classify repo: Python exceptions from specsmith module → specsmith CLI
450+
# Extension/bridge errors would never reach here (they don’t use this runner)
451+
repo = "specsmith"
452+
453+
# Extract first meaningful error line for the summary
454+
summary = output.strip().splitlines()
455+
_err_pat = re.compile(r"\w+Error|Exception|RuntimeError")
456+
summary_line = next(
457+
(ln.strip() for ln in reversed(summary) if _err_pat.match(ln.strip())),
458+
summary[0] if summary else "Unknown error",
459+
)[:200]
460+
461+
return {
462+
"tool": tool_name,
463+
"summary": summary_line,
464+
"detail": output[:4000],
465+
"specsmith_version": _ver,
466+
"python_version": _sys.version.split()[0],
467+
"os_info": f"{_platform.system()} {_platform.release()}",
468+
"project_type": project_type,
469+
"repo": repo,
470+
}
471+
382472
def _agent_turn(self, user_input: str, silent: bool = False) -> str:
383473
"""Execute one user→agent turn with tool loop."""
384474
# Inject a lightweight English-only reminder into every user message.
@@ -443,9 +533,15 @@ def _agent_turn(self, user_input: str, silent: bool = False) -> str:
443533
break
444534

445535
# Process tool calls
536+
self._hard_stop = False # reset before each batch
446537
tool_results = self._execute_tool_calls(response.tool_calls, silent=silent)
447538
self._state.tool_calls_made += len(tool_results)
448539

540+
# Fail fast: a critical tool crash was detected — break immediately
541+
# without sending the error back to the LLM (which would try to fix it).
542+
if self._hard_stop:
543+
break
544+
449545
# Add assistant message with tool calls
450546
self._state.messages.append(
451547
Message(
@@ -543,7 +639,12 @@ def _call_provider(self, messages: list[Message], silent: bool = False) -> Compl
543639
def _execute_tool_calls(
544640
self, tool_calls: list[dict[str, Any]], silent: bool = False
545641
) -> list[ToolResult]:
546-
"""Execute tool calls and return results."""
642+
"""Execute tool calls and return results.
643+
644+
Sets ``self._hard_stop = True`` if any tool produces a critical error
645+
(Python exception, import error, etc.) so the caller can break the
646+
agentic loop immediately without sending the error to the LLM.
647+
"""
547648
from specsmith import profiles as _profiles
548649

549650
results: list[ToolResult] = []
@@ -669,6 +770,22 @@ def _execute_tool_calls(
669770

670771
elapsed = time.time() * 1000 - start_ms
671772

773+
# ---- Fail-fast: detect critical errors -------------------------
774+
# A critical error is an unexpected crash (Python exception, import
775+
# failure, etc.) — NOT a normal governance failure (audit issues,
776+
# missing files) which the LLM should describe conversationally.
777+
if self._is_critical_error(output):
778+
self._hard_stop = True
779+
diagnostics = self._collect_diagnostics(name, output)
780+
if not silent and self._json_events:
781+
self._emit_event(type="tool_crash", **diagnostics)
782+
elif not silent:
783+
self._print(
784+
f"\n[CRITICAL ERROR in {name}] "
785+
f"{diagnostics['summary']}\n"
786+
"Session stopped. Please report this bug."
787+
)
788+
672789
if not silent:
673790
if self._json_events:
674791
self._emit_event(type="tool_finished", name=name, result=output, is_error=error)

0 commit comments

Comments
 (0)