Version to 1.15.1. Clean and extend agent tools. Small fix to discovery error handling.

monoxgas · monoxgas · commit 55072bb9d5e9 · 2025-10-28T01:01:29.000-06:00
diff --git a/dreadnode/agent/tools/__init__.py b/dreadnode/agent/tools/__init__.py
@@ -16,7 +16,7 @@
 )
 
 if t.TYPE_CHECKING:
-    from dreadnode.agent.tools import fs, planning, reporting, tasking
+    from dreadnode.agent.tools import execute, fs, memory, planning, reporting, tasking
 
 __all__ = [
     "AnyTool",
@@ -28,15 +28,17 @@
     "ToolMode",
     "Toolset",
     "discover_tools_on_obj",
+    "execute",
     "fs",
+    "memory",
     "planning",
     "reporting",
     "tasking",
     "tool",
     "tool_method",
 ]
 
-__lazy_submodules__: list[str] = ["fs", "planning", "reporting", "tasking"]
+__lazy_submodules__: list[str] = ["fs", "planning", "reporting", "tasking", "execute", "memory"]
 __lazy_components__: dict[str, str] = {}
 
 
diff --git a/dreadnode/agent/tools/execute.py b/dreadnode/agent/tools/execute.py
@@ -0,0 +1,111 @@
+import asyncio
+import contextlib
+import sys
+
+from loguru import logger
+
+from dreadnode.agent.tools.base import tool
+
+
+@tool(catch=True)
+async def command(
+    cmd: list[str],
+    *,
+    timeout: int = 120,
+    cwd: str | None = None,
+    env: dict[str, str] | None = None,
+) -> str:
+    """
+    Execute a shell command.
+
+    Use this tool to run system utilities and command-line programs (e.g., `ls`, `cat`, `grep`). \
+    It is designed for straightforward, single-shot operations and returns the combined output and error streams.
+
+    ## Best Practices
+    - Argument Format: The command and its arguments *must* be provided as a \
+    list of strings (e.g., `["ls", "-la", "/tmp"]`), not as a single string.
+    - No Shell Syntax: Does not use a shell. Features like pipes (`|`), \
+    redirection (`>`), and variable expansion (`$VAR`) are not supported.
+    - Error on Failure: The tool will raise a `RuntimeError` if the command returns a non-zero exit code.
+
+    Args:
+        cmd: The command to execute, provided as a list of strings.
+        timeout: Maximum time in seconds to allow for command execution.
+        cwd: The working directory in which to execute the command.
+        env: Optional environment variables to set for the command.
+    """
+    try:
+        command_str = " ".join(cmd)
+        logger.debug(f"Executing '{command_str}'")
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            env=env,
+            cwd=cwd,
+        )
+        stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
+        output = stdout.decode() + stderr.decode()
+    except asyncio.TimeoutError as e:
+        logger.warning(f"Command '{command_str}' timed out after {timeout} seconds.")
+        with contextlib.suppress(OSError):
+            proc.kill()
+        raise TimeoutError(f"Command timed out after {timeout} seconds") from e
+    except Exception as e:
+        logger.error(f"Error executing '{command_str}': {e}")
+        raise
+
+    if proc.returncode != 0:
+        logger.error(f"Command '{command_str}' failed with return code {proc.returncode}: {output}")
+        raise RuntimeError(f"Command failed ({proc.returncode}): {output}")
+
+    logger.debug(f"Command '{command_str}':\n{output}")
+    return output
+
+
+@tool(catch=True)
+async def python(code: str, *, timeout: int = 120) -> str:
+    """
+    Execute Python code.
+
+    This tool is ideal for tasks that require custom logic like loops and conditionals, \
+    or for parsing and transforming the output from other tools. Use it to implement a \
+    sequence of actions, perform file I/O, or create functionality not covered by other \
+    available tools.
+
+    ## Best Practices
+    - Capture Output: Your script *must* print results to standard output (`print(...)`) to be captured.
+    - Self-Contained: Import all required standard libraries (e.g., `os`, `json`) within the script.
+    - Handle Errors: Write robust code. Unhandled exceptions in your script will cause the tool to fail.
+    - String-Based I/O: Ensure all printed output can be represented as a string. Use formats like JSON (`json.dumps`) for complex data.
+
+    Args:
+        code: The Python code to execute as a string.
+        timeout: Maximum time in seconds to allow for code execution.
+    """
+    try:
+        logger.debug(f"Executing python:\n{code}")
+        proc = await asyncio.create_subprocess_exec(
+            *[sys.executable, "-"],
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await asyncio.wait_for(
+            proc.communicate(input=code.encode("utf-8")), timeout=timeout
+        )
+        output = stdout.decode(errors="ignore") + stderr.decode(errors="ignore")
+    except asyncio.TimeoutError as e:
+        with contextlib.suppress(ProcessLookupError):
+            proc.kill()
+        raise TimeoutError(f"Execution timed out after {timeout} seconds") from e
+    except Exception as e:
+        logger.error(f"Error executing code in Python: {e}")
+        raise
+
+    if proc.returncode != 0:
+        logger.error(f"Execution failed with return code {proc.returncode}:\n{output}")
+        raise RuntimeError(f"Execution failed ({proc.returncode}):\n{output}")
+
+    logger.debug(f"Execution successful. Output:\n{output}")
+    return output
diff --git a/dreadnode/agent/tools/memory.py b/dreadnode/agent/tools/memory.py
@@ -0,0 +1,56 @@
+import typing as t
+
+from pydantic import PrivateAttr
+
+from dreadnode.agent.tools import Toolset, tool_method
+
+
+class Memory(Toolset):
+    """
+    Provides a stateful, in-memory key-value store for the toolset's lifetime.
+
+    This toolset allows the agent to save, retrieve, and manage data, enabling it to
+    remember information across multiple steps and tool calls.
+    """
+
+    _memory: dict[str, str] = PrivateAttr(default_factory=dict)
+
+    @tool_method
+    def save_memory(
+        self,
+        key: t.Annotated[str, "The unique key to store the value under."],
+        value: t.Annotated[str, "The string value to store in memory."],
+    ) -> str:
+        """Saves a value to memory with the specified key, overwriting any existing value."""
+        self._memory[key] = value
+        return f"Value saved to memory key: '{key}'"
+
+    @tool_method(catch=True)
+    def retrieve_memory(self, key: t.Annotated[str, "The key of the value to retrieve."]) -> str:
+        """Retrieves a value from memory using the specified key."""
+        return self._memory[key]
+
+    @tool_method
+    def list_memory_keys(self) -> list[str]:
+        """Lists all keys currently stored in memory."""
+        return list(self._memory.keys())
+
+    @tool_method(catch=True)
+    def clear_memory(
+        self,
+        key: t.Annotated[
+            str | None, "The specific key to clear. If not provided, all memory is cleared."
+        ] = None,
+    ) -> str:
+        """
+        Clears a specific key from memory, or clears all memory if no key is provided.
+        """
+        if key is None:
+            self._memory.clear()
+            return "All memory has been cleared."
+
+        if key not in self._memory:
+            return f"Key '{key}' not found in memory. Nothing to clear."
+
+        del self._memory[key]
+        return f"Cleared memory for key: '{key}'"
diff --git a/dreadnode/agent/tools/planning.py b/dreadnode/agent/tools/planning.py
@@ -22,15 +22,14 @@ class TodoItem(BaseModel):
     )
 
 
-@tool
+@tool(catch=True)
 def update_todo(todos: t.Annotated[list[TodoItem], "The full, updated list of todo items."]) -> str:
     """
     Use this tool to create and manage a structured task list for your current session.
     This helps you track progress, organize complex tasks, and demonstrate thoroughness to the user.
     It also helps the user understand the progress of the task and overall progress of their requests.
 
     ## When to Use This Tool
-    Use this tool proactively in these scenarios:
 
     1. Complex multi-step tasks - When a task requires 3 or more distinct steps or actions
     2. Non-trivial and complex tasks - Tasks that require careful planning or multiple operations
@@ -42,7 +41,6 @@ def update_todo(todos: t.Annotated[list[TodoItem], "The full, updated list of to
 
     ## When NOT to Use This Tool
 
-    Skip using this tool when:
     1. There is only a single, straightforward task
     2. The task is trivial and tracking it provides no organizational benefit
     3. The task can be completed in less than 3 trivial steps
@@ -111,3 +109,28 @@ def update_todo(todos: t.Annotated[list[TodoItem], "The full, updated list of to
         f"{status_counts['in_progress']} in progress, "
         f"{status_counts['pending']} pending."
     )
+
+
+@tool
+def think(thought: str) -> None:
+    """
+    Records a thought, reflection, or plan to document your reasoning process.
+
+    This tool acts as your internal monologue, allowing you to articulate your strategy. Use it to:
+    - Break down a complex problem into smaller steps.
+    - Formulate a multi-step plan before you act.
+    - Interpret the results of another tool's output.
+    - Document a change in strategy (self-correction).
+
+    A clear chain of thought is essential for explaining your actions.
+
+    ## Best Practices
+    - Do Not Substitute for Action**: After thinking, you must call the appropriate \
+    tool to execute your plan. This tool performs no action on its own.
+    - Do Not Repeat Information**: Never use this to repeat the output of other tools. \
+    Use it to state your *conclusion* or *next step* based on that output.
+
+    Args:
+        thought: A clear, concise statement of your thought process or plan.
+    """
+    logger.info(f"Agent thought: {thought}")
diff --git a/dreadnode/agent/tools/reporting.py b/dreadnode/agent/tools/reporting.py
@@ -4,21 +4,21 @@
 from dreadnode.data_types import Markdown
 
 
-@tool
+@tool(catch=True)
 async def highlight_for_review(title: str, interest_level: str, justification: str) -> str:
     """
-    Flags a potential area of interest for a human operator to review.
-
-    This is your primary tool for surfacing leads. Use it when you discover something
-    anomalous, high-value, or potentially vulnerable that warrants human attention.
-
-    `interest_level` should be one of:
-    - "high": Urgent. Potential for immediate impact (e.g., exposed login, sensitive keywords).
-    - "medium": Interesting. Warrants follow-up (e.g., dev subdomain, unusual tech stack).
-    - "low": Informational. Good context but not an immediate priority (e.g., interesting directory found).
-
-    `justification` should be a structured technical markdown explanation of *why* this is
-    interesting and what the potential next steps for a human could be.
+    Flag a finding for human review. Use this to surface leads that warrant further investigation.
+
+    This tool is essential for escalating findings that appear anomalous, valuable, or potentially
+    vulnerable. It creates a "lead" for a human operator to pick up.
+
+    Args:
+        title: A brief, descriptive summary of the finding.
+        interest_level: The priority of the finding. Must be one of:
+            - "high": Urgent. Potential for immediate impact or exploitation. (exposed credentials, pre-authentication vulnerability).
+            - "medium": Noteworthy. Suggests a potential weakness or area for deeper investigation. (debug endpoint, verbose error messages, PII exposure).
+            - "low": Informational. Provides useful context but is not an immediate risk. (software version disclosure, interesting file path).
+        justification: A technical, markdown-formatted explanation. Detail *why* the finding is interesting, what its potential impact is, and suggest next steps for a human analyst.
     """
     from dreadnode import log_metric, log_output, tag
 
@@ -32,4 +32,4 @@ async def highlight_for_review(title: str, interest_level: str, justification: s
     log_output("markdown", Markdown(f"# {title} ({interest_level})\n\n{justification}"))
     log_metric("count", 1, mode="count")
 
-    return "Area of interest has been highlighted for human review."
+    return "Highlighted."
diff --git a/dreadnode/agent/tools/tasking.py b/dreadnode/agent/tools/tasking.py
@@ -7,44 +7,52 @@
 @tool
 async def finish_task(success: bool, summary: str) -> None:  # noqa: ARG001, FBT001
     """
-    Mark your task as complete with a success/failure status and markdown summary of actions taken.
+    Concludes the task by reporting a final status and a comprehensive summary.
 
-    ## When to Use This Tool
-    This tool should be called under the following circumstances:
-    1.  **All TODOs are complete**: If you are managing todos, every task in your TODO list has been marked as 'completed'.
-    2.  **No more actions**: You have no further actions to take and have addressed all aspects of the user's request.
-    3.  **Irrecoverable failure**: You have encountered an error that you cannot resolve, and there are no further steps you can take.
-    4.  **Final Summary**: You are ready to provide a comprehensive summary of all actions taken.
-
-    ## When NOT to Use This Tool
-    Do not use this tool if:
-    2.  **You are in the middle of a multi-step process**: The overall task is not yet finished.
-    3.  **A recoverable error has occurred**: You should first attempt to fix the error through all available means.
-    4.  **You are waiting for user feedback**: The task is paused, not finished.
+    This is the **final tool** to call when your planned sequence of actions is complete, \
+    regardless of whether the outcome was successful. Use it when you have no more \
+    steps to take and are ready to present a final report.
 
     ## Best Practices
-    *   **Final Step**: This should be the absolute last tool you call. Once invoked, your task is considered finished.
-    *   **Honest Status**: Accurately report the success or failure of the overall task. If any part of the task failed or was not completed, `success` should be `False`.
-    *   **Comprehensive Summary**: The `summary` should be a complete and detailed markdown-formatted report of everything you did, including steps taken, tools used, and the final outcome. This is your final report to the user.
+    - Honest Status: The `success` flag must accurately reflect the final outcome. \
+    If any part of the task failed or objectives were not met, it must be `False`.
+    - Comprehensive Summary: The `summary` is your final report. It must be a complete, \
+    markdown-formatted document detailing all actions taken, tools used, and the results.
+
+    Args:
+        success: True if the task's objectives were fully met, False otherwise.
+        summary: A complete markdown-formatted report of all actions and outcomes.
     """
     from dreadnode import log_metric
 
     log_func = logger.success if success else logger.warning
     log_func(f"Agent finished the task (success={success})")
-
     log_metric("task_success", success)
 
     raise Finish if success else Fail("Agent marked the task as failed.")
 
 
 @tool
-async def give_up_on_task(reason: str) -> None:  # noqa: ARG001
+async def give_up_on_task(reason: str) -> None:
     """
-    Give up on your task.
+    Aborts the task when you are irrecoverably stuck and cannot make progress.
+
+    This tool is a last resort and should only be used when you have exhausted all \
+    possible strategies and alternative approaches. It signals that you were unable \
+    to complete your assigned process.
+
+    ## Best Practices
+    - Do Not Use for a Failed Outcome**: If the `finish_task` tool is available, use it to report failures. \
+    This tool is strictly for when you cannot *finish* your work.
+    - Provide a Clear Justification**: The `reason` must clearly explain why you are stuck. \
+    Detail the final obstacle you could not overcome and the approaches you already tried.
+
+    Args:
+        reason: A concise explanation of why you are unable to continue the task.
     """
     from dreadnode import log_metric
 
-    logger.info("Agent gave up on the task")
+    logger.warning(f"Agent gave up on the task: {reason}")
     log_metric("task_give_up", 1)
 
     raise Fail("Agent gave up on the task.")
diff --git a/dreadnode/discovery.py b/dreadnode/discovery.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/uv.lock b/uv.lock