review comments

akihikokuroda · akihikokuroda · commit cfe2a64a4a61 · 2026-05-12T15:51:26.000-04:00
Signed-off-by: Akihiko Kuroda &lt;akihikokuroda2020@gmail.com&gt;
diff --git a/docs/examples/tools/shell_example.py b/docs/examples/tools/shell_example.py
@@ -84,6 +84,10 @@ def example_3_llm_with_forced_tool_use(m: MelleaSession) -> None:
 
     This mirrors the Python interpreter pattern: ask the LLM to generate
     a bash command, force it to use the tool, then execute the command.
+
+    Requirements:
+        - Ollama running locally (or compatible LLM configured)
+        - Run: ollama serve
     """
     print("=== Example 3: LLM-Generated Bash Commands with Forced Tool Use ===")
 
@@ -99,15 +103,32 @@ def example_3_llm_with_forced_tool_use(m: MelleaSession) -> None:
     if result.tool_calls is None:
         raise ValueError("Expected tool_calls but got None")
 
+    if "unsafe_local_bash_executor" not in result.tool_calls:
+        available_tools = list(result.tool_calls.keys())
+        raise ValueError(
+            f"Expected tool 'unsafe_local_bash_executor' in tool_calls, "
+            f"but got: {available_tools}"
+        )
+
     # Extract the bash command the LLM generated
-    command = result.tool_calls["unsafe_local_bash_executor"].args["command"]
+    tool_call = result.tool_calls["unsafe_local_bash_executor"]
+    if "command" not in tool_call.args:
+        raise ValueError(
+            f"Expected 'command' argument in tool call args, "
+            f"but got: {list(tool_call.args.keys())}"
+        )
+
+    command = tool_call.args["command"]
     print(f"LLM generated bash command:\n  {command}\n")
 
     # Execute the command
-    exec_result = result.tool_calls["unsafe_local_bash_executor"].call_func()
+    exec_result = tool_call.call_func()
 
     print("Execution result:")
     print(f"  Success: {exec_result.success}")
+    print(f"  Skipped: {exec_result.skipped}")
+    if exec_result.skip_message:
+        print(f"  Skip reason: {exec_result.skip_message}")
     print(f"  Output: {exec_result.stdout}")
     if exec_result.stderr:
         print(f"  Error: {exec_result.stderr}")
@@ -202,10 +223,15 @@ def example_5_error_handling() -> None:
     example_1_direct_execution()
     example_2_wrapped_as_tool()
 
-    # Example 3 requires a running Mellea session with LLM (Ollama recommended)
-    # Uncomment to run with LLM:
-    # m = start_session()
-    # example_3_llm_with_forced_tool_use(m)
+    # Example 3: Run with LLM-based tool calling (requires Ollama or compatible LLM)
+    # Uncomment these lines to test LLM-generated commands:
+    # try:
+    #     m = start_session()
+    #     example_3_llm_with_forced_tool_use(m)
+    # except Exception as e:
+    #     print(f"Example 3 skipped: {e!s}")
+    #     print("  Requires: Ollama running locally or compatible LLM configured")
+    #     print("  See: https://docs.ollama.ai/")
 
     example_3_with_working_dir()
     example_4_safety_features()
diff --git a/mellea/stdlib/tools/shell.py b/mellea/stdlib/tools/shell.py
@@ -9,6 +9,12 @@
 ``working_dir`` and ``allowed_paths``. The top-level ``bash_executor`` (recommended
 for production) and ``unsafe_local_bash_executor`` (development-only) functions are
 ready to be wrapped as ``MelleaTool`` instances for ReACT or other agentic loops.
+
+Security note: The denylist covers inline code execution (e.g., bash -c, python -e) and
+dangerous commands in argv. However, it does not prevent execution of pre-existing
+script files (e.g., bash script.sh, python script.py), which can execute arbitrary
+code from the file. For untrusted inputs, ensure that script files are either absent
+or come from a trusted source.
 """
 
 import shlex
@@ -138,10 +144,18 @@ def _is_dangerous_command(argv: list[str]) -> tuple[bool, str]:
             )
 
     # Check if any argument is a dangerous command (e.g., env sudo, timeout sudo)
-    # Only check arguments that are not paths (don't contain / or are not flag values)
+    # Only check positional arguments that are not paths or flag values.
+    # Known value-taking flags that consume the next argument (space-separated only):
+    flag_value_flags = {
+        "-c", "--config", "-f", "--file", "-o", "--output",
+        "-i", "--input", "-d", "--dir", "-p", "--path",
+        "-t", "--timeout", "-w", "--wait",
+    }
     for i, arg in enumerate(argv[1:], start=1):
-        # Skip if this looks like a flag value (argument to a preceding flag)
-        if i > 1 and argv[i - 1].startswith("-"):
+        # Skip if this argument is the value for a preceding flag (space-separated)
+        # E.g., in "timeout -t 10 sudo", skip "10" (it's the value for -t)
+        # But don't skip "sudo" when the flag uses = notation (e.g., --kill-after=1)
+        if i > 1 and argv[i - 1] in flag_value_flags:
             continue
         # Skip if argument contains / (it's a path, not a command name)
         if "/" in arg:
@@ -640,11 +654,11 @@ def execute(self, command: str) -> ExecutionResult:
             )
 
         sandbox_workdir = self.working_dir or "/sandbox"
-        shell_command = " ".join(shlex.quote(arg) for arg in argv)
+        # Pass argv as a list (not shell string) to avoid re-parse and unnecessary quoting
         python_wrapper = (
             "import subprocess\n"
             "import sys\n"
-            f"result = subprocess.run({shell_command!r}, shell=True, cwd={sandbox_workdir!r}, "
+            f"result = subprocess.run({argv!r}, shell=False, cwd={sandbox_workdir!r}, "
             "capture_output=True, text=True)\n"
             "sys.stdout.write(result.stdout)\n"
             "sys.stderr.write(result.stderr)\n"
diff --git a/test/stdlib/tools/test_shell.py b/test/stdlib/tools/test_shell.py
@@ -150,6 +150,17 @@ def test_timeout_with_sudo_rejected(self) -> None:
         assert result.skip_message is not None
         assert "not allowed" in result.skip_message.lower()
 
+    def test_timeout_with_flag_value_and_sudo_rejected(self) -> None:
+        """timeout with --kill-after=value and sudo should be rejected (checks value-taking flags)."""
+        env = StaticBashEnvironment()
+        # Regression test: ensure sudo is detected despite --kill-after=1 consuming the value
+        result = env.execute("timeout --kill-after=1 sudo whoami")
+
+        assert result.skipped is True
+        assert result.success is False
+        assert result.skip_message is not None
+        assert "not allowed" in result.skip_message.lower()
+
     def test_python_c_arbitrary_code_rejected(self) -> None:
         """python -c with arbitrary code should be rejected."""
         env = StaticBashEnvironment()
@@ -363,18 +374,16 @@ class TestWorkingDirRestriction:
     """Tests for working directory restrictions."""
 
     def test_working_dir_restriction_blocks_outside_writes(self) -> None:
-        """Writing outside working_dir should be rejected."""
+        """Writing outside working_dir should be rejected by working_dir check."""
         env = StaticBashEnvironment(working_dir="/home/user/project")
-        result = env.execute("touch /var/log/test.log")
+        # Use a safe path that is not in DANGEROUS_PATHS (so working_dir check fires first)
+        result = env.execute("touch /home/other/file.txt")
 
         assert result.skipped is True
         assert result.success is False
-        # Could be blocked by either path restriction or working dir restriction
         assert result.skip_message is not None
-        assert (
-            "not allowed" in result.skip_message.lower()
-            or "outside" in result.skip_message.lower()
-        )
+        # Must be rejected by working_dir check, not dangerous-path check
+        assert "outside" in result.skip_message.lower()
 
     def test_working_dir_allows_inside_writes(self) -> None:
         """Writing inside working_dir should be allowed."""