[argus] sandbox_audit: raise _MAX_COMMAND_LENGTH 10_000 → 131_072

Nichol4s · Nichol4s · commit 2f114e52edc7 · 2026-04-28T10:40:51.000+02:00
The 10 000-char cap rejects legitimate heredocs that DeerFlow agents use
to write small-to-medium files in one shot — e.g. a 20 KB self-contained
HTML page produced by a research task. 128 KB is still four orders of
magnitude below Linux ARG_MAX, and the cap remains an effective
tripwire for base64 payload injection (which would otherwise produce
megabytes-long lines).

Updates the existing length tests to read the constant rather than
hard-coding 10_001, and adds:
  - test_max_length_at_128k — pins the new value
  - test_20kb_heredoc_accepted — realistic positive case from the bug
    that motivated the change

PR-candidate: yes
Upstream-issue: none
Reason: Easy win, well-scoped, with a believable use case in the test.
        Could be even more generally accepted if reframed as
        configurable, but a flat bump is the smallest defensible change.
diff --git a/backend/packages/harness/deerflow/agents/middlewares/sandbox_audit_middleware.py b/backend/packages/harness/deerflow/agents/middlewares/sandbox_audit_middleware.py
@@ -271,11 +271,13 @@ def _append_warn_to_result(self, result: ToolMessage | Command, command: str) ->
     # Input sanitisation
     # ------------------------------------------------------------------
 
-    # Normal bash commands rarely exceed a few hundred characters.  10 000 is
-    # well above any legitimate use case yet a tiny fraction of Linux ARG_MAX.
-    # Anything longer is almost certainly a payload injection or base64-encoded
-    # attack string.
-    _MAX_COMMAND_LENGTH = 10_000
+    # Most bash commands are tiny, but DeerFlow agents legitimately route
+    # heredocs through bash to write small-to-medium files in one shot —
+    # e.g. a 20 KB self-contained HTML page. The previous 10 000-char cap
+    # rejected those. 131 072 (128 KB) is still four orders of magnitude
+    # below Linux ARG_MAX and remains an effective tripwire for base64
+    # payload injection.
+    _MAX_COMMAND_LENGTH = 131_072
 
     def _validate_input(self, command: str) -> str | None:
         """Return ``None`` if *command* is acceptable, else a rejection reason."""
diff --git a/backend/tests/test_sandbox_audit_middleware.py b/backend/tests/test_sandbox_audit_middleware.py
@@ -276,13 +276,26 @@ def test_normal_command_accepted(self):
         assert self.mw._validate_input("ls -la") is None
 
     def test_command_at_max_length_accepted(self):
-        cmd = "a" * 10_000
+        cmd = "a" * self.mw._MAX_COMMAND_LENGTH
         assert self.mw._validate_input(cmd) is None
 
     def test_command_exceeding_max_length_rejected(self):
-        cmd = "a" * 10_001
+        cmd = "a" * (self.mw._MAX_COMMAND_LENGTH + 1)
         assert self.mw._validate_input(cmd) == "command too long"
 
+    def test_max_length_at_128k(self):
+        """The cap is 128 KB — large enough to allow heredocs that write
+        small-to-medium files (e.g. a 20 KB HTML page) without rejection,
+        small enough to remain a tripwire for base64 payload injection."""
+        assert self.mw._MAX_COMMAND_LENGTH == 131_072
+
+    def test_20kb_heredoc_accepted(self):
+        """Realistic case: an agent writes a 20 KB self-contained HTML file
+        via a bash heredoc. This was rejected by the old 10 000 cap."""
+        payload = "x" * 20_000
+        heredoc = f"cat <<'EOF' > /tmp/page.html\n{payload}\nEOF"
+        assert self.mw._validate_input(heredoc) is None
+
     def test_null_byte_rejected(self):
         assert self.mw._validate_input("ls\x00; rm -rf /") == "null byte detected"
 
@@ -318,7 +331,7 @@ def test_null_byte_command_blocked_with_reason(self):
         assert "null byte" in result.content.lower()
 
     def test_oversized_command_blocked_with_reason(self):
-        request = _make_request("a" * 10_001)
+        request = _make_request("a" * (self.mw._MAX_COMMAND_LENGTH + 1))
         handler = _make_handler()
         result = self.mw.wrap_tool_call(request, handler)
         assert not handler.called
@@ -339,7 +352,7 @@ def test_none_command_coerced_to_empty(self):
 
     def test_oversized_command_audit_log_truncated(self):
         """Oversized commands should be truncated in audit logs to prevent log amplification."""
-        big_cmd = "x" * 10_001
+        big_cmd = "x" * (self.mw._MAX_COMMAND_LENGTH + 1)
         request = _make_request(big_cmd)
         handler = _make_handler()
         with unittest.mock.patch.object(self.mw, "_write_audit", wraps=self.mw._write_audit) as spy:
@@ -597,7 +610,7 @@ async def test_null_byte_command_blocked_with_reason(self):
 
     @pytest.mark.anyio
     async def test_oversized_command_blocked_with_reason(self):
-        request = _make_request("a" * 10_001)
+        request = _make_request("a" * (SandboxAuditMiddleware._MAX_COMMAND_LENGTH + 1))
         result, called = await self._call_async(request)
         assert not called
         assert isinstance(result, ToolMessage)