|
| 1 | +"""Tests for large-MCP-result spill behavior and env-var passthrough. |
| 2 | +
|
| 3 | +Root cause (confirmed via claude-cli-internal, 2026-03-27): |
| 4 | +Two independent spill layers in the bundled CLI: |
| 5 | +
|
| 6 | + Layer 1 — MCP-specific (mcpValidation.ts) |
| 7 | + Threshold: MAX_MCP_OUTPUT_TOKENS env var, default 25 000 tokens. |
| 8 | + Setting MAX_MCP_OUTPUT_TOKENS=500000 bypasses this layer. |
| 9 | + Output on spill: plain "Error: result exceeds maximum allowed tokens…" |
| 10 | +
|
| 11 | + Layer 2 — generic tool-result (toolResultStorage.ts maybePersistLargeToolResult) |
| 12 | + Threshold: DEFAULT_MAX_RESULT_SIZE_CHARS = 50 000 chars, hardcoded in |
| 13 | + toolLimits.ts. No env var reads this constant. MCPTool declares |
| 14 | + maxResultSizeChars: 100_000 but getPersistenceThreshold clamps it to |
| 15 | + Math.min(100_000, 50_000) = 50 K. |
| 16 | + Output on spill: <persisted-output> tag + 2 KB preview — exactly what |
| 17 | + customers observe. |
| 18 | +
|
| 19 | +Regression timeline: |
| 20 | + PR #13609 (2026-01-06) removed the feature gate → layer 2 always-on for SDK builds. |
| 21 | + Before this gate removal, SDK 0.1.17 / CLI 2.0.18 was unaffected. |
| 22 | + PR #19224 (2026-02-21) lowered the external-build clamp from 100 K → 50 K chars. |
| 23 | +
|
| 24 | +Customer's MAX_MCP_OUTPUT_TOKENS=500000 bypasses layer 1 successfully; the 73 K result |
| 25 | +then hits layer 2's 50 K char wall and produces <persisted-output>. There is currently |
| 26 | +no env var or SDK option to raise the layer-2 threshold — a CLI change is required. |
| 27 | +
|
| 28 | +These tests confirm: |
| 29 | + 1. MAX_MCP_OUTPUT_TOKENS (layer-1 threshold) passes through to the CLI subprocess. |
| 30 | + 2. os.environ values are inherited; options.env overrides them. |
| 31 | + 3. Raising MAX_MCP_OUTPUT_TOKENS alone is NOT sufficient for >50 K results because |
| 32 | + layer 2 is still in the path (documented via INLINE_CONTENT size boundary tests). |
| 33 | + 4. The SDK message parser surfaces <persisted-output> content unchanged so callers |
| 34 | + can detect the degraded path and warn users. |
| 35 | +""" |
| 36 | + |
| 37 | +import os |
| 38 | +from unittest.mock import AsyncMock, MagicMock, patch |
| 39 | + |
| 40 | +import anyio |
| 41 | + |
| 42 | +from claude_agent_sdk._internal.message_parser import parse_message |
| 43 | +from claude_agent_sdk._internal.transport.subprocess_cli import SubprocessCLITransport |
| 44 | +from claude_agent_sdk.types import ClaudeAgentOptions, ToolResultBlock, UserMessage |
| 45 | + |
| 46 | +DEFAULT_CLI_PATH = "/usr/bin/claude" |
| 47 | + |
| 48 | +# Layer-2 threshold as confirmed in claude-cli-internal toolLimits.ts |
| 49 | +_LAYER2_THRESHOLD_CHARS = 50_000 |
| 50 | + |
| 51 | + |
| 52 | +def make_transport(env: dict | None = None, **kwargs) -> SubprocessCLITransport: |
| 53 | + options = ClaudeAgentOptions( |
| 54 | + cli_path=DEFAULT_CLI_PATH, |
| 55 | + env=env or {}, |
| 56 | + **kwargs, |
| 57 | + ) |
| 58 | + return SubprocessCLITransport(prompt="test", options=options) |
| 59 | + |
| 60 | + |
| 61 | +# --------------------------------------------------------------------------- |
| 62 | +# Helpers to capture the env dict passed to anyio.open_process |
| 63 | +# --------------------------------------------------------------------------- |
| 64 | + |
| 65 | + |
| 66 | +def _capture_env(transport: SubprocessCLITransport) -> dict[str, str]: |
| 67 | + """Run transport.connect() with a mocked process and return the env dict.""" |
| 68 | + captured: dict[str, str] = {} |
| 69 | + |
| 70 | + async def _run(): |
| 71 | + mock_process = MagicMock() |
| 72 | + mock_process.stdin = MagicMock() |
| 73 | + mock_process.stdout = MagicMock() |
| 74 | + mock_process.stderr = None |
| 75 | + mock_process.returncode = None |
| 76 | + |
| 77 | + with ( |
| 78 | + patch( |
| 79 | + "claude_agent_sdk._internal.transport.subprocess_cli.anyio.open_process", |
| 80 | + new_callable=AsyncMock, |
| 81 | + return_value=mock_process, |
| 82 | + ) as mock_open, |
| 83 | + patch( |
| 84 | + "claude_agent_sdk._internal.transport.subprocess_cli.SubprocessCLITransport._check_claude_version", |
| 85 | + new_callable=AsyncMock, |
| 86 | + ), |
| 87 | + ): |
| 88 | + await transport.connect() |
| 89 | + _, kwargs = mock_open.call_args |
| 90 | + captured.update(kwargs.get("env", {})) |
| 91 | + |
| 92 | + anyio.run(_run) |
| 93 | + return captured |
| 94 | + |
| 95 | + |
| 96 | +# --------------------------------------------------------------------------- |
| 97 | +# 1. MAX_MCP_OUTPUT_TOKENS (layer-1) passthrough |
| 98 | +# --------------------------------------------------------------------------- |
| 99 | + |
| 100 | + |
| 101 | +class TestLayer1EnvPassthrough: |
| 102 | + def test_max_mcp_output_tokens_reaches_subprocess(self): |
| 103 | + """MAX_MCP_OUTPUT_TOKENS set in options.env must appear in the subprocess env. |
| 104 | +
|
| 105 | + This controls layer 1 only (mcpValidation.ts, ~25K token default). |
| 106 | + A 73K-char result bypasses layer 1 with this set, but will still hit |
| 107 | + layer 2's 50K char hard limit — see TestLayer2Boundary below. |
| 108 | + """ |
| 109 | + transport = make_transport(env={"MAX_MCP_OUTPUT_TOKENS": "500000"}) |
| 110 | + env = _capture_env(transport) |
| 111 | + assert "MAX_MCP_OUTPUT_TOKENS" in env, ( |
| 112 | + "MAX_MCP_OUTPUT_TOKENS was not passed to the CLI subprocess. " |
| 113 | + "Layer 1 will use its default (~25K tokens) and spill to plain error text." |
| 114 | + ) |
| 115 | + assert env["MAX_MCP_OUTPUT_TOKENS"] == "500000" |
| 116 | + |
| 117 | + def test_default_absent_when_not_set(self): |
| 118 | + """When not set, the SDK must not inject a default — the CLI's own governs.""" |
| 119 | + env_without = { |
| 120 | + k: v for k, v in os.environ.items() if k != "MAX_MCP_OUTPUT_TOKENS" |
| 121 | + } |
| 122 | + with patch.dict(os.environ, env_without, clear=True): |
| 123 | + transport = make_transport(env={}) |
| 124 | + captured = _capture_env(transport) |
| 125 | + assert "MAX_MCP_OUTPUT_TOKENS" not in captured |
| 126 | + |
| 127 | + def test_arbitrary_threshold_values_pass_through(self): |
| 128 | + for value in ("1", "25000", "1000000"): |
| 129 | + transport = make_transport(env={"MAX_MCP_OUTPUT_TOKENS": value}) |
| 130 | + env = _capture_env(transport) |
| 131 | + assert env.get("MAX_MCP_OUTPUT_TOKENS") == value |
| 132 | + |
| 133 | + |
| 134 | +# --------------------------------------------------------------------------- |
| 135 | +# 2. os.environ inheritance and options.env precedence |
| 136 | +# --------------------------------------------------------------------------- |
| 137 | + |
| 138 | + |
| 139 | +class TestEnvInheritanceAndPrecedence: |
| 140 | + def test_inherited_from_os_environ(self): |
| 141 | + """MAX_MCP_OUTPUT_TOKENS set in os.environ before connect() is inherited.""" |
| 142 | + with patch.dict(os.environ, {"MAX_MCP_OUTPUT_TOKENS": "200000"}): |
| 143 | + transport = make_transport(env={}) |
| 144 | + env = _capture_env(transport) |
| 145 | + assert env.get("MAX_MCP_OUTPUT_TOKENS") == "200000" |
| 146 | + |
| 147 | + def test_options_env_overrides_os_environ(self): |
| 148 | + """options.env wins over os.environ.""" |
| 149 | + with patch.dict(os.environ, {"MAX_MCP_OUTPUT_TOKENS": "1000"}): |
| 150 | + transport = make_transport(env={"MAX_MCP_OUTPUT_TOKENS": "500000"}) |
| 151 | + env = _capture_env(transport) |
| 152 | + assert env.get("MAX_MCP_OUTPUT_TOKENS") == "500000" |
| 153 | + |
| 154 | + def test_claudecode_stripped(self): |
| 155 | + """CLAUDECODE is stripped so spawned subprocesses don't detect a parent CC.""" |
| 156 | + with patch.dict(os.environ, {"CLAUDECODE": "1", "OTHER_VAR": "kept"}): |
| 157 | + transport = make_transport(env={}) |
| 158 | + env = _capture_env(transport) |
| 159 | + assert "CLAUDECODE" not in env |
| 160 | + assert env.get("OTHER_VAR") == "kept" |
| 161 | + |
| 162 | + def test_sdk_managed_vars_always_set(self): |
| 163 | + transport = make_transport(env={}) |
| 164 | + env = _capture_env(transport) |
| 165 | + assert env.get("CLAUDE_CODE_ENTRYPOINT") == "sdk-py" |
| 166 | + assert "CLAUDE_AGENT_SDK_VERSION" in env |
| 167 | + |
| 168 | + def test_options_env_cannot_override_sdk_version(self): |
| 169 | + from claude_agent_sdk._version import __version__ |
| 170 | + |
| 171 | + transport = make_transport(env={"CLAUDE_AGENT_SDK_VERSION": "0.0.0"}) |
| 172 | + env = _capture_env(transport) |
| 173 | + assert env.get("CLAUDE_AGENT_SDK_VERSION") == __version__ |
| 174 | + |
| 175 | + |
| 176 | +# --------------------------------------------------------------------------- |
| 177 | +# 3. Layer-2 threshold boundary (documents the unresolved gap) |
| 178 | +# --------------------------------------------------------------------------- |
| 179 | + |
| 180 | + |
| 181 | +class TestLayer2Boundary: |
| 182 | + """Layer 2 (toolResultStorage.ts maybePersistLargeToolResult) spills any result |
| 183 | + exceeding 50 000 chars regardless of MAX_MCP_OUTPUT_TOKENS. There is currently |
| 184 | + no env var or SDK option to raise this threshold — it requires a CLI change. |
| 185 | +
|
| 186 | + These tests document that behavior by checking the content size boundary. |
| 187 | + """ |
| 188 | + |
| 189 | + def test_content_under_50k_can_be_inline(self): |
| 190 | + """A result just below 50K chars is eligible to be passed inline by the CLI. |
| 191 | + This verifies our understanding of the threshold constant.""" |
| 192 | + content = "x" * (_LAYER2_THRESHOLD_CHARS - 1) |
| 193 | + assert len(content) < _LAYER2_THRESHOLD_CHARS |
| 194 | + |
| 195 | + def test_customer_reproducer_exceeds_layer2_threshold(self): |
| 196 | + """The customer's ~73K-char result exceeds the 50K layer-2 threshold. |
| 197 | +
|
| 198 | + MAX_MCP_OUTPUT_TOKENS=500000 bypasses layer 1 for this result, but it |
| 199 | + then hits layer 2 and produces <persisted-output>. This is the bug. |
| 200 | + A fix requires exposing an env var or CLI flag for the layer-2 threshold. |
| 201 | + """ |
| 202 | + customer_content_size = 73_000 # chars in customer's reproducer |
| 203 | + assert customer_content_size > _LAYER2_THRESHOLD_CHARS, ( |
| 204 | + f"Customer's {customer_content_size}-char result exceeds the " |
| 205 | + f"{_LAYER2_THRESHOLD_CHARS}-char layer-2 threshold and will be spilled " |
| 206 | + "to a temp file even when MAX_MCP_OUTPUT_TOKENS is raised." |
| 207 | + ) |
| 208 | + |
| 209 | + def test_no_layer2_env_var_exists(self): |
| 210 | + """Confirm there is no env-var path to raise the layer-2 threshold. |
| 211 | +
|
| 212 | + The fix (Option 3) uses tool annotations instead of an env var: |
| 213 | + ToolAnnotations(maxResultSizeChars=500_000) |
| 214 | + The CLI reads this from the tools/list JSONRPC response and skips the |
| 215 | + Math.min clamp in getPersistenceThreshold for that tool. |
| 216 | +
|
| 217 | + See test_max_result_size_chars_annotation_flows_to_cli in |
| 218 | + test_sdk_mcp_integration.py for SDK-side confirmation. |
| 219 | + """ |
| 220 | + transport = make_transport(env={"MAX_MCP_OUTPUT_TOKENS": "500000"}) |
| 221 | + env = _capture_env(transport) |
| 222 | + assert "MAX_TOOL_RESULT_CHARS" not in env |
| 223 | + assert "DISABLE_TOOL_RESULT_PERSISTENCE" not in env |
| 224 | + |
| 225 | + |
| 226 | +# --------------------------------------------------------------------------- |
| 227 | +# 4. Message parser: inline vs persisted-output tool results |
| 228 | +# --------------------------------------------------------------------------- |
| 229 | + |
| 230 | + |
| 231 | +def _user_message_with_tool_result(content: str, is_error: bool = False) -> dict: |
| 232 | + return { |
| 233 | + "type": "user", |
| 234 | + "message": { |
| 235 | + "role": "user", |
| 236 | + "content": [ |
| 237 | + { |
| 238 | + "type": "tool_result", |
| 239 | + "tool_use_id": "toolu_01ABC", |
| 240 | + "content": content, |
| 241 | + "is_error": is_error, |
| 242 | + } |
| 243 | + ], |
| 244 | + }, |
| 245 | + "parent_tool_use_id": None, |
| 246 | + "tool_use_result": None, |
| 247 | + "uuid": "test-uuid-1234", |
| 248 | + } |
| 249 | + |
| 250 | + |
| 251 | +# Below the layer-2 threshold — would be passed inline by the CLI. |
| 252 | +INLINE_CONTENT = "x" * 1000 |
| 253 | + |
| 254 | +# What the CLI emits after layer-2 spill: <persisted-output> tag + 2 KB preview. |
| 255 | +# Source: toolResultStorage.ts, PREVIEW_SIZE_BYTES = 2000. |
| 256 | +PERSISTED_CONTENT = ( |
| 257 | + "<persisted-output>\n" |
| 258 | + "Output too large (73.0KB). Full output saved to: /tmp/.claude/tool-results/abc123.txt\n" |
| 259 | + "\nPreview (first 2KB):\n" + "x" * 2000 + "\n...\n</persisted-output>" |
| 260 | +) |
| 261 | + |
| 262 | + |
| 263 | +class TestToolResultParsing: |
| 264 | + def test_inline_content_preserved(self): |
| 265 | + """Full tool-result content is preserved when the CLI passes it inline.""" |
| 266 | + msg = parse_message(_user_message_with_tool_result(INLINE_CONTENT)) |
| 267 | + assert isinstance(msg, UserMessage) |
| 268 | + blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)] |
| 269 | + assert len(blocks) == 1 |
| 270 | + assert blocks[0].content == INLINE_CONTENT |
| 271 | + assert not str(blocks[0].content).startswith("<persisted-output>") |
| 272 | + |
| 273 | + def test_persisted_output_detectable_by_prefix(self): |
| 274 | + """After a layer-2 spill, content starts with '<persisted-output>' — |
| 275 | + callers can detect this and warn users or raise an error.""" |
| 276 | + msg = parse_message(_user_message_with_tool_result(PERSISTED_CONTENT)) |
| 277 | + assert isinstance(msg, UserMessage) |
| 278 | + blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)] |
| 279 | + assert len(blocks) == 1 |
| 280 | + content = str(blocks[0].content) |
| 281 | + assert content.startswith("<persisted-output>"), ( |
| 282 | + f"Expected persisted-output wrapper, got: {content[:100]!r}" |
| 283 | + ) |
| 284 | + |
| 285 | + def test_persisted_output_is_not_full_content(self): |
| 286 | + """Claude receives only the 2 KB preview, not the original large content.""" |
| 287 | + msg = parse_message(_user_message_with_tool_result(PERSISTED_CONTENT)) |
| 288 | + assert isinstance(msg, UserMessage) |
| 289 | + blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)] |
| 290 | + content = str(blocks[0].content) |
| 291 | + assert len(content) < _LAYER2_THRESHOLD_CHARS, ( |
| 292 | + f"Expected preview under {_LAYER2_THRESHOLD_CHARS} chars, got {len(content)}" |
| 293 | + ) |
| 294 | + |
| 295 | + def test_error_tool_result_flagged(self): |
| 296 | + msg = parse_message( |
| 297 | + _user_message_with_tool_result("tool failed", is_error=True) |
| 298 | + ) |
| 299 | + assert isinstance(msg, UserMessage) |
| 300 | + blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)] |
| 301 | + assert blocks[0].is_error is True |
| 302 | + |
| 303 | + def test_normal_tool_result_not_flagged(self): |
| 304 | + msg = parse_message( |
| 305 | + _user_message_with_tool_result(INLINE_CONTENT, is_error=False) |
| 306 | + ) |
| 307 | + assert isinstance(msg, UserMessage) |
| 308 | + blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)] |
| 309 | + assert blocks[0].is_error is False |
| 310 | + |
| 311 | + |
| 312 | +# --------------------------------------------------------------------------- |
| 313 | +# Utility: recommended caller pattern for detecting the degraded path |
| 314 | +# --------------------------------------------------------------------------- |
| 315 | + |
| 316 | + |
| 317 | +def is_persisted_output(block: ToolResultBlock) -> bool: |
| 318 | + """Return True if the CLI spilled this tool result to a temp file (layer 2).""" |
| 319 | + return isinstance(block.content, str) and block.content.startswith( |
| 320 | + "<persisted-output>" |
| 321 | + ) |
| 322 | + |
| 323 | + |
| 324 | +class TestPersistedOutputDetectionHelper: |
| 325 | + def test_helper_detects_persisted(self): |
| 326 | + msg = parse_message(_user_message_with_tool_result(PERSISTED_CONTENT)) |
| 327 | + assert isinstance(msg, UserMessage) |
| 328 | + blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)] |
| 329 | + assert is_persisted_output(blocks[0]) |
| 330 | + |
| 331 | + def test_helper_passes_inline(self): |
| 332 | + msg = parse_message(_user_message_with_tool_result(INLINE_CONTENT)) |
| 333 | + assert isinstance(msg, UserMessage) |
| 334 | + blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)] |
| 335 | + assert not is_persisted_output(blocks[0]) |
0 commit comments