Skip to content

Commit 730340e

Browse files
committed
Test 1
1 parent 15919ec commit 730340e

2 files changed

Lines changed: 147 additions & 4 deletions

File tree

code-interpreter/app/services/executor_kubernetes.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -344,12 +344,15 @@ def execute_python(
344344
_preload_content=False,
345345
)
346346

347-
# Write tar archive to stdin - write_stdin handles the channel protocol
347+
# Write tar archive to stdin as raw bytes — using bytes ensures the
348+
# kubernetes client sends via binary WebSocket frames, avoiding
349+
# text encoding corruption (latin-1 → UTF-8 round-trip mangles
350+
# any byte >= 0x80 in the tar archive).
348351
logger.debug("Writing tar archive to stdin")
349-
resp.write_stdin(tar_archive.decode("latin-1"))
350-
# Close stdin by writing empty string
352+
resp.write_stdin(tar_archive)
353+
# Signal end of input
351354
logger.debug("Closing stdin")
352-
resp.write_stdin("")
355+
resp.write_stdin(b"")
353356

354357
# Wait for tar extraction to complete by reading until the stream closes
355358
logger.debug("Waiting for tar extraction to complete")
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""Tests that binary files with high bytes (>= 0x80) survive the tar pipeline.
2+
3+
Regression test for a bug where the Kubernetes executor decoded tar archives
4+
as latin-1 text before sending through a WebSocket. The WebSocket re-encoded
5+
as UTF-8, corrupting any byte >= 0x80 (single latin-1 bytes became multi-byte
6+
UTF-8 sequences), which produced "tar: Skipping to next header" errors.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import io
12+
import tarfile
13+
from unittest.mock import MagicMock, patch
14+
15+
import pytest
16+
17+
from app.services.executor_kubernetes import KubernetesExecutor
18+
19+
20+
@pytest.fixture()
21+
def executor() -> KubernetesExecutor:
22+
"""Create a KubernetesExecutor without calling __init__ (no cluster needed)."""
23+
inst = KubernetesExecutor.__new__(KubernetesExecutor)
24+
inst.v1 = MagicMock()
25+
inst.namespace = "test"
26+
inst.image = "test:latest"
27+
inst.service_account = ""
28+
return inst
29+
30+
31+
def _mock_pod_running(v1_mock: MagicMock) -> None:
32+
"""Make read_namespaced_pod return a Running pod."""
33+
pod_mock = MagicMock()
34+
pod_mock.status.phase = "Running"
35+
v1_mock.read_namespaced_pod.return_value = pod_mock
36+
37+
38+
def _mock_stream_resp() -> MagicMock:
39+
"""Create a mock WebSocket stream response that closes after one iteration."""
40+
resp = MagicMock()
41+
# is_open returns True once (to enter the loop), then False
42+
resp.is_open.side_effect = [True, False]
43+
resp.peek_stdout.return_value = False
44+
resp.peek_stderr.return_value = False
45+
# Return a Success status on the error channel (= command completed OK)
46+
resp.read_channel.return_value = "{'status': 'Success'}"
47+
return resp
48+
49+
50+
def test_write_stdin_receives_bytes_not_string(
51+
executor: KubernetesExecutor,
52+
) -> None:
53+
"""The critical assertion: write_stdin must be called with raw bytes,
54+
not a latin-1 decoded string. Passing a string causes the WebSocket
55+
text frame to re-encode as UTF-8, corrupting bytes >= 0x80.
56+
"""
57+
_mock_pod_running(executor.v1)
58+
59+
binary_content = bytes(range(0x80, 0x100))
60+
61+
# Two stream calls: first for tar extraction, second for python execution
62+
tar_resp = _mock_stream_resp()
63+
exec_resp = _mock_stream_resp()
64+
65+
with patch("app.services.executor_kubernetes.stream.stream") as mock_stream:
66+
mock_stream.side_effect = [tar_resp, exec_resp]
67+
68+
executor.execute_python(
69+
code="print('hello')",
70+
stdin=None,
71+
timeout_ms=5000,
72+
max_output_bytes=1024,
73+
files=[("data.bin", binary_content)],
74+
)
75+
76+
# Find the write_stdin calls on the tar extraction stream
77+
write_calls = tar_resp.write_stdin.call_args_list
78+
assert len(write_calls) >= 1, "write_stdin was never called"
79+
80+
# The first call should be the tar archive data
81+
tar_data_arg = write_calls[0][0][0]
82+
83+
assert isinstance(tar_data_arg, bytes), (
84+
f"write_stdin was called with {type(tar_data_arg).__name__}, expected bytes. "
85+
f"Passing a string causes UTF-8 re-encoding which corrupts binary tar data."
86+
)
87+
88+
# Verify the tar archive is valid and contains our binary file
89+
with tarfile.open(fileobj=io.BytesIO(tar_data_arg), mode="r") as tar:
90+
member = next(m for m in tar.getmembers() if m.name == "data.bin")
91+
extracted = tar.extractfile(member)
92+
assert extracted is not None
93+
assert extracted.read() == binary_content
94+
95+
96+
def test_write_stdin_string_causes_tar_corruption(
97+
executor: KubernetesExecutor,
98+
) -> None:
99+
"""Demonstrate that passing a latin-1 decoded string through a UTF-8
100+
WebSocket would produce a different (corrupted) byte sequence.
101+
"""
102+
binary_content = bytes(range(0x80, 0x100))
103+
tar_bytes = executor._create_tar_archive(
104+
code="print('hello')",
105+
files=[("data.bin", binary_content)],
106+
last_line_interactive=False,
107+
)
108+
109+
# This is what the old code did: decode as latin-1 to make a string
110+
as_string = tar_bytes.decode("latin-1")
111+
112+
# The WebSocket text frame encodes strings as UTF-8
113+
after_websocket = as_string.encode("utf-8")
114+
115+
# The byte sequences differ — this IS the corruption
116+
assert after_websocket != tar_bytes, (
117+
"latin-1→UTF-8 round-trip should corrupt bytes >= 0x80"
118+
)
119+
assert len(after_websocket) > len(tar_bytes), (
120+
"UTF-8 encoding expands bytes >= 0x80 into multi-byte sequences"
121+
)
122+
123+
124+
def test_ascii_only_tar_unaffected_by_encoding(
125+
executor: KubernetesExecutor,
126+
) -> None:
127+
"""ASCII-only archives survive latin-1→UTF-8, explaining why the bug
128+
only triggered with binary file uploads.
129+
"""
130+
tar_bytes = executor._create_tar_archive(
131+
code="print('hello')",
132+
files=[("readme.txt", b"just ascii\n")],
133+
last_line_interactive=False,
134+
)
135+
136+
roundtripped = tar_bytes.decode("latin-1").encode("utf-8")
137+
138+
assert roundtripped == tar_bytes, (
139+
"ASCII-only tar archives should be unaffected by latin-1→UTF-8"
140+
)

0 commit comments

Comments
 (0)