|
| 1 | +"""Tests that binary files with high bytes (>= 0x80) survive the tar pipeline. |
| 2 | +
|
| 3 | +Regression test for a bug where the Kubernetes executor decoded tar archives |
| 4 | +as latin-1 text before sending through a WebSocket. The WebSocket re-encoded |
| 5 | +as UTF-8, corrupting any byte >= 0x80 (single latin-1 bytes became multi-byte |
| 6 | +UTF-8 sequences), which produced "tar: Skipping to next header" errors. |
| 7 | +""" |
| 8 | + |
| 9 | +from __future__ import annotations |
| 10 | + |
| 11 | +import io |
| 12 | +import tarfile |
| 13 | +from unittest.mock import MagicMock, patch |
| 14 | + |
| 15 | +import pytest |
| 16 | + |
| 17 | +from app.services.executor_kubernetes import KubernetesExecutor |
| 18 | + |
| 19 | + |
| 20 | +@pytest.fixture() |
| 21 | +def executor() -> KubernetesExecutor: |
| 22 | + """Create a KubernetesExecutor without calling __init__ (no cluster needed).""" |
| 23 | + inst = KubernetesExecutor.__new__(KubernetesExecutor) |
| 24 | + inst.v1 = MagicMock() |
| 25 | + inst.namespace = "test" |
| 26 | + inst.image = "test:latest" |
| 27 | + inst.service_account = "" |
| 28 | + return inst |
| 29 | + |
| 30 | + |
| 31 | +def _mock_pod_running(v1_mock: MagicMock) -> None: |
| 32 | + """Make read_namespaced_pod return a Running pod.""" |
| 33 | + pod_mock = MagicMock() |
| 34 | + pod_mock.status.phase = "Running" |
| 35 | + v1_mock.read_namespaced_pod.return_value = pod_mock |
| 36 | + |
| 37 | + |
| 38 | +def _mock_stream_resp() -> MagicMock: |
| 39 | + """Create a mock WebSocket stream response that closes after one iteration.""" |
| 40 | + resp = MagicMock() |
| 41 | + # is_open returns True once (to enter the loop), then False |
| 42 | + resp.is_open.side_effect = [True, False] |
| 43 | + resp.peek_stdout.return_value = False |
| 44 | + resp.peek_stderr.return_value = False |
| 45 | + # Return a Success status on the error channel (= command completed OK) |
| 46 | + resp.read_channel.return_value = "{'status': 'Success'}" |
| 47 | + return resp |
| 48 | + |
| 49 | + |
| 50 | +def test_write_stdin_receives_bytes_not_string( |
| 51 | + executor: KubernetesExecutor, |
| 52 | +) -> None: |
| 53 | + """The critical assertion: write_stdin must be called with raw bytes, |
| 54 | + not a latin-1 decoded string. Passing a string causes the WebSocket |
| 55 | + text frame to re-encode as UTF-8, corrupting bytes >= 0x80. |
| 56 | + """ |
| 57 | + _mock_pod_running(executor.v1) |
| 58 | + |
| 59 | + binary_content = bytes(range(0x80, 0x100)) |
| 60 | + |
| 61 | + # Two stream calls: first for tar extraction, second for python execution |
| 62 | + tar_resp = _mock_stream_resp() |
| 63 | + exec_resp = _mock_stream_resp() |
| 64 | + |
| 65 | + with patch("app.services.executor_kubernetes.stream.stream") as mock_stream: |
| 66 | + mock_stream.side_effect = [tar_resp, exec_resp] |
| 67 | + |
| 68 | + executor.execute_python( |
| 69 | + code="print('hello')", |
| 70 | + stdin=None, |
| 71 | + timeout_ms=5000, |
| 72 | + max_output_bytes=1024, |
| 73 | + files=[("data.bin", binary_content)], |
| 74 | + ) |
| 75 | + |
| 76 | + # Find the write_stdin calls on the tar extraction stream |
| 77 | + write_calls = tar_resp.write_stdin.call_args_list |
| 78 | + assert len(write_calls) >= 1, "write_stdin was never called" |
| 79 | + |
| 80 | + # The first call should be the tar archive data |
| 81 | + tar_data_arg = write_calls[0][0][0] |
| 82 | + |
| 83 | + assert isinstance(tar_data_arg, bytes), ( |
| 84 | + f"write_stdin was called with {type(tar_data_arg).__name__}, expected bytes. " |
| 85 | + f"Passing a string causes UTF-8 re-encoding which corrupts binary tar data." |
| 86 | + ) |
| 87 | + |
| 88 | + # Verify the tar archive is valid and contains our binary file |
| 89 | + with tarfile.open(fileobj=io.BytesIO(tar_data_arg), mode="r") as tar: |
| 90 | + member = next(m for m in tar.getmembers() if m.name == "data.bin") |
| 91 | + extracted = tar.extractfile(member) |
| 92 | + assert extracted is not None |
| 93 | + assert extracted.read() == binary_content |
| 94 | + |
| 95 | + |
| 96 | +def test_write_stdin_string_causes_tar_corruption( |
| 97 | + executor: KubernetesExecutor, |
| 98 | +) -> None: |
| 99 | + """Demonstrate that passing a latin-1 decoded string through a UTF-8 |
| 100 | + WebSocket would produce a different (corrupted) byte sequence. |
| 101 | + """ |
| 102 | + binary_content = bytes(range(0x80, 0x100)) |
| 103 | + tar_bytes = executor._create_tar_archive( |
| 104 | + code="print('hello')", |
| 105 | + files=[("data.bin", binary_content)], |
| 106 | + last_line_interactive=False, |
| 107 | + ) |
| 108 | + |
| 109 | + # This is what the old code did: decode as latin-1 to make a string |
| 110 | + as_string = tar_bytes.decode("latin-1") |
| 111 | + |
| 112 | + # The WebSocket text frame encodes strings as UTF-8 |
| 113 | + after_websocket = as_string.encode("utf-8") |
| 114 | + |
| 115 | + # The byte sequences differ — this IS the corruption |
| 116 | + assert after_websocket != tar_bytes, ( |
| 117 | + "latin-1→UTF-8 round-trip should corrupt bytes >= 0x80" |
| 118 | + ) |
| 119 | + assert len(after_websocket) > len(tar_bytes), ( |
| 120 | + "UTF-8 encoding expands bytes >= 0x80 into multi-byte sequences" |
| 121 | + ) |
| 122 | + |
| 123 | + |
| 124 | +def test_ascii_only_tar_unaffected_by_encoding( |
| 125 | + executor: KubernetesExecutor, |
| 126 | +) -> None: |
| 127 | + """ASCII-only archives survive latin-1→UTF-8, explaining why the bug |
| 128 | + only triggered with binary file uploads. |
| 129 | + """ |
| 130 | + tar_bytes = executor._create_tar_archive( |
| 131 | + code="print('hello')", |
| 132 | + files=[("readme.txt", b"just ascii\n")], |
| 133 | + last_line_interactive=False, |
| 134 | + ) |
| 135 | + |
| 136 | + roundtripped = tar_bytes.decode("latin-1").encode("utf-8") |
| 137 | + |
| 138 | + assert roundtripped == tar_bytes, ( |
| 139 | + "ASCII-only tar archives should be unaffected by latin-1→UTF-8" |
| 140 | + ) |
0 commit comments