Skip to content

Commit fe11a6b

Browse files
committed
kubernetes: decode pod logs with errors=replace to survive torn UTF-8 bytes
The Kubernetes client's default read_namespaced_pod_log path does a strict .decode('utf8') over the full log payload before checking HTTP status. When a pod with high-volume tqdm progress bars (block glyphs █▉▊▋▌▍▎▏, 3-byte UTF-8) runs with num_proc>1, concurrent writes to the same fd can split a multi-byte glyph across a chunk boundary, leaving an orphaned continuation byte. The strict decode throws UnicodeDecodeError, which bubbles through the log-upload retry wrapper and marks an otherwise-healthy training run as SYSTEM_ERROR. Fix: pass _preload_content=False to get the raw urllib3 response and decode manually with errors="replace". This is applied to both the single-pod (LaunchedKubernetesContainer.get_log) and multi-pod (LaunchedKubernetesJob._get_log_by_pod_key) log-read paths. A warning is logged whenever replacement characters are injected, so the next occurrence is observable in Observe without requiring a separate debug build. The existing "Bad Request" catch for PodInitializing is unaffected: the kubernetes client's status check runs outside the _preload_content block and still raises ApiException with the correct reason phrase.
1 parent decae68 commit fe11a6b

1 file changed

Lines changed: 26 additions & 3 deletions

File tree

cloud_pipelines_backend/launchers/kubernetes_launchers.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -922,7 +922,8 @@ def get_refreshed(self) -> "LaunchedKubernetesContainer":
922922
def get_log(self) -> str:
923923
launcher = self._get_launcher()
924924
core_api_client = k8s_client_lib.CoreV1Api(api_client=launcher._api_client)
925-
return core_api_client.read_namespaced_pod_log(
925+
# _preload_content=False bypasses the kubernetes client's strict .decode('utf8'); see _get_log_by_pod_key.
926+
response = core_api_client.read_namespaced_pod_log(
926927
name=self._pod_name,
927928
namespace=self._namespace,
928929
container=_MAIN_CONTAINER_NAME,
@@ -931,7 +932,17 @@ def get_log(self) -> str:
931932
# HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"PodLogOptions \"task-pod-xxxxx\" is invalid: stream: Forbidden: may not be specified","reason":"Invalid","details":{"name":"task-pod-xxxxx","kind":"PodLogOptions","causes":[{"reason":"FieldValueForbidden","message":"Forbidden: may not be specified","field":"stream"}]},"code":422}
932933
# stream="All",
933934
_request_timeout=launcher._request_timeout,
935+
_preload_content=False,
934936
)
937+
try:
938+
log = response.data.decode("utf-8", errors="replace")
939+
if "\N{REPLACEMENT CHARACTER}" in log:
940+
_logger.warning(
941+
f"Pod log for {self._pod_name} contained invalid UTF-8 bytes; substituted replacement characters."
942+
)
943+
return log
944+
finally:
945+
response.release_conn()
935946

936947
def upload_log(self):
937948
launcher = self._get_launcher()
@@ -1490,14 +1501,26 @@ def _get_log_by_pod_key(self, pod_name: str) -> str | None:
14901501
launcher = self._get_launcher()
14911502
core_api_client = k8s_client_lib.CoreV1Api(api_client=launcher._api_client)
14921503
try:
1493-
log = core_api_client.read_namespaced_pod_log(
1504+
# _preload_content=False bypasses the kubernetes client's strict .decode('utf8'),
1505+
# which would raise UnicodeDecodeError on torn multi-byte chars in pod logs (e.g.
1506+
# tqdm block glyphs split across concurrent-writer chunk boundaries).
1507+
response = core_api_client.read_namespaced_pod_log(
14941508
name=pod_name,
14951509
namespace=self._namespace,
14961510
container=_MAIN_CONTAINER_NAME,
14971511
timestamps=True,
14981512
_request_timeout=launcher._request_timeout,
1513+
_preload_content=False,
14991514
)
1500-
return log
1515+
try:
1516+
log = response.data.decode("utf-8", errors="replace")
1517+
if "\N{REPLACEMENT CHARACTER}" in log:
1518+
_logger.warning(
1519+
f"Pod log for {pod_name} contained invalid UTF-8 bytes; substituted replacement characters."
1520+
)
1521+
return log
1522+
finally:
1523+
response.release_conn()
15011524
except kubernetes.client.exceptions.ApiException as ex:
15021525
if ex.reason == "Bad Request":
15031526
# Kubernetes client raises kubernetes.client.exceptions.ApiException when Pod is still in PodInitializing phase

0 commit comments

Comments
 (0)