|
7 | 7 | import time |
8 | 8 | import uuid |
9 | 9 | from collections.abc import Generator, Sequence |
10 | | -from contextlib import contextmanager, suppress |
| 10 | +from contextlib import contextmanager |
11 | 11 | from dataclasses import dataclass |
12 | 12 | from pathlib import Path |
13 | 13 | from typing import Any |
|
41 | 41 |
|
42 | 42 | logger = logging.getLogger(__name__) |
43 | 43 |
|
| 44 | +POD_DELETE_RETRIES = 3 |
| 45 | +POD_DELETE_RETRY_DELAY_SECONDS = 0.2 |
| 46 | +POD_DELETE_CONFIRM_TIMEOUT_SECONDS = 2.0 |
| 47 | + |
44 | 48 |
|
45 | 49 | def _parse_exit_code(error: str) -> int | None: |
46 | 50 | """Parse the exit code from a Kubernetes exec error channel message.""" |
@@ -75,7 +79,11 @@ def __init__(self) -> None: |
75 | 79 | except config.ConfigException: |
76 | 80 | config.load_kube_config() |
77 | 81 |
|
78 | | - self.v1 = client.CoreV1Api() |
| 82 | + # Keep REST calls on a dedicated ApiClient. kubernetes.stream.stream mutates |
| 83 | + # the ApiClient request path for websocket exec calls, so mixing CRUD and |
| 84 | + # exec traffic on one client can leave later REST calls in a broken state. |
| 85 | + self._rest_api_client = client.ApiClient() |
| 86 | + self.v1 = client.CoreV1Api(api_client=self._rest_api_client) |
79 | 87 | self.namespace = KUBERNETES_EXECUTOR_NAMESPACE |
80 | 88 | self.image = KUBERNETES_EXECUTOR_IMAGE |
81 | 89 | self.service_account = KUBERNETES_EXECUTOR_SERVICE_ACCOUNT |
@@ -263,20 +271,42 @@ def _wait_for_pod_ready(self, pod_name: str, timeout_sec: int = 30) -> None: |
263 | 271 | time.sleep(0.1) |
264 | 272 | raise RuntimeError(f"Pod {pod_name} did not become ready in {timeout_sec} seconds") |
265 | 273 |
|
| 274 | + def _stream_pod_exec( |
| 275 | + self, |
| 276 | + pod_name: str, |
| 277 | + command: list[str], |
| 278 | + *, |
| 279 | + stderr: bool, |
| 280 | + stdin: bool, |
| 281 | + stdout: bool, |
| 282 | + tty: bool, |
| 283 | + preload_content: bool = False, |
| 284 | + ) -> ws_client.WSClient: |
| 285 | + """Run a websocket exec call using an isolated ApiClient instance.""" |
| 286 | + stream_api = client.CoreV1Api(api_client=client.ApiClient()) |
| 287 | + return stream.stream( |
| 288 | + stream_api.connect_get_namespaced_pod_exec, |
| 289 | + pod_name, |
| 290 | + self.namespace, |
| 291 | + command=command, |
| 292 | + stderr=stderr, |
| 293 | + stdin=stdin, |
| 294 | + stdout=stdout, |
| 295 | + tty=tty, |
| 296 | + _preload_content=preload_content, |
| 297 | + ) |
| 298 | + |
266 | 299 | def _upload_tar_to_pod(self, pod_name: str, tar_archive: bytes) -> None: |
267 | 300 | """Upload and extract a tar archive into the pod's workspace.""" |
268 | 301 | logger.info(f"Uploading tar archive ({len(tar_archive)} bytes) to pod {pod_name}") |
269 | 302 | exec_command = ["tar", "-x", "-C", "/workspace"] |
270 | | - resp = stream.stream( |
271 | | - self.v1.connect_get_namespaced_pod_exec, |
| 303 | + resp = self._stream_pod_exec( |
272 | 304 | pod_name, |
273 | | - self.namespace, |
274 | 305 | command=exec_command, |
275 | 306 | stderr=True, |
276 | 307 | stdin=True, |
277 | 308 | stdout=True, |
278 | 309 | tty=False, |
279 | | - _preload_content=False, |
280 | 310 | ) |
281 | 311 |
|
282 | 312 | resp.write_stdin(tar_archive) |
@@ -314,17 +344,17 @@ def _upload_tar_to_pod(self, pod_name: str, tar_archive: bytes) -> None: |
314 | 344 |
|
315 | 345 | def _kill_python_process(self, pod_name: str) -> None: |
316 | 346 | """Kill the Python process running in the pod.""" |
317 | | - with suppress(Exception): |
318 | | - stream.stream( |
319 | | - self.v1.connect_get_namespaced_pod_exec, |
| 347 | + try: |
| 348 | + self._stream_pod_exec( |
320 | 349 | pod_name, |
321 | | - self.namespace, |
322 | 350 | command=["pkill", "-9", "python"], |
323 | 351 | stderr=False, |
324 | 352 | stdin=False, |
325 | 353 | stdout=False, |
326 | 354 | tty=False, |
327 | 355 | ) |
| 356 | + except Exception: |
| 357 | + logger.warning("Failed to kill Python process in pod %s", pod_name, exc_info=True) |
328 | 358 |
|
329 | 359 | @contextmanager |
330 | 360 | def _run_in_pod( |
@@ -369,16 +399,13 @@ def _run_in_pod( |
369 | 399 | start = time.perf_counter() |
370 | 400 | exec_command = ["python", "/workspace/__main__.py"] |
371 | 401 |
|
372 | | - exec_resp = stream.stream( |
373 | | - self.v1.connect_get_namespaced_pod_exec, |
| 402 | + exec_resp = self._stream_pod_exec( |
374 | 403 | pod_name, |
375 | | - self.namespace, |
376 | 404 | command=exec_command, |
377 | 405 | stderr=True, |
378 | 406 | stdin=True, |
379 | 407 | stdout=True, |
380 | 408 | tty=False, |
381 | | - _preload_content=False, |
382 | 409 | ) |
383 | 410 |
|
384 | 411 | yield _KubeExecContext( |
@@ -409,16 +436,13 @@ def _extract_workspace_snapshot(self, pod_name: str) -> tuple[WorkspaceEntry, .. |
409 | 436 | ] |
410 | 437 |
|
411 | 438 | logger.info(f"Starting tar extraction from pod {pod_name}") |
412 | | - resp = stream.stream( |
413 | | - self.v1.connect_get_namespaced_pod_exec, |
| 439 | + resp = self._stream_pod_exec( |
414 | 440 | pod_name, |
415 | | - self.namespace, |
416 | 441 | command=exec_command, |
417 | 442 | stderr=True, |
418 | 443 | stdin=False, |
419 | 444 | stdout=True, |
420 | 445 | tty=False, |
421 | | - _preload_content=False, |
422 | 446 | ) |
423 | 447 |
|
424 | 448 | base64_data = "" |
@@ -485,14 +509,63 @@ def _extract_workspace_snapshot(self, pod_name: str) -> tuple[WorkspaceEntry, .. |
485 | 509 | logger.error(f"Failed to extract workspace snapshot: {e}", exc_info=True) |
486 | 510 | return tuple() |
487 | 511 |
|
| 512 | + def _wait_for_pod_deleted(self, pod_name: str, timeout_sec: float) -> bool: |
| 513 | + deadline = time.time() + timeout_sec |
| 514 | + while time.time() < deadline: |
| 515 | + try: |
| 516 | + self.v1.read_namespaced_pod(pod_name, self.namespace) |
| 517 | + except ApiException as e: |
| 518 | + if e.status == 404: |
| 519 | + return True |
| 520 | + logger.warning( |
| 521 | + "Error while checking pod deletion for %s in namespace %s: %s", |
| 522 | + pod_name, |
| 523 | + self.namespace, |
| 524 | + e, |
| 525 | + ) |
| 526 | + return False |
| 527 | + time.sleep(0.1) |
| 528 | + return False |
| 529 | + |
488 | 530 | def _cleanup_pod(self, pod_name: str) -> None: |
489 | | - """Delete a pod and wait for cleanup.""" |
490 | | - with suppress(ApiException): |
491 | | - self.v1.delete_namespaced_pod( |
492 | | - name=pod_name, |
493 | | - namespace=self.namespace, |
494 | | - body=client.V1DeleteOptions(grace_period_seconds=0), |
495 | | - ) |
| 531 | + """Delete a pod and log any cleanup failures.""" |
| 532 | + for attempt in range(1, POD_DELETE_RETRIES + 1): |
| 533 | + try: |
| 534 | + self.v1.delete_namespaced_pod( |
| 535 | + name=pod_name, |
| 536 | + namespace=self.namespace, |
| 537 | + body=client.V1DeleteOptions(grace_period_seconds=0), |
| 538 | + ) |
| 539 | + except ApiException as e: |
| 540 | + if e.status == 404: |
| 541 | + return |
| 542 | + logger.warning( |
| 543 | + "Failed to delete pod %s in namespace %s on attempt %s/%s: %s", |
| 544 | + pod_name, |
| 545 | + self.namespace, |
| 546 | + attempt, |
| 547 | + POD_DELETE_RETRIES, |
| 548 | + e, |
| 549 | + ) |
| 550 | + else: |
| 551 | + if self._wait_for_pod_deleted(pod_name, POD_DELETE_CONFIRM_TIMEOUT_SECONDS): |
| 552 | + return |
| 553 | + logger.warning( |
| 554 | + "Pod %s still exists after delete request on attempt %s/%s", |
| 555 | + pod_name, |
| 556 | + attempt, |
| 557 | + POD_DELETE_RETRIES, |
| 558 | + ) |
| 559 | + |
| 560 | + if attempt < POD_DELETE_RETRIES: |
| 561 | + time.sleep(POD_DELETE_RETRY_DELAY_SECONDS * attempt) |
| 562 | + |
| 563 | + logger.error( |
| 564 | + "Failed to confirm deletion of pod %s in namespace %s after %s attempts", |
| 565 | + pod_name, |
| 566 | + self.namespace, |
| 567 | + POD_DELETE_RETRIES, |
| 568 | + ) |
496 | 569 |
|
497 | 570 | def execute_python( |
498 | 571 | self, |
|
0 commit comments