|
3 | 3 | """Generic utility functions that are used in the framework.""" |
4 | 4 |
|
5 | 5 | import base64 |
6 | | -import errno |
7 | 6 | import hashlib |
8 | 7 | import json |
9 | 8 | import logging |
10 | 9 | import os |
11 | 10 | import platform |
12 | 11 | import re |
13 | | -import select |
14 | 12 | import signal |
15 | 13 | import subprocess |
16 | 14 | import time |
@@ -377,46 +375,156 @@ def run_guest_cmd(ssh_connection, cmd, expected, use_json=False): |
377 | 375 | assert stdout == expected |
378 | 376 |
|
379 | 377 |
|
380 | | -def get_process_pidfd(pid): |
381 | | - """Get a pidfd file descriptor for the process with PID `pid` |
| 378 | +def dump_proc_state(pid): |
| 379 | + """Log diagnostic info for a process that failed to exit after SIGKILL.""" |
| 380 | + log = logging.getLogger(__name__) |
382 | 381 |
|
383 | | - Will return a pid file descriptor for the process with PID `pid` if it is |
384 | | - still alive. If the process has already exited we will receive either a |
385 | | - `ProcessLookupError` exception or and an `OSError` exception with errno `EINVAL`. |
386 | | - In these cases, we will return `None`. |
387 | | -
|
388 | | - Any other error while calling the system call, will raise an OSError |
389 | | - exception. |
390 | | - """ |
| 382 | + # Confirm we can still see this PID at all (catches namespace issues). |
391 | 383 | try: |
392 | | - pidfd = os.pidfd_open(pid) |
| 384 | + os.kill(pid, 0) |
| 385 | + log.error("Process %d kill -0: still visible", pid) |
393 | 386 | except ProcessLookupError: |
394 | | - return None |
| 387 | + log.error("Process %d kill -0: ESRCH (gone or invisible from sender)", pid) |
| 388 | + return |
395 | 389 | except OSError as err: |
396 | | - if err.errno == errno.EINVAL: |
397 | | - return None |
398 | | - |
399 | | - raise |
400 | | - |
401 | | - return pidfd |
| 390 | + log.error("Process %d kill -0: errno=%d (%s)", pid, err.errno, err.strerror) |
402 | 391 |
|
403 | | - |
404 | | -def wait_process_termination(p_pid): |
405 | | - """Wait for a process to terminate. |
406 | | -
|
407 | | - Will return successfully if the process |
408 | | - got indeed killed or raises an exception if the process |
409 | | - is still alive after retrying several times. |
| 392 | + try: |
| 393 | + status = Path(f"/proc/{pid}/status").read_text(encoding="utf-8") |
| 394 | + log.error( |
| 395 | + "Process %d did not exit after SIGKILL. /proc/%d/status:\n%s", |
| 396 | + pid, |
| 397 | + pid, |
| 398 | + status, |
| 399 | + ) |
| 400 | + except FileNotFoundError: |
| 401 | + log.error("Process %d exited between timeout and diagnostics collection", pid) |
| 402 | + return |
| 403 | + try: |
| 404 | + cmdline = Path(f"/proc/{pid}/cmdline").read_bytes().decode(errors="replace") |
| 405 | + log.error("Process %d cmdline: %s", pid, cmdline.replace("\x00", " ")) |
| 406 | + except (FileNotFoundError, PermissionError): |
| 407 | + pass |
| 408 | + # /proc/PID/stat field 41 = exit_state (non-zero = task in exit path) |
| 409 | + try: |
| 410 | + stat = Path(f"/proc/{pid}/stat").read_text(encoding="utf-8").strip() |
| 411 | + # comm in stat is parenthesized and may contain spaces; split after last ')' |
| 412 | + rest = stat.rsplit(")", 1)[1].split() |
| 413 | + # field indexing per proc(5): pid(1) comm(2) state(3) ppid(4) ... |
| 414 | + # rest[0] is field 3 onwards. exit_state is field 41 → rest[38] |
| 415 | + if len(rest) > 38: |
| 416 | + log.error("Process %d exit_state (stat[41]): %s", pid, rest[38]) |
| 417 | + except (FileNotFoundError, PermissionError, IndexError, ValueError): |
| 418 | + pass |
| 419 | + # /proc/PID/sched scheduling stats (voluntary_ctxt_switches etc.) |
| 420 | + try: |
| 421 | + sched = Path(f"/proc/{pid}/sched").read_text(encoding="utf-8") |
| 422 | + relevant = [ |
| 423 | + line |
| 424 | + for line in sched.splitlines() |
| 425 | + if any( |
| 426 | + k in line for k in ("voluntary", "exec_runtime", "wait_sum", "switches") |
| 427 | + ) |
| 428 | + ] |
| 429 | + if relevant: |
| 430 | + log.error("Process %d sched:\n%s", pid, "\n".join(relevant)) |
| 431 | + except (FileNotFoundError, PermissionError): |
| 432 | + pass |
| 433 | + # Per-thread state: identify which thread is stuck |
| 434 | + try: |
| 435 | + tids = sorted(int(t) for t in os.listdir(f"/proc/{pid}/task")) |
| 436 | + except FileNotFoundError: |
| 437 | + return |
| 438 | + for tid in tids: |
| 439 | + for fname in ("comm", "wchan", "syscall", "stack"): |
| 440 | + try: |
| 441 | + content = ( |
| 442 | + Path(f"/proc/{pid}/task/{tid}/{fname}") |
| 443 | + .read_text(encoding="utf-8", errors="replace") |
| 444 | + .strip() |
| 445 | + ) |
| 446 | + log.error("Process %d task %d %s:\n%s", pid, tid, fname, content) |
| 447 | + except (FileNotFoundError, PermissionError): |
| 448 | + pass |
| 449 | + try: |
| 450 | + tstatus = Path(f"/proc/{pid}/task/{tid}/status").read_text(encoding="utf-8") |
| 451 | + for line in tstatus.splitlines(): |
| 452 | + if line.startswith(("State:", "SigBlk:", "SigPnd:", "ShdPnd:")): |
| 453 | + log.error("Process %d task %d %s", pid, tid, line.strip()) |
| 454 | + except (FileNotFoundError, PermissionError): |
| 455 | + pass |
| 456 | + |
| 457 | + # Tail of dmesg may show RCU stalls, hung_task, OOM, soft lockups. |
| 458 | + try: |
| 459 | + result = subprocess.run( |
| 460 | + ["dmesg", "--ctime", "--time-format", "iso"], |
| 461 | + capture_output=True, |
| 462 | + text=True, |
| 463 | + timeout=5, |
| 464 | + check=False, |
| 465 | + ) |
| 466 | + tail = "\n".join(result.stdout.splitlines()[-50:]) |
| 467 | + if tail: |
| 468 | + log.error("dmesg tail (last 50 lines):\n%s", tail) |
| 469 | + except (FileNotFoundError, subprocess.TimeoutExpired, OSError): |
| 470 | + pass |
| 471 | + |
| 472 | + # All processes parented to our test session (the subreaper) — reveals |
| 473 | + # any sibling processes (UFFD handlers, vhost-user backends, leftover FCs) |
| 474 | + # that might be keeping the stuck FC's resources alive. |
| 475 | + try: |
| 476 | + result = subprocess.run( |
| 477 | + ["ps", "axo", "pid,ppid,pgid,sid,stat,wchan,comm"], |
| 478 | + capture_output=True, |
| 479 | + text=True, |
| 480 | + timeout=5, |
| 481 | + check=False, |
| 482 | + ) |
| 483 | + own_pid = os.getpid() |
| 484 | + related = [result.stdout.splitlines()[0]] |
| 485 | + for line in result.stdout.splitlines()[1:]: |
| 486 | + fields = line.split(maxsplit=6) |
| 487 | + if len(fields) < 2: |
| 488 | + continue |
| 489 | + try: |
| 490 | + ppid = int(fields[1]) |
| 491 | + except ValueError: |
| 492 | + continue |
| 493 | + if ppid in (own_pid, pid): |
| 494 | + related.append(line) |
| 495 | + if len(related) > 1: |
| 496 | + log.error( |
| 497 | + "Processes parented to test worker %d or stuck FC %d:\n%s", |
| 498 | + own_pid, |
| 499 | + pid, |
| 500 | + "\n".join(related), |
| 501 | + ) |
| 502 | + except (FileNotFoundError, subprocess.TimeoutExpired, OSError): |
| 503 | + pass |
| 504 | + |
| 505 | + |
| 506 | +def wait_process_termination(p_pid, timeout=10.0): |
| 507 | + """Wait for a process to terminate via waitpid(). |
| 508 | +
|
| 509 | + Requires the test session to be a subreaper (set via PR_SET_CHILD_SUBREAPER |
| 510 | + in conftest.py) so orphaned descendants reparent to us. Polls with WNOHANG |
| 511 | + so we can enforce a timeout. |
| 512 | +
|
| 513 | + Returns successfully if the process exits within `timeout` seconds. |
| 514 | + Raises TimeoutError if it does not. |
410 | 515 | """ |
411 | | - pidfd = get_process_pidfd(p_pid) |
412 | | - |
413 | | - # If pidfd is None the process has already terminated |
414 | | - if pidfd is not None: |
415 | | - epoll = select.epoll() |
416 | | - epoll.register(pidfd, select.EPOLLIN) |
417 | | - # This will return once the process exits |
418 | | - epoll.poll() |
419 | | - os.close(pidfd) |
| 516 | + deadline = time.time() + timeout |
| 517 | + while True: |
| 518 | + try: |
| 519 | + pid, _status = os.waitpid(p_pid, os.WNOHANG) |
| 520 | + except ChildProcessError: |
| 521 | + # Process is not our child (already reaped, or never reparented to us). |
| 522 | + return |
| 523 | + if pid == p_pid: |
| 524 | + return |
| 525 | + if time.time() >= deadline: |
| 526 | + raise TimeoutError(f"Process {p_pid} did not exit within {timeout}s") |
| 527 | + time.sleep(0.05) |
420 | 528 |
|
421 | 529 |
|
422 | 530 | def get_firecracker_version_from_toml(): |
|
0 commit comments