diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e0673c551..c5baed880 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -142,22 +142,23 @@ jobs: # To restore sanity here is a goal for Isaac Lab Arena v0.3. - name: Run in-process Newton tests run: | - /isaac-sim/python.sh -m pytest -sv -m with_newton \ + /isaac-sim/python.sh -m pytest -sv --durations=0 -m with_newton \ isaaclab_arena/tests/ - name: Run in-process PhysX tests without cameras run: | - /isaac-sim/python.sh -m pytest -sv -m "not with_cameras and not with_subprocess and not with_newton" \ + /isaac-sim/python.sh -m pytest -sv --durations=0 -m "not with_cameras and not with_subprocess and not with_newton" \ isaaclab_arena/tests/ - name: Run in-process PhysX tests with cameras run: | - /isaac-sim/python.sh -m pytest -sv -m "with_cameras and not with_subprocess and not with_newton" \ + /isaac-sim/python.sh -m pytest -sv --durations=0 -m "with_cameras and not with_subprocess and not with_newton" \ isaaclab_arena/tests/ - - name: Run subprocess-spawning PhysX tests + - name: Run subprocess-spawning PhysX tests run: | - /isaac-sim/python.sh -m pytest -sv -m with_subprocess \ + ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \ + /isaac-sim/python.sh -m pytest -sv --durations=0 -m with_subprocess \ isaaclab_arena/tests/ @@ -188,7 +189,9 @@ jobs: # Run the policy (GR00T) related tests. - name: Run policy-related pytest - run: /isaac-sim/python.sh -m pytest -sv isaaclab_arena_gr00t/tests/ + run: | + ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \ + /isaac-sim/python.sh -m pytest -sv --durations=0 isaaclab_arena_gr00t/tests/ build_docs_pre_merge: diff --git a/isaaclab_arena/tests/test_policy_runner.py b/isaaclab_arena/tests/test_policy_runner.py index 559c6891b..1e595f4f2 100644 --- a/isaaclab_arena/tests/test_policy_runner.py +++ b/isaaclab_arena/tests/test_policy_runner.py @@ -65,21 +65,18 @@ def test_zero_action_policy_press_button(): @pytest.mark.with_subprocess -def test_zero_action_policy_kitchen_pick_and_place(): +@pytest.mark.parametrize("embodiment", ["franka_ik", "gr1_pink", "gr1_joint"]) +@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can"]) +def test_zero_action_policy_kitchen_pick_and_place(embodiment, object_name): # TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments # from a registry when we have one. - example_environment = "kitchen_pick_and_place" - embodiments = ["franka_ik", "gr1_pink", "gr1_joint"] - object_names = ["cracker_box", "tomato_soup_can"] - for embodiment in embodiments: - for object_name in object_names: - run_policy_runner( - policy_type="zero_action", - example_environment=example_environment, - embodiment=embodiment, - object_name=object_name, - num_steps=NUM_STEPS, - ) + run_policy_runner( + policy_type="zero_action", + example_environment="kitchen_pick_and_place", + embodiment=embodiment, + object_name=object_name, + num_steps=NUM_STEPS, + ) @pytest.mark.with_subprocess @@ -98,20 +95,17 @@ def test_zero_action_policy_galileo_pick_and_place(): @pytest.mark.with_subprocess -def test_zero_action_policy_gr1_open_microwave(): +@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can", "mustard_bottle"]) +def test_zero_action_policy_gr1_open_microwave(object_name): # TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments # from a registry when we have one. - example_environment = "gr1_open_microwave" - object_name = ["cracker_box", "tomato_soup_can", "mustard_bottle"] - for object_name in object_name: - run_policy_runner( - policy_type="zero_action", - example_environment=example_environment, - embodiment="gr1_pink", - background=None, - object_name=object_name, - num_steps=NUM_STEPS, - ) + run_policy_runner( + policy_type="zero_action", + example_environment="gr1_open_microwave", + embodiment="gr1_pink", + object_name=object_name, + num_steps=NUM_STEPS, + ) @pytest.mark.with_subprocess diff --git a/isaaclab_arena/tests/utils/subprocess.py b/isaaclab_arena/tests/utils/subprocess.py index c78165234..ff1fa9b9c 100644 --- a/isaaclab_arena/tests/utils/subprocess.py +++ b/isaaclab_arena/tests/utils/subprocess.py @@ -5,6 +5,7 @@ import atexit import os +import signal import subprocess import sys import traceback @@ -17,6 +18,54 @@ from isaaclab_arena.tests.conftest import PYTEST_SESSION from isaaclab_arena.utils.isaaclab_utils.simulation_app import get_app_launcher, teardown_simulation_app +_ORPHAN_PATTERNS = ("omni.telemetry.transmitter", "omni.kit", "carb.windowing") + + +def _kill_kit_orphans(): + """Kill Kit child processes (telemetry transmitters, etc.) that survive app.close(). + + On CPU-constrained CI runners, even one orphan telemetry transmitter can starve + the next Kit subprocess launch, eventually deadlocking Kit startup entirely. + ``_kill_child_processes()`` in simulation_app.py handles direct children via /proc, + but reparented orphans (e.g. telemetry transmitter adopted by PID 1) escape that + scan. This function catches them by matching process names. + """ + import re + + try: + ps = subprocess.run( + ["ps", "-eo", "pid,ppid,comm,args"], + capture_output=True, + text=True, + timeout=5, + ) + except Exception: + return + + our_pid = os.getpid() + killed = [] + for line in ps.stdout.splitlines()[1:]: + line_lower = line.lower() + if not any(pat in line_lower for pat in _ORPHAN_PATTERNS): + continue + m = re.match(r"\s*(\d+)", line) + if not m: + continue + pid = int(m.group(1)) + if pid == our_pid: + continue + try: + os.kill(pid, signal.SIGKILL) + killed.append(pid) + except ProcessLookupError: + pass + except PermissionError: + pass + + if killed: + print(f"[run_subprocess] Killed {len(killed)} orphan process(es): {killed}") + + # NOTE(alexmillane): Isaac Sim makes testing complicated. During shutdown Isaac Sim will # terminate the surrounding pytest process with exit code 0, regardless # of whether the tests passed or failed. @@ -30,27 +79,73 @@ _PERSISTENT_INIT_ARGS = None # store (headless, enable_cameras) used at first init _AT_LEAST_ONE_TEST_FAILED = False +_SUBPROCESS_TIMEOUT_SEC = int(os.environ.get("ISAACLAB_ARENA_SUBPROCESS_TIMEOUT", "600")) + + +def run_subprocess( + cmd, + env=None, + timeout_sec: int | None = None, + capture_output: bool = False, +) -> subprocess.CompletedProcess | None: + """Run a command in a subprocess with timeout and orphan cleanup. -def run_subprocess(cmd, env=None): - print(f"Running command: {cmd}") + Args: + cmd: Command to run (list of strings). + env: Optional environment dict. Defaults to inheriting the parent env. + timeout_sec: Per-subprocess wall-clock timeout in seconds. + Defaults to ``_SUBPROCESS_TIMEOUT_SEC`` (env ``ISAACLAB_ARENA_SUBPROCESS_TIMEOUT``, fallback 600). + capture_output: If True, capture stdout/stderr and return a + ``CompletedProcess``. When False (default) output streams to + the parent process and the function returns None on success. + + Returns: + ``CompletedProcess`` when *capture_output* is True, else None. + """ + if timeout_sec is None: + timeout_sec = _SUBPROCESS_TIMEOUT_SEC + + print(f"Running command (timeout={timeout_sec}s): {cmd}") global _AT_LEAST_ONE_TEST_FAILED + + if env is None: + env = os.environ.copy() + env["ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE"] = "1" + + # start_new_session=True isolates the child into its own process group. + # The child-side SimulationAppContext uses this to SIGTERM its entire + # group before os._exit(), preventing orphaned Kit children (shader + # compiler, GPU workers, …) from holding GPU resources and blocking + # the next subprocess. After the child exits, _kill_kit_orphans() + # sweeps for any reparented Kit processes (e.g. telemetry transmitter) + # that escaped the child-side cleanup. try: result = subprocess.run( cmd, - check=True, env=env, - # Don't capture output, let it flow through in real-time - capture_output=False, - text=True, - # Explicitly set stdout and stderr to None to use parent process's pipes - stdout=None, - stderr=None, + timeout=timeout_sec, + capture_output=capture_output, + text=capture_output, + start_new_session=True, ) - print(f"Command completed with return code: {result.returncode}") - except subprocess.CalledProcessError as e: - sys.stderr.write(f"Command failed with return code {e.returncode}: {e}\n") + except subprocess.TimeoutExpired: + sys.stderr.write(f"\n[isaaclab-arena] Subprocess timed out after {timeout_sec}s\n") _AT_LEAST_ONE_TEST_FAILED = True - raise e + raise subprocess.SubprocessError(f"Subprocess timed out after {timeout_sec}s: {cmd}") + finally: + _kill_kit_orphans() + + print(f"Command completed with return code: {result.returncode}") + if result.returncode != 0: + sys.stderr.write(f"Command failed with return code {result.returncode}\n") + if capture_output and result.stderr: + sys.stderr.write(result.stderr) + _AT_LEAST_ONE_TEST_FAILED = True + raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr) + + if capture_output: + return result + return None class _IsolatedArgv: @@ -108,7 +203,7 @@ def get_persistent_simulation_app(headless: bool, enable_cameras: bool = False) first_headless, first_enable_cameras = _PERSISTENT_INIT_ARGS if (headless != first_headless) or (enable_cameras != first_enable_cameras): print( - "[isaac-arena] Warning: persistent SimulationApp already initialized with " + "[isaaclab-arena] Warning: persistent SimulationApp already initialized with " f"headless={first_headless}, enable_cameras={first_enable_cameras}. " "Ignoring new values." ) diff --git a/isaaclab_arena/utils/isaaclab_utils/simulation_app.py b/isaaclab_arena/utils/isaaclab_utils/simulation_app.py index edff1b75b..9a33c1b03 100644 --- a/isaaclab_arena/utils/isaaclab_utils/simulation_app.py +++ b/isaaclab_arena/utils/isaaclab_utils/simulation_app.py @@ -87,6 +87,26 @@ def reapply_viewer_cfg(env) -> None: vcc.update_view_location() +def _kill_child_processes() -> None: + """SIGKILL all direct child processes of the current process via /proc.""" + import signal + + my_pid = os.getpid() + with suppress(FileNotFoundError, PermissionError): + for entry in os.scandir("/proc"): + if not entry.name.isdigit(): + continue + try: + with open(f"/proc/{entry.name}/status") as f: + for line in f: + if line.startswith("PPid:"): + if int(line.split()[1]) == my_pid: + os.kill(int(entry.name), signal.SIGKILL) + break + except (FileNotFoundError, PermissionError, ProcessLookupError, ValueError): + continue + + class SimulationAppContext: """Context manager for launching and closing a simulation app.""" @@ -110,11 +130,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): print("Closing simulation app") - # app_launcher.close() will terminate the whole process with exit code 0, i.e. preventing errors from being seen by the caller. There are seemingly no ways around this. - # As a workaround, we call os._exit(1) that terminates immediately. The downside is that any cleanup would be omitted - if exc_type is None: - self.app_launcher.app.close() - else: + if exc_type is not None: print(f"Exception caught in SimulationAppContext: {exc_type.__name__}: {exc_val}") print("Traceback:") traceback.print_exception(exc_type, exc_val, exc_tb) @@ -122,3 +138,22 @@ def __exit__(self, exc_type, exc_val, exc_tb): sys.stdout.flush() sys.stderr.flush() os._exit(1) + + # When launched as a test subprocess, skip app.close() which can hang + # indefinitely in Kit's shutdown path. + if os.environ.get("ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE") == "1": + print("Force-exiting subprocess (ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE=1)") + sys.stdout.flush() + sys.stderr.flush() + # SIGKILL orphaned Kit children (shader compiler, GPU workers, …) + # so they don't hold GPU resources and block the next test subprocess. + # We target each child individually via /proc to avoid signalling + # ourselves (Kit installs a C-level SIGTERM handler that overrides + # Python's SIG_IGN, so os.killpg is not safe here). + _kill_child_processes() + os._exit(0) + + # Normal interactive / non-test path: attempt a clean Kit shutdown. + # app.close() may terminate the process with exit code 0 regardless of + # errors — see the error branch above for the workaround. + self.app_launcher.app.close() diff --git a/isaaclab_arena_gr00t/tests/test_gr00t_closedloop_policy.py b/isaaclab_arena_gr00t/tests/test_gr00t_closedloop_policy.py index ac83f59d9..1be6ffbd5 100644 --- a/isaaclab_arena_gr00t/tests/test_gr00t_closedloop_policy.py +++ b/isaaclab_arena_gr00t/tests/test_gr00t_closedloop_policy.py @@ -236,7 +236,7 @@ def test_g1_locomanip_gr00t_closedloop_policy_runner_eval_runner(gr00t_finetuned "name": "gr1_open_microwave_cracker_box", "arena_env_args": { "environment": "gr1_open_microwave", - "num_envs": 10, + "num_envs": 3, "object": "cracker_box", "embodiment": "gr1_joint", },