isaac-sim · xyao-nv · Apr 10, 2026 · Apr 10, 2026 · isaaclab-review-bot · Apr 10, 2026
@@ -142,22 +142,23 @@ jobs:
       # To restore sanity here is a goal for Isaac Lab Arena v0.3.
       - name: Run in-process Newton tests
         run: |
-          /isaac-sim/python.sh -m pytest -sv -m with_newton \
+          /isaac-sim/python.sh -m pytest -sv --durations=0 -m with_newton \
             isaaclab_arena/tests/
 
       - name: Run in-process PhysX tests without cameras
         run: |
-          /isaac-sim/python.sh -m pytest -sv -m "not with_cameras and not with_subprocess and not with_newton" \
+          /isaac-sim/python.sh -m pytest -sv --durations=0 -m "not with_cameras and not with_subprocess and not with_newton" \
             isaaclab_arena/tests/
 
       - name: Run in-process PhysX tests with cameras
         run: |
-          /isaac-sim/python.sh -m pytest -sv -m "with_cameras and not with_subprocess and not with_newton" \
+          /isaac-sim/python.sh -m pytest -sv --durations=0 -m "with_cameras and not with_subprocess and not with_newton" \
             isaaclab_arena/tests/
 
-      - name: Run subprocess-spawning  PhysX tests
+      - name: Run subprocess-spawning PhysX tests
         run: |
-          /isaac-sim/python.sh -m pytest -sv -m with_subprocess \
+          ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \
+          /isaac-sim/python.sh -m pytest -sv --durations=0 -m with_subprocess \
             isaaclab_arena/tests/
 
 
@@ -188,7 +189,9 @@ jobs:
 
       # Run the policy (GR00T) related tests.
       - name: Run policy-related pytest
-        run: /isaac-sim/python.sh -m pytest -sv isaaclab_arena_gr00t/tests/
+        run: |
+          ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \
+          /isaac-sim/python.sh -m pytest -sv --durations=0 isaaclab_arena_gr00t/tests/
 
 
   build_docs_pre_merge:

@@ -65,21 +65,18 @@ def test_zero_action_policy_press_button():
 
 
 @pytest.mark.with_subprocess
-def test_zero_action_policy_kitchen_pick_and_place():
+@pytest.mark.parametrize("embodiment", ["franka_ik", "gr1_pink", "gr1_joint"])
+@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can"])
+def test_zero_action_policy_kitchen_pick_and_place(embodiment, object_name):
     # TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments
     # from a registry when we have one.
-    example_environment = "kitchen_pick_and_place"
-    embodiments = ["franka_ik", "gr1_pink", "gr1_joint"]
-    object_names = ["cracker_box", "tomato_soup_can"]
-    for embodiment in embodiments:
-        for object_name in object_names:
-            run_policy_runner(
-                policy_type="zero_action",
-                example_environment=example_environment,
-                embodiment=embodiment,
-                object_name=object_name,
-                num_steps=NUM_STEPS,
-            )
+    run_policy_runner(
+        policy_type="zero_action",
+        example_environment="kitchen_pick_and_place",
+        embodiment=embodiment,
+        object_name=object_name,
+        num_steps=NUM_STEPS,
+    )
 
 
 @pytest.mark.with_subprocess
@@ -98,20 +95,17 @@ def test_zero_action_policy_galileo_pick_and_place():
 
 
 @pytest.mark.with_subprocess
-def test_zero_action_policy_gr1_open_microwave():
+@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can", "mustard_bottle"])
+def test_zero_action_policy_gr1_open_microwave(object_name):
     # TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments
     # from a registry when we have one.
-    example_environment = "gr1_open_microwave"
-    object_name = ["cracker_box", "tomato_soup_can", "mustard_bottle"]
-    for object_name in object_name:
-        run_policy_runner(
-            policy_type="zero_action",
-            example_environment=example_environment,
-            embodiment="gr1_pink",
-            background=None,
-            object_name=object_name,
-            num_steps=NUM_STEPS,
-        )
+    run_policy_runner(
+        policy_type="zero_action",
+        example_environment="gr1_open_microwave",
+        embodiment="gr1_pink",
+        object_name=object_name,
+        num_steps=NUM_STEPS,
+    )
 
 
 @pytest.mark.with_subprocess

@@ -5,6 +5,7 @@
 
 import atexit
 import os
+import signal
 import subprocess
 import sys
 import traceback
@@ -17,6 +18,54 @@
 from isaaclab_arena.tests.conftest import PYTEST_SESSION
 from isaaclab_arena.utils.isaaclab_utils.simulation_app import get_app_launcher, teardown_simulation_app
 
+_ORPHAN_PATTERNS = ("omni.telemetry.transmitter", "omni.kit", "carb.windowing")
+
+
+def _kill_kit_orphans():
+    """Kill Kit child processes (telemetry transmitters, etc.) that survive app.close().
+
+    On CPU-constrained CI runners, even one orphan telemetry transmitter can starve
+    the next Kit subprocess launch, eventually deadlocking Kit startup entirely.
+    ``_kill_child_processes()`` in simulation_app.py handles direct children via /proc,
+    but reparented orphans (e.g. telemetry transmitter adopted by PID 1) escape that
+    scan.  This function catches them by matching process names.
+    """
+    import re
+
+    try:
+        ps = subprocess.run(
+            ["ps", "-eo", "pid,ppid,comm,args"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+    except Exception:
+        return
+
+    our_pid = os.getpid()
+    killed = []
+    for line in ps.stdout.splitlines()[1:]:
+        line_lower = line.lower()
+        if not any(pat in line_lower for pat in _ORPHAN_PATTERNS):
+            continue
+        m = re.match(r"\s*(\d+)", line)
+        if not m:
+            continue
+        pid = int(m.group(1))
+        if pid == our_pid:
+            continue
+        try:
+            os.kill(pid, signal.SIGKILL)
+            killed.append(pid)
+        except ProcessLookupError:
+            pass
+        except PermissionError:
+            pass
+
+    if killed:
+        print(f"[run_subprocess] Killed {len(killed)} orphan process(es): {killed}")
+
+
 # NOTE(alexmillane): Isaac Sim makes testing complicated. During shutdown Isaac Sim will
 # terminate the surrounding pytest process with exit code 0, regardless
 # of whether the tests passed or failed.
@@ -30,27 +79,73 @@
 _PERSISTENT_INIT_ARGS = None  # store (headless, enable_cameras) used at first init
 _AT_LEAST_ONE_TEST_FAILED = False
 
+_SUBPROCESS_TIMEOUT_SEC = int(os.environ.get("ISAACLAB_ARENA_SUBPROCESS_TIMEOUT", "600"))
+
+
+def run_subprocess(
+    cmd,
+    env=None,
+    timeout_sec: int | None = None,
+    capture_output: bool = False,
+) -> subprocess.CompletedProcess | None:
+    """Run a command in a subprocess with timeout and orphan cleanup.
 
-def run_subprocess(cmd, env=None):
-    print(f"Running command: {cmd}")
+    Args:
+        cmd: Command to run (list of strings).
+        env: Optional environment dict.  Defaults to inheriting the parent env.
+        timeout_sec: Per-subprocess wall-clock timeout in seconds.
+            Defaults to ``_SUBPROCESS_TIMEOUT_SEC`` (env ``ISAACLAB_ARENA_SUBPROCESS_TIMEOUT``, fallback 600).
+        capture_output: If True, capture stdout/stderr and return a
+            ``CompletedProcess``.  When False (default) output streams to
+            the parent process and the function returns None on success.
+
+    Returns:
+        ``CompletedProcess`` when *capture_output* is True, else None.
+    """
+    if timeout_sec is None:
+        timeout_sec = _SUBPROCESS_TIMEOUT_SEC
+
+    print(f"Running command (timeout={timeout_sec}s): {cmd}")
     global _AT_LEAST_ONE_TEST_FAILED
+
+    if env is None:
+        env = os.environ.copy()
+    env["ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE"] = "1"
+
+    # start_new_session=True isolates the child into its own process group.
+    # The child-side SimulationAppContext uses this to SIGTERM its entire
+    # group before os._exit(), preventing orphaned Kit children (shader
+    # compiler, GPU workers, …) from holding GPU resources and blocking
+    # the next subprocess.  After the child exits, _kill_kit_orphans()
+    # sweeps for any reparented Kit processes (e.g. telemetry transmitter)
+    # that escaped the child-side cleanup.
     try:
         result = subprocess.run(
             cmd,
-            check=True,
             env=env,
-            # Don't capture output, let it flow through in real-time
-            capture_output=False,
-            text=True,
-            # Explicitly set stdout and stderr to None to use parent process's pipes
-            stdout=None,
-            stderr=None,
+            timeout=timeout_sec,
+            capture_output=capture_output,
+            text=capture_output,
+            start_new_session=True,
         )
-        print(f"Command completed with return code: {result.returncode}")
-    except subprocess.CalledProcessError as e:
-        sys.stderr.write(f"Command failed with return code {e.returncode}: {e}\n")
+    except subprocess.TimeoutExpired:
+        sys.stderr.write(f"\n[isaaclab-arena] Subprocess timed out after {timeout_sec}s\n")
         _AT_LEAST_ONE_TEST_FAILED = True
-        raise e
+        raise subprocess.SubprocessError(f"Subprocess timed out after {timeout_sec}s: {cmd}")
+    finally:
+        _kill_kit_orphans()
+
+    print(f"Command completed with return code: {result.returncode}")
+    if result.returncode != 0:
+        sys.stderr.write(f"Command failed with return code {result.returncode}\n")
+        if capture_output and result.stderr:
+            sys.stderr.write(result.stderr)
+        _AT_LEAST_ONE_TEST_FAILED = True
+        raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)
+
+    if capture_output:
+        return result
+    return None
 
 
 class _IsolatedArgv:
@@ -108,7 +203,7 @@ def get_persistent_simulation_app(headless: bool, enable_cameras: bool = False)
         first_headless, first_enable_cameras = _PERSISTENT_INIT_ARGS
         if (headless != first_headless) or (enable_cameras != first_enable_cameras):
             print(
-                "[isaac-arena] Warning: persistent SimulationApp already initialized with "
+                "[isaaclab-arena] Warning: persistent SimulationApp already initialized with "
                 f"headless={first_headless}, enable_cameras={first_enable_cameras}. "
                 "Ignoring new values."
             )

@@ -87,6 +87,26 @@ def reapply_viewer_cfg(env) -> None:
         vcc.update_view_location()
 
 
+def _kill_child_processes() -> None:
+    """SIGKILL all direct child processes of the current process via /proc."""
+    import signal
+
+    my_pid = os.getpid()
+    with suppress(FileNotFoundError, PermissionError):
+        for entry in os.scandir("/proc"):
+            if not entry.name.isdigit():
+                continue
+            try:
+                with open(f"/proc/{entry.name}/status") as f:
+                    for line in f:
+                        if line.startswith("PPid:"):
+                            if int(line.split()[1]) == my_pid:
+                                os.kill(int(entry.name), signal.SIGKILL)
+                            break
+            except (FileNotFoundError, PermissionError, ProcessLookupError, ValueError):
+                continue
+
+
 class SimulationAppContext:
     """Context manager for launching and closing a simulation app."""
 
@@ -110,15 +130,30 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         print("Closing simulation app")
-        # app_launcher.close() will terminate the whole process with exit code 0, i.e. preventing errors from being seen by the caller. There are seemingly no ways around this.
-        # As a workaround, we call os._exit(1) that terminates immediately. The downside is that any cleanup would be omitted
-        if exc_type is None:
-            self.app_launcher.app.close()
-        else:
+        if exc_type is not None:
             print(f"Exception caught in SimulationAppContext: {exc_type.__name__}: {exc_val}")
             print("Traceback:")
             traceback.print_exception(exc_type, exc_val, exc_tb)
             print("Killing the process without cleaning up")
             sys.stdout.flush()
             sys.stderr.flush()
             os._exit(1)
+
+        # When launched as a test subprocess, skip app.close() which can hang
+        # indefinitely in Kit's shutdown path.
+        if os.environ.get("ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE") == "1":
+            print("Force-exiting subprocess (ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE=1)")
+            sys.stdout.flush()
+            sys.stderr.flush()
+            # SIGKILL orphaned Kit children (shader compiler, GPU workers, …)
+            # so they don't hold GPU resources and block the next test subprocess.
+            # We target each child individually via /proc to avoid signalling
+            # ourselves (Kit installs a C-level SIGTERM handler that overrides
+            # Python's SIG_IGN, so os.killpg is not safe here).
+            _kill_child_processes()
+            os._exit(0)
+
+        # Normal interactive / non-test path: attempt a clean Kit shutdown.
+        # app.close() may terminate the process with exit code 0 regardless of
+        # errors — see the error branch above for the workaround.
+        self.app_launcher.app.close()
@@ -236,7 +236,7 @@ def test_g1_locomanip_gr00t_closedloop_policy_runner_eval_runner(gr00t_finetuned
             "name": "gr1_open_microwave_cracker_box",
             "arena_env_args": {
                 "environment": "gr1_open_microwave",
-                "num_envs": 10,
+                "num_envs": 3,
                 "object": "cracker_box",
                 "embodiment": "gr1_joint",
             },