diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e0673c551..c5baed880 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -142,22 +142,23 @@ jobs: # To restore sanity here is a goal for Isaac Lab Arena v0.3. - name: Run in-process Newton tests run: | - /isaac-sim/python.sh -m pytest -sv -m with_newton \ + /isaac-sim/python.sh -m pytest -sv --durations=0 -m with_newton \ isaaclab_arena/tests/ - name: Run in-process PhysX tests without cameras run: | - /isaac-sim/python.sh -m pytest -sv -m "not with_cameras and not with_subprocess and not with_newton" \ + /isaac-sim/python.sh -m pytest -sv --durations=0 -m "not with_cameras and not with_subprocess and not with_newton" \ isaaclab_arena/tests/ - name: Run in-process PhysX tests with cameras run: | - /isaac-sim/python.sh -m pytest -sv -m "with_cameras and not with_subprocess and not with_newton" \ + /isaac-sim/python.sh -m pytest -sv --durations=0 -m "with_cameras and not with_subprocess and not with_newton" \ isaaclab_arena/tests/ - - name: Run subprocess-spawning PhysX tests + - name: Run subprocess-spawning PhysX tests run: | - /isaac-sim/python.sh -m pytest -sv -m with_subprocess \ + ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \ + /isaac-sim/python.sh -m pytest -sv --durations=0 -m with_subprocess \ isaaclab_arena/tests/ @@ -188,7 +189,9 @@ jobs: # Run the policy (GR00T) related tests. - name: Run policy-related pytest - run: /isaac-sim/python.sh -m pytest -sv isaaclab_arena_gr00t/tests/ + run: | + ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \ + /isaac-sim/python.sh -m pytest -sv --durations=0 isaaclab_arena_gr00t/tests/ build_docs_pre_merge: diff --git a/isaaclab_arena/tests/test_policy_runner.py b/isaaclab_arena/tests/test_policy_runner.py index 559c6891b..1e595f4f2 100644 --- a/isaaclab_arena/tests/test_policy_runner.py +++ b/isaaclab_arena/tests/test_policy_runner.py @@ -65,21 +65,18 @@ def test_zero_action_policy_press_button(): @pytest.mark.with_subprocess -def test_zero_action_policy_kitchen_pick_and_place(): +@pytest.mark.parametrize("embodiment", ["franka_ik", "gr1_pink", "gr1_joint"]) +@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can"]) +def test_zero_action_policy_kitchen_pick_and_place(embodiment, object_name): # TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments # from a registry when we have one. - example_environment = "kitchen_pick_and_place" - embodiments = ["franka_ik", "gr1_pink", "gr1_joint"] - object_names = ["cracker_box", "tomato_soup_can"] - for embodiment in embodiments: - for object_name in object_names: - run_policy_runner( - policy_type="zero_action", - example_environment=example_environment, - embodiment=embodiment, - object_name=object_name, - num_steps=NUM_STEPS, - ) + run_policy_runner( + policy_type="zero_action", + example_environment="kitchen_pick_and_place", + embodiment=embodiment, + object_name=object_name, + num_steps=NUM_STEPS, + ) @pytest.mark.with_subprocess @@ -98,20 +95,17 @@ def test_zero_action_policy_galileo_pick_and_place(): @pytest.mark.with_subprocess -def test_zero_action_policy_gr1_open_microwave(): +@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can", "mustard_bottle"]) +def test_zero_action_policy_gr1_open_microwave(object_name): # TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments # from a registry when we have one. - example_environment = "gr1_open_microwave" - object_name = ["cracker_box", "tomato_soup_can", "mustard_bottle"] - for object_name in object_name: - run_policy_runner( - policy_type="zero_action", - example_environment=example_environment, - embodiment="gr1_pink", - background=None, - object_name=object_name, - num_steps=NUM_STEPS, - ) + run_policy_runner( + policy_type="zero_action", + example_environment="gr1_open_microwave", + embodiment="gr1_pink", + object_name=object_name, + num_steps=NUM_STEPS, + ) @pytest.mark.with_subprocess diff --git a/isaaclab_arena/tests/utils/subprocess.py b/isaaclab_arena/tests/utils/subprocess.py index c78165234..67a276813 100644 --- a/isaaclab_arena/tests/utils/subprocess.py +++ b/isaaclab_arena/tests/utils/subprocess.py @@ -31,26 +31,70 @@ _AT_LEAST_ONE_TEST_FAILED = False -def run_subprocess(cmd, env=None): - print(f"Running command: {cmd}") +_SUBPROCESS_TIMEOUT_SEC = int(os.environ.get("ISAACLAB_ARENA_SUBPROCESS_TIMEOUT", "600")) + + +def run_subprocess( + cmd, + env=None, + timeout_sec: int | None = None, + capture_output: bool = False, +) -> subprocess.CompletedProcess | None: + """Run a command in a subprocess with timeout. + + The child is launched with ``start_new_session=True`` so it lives in its + own process group. The child-side ``SimulationAppContext`` uses this to + SIGTERM its entire group before ``os._exit()``, preventing orphaned Kit + children (shader compiler, GPU workers, …) from holding GPU resources and + blocking the next subprocess. + + Args: + cmd: Command to run (list of strings). + env: Optional environment dict. Defaults to inheriting the parent env. + timeout_sec: Per-subprocess wall-clock timeout in seconds. + Defaults to ``_SUBPROCESS_TIMEOUT_SEC`` (env ``ISAACLAB_ARENA_SUBPROCESS_TIMEOUT``, fallback 600). + capture_output: If True, capture stdout/stderr and return a + ``CompletedProcess``. When False (default) output streams to + the parent process and the function returns None on success. + + Returns: + ``CompletedProcess`` when *capture_output* is True, else None. + """ + if timeout_sec is None: + timeout_sec = _SUBPROCESS_TIMEOUT_SEC + + print(f"Running command (timeout={timeout_sec}s): {cmd}") global _AT_LEAST_ONE_TEST_FAILED + + if env is None: + env = os.environ.copy() + env["ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE"] = "1" + try: result = subprocess.run( cmd, - check=True, env=env, - # Don't capture output, let it flow through in real-time - capture_output=False, - text=True, - # Explicitly set stdout and stderr to None to use parent process's pipes - stdout=None, - stderr=None, + timeout=timeout_sec, + capture_output=capture_output, + text=capture_output, + start_new_session=True, ) - print(f"Command completed with return code: {result.returncode}") - except subprocess.CalledProcessError as e: - sys.stderr.write(f"Command failed with return code {e.returncode}: {e}\n") + except subprocess.TimeoutExpired: + sys.stderr.write(f"\n[isaaclab-arena] Subprocess timed out after {timeout_sec}s\n") _AT_LEAST_ONE_TEST_FAILED = True - raise e + raise subprocess.SubprocessError(f"Subprocess timed out after {timeout_sec}s: {cmd}") + + print(f"Command completed with return code: {result.returncode}") + if result.returncode != 0: + sys.stderr.write(f"Command failed with return code {result.returncode}\n") + if capture_output and result.stderr: + sys.stderr.write(result.stderr) + _AT_LEAST_ONE_TEST_FAILED = True + raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr) + + if capture_output: + return result + return None class _IsolatedArgv: @@ -108,7 +152,7 @@ def get_persistent_simulation_app(headless: bool, enable_cameras: bool = False) first_headless, first_enable_cameras = _PERSISTENT_INIT_ARGS if (headless != first_headless) or (enable_cameras != first_enable_cameras): print( - "[isaac-arena] Warning: persistent SimulationApp already initialized with " + "[isaaclab-arena] Warning: persistent SimulationApp already initialized with " f"headless={first_headless}, enable_cameras={first_enable_cameras}. " "Ignoring new values." ) diff --git a/isaaclab_arena/utils/isaaclab_utils/simulation_app.py b/isaaclab_arena/utils/isaaclab_utils/simulation_app.py index edff1b75b..02db2a0f3 100644 --- a/isaaclab_arena/utils/isaaclab_utils/simulation_app.py +++ b/isaaclab_arena/utils/isaaclab_utils/simulation_app.py @@ -18,9 +18,18 @@ def get_isaac_sim_version() -> str: return omni.kit.app.get_app().get_app_version() +STARTUP_COMPLETE_MARKER = "[isaaclab-arena] AppLauncher initialization complete" + + def get_app_launcher(args: argparse.Namespace) -> AppLauncher: """Get an app launcher.""" + import time + + t0 = time.monotonic() app_launcher = AppLauncher(args) + elapsed = time.monotonic() - t0 + sys.__stderr__.write(f"{STARTUP_COMPLETE_MARKER} ({elapsed:.1f}s)\n") + sys.__stderr__.flush() return app_launcher @@ -87,6 +96,26 @@ def reapply_viewer_cfg(env) -> None: vcc.update_view_location() +def _kill_child_processes() -> None: + """SIGKILL all direct child processes of the current process via /proc.""" + import signal + + my_pid = os.getpid() + with suppress(FileNotFoundError, PermissionError): + for entry in os.scandir("/proc"): + if not entry.name.isdigit(): + continue + try: + with open(f"/proc/{entry.name}/status") as f: + for line in f: + if line.startswith("PPid:"): + if int(line.split()[1]) == my_pid: + os.kill(int(entry.name), signal.SIGKILL) + break + except (FileNotFoundError, PermissionError, ProcessLookupError, ValueError): + continue + + class SimulationAppContext: """Context manager for launching and closing a simulation app.""" @@ -110,11 +139,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): print("Closing simulation app") - # app_launcher.close() will terminate the whole process with exit code 0, i.e. preventing errors from being seen by the caller. There are seemingly no ways around this. - # As a workaround, we call os._exit(1) that terminates immediately. The downside is that any cleanup would be omitted - if exc_type is None: - self.app_launcher.app.close() - else: + if exc_type is not None: print(f"Exception caught in SimulationAppContext: {exc_type.__name__}: {exc_val}") print("Traceback:") traceback.print_exception(exc_type, exc_val, exc_tb) @@ -122,3 +147,22 @@ def __exit__(self, exc_type, exc_val, exc_tb): sys.stdout.flush() sys.stderr.flush() os._exit(1) + + # When launched as a test subprocess, skip app.close() which can hang + # indefinitely in Kit's shutdown path. + if os.environ.get("ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE") == "1": + print("Force-exiting subprocess (ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE=1)") + sys.stdout.flush() + sys.stderr.flush() + # SIGKILL orphaned Kit children (shader compiler, GPU workers, …) + # so they don't hold GPU resources and block the next test subprocess. + # We target each child individually via /proc to avoid signalling + # ourselves (Kit installs a C-level SIGTERM handler that overrides + # Python's SIG_IGN, so os.killpg is not safe here). + _kill_child_processes() + os._exit(0) + + # Normal interactive / non-test path: attempt a clean Kit shutdown. + # app.close() may terminate the process with exit code 0 regardless of + # errors — see the error branch above for the workaround. + self.app_launcher.app.close() diff --git a/isaaclab_arena_gr00t/tests/test_gr00t_closedloop_policy.py b/isaaclab_arena_gr00t/tests/test_gr00t_closedloop_policy.py index ac83f59d9..1be6ffbd5 100644 --- a/isaaclab_arena_gr00t/tests/test_gr00t_closedloop_policy.py +++ b/isaaclab_arena_gr00t/tests/test_gr00t_closedloop_policy.py @@ -236,7 +236,7 @@ def test_g1_locomanip_gr00t_closedloop_policy_runner_eval_runner(gr00t_finetuned "name": "gr1_open_microwave_cracker_box", "arena_env_args": { "environment": "gr1_open_microwave", - "num_envs": 10, + "num_envs": 3, "object": "cracker_box", "embodiment": "gr1_joint", },