Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,22 +142,23 @@ jobs:
# To restore sanity here is a goal for Isaac Lab Arena v0.3.
- name: Run in-process Newton tests
run: |
/isaac-sim/python.sh -m pytest -sv -m with_newton \
/isaac-sim/python.sh -m pytest -sv --durations=0 -m with_newton \
isaaclab_arena/tests/

- name: Run in-process PhysX tests without cameras
run: |
/isaac-sim/python.sh -m pytest -sv -m "not with_cameras and not with_subprocess and not with_newton" \
/isaac-sim/python.sh -m pytest -sv --durations=0 -m "not with_cameras and not with_subprocess and not with_newton" \
isaaclab_arena/tests/

- name: Run in-process PhysX tests with cameras
run: |
/isaac-sim/python.sh -m pytest -sv -m "with_cameras and not with_subprocess and not with_newton" \
/isaac-sim/python.sh -m pytest -sv --durations=0 -m "with_cameras and not with_subprocess and not with_newton" \
isaaclab_arena/tests/

- name: Run subprocess-spawning PhysX tests
- name: Run subprocess-spawning PhysX tests
run: |
/isaac-sim/python.sh -m pytest -sv -m with_subprocess \
ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \
/isaac-sim/python.sh -m pytest -sv --durations=0 -m with_subprocess \
isaaclab_arena/tests/


Expand Down Expand Up @@ -188,7 +189,9 @@ jobs:

# Run the policy (GR00T) related tests.
- name: Run policy-related pytest
run: /isaac-sim/python.sh -m pytest -sv isaaclab_arena_gr00t/tests/
run: |
ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \
/isaac-sim/python.sh -m pytest -sv --durations=0 isaaclab_arena_gr00t/tests/


build_docs_pre_merge:
Expand Down
44 changes: 19 additions & 25 deletions isaaclab_arena/tests/test_policy_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,18 @@ def test_zero_action_policy_press_button():


@pytest.mark.with_subprocess
def test_zero_action_policy_kitchen_pick_and_place():
@pytest.mark.parametrize("embodiment", ["franka_ik", "gr1_pink", "gr1_joint"])
@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can"])
def test_zero_action_policy_kitchen_pick_and_place(embodiment, object_name):
# TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments
# from a registry when we have one.
example_environment = "kitchen_pick_and_place"
embodiments = ["franka_ik", "gr1_pink", "gr1_joint"]
object_names = ["cracker_box", "tomato_soup_can"]
for embodiment in embodiments:
for object_name in object_names:
run_policy_runner(
policy_type="zero_action",
example_environment=example_environment,
embodiment=embodiment,
object_name=object_name,
num_steps=NUM_STEPS,
)
run_policy_runner(
policy_type="zero_action",
example_environment="kitchen_pick_and_place",
embodiment=embodiment,
object_name=object_name,
num_steps=NUM_STEPS,
)
Comment on lines +68 to +79
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!



@pytest.mark.with_subprocess
Expand All @@ -98,20 +95,17 @@ def test_zero_action_policy_galileo_pick_and_place():


@pytest.mark.with_subprocess
def test_zero_action_policy_gr1_open_microwave():
@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can", "mustard_bottle"])
def test_zero_action_policy_gr1_open_microwave(object_name):
# TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments
# from a registry when we have one.
example_environment = "gr1_open_microwave"
object_name = ["cracker_box", "tomato_soup_can", "mustard_bottle"]
for object_name in object_name:
run_policy_runner(
policy_type="zero_action",
example_environment=example_environment,
embodiment="gr1_pink",
background=None,
object_name=object_name,
num_steps=NUM_STEPS,
)
run_policy_runner(
policy_type="zero_action",
example_environment="gr1_open_microwave",
embodiment="gr1_pink",
object_name=object_name,
num_steps=NUM_STEPS,
)


@pytest.mark.with_subprocess
Expand Down
72 changes: 58 additions & 14 deletions isaaclab_arena/tests/utils/subprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,70 @@
_AT_LEAST_ONE_TEST_FAILED = False


def run_subprocess(cmd, env=None):
print(f"Running command: {cmd}")
_SUBPROCESS_TIMEOUT_SEC = int(os.environ.get("ISAACLAB_ARENA_SUBPROCESS_TIMEOUT", "600"))


def run_subprocess(
cmd,
env=None,
timeout_sec: int | None = None,
capture_output: bool = False,
) -> subprocess.CompletedProcess | None:
"""Run a command in a subprocess with timeout.

The child is launched with ``start_new_session=True`` so it lives in its
own process group. The child-side ``SimulationAppContext`` uses this to
SIGTERM its entire group before ``os._exit()``, preventing orphaned Kit
children (shader compiler, GPU workers, …) from holding GPU resources and
blocking the next subprocess.

Args:
cmd: Command to run (list of strings).
env: Optional environment dict. Defaults to inheriting the parent env.
timeout_sec: Per-subprocess wall-clock timeout in seconds.
Defaults to ``_SUBPROCESS_TIMEOUT_SEC`` (env ``ISAACLAB_ARENA_SUBPROCESS_TIMEOUT``, fallback 600).
capture_output: If True, capture stdout/stderr and return a
``CompletedProcess``. When False (default) output streams to
the parent process and the function returns None on success.

Returns:
``CompletedProcess`` when *capture_output* is True, else None.
"""
if timeout_sec is None:
timeout_sec = _SUBPROCESS_TIMEOUT_SEC

print(f"Running command (timeout={timeout_sec}s): {cmd}")
global _AT_LEAST_ONE_TEST_FAILED

if env is None:
env = os.environ.copy()
Comment on lines +69 to +70
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Caller's env dict is mutated in-place

If a caller passes their own env dict, it is mutated to include ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE=1. While this key is harmless to leave behind for the current callers, it can still cause surprising side-effects if the same dict is reused or inspected after the call. The env is None branch already copies os.environ, so the fix is to copy the caller-supplied dict as well.

Suggested change
if env is None:
env = os.environ.copy()
if env is None:
env = os.environ.copy()
else:
env = env.copy()
env["ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE"] = "1"

env["ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE"] = "1"

try:
result = subprocess.run(
cmd,
check=True,
env=env,
# Don't capture output, let it flow through in real-time
capture_output=False,
text=True,
# Explicitly set stdout and stderr to None to use parent process's pipes
stdout=None,
stderr=None,
timeout=timeout_sec,
capture_output=capture_output,
text=capture_output,
start_new_session=True,
)
print(f"Command completed with return code: {result.returncode}")
except subprocess.CalledProcessError as e:
sys.stderr.write(f"Command failed with return code {e.returncode}: {e}\n")
except subprocess.TimeoutExpired:
sys.stderr.write(f"\n[isaaclab-arena] Subprocess timed out after {timeout_sec}s\n")
_AT_LEAST_ONE_TEST_FAILED = True
raise e
raise subprocess.SubprocessError(f"Subprocess timed out after {timeout_sec}s: {cmd}")

print(f"Command completed with return code: {result.returncode}")
if result.returncode != 0:
sys.stderr.write(f"Command failed with return code {result.returncode}\n")
if capture_output and result.stderr:
sys.stderr.write(result.stderr)
_AT_LEAST_ONE_TEST_FAILED = True
raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)

if capture_output:
return result
return None


class _IsolatedArgv:
Expand Down Expand Up @@ -108,7 +152,7 @@ def get_persistent_simulation_app(headless: bool, enable_cameras: bool = False)
first_headless, first_enable_cameras = _PERSISTENT_INIT_ARGS
if (headless != first_headless) or (enable_cameras != first_enable_cameras):
print(
"[isaac-arena] Warning: persistent SimulationApp already initialized with "
"[isaaclab-arena] Warning: persistent SimulationApp already initialized with "
f"headless={first_headless}, enable_cameras={first_enable_cameras}. "
"Ignoring new values."
)
Expand Down
54 changes: 49 additions & 5 deletions isaaclab_arena/utils/isaaclab_utils/simulation_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,18 @@ def get_isaac_sim_version() -> str:
return omni.kit.app.get_app().get_app_version()


STARTUP_COMPLETE_MARKER = "[isaaclab-arena] AppLauncher initialization complete"


def get_app_launcher(args: argparse.Namespace) -> AppLauncher:
"""Get an app launcher."""
import time

t0 = time.monotonic()
app_launcher = AppLauncher(args)
elapsed = time.monotonic() - t0
sys.__stderr__.write(f"{STARTUP_COMPLETE_MARKER} ({elapsed:.1f}s)\n")
sys.__stderr__.flush()
return app_launcher


Expand Down Expand Up @@ -87,6 +96,26 @@ def reapply_viewer_cfg(env) -> None:
vcc.update_view_location()


def _kill_child_processes() -> None:
"""SIGKILL all direct child processes of the current process via /proc."""
import signal

my_pid = os.getpid()
with suppress(FileNotFoundError, PermissionError):
for entry in os.scandir("/proc"):
if not entry.name.isdigit():
continue
try:
with open(f"/proc/{entry.name}/status") as f:
for line in f:
if line.startswith("PPid:"):
if int(line.split()[1]) == my_pid:
os.kill(int(entry.name), signal.SIGKILL)
break
except (FileNotFoundError, PermissionError, ProcessLookupError, ValueError):
continue


class SimulationAppContext:
"""Context manager for launching and closing a simulation app."""

Expand All @@ -110,15 +139,30 @@ def __enter__(self):

def __exit__(self, exc_type, exc_val, exc_tb):
print("Closing simulation app")
# app_launcher.close() will terminate the whole process with exit code 0, i.e. preventing errors from being seen by the caller. There are seemingly no ways around this.
# As a workaround, we call os._exit(1) that terminates immediately. The downside is that any cleanup would be omitted
if exc_type is None:
self.app_launcher.app.close()
else:
if exc_type is not None:
print(f"Exception caught in SimulationAppContext: {exc_type.__name__}: {exc_val}")
print("Traceback:")
traceback.print_exception(exc_type, exc_val, exc_tb)
print("Killing the process without cleaning up")
sys.stdout.flush()
sys.stderr.flush()
os._exit(1)

# When launched as a test subprocess, skip app.close() which can hang
# indefinitely in Kit's shutdown path.
if os.environ.get("ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE") == "1":
print("Force-exiting subprocess (ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE=1)")
sys.stdout.flush()
sys.stderr.flush()
# SIGKILL orphaned Kit children (shader compiler, GPU workers, …)
# so they don't hold GPU resources and block the next test subprocess.
# We target each child individually via /proc to avoid signalling
# ourselves (Kit installs a C-level SIGTERM handler that overrides
# Python's SIG_IGN, so os.killpg is not safe here).
_kill_child_processes()
os._exit(0)

# Normal interactive / non-test path: attempt a clean Kit shutdown.
# app.close() may terminate the process with exit code 0 regardless of
# errors — see the error branch above for the workaround.
self.app_launcher.app.close()
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def test_g1_locomanip_gr00t_closedloop_policy_runner_eval_runner(gr00t_finetuned
"name": "gr1_open_microwave_cracker_box",
"arena_env_args": {
"environment": "gr1_open_microwave",
"num_envs": 10,
"num_envs": 3,
"object": "cracker_box",
"embodiment": "gr1_joint",
},
Expand Down
Loading