Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,22 +142,23 @@ jobs:
# To restore sanity here is a goal for Isaac Lab Arena v0.3.
- name: Run in-process Newton tests
run: |
/isaac-sim/python.sh -m pytest -sv -m with_newton \
/isaac-sim/python.sh -m pytest -sv --durations=0 -m with_newton \
isaaclab_arena/tests/

- name: Run in-process PhysX tests without cameras
run: |
/isaac-sim/python.sh -m pytest -sv -m "not with_cameras and not with_subprocess and not with_newton" \
/isaac-sim/python.sh -m pytest -sv --durations=0 -m "not with_cameras and not with_subprocess and not with_newton" \
isaaclab_arena/tests/

- name: Run in-process PhysX tests with cameras
run: |
/isaac-sim/python.sh -m pytest -sv -m "with_cameras and not with_subprocess and not with_newton" \
/isaac-sim/python.sh -m pytest -sv --durations=0 -m "with_cameras and not with_subprocess and not with_newton" \
isaaclab_arena/tests/

- name: Run subprocess-spawning PhysX tests
- name: Run subprocess-spawning PhysX tests
run: |
/isaac-sim/python.sh -m pytest -sv -m with_subprocess \
ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \
/isaac-sim/python.sh -m pytest -sv --durations=0 -m with_subprocess \
isaaclab_arena/tests/


Expand Down Expand Up @@ -188,7 +189,9 @@ jobs:

# Run the policy (GR00T) related tests.
- name: Run policy-related pytest
run: /isaac-sim/python.sh -m pytest -sv isaaclab_arena_gr00t/tests/
run: |
ISAACLAB_ARENA_SUBPROCESS_TIMEOUT=900 \
/isaac-sim/python.sh -m pytest -sv --durations=0 isaaclab_arena_gr00t/tests/


build_docs_pre_merge:
Expand Down
44 changes: 19 additions & 25 deletions isaaclab_arena/tests/test_policy_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,18 @@ def test_zero_action_policy_press_button():


@pytest.mark.with_subprocess
def test_zero_action_policy_kitchen_pick_and_place():
@pytest.mark.parametrize("embodiment", ["franka_ik", "gr1_pink", "gr1_joint"])
@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can"])
def test_zero_action_policy_kitchen_pick_and_place(embodiment, object_name):
# TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments
# from a registry when we have one.
example_environment = "kitchen_pick_and_place"
embodiments = ["franka_ik", "gr1_pink", "gr1_joint"]
object_names = ["cracker_box", "tomato_soup_can"]
for embodiment in embodiments:
for object_name in object_names:
run_policy_runner(
policy_type="zero_action",
example_environment=example_environment,
embodiment=embodiment,
object_name=object_name,
num_steps=NUM_STEPS,
)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old code passed background=None explicitly. Worth verifying that run_policy_runner's default for background is already None — otherwise this parametrized version silently changes behavior.

run_policy_runner(
policy_type="zero_action",
example_environment="kitchen_pick_and_place",
embodiment=embodiment,
object_name=object_name,
num_steps=NUM_STEPS,
)


@pytest.mark.with_subprocess
Expand All @@ -98,20 +95,17 @@ def test_zero_action_policy_galileo_pick_and_place():


@pytest.mark.with_subprocess
def test_zero_action_policy_gr1_open_microwave():
@pytest.mark.parametrize("object_name", ["cracker_box", "tomato_soup_can", "mustard_bottle"])
def test_zero_action_policy_gr1_open_microwave(object_name):
# TODO(alexmillane, 2025.07.29): Get an exhaustive list of all scenes and embodiments
# from a registry when we have one.
example_environment = "gr1_open_microwave"
object_name = ["cracker_box", "tomato_soup_can", "mustard_bottle"]
for object_name in object_name:
run_policy_runner(
policy_type="zero_action",
example_environment=example_environment,
embodiment="gr1_pink",
background=None,
object_name=object_name,
num_steps=NUM_STEPS,
)
run_policy_runner(
policy_type="zero_action",
example_environment="gr1_open_microwave",
embodiment="gr1_pink",
object_name=object_name,
num_steps=NUM_STEPS,
)


@pytest.mark.with_subprocess
Expand Down
123 changes: 109 additions & 14 deletions isaaclab_arena/tests/utils/subprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import atexit
import os
import signal
import subprocess
import sys
import traceback
Expand All @@ -17,6 +18,54 @@
from isaaclab_arena.tests.conftest import PYTEST_SESSION
from isaaclab_arena.utils.isaaclab_utils.simulation_app import get_app_launcher, teardown_simulation_app

_ORPHAN_PATTERNS = ("omni.telemetry.transmitter", "omni.kit", "carb.windowing")


def _kill_kit_orphans():
"""Kill Kit child processes (telemetry transmitters, etc.) that survive app.close().

On CPU-constrained CI runners, even one orphan telemetry transmitter can starve
the next Kit subprocess launch, eventually deadlocking Kit startup entirely.
``_kill_child_processes()`` in simulation_app.py handles direct children via /proc,
but reparented orphans (e.g. telemetry transmitter adopted by PID 1) escape that
scan. This function catches them by matching process names.
"""
import re

try:
ps = subprocess.run(
["ps", "-eo", "pid,ppid,comm,args"],
capture_output=True,
text=True,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This matches against the entire system process table, not just descendants of the test runner. On shared dev machines this could kill legitimate Kit sessions. Consider adding a comment that this is intended for CI use only, or narrowing the scan to processes whose original parent was this test runner (though I understand reparented orphans make that hard).

timeout=5,
)
except Exception:
return

our_pid = os.getpid()
killed = []
for line in ps.stdout.splitlines()[1:]:
line_lower = line.lower()
if not any(pat in line_lower for pat in _ORPHAN_PATTERNS):
continue
m = re.match(r"\s*(\d+)", line)
if not m:
continue
pid = int(m.group(1))
if pid == our_pid:
continue
try:
os.kill(pid, signal.SIGKILL)
killed.append(pid)
except ProcessLookupError:
pass
except PermissionError:
pass

if killed:
print(f"[run_subprocess] Killed {len(killed)} orphan process(es): {killed}")


# NOTE(alexmillane): Isaac Sim makes testing complicated. During shutdown Isaac Sim will
# terminate the surrounding pytest process with exit code 0, regardless
# of whether the tests passed or failed.
Expand All @@ -30,27 +79,73 @@
_PERSISTENT_INIT_ARGS = None # store (headless, enable_cameras) used at first init
_AT_LEAST_ONE_TEST_FAILED = False

_SUBPROCESS_TIMEOUT_SEC = int(os.environ.get("ISAACLAB_ARENA_SUBPROCESS_TIMEOUT", "600"))


def run_subprocess(
cmd,
env=None,
timeout_sec: int | None = None,
capture_output: bool = False,
) -> subprocess.CompletedProcess | None:
"""Run a command in a subprocess with timeout and orphan cleanup.

def run_subprocess(cmd, env=None):
print(f"Running command: {cmd}")
Args:
cmd: Command to run (list of strings).
env: Optional environment dict. Defaults to inheriting the parent env.
timeout_sec: Per-subprocess wall-clock timeout in seconds.
Defaults to ``_SUBPROCESS_TIMEOUT_SEC`` (env ``ISAACLAB_ARENA_SUBPROCESS_TIMEOUT``, fallback 600).
capture_output: If True, capture stdout/stderr and return a
``CompletedProcess``. When False (default) output streams to
the parent process and the function returns None on success.

Returns:
``CompletedProcess`` when *capture_output* is True, else None.
"""
if timeout_sec is None:
timeout_sec = _SUBPROCESS_TIMEOUT_SEC

print(f"Running command (timeout={timeout_sec}s): {cmd}")
global _AT_LEAST_ONE_TEST_FAILED

if env is None:
env = os.environ.copy()
env["ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE"] = "1"

# start_new_session=True isolates the child into its own process group.
# The child-side SimulationAppContext uses this to SIGTERM its entire
# group before os._exit(), preventing orphaned Kit children (shader
# compiler, GPU workers, …) from holding GPU resources and blocking
# the next subprocess. After the child exits, _kill_kit_orphans()
# sweeps for any reparented Kit processes (e.g. telemetry transmitter)
# that escaped the child-side cleanup.
try:
result = subprocess.run(
cmd,
check=True,
env=env,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When TimeoutExpired is raised with start_new_session=True, subprocess.run kills the direct child but the process group may survive. Consider:

except subprocess.TimeoutExpired as e:
    # Kill the entire process group
    try:
        os.killpg(os.getpgid(e.pid), signal.SIGKILL)
    except (ProcessLookupError, PermissionError):
        pass
    ...

Also, raise SubprocessError(...) from e would preserve the exception chain.

# Don't capture output, let it flow through in real-time
capture_output=False,
text=True,
# Explicitly set stdout and stderr to None to use parent process's pipes
stdout=None,
stderr=None,
timeout=timeout_sec,
capture_output=capture_output,
text=capture_output,
start_new_session=True,
)
print(f"Command completed with return code: {result.returncode}")
except subprocess.CalledProcessError as e:
sys.stderr.write(f"Command failed with return code {e.returncode}: {e}\n")
except subprocess.TimeoutExpired:
sys.stderr.write(f"\n[isaaclab-arena] Subprocess timed out after {timeout_sec}s\n")
_AT_LEAST_ONE_TEST_FAILED = True
raise e
raise subprocess.SubprocessError(f"Subprocess timed out after {timeout_sec}s: {cmd}")
finally:
_kill_kit_orphans()

print(f"Command completed with return code: {result.returncode}")
if result.returncode != 0:
sys.stderr.write(f"Command failed with return code {result.returncode}\n")
if capture_output and result.stderr:
sys.stderr.write(result.stderr)
_AT_LEAST_ONE_TEST_FAILED = True
raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)

if capture_output:
return result
return None


class _IsolatedArgv:
Expand Down Expand Up @@ -108,7 +203,7 @@ def get_persistent_simulation_app(headless: bool, enable_cameras: bool = False)
first_headless, first_enable_cameras = _PERSISTENT_INIT_ARGS
if (headless != first_headless) or (enable_cameras != first_enable_cameras):
print(
"[isaac-arena] Warning: persistent SimulationApp already initialized with "
"[isaaclab-arena] Warning: persistent SimulationApp already initialized with "
f"headless={first_headless}, enable_cameras={first_enable_cameras}. "
"Ignoring new values."
)
Expand Down
45 changes: 40 additions & 5 deletions isaaclab_arena/utils/isaaclab_utils/simulation_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,26 @@ def reapply_viewer_cfg(env) -> None:
vcc.update_view_location()


def _kill_child_processes() -> None:
"""SIGKILL all direct child processes of the current process via /proc."""
import signal

my_pid = os.getpid()
with suppress(FileNotFoundError, PermissionError):
for entry in os.scandir("/proc"):
if not entry.name.isdigit():
continue
try:
with open(f"/proc/{entry.name}/status") as f:
for line in f:
if line.startswith("PPid:"):
if int(line.split()[1]) == my_pid:
os.kill(int(entry.name), signal.SIGKILL)
break
except (FileNotFoundError, PermissionError, ProcessLookupError, ValueError):
continue


class SimulationAppContext:
"""Context manager for launching and closing a simulation app."""

Expand All @@ -110,15 +130,30 @@ def __enter__(self):

def __exit__(self, exc_type, exc_val, exc_tb):
print("Closing simulation app")
# app_launcher.close() will terminate the whole process with exit code 0, i.e. preventing errors from being seen by the caller. There are seemingly no ways around this.
# As a workaround, we call os._exit(1) that terminates immediately. The downside is that any cleanup would be omitted
if exc_type is None:
self.app_launcher.app.close()
else:
if exc_type is not None:
print(f"Exception caught in SimulationAppContext: {exc_type.__name__}: {exc_val}")
print("Traceback:")
traceback.print_exception(exc_type, exc_val, exc_tb)
print("Killing the process without cleaning up")
sys.stdout.flush()
sys.stderr.flush()
os._exit(1)

# When launched as a test subprocess, skip app.close() which can hang
# indefinitely in Kit's shutdown path.
if os.environ.get("ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE") == "1":
print("Force-exiting subprocess (ISAACLAB_ARENA_FORCE_EXIT_ON_COMPLETE=1)")
sys.stdout.flush()
sys.stderr.flush()
# SIGKILL orphaned Kit children (shader compiler, GPU workers, …)
# so they don't hold GPU resources and block the next test subprocess.
# We target each child individually via /proc to avoid signalling
# ourselves (Kit installs a C-level SIGTERM handler that overrides
# Python's SIG_IGN, so os.killpg is not safe here).
_kill_child_processes()
os._exit(0)

# Normal interactive / non-test path: attempt a clean Kit shutdown.
# app.close() may terminate the process with exit code 0 regardless of
# errors — see the error branch above for the workaround.
self.app_launcher.app.close()
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def test_g1_locomanip_gr00t_closedloop_policy_runner_eval_runner(gr00t_finetuned
"name": "gr1_open_microwave_cracker_box",
"arena_env_args": {
"environment": "gr1_open_microwave",
"num_envs": 10,
"num_envs": 3,
"object": "cracker_box",
"embodiment": "gr1_joint",
},
Expand Down
Loading