Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/workflows/test-multi-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ on:
- "source/isaaclab/isaaclab/app/app_launcher.py"
- "source/isaaclab_tasks/isaaclab_tasks/utils/sim_launcher.py"
- "scripts/reinforcement_learning/**/train.py"
- "source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py"
- "source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py"
- ".github/workflows/test-multi-gpu.yaml"
workflow_dispatch:

Expand All @@ -28,6 +30,22 @@ concurrency:
cancel-in-progress: true

jobs:
test-fabric-multi-gpu:
name: FabricFrameView multi-GPU unit tests
runs-on: [self-hosted, linux, x64, gpu, multi-gpu]
timeout-minutes: 30
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install Isaac Lab
run: ./isaaclab.sh --install

- name: Run FabricFrameView multi-GPU unit tests
run: |
./isaaclab.sh -p -m pytest -m multi_gpu \
source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py -v

test-multi-gpu:
name: Multi-GPU (${{ matrix.physics }}, ${{ matrix.renderer }})
# Use dedicated multi-GPU runner to avoid blocking standard CI resources
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty"

markers = [
"isaacsim_ci: mark test to run in isaacsim ci",
"multi_gpu: tests that require 2+ GPUs; skipped automatically on single-GPU machines",
]

# Add pypi.nvidia.com so that `uv pip install isaaclab[isaacsim]` works without --extra-index-url.
Expand Down
27 changes: 27 additions & 0 deletions source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Changed
^^^^^^^

* Combined the initial USD→Fabric sync in
:class:`~isaaclab_physx.sim.views.FabricFrameView` into a single Fabric
write so ``PrepareForReuse`` is invoked exactly once per logical update
(positions, orientations, and scales are composed in one kernel launch).
This avoids the possibility of a second non-idempotent
``PrepareForReuse`` call masking a topology-change signal that should
have triggered a fabricarray rebuild.

Fixed
^^^^^

* Fixed :class:`~isaaclab_physx.sim.views.FabricFrameView` falling back to
the slow USD path on every CUDA device other than ``cuda:0``. USDRT
``SelectPrims`` now accepts any CUDA device index, so Fabric acceleration
runs on the simulation device the view was constructed with (e.g.
``cuda:1``). This unblocks distributed training where each rank is
pinned to a non-primary GPU.

* Fixed the topology-change invariant guard in
:class:`~isaaclab_physx.sim.views.FabricFrameView` not surviving
``python -O``. The check now raises :class:`RuntimeError` instead of
using ``assert`` so the prim-count mismatch between view and Fabric is
reported at every optimisation level rather than silently producing
wrong poses or out-of-bounds kernel indices.
100 changes: 29 additions & 71 deletions source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@

logger = logging.getLogger(__name__)

# TODO: extend this to ``cuda:N`` once we wire up multi-GPU support for the view.
# Recent Kit / USDRT releases do support multi-GPU ``SelectPrims``, but the
# rest of the FabricFrameView wiring (selections, indexed arrays, etc.) still
# assumes a single device — to be tackled in a follow-up.
_fabric_supported_devices = ("cpu", "cuda", "cuda:0")


def _to_float32_2d(a: wp.array | torch.Tensor) -> wp.array | torch.Tensor:
"""Ensure array is compatible with Fabric kernels (2-D float32).
Expand Down Expand Up @@ -92,15 +86,6 @@ def __init__(
settings = SettingsManager.instance()
self._use_fabric = bool(settings.get("/physics/fabricEnabled", False))

if self._use_fabric and self._device not in _fabric_supported_devices:
logger.warning(
f"Fabric mode is not supported on device '{self._device}'. "
"USDRT SelectPrims and Warp fabric arrays are currently "
f"only supported on {', '.join(_fabric_supported_devices)}. "
"Falling back to standard USD operations. This may impact performance."
)
self._use_fabric = False

self._fabric_initialized = False
self._fabric_usd_sync_done = False
self._fabric_selection = None
Expand Down Expand Up @@ -149,43 +134,7 @@ def set_world_poses(self, positions=None, orientations=None, indices=None):
if not self._use_fabric:
self._usd_view.set_world_poses(positions, orientations, indices)
return

if not self._fabric_initialized:
self._initialize_fabric()

self._prepare_for_reuse()

indices_wp = self._resolve_indices_wp(indices)
count = indices_wp.shape[0]

dummy = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
positions_wp = _to_float32_2d(positions) if positions is not None else dummy
orientations_wp = (
_to_float32_2d(orientations)
if orientations is not None
else wp.zeros((0, 4), dtype=wp.float32, device=self._device)
)

wp.launch(
kernel=fabric_utils.compose_fabric_transformation_matrix_from_warp_arrays,
dim=count,
inputs=[
self._fabric_world_matrices,
positions_wp,
orientations_wp,
dummy,
False,
False,
False,
indices_wp,
self._view_to_fabric,
],
device=self._fabric_device,
)
wp.synchronize()

self._fabric_hierarchy.update_world_xforms()
self._fabric_usd_sync_done = True
self._compose_fabric_transform(positions=positions, orientations=orientations, indices=indices)

def get_world_poses(self, indices: wp.array | None = None) -> tuple[ProxyArray, ProxyArray]:
if not self._use_fabric:
Expand Down Expand Up @@ -244,7 +193,15 @@ def set_scales(self, scales, indices=None):
if not self._use_fabric:
self._usd_view.set_scales(scales, indices)
return
self._compose_fabric_transform(scales=scales, indices=indices)

def _compose_fabric_transform(self, positions=None, orientations=None, scales=None, indices=None):
"""Write the given subset of (position, orientation, scale) into Fabric in one kernel launch.

Components left as ``None`` are skipped via empty input arrays — the kernel reads them
from the existing Fabric matrix. Always invokes :meth:`_prepare_for_reuse` exactly once
per write, even when multiple components are updated together.
"""
if not self._fabric_initialized:
self._initialize_fabric()

Expand All @@ -253,17 +210,19 @@ def set_scales(self, scales, indices=None):
indices_wp = self._resolve_indices_wp(indices)
count = indices_wp.shape[0]

dummy3 = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
dummy4 = wp.zeros((0, 4), dtype=wp.float32, device=self._device)
scales_wp = _to_float32_2d(scales)
empty3 = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Good consolidation: Merging the old set_world_poses Fabric path and set_scales Fabric path into _compose_fabric_transform is a clean refactor. The "empty array means skip" convention with the kernel is elegant.

One subtle point: _sync_fabric_from_usd_once now also calls _compose_fabric_transform (which calls _prepare_for_reuse). Previously it called set_world_poses + set_scales which would invoke _prepare_for_reuse twice. The consolidation to one call is strictly better — worth noting in the changelog (which you did ✓).

empty4 = wp.zeros((0, 4), dtype=wp.float32, device=self._device)
positions_wp = _to_float32_2d(positions) if positions is not None else empty3
orientations_wp = _to_float32_2d(orientations) if orientations is not None else empty4
scales_wp = _to_float32_2d(scales) if scales is not None else empty3

wp.launch(
kernel=fabric_utils.compose_fabric_transformation_matrix_from_warp_arrays,
dim=count,
inputs=[
self._fabric_world_matrices,
dummy3,
dummy4,
positions_wp,
orientations_wp,
scales_wp,
False,
False,
Expand Down Expand Up @@ -347,10 +306,11 @@ def _rebuild_fabric_arrays(self) -> None:
pattern (via ``_usd_view.count``) and does not change when Fabric rearranges its
internal memory layout. The assertion below guards this invariant.
"""
assert self.count == self._default_view_indices.shape[0], (
f"Prim count changed ({self.count} vs {self._default_view_indices.shape[0]}). "
"Fabric topology change added/removed tracked prims — full re-initialization required."
)
if self.count != self._default_view_indices.shape[0]:
raise RuntimeError(
f"Prim count changed ({self.count} vs {self._default_view_indices.shape[0]}). "
"Fabric topology change added/removed tracked prims — full re-initialization required."
)
self._view_to_fabric = wp.zeros((self.count,), dtype=wp.uint32, device=self._fabric_device)
self._fabric_to_view = wp.fabricarray(self._fabric_selection, self._view_index_attr)

Expand Down Expand Up @@ -404,9 +364,6 @@ def _initialize_fabric(self) -> None:
)
wp.synchronize()

# The constructor should have taken care of this, but double check here to avoid regressions
assert self._device in _fabric_supported_devices

self._fabric_selection = fabric_stage.SelectPrims(
require_attrs=[
(usdrt.Sdf.ValueTypeNames.UInt, self._view_index_attr, usdrt.Usd.Access.Read),
Expand Down Expand Up @@ -442,19 +399,20 @@ def _initialize_fabric(self) -> None:
def _sync_fabric_from_usd_once(self) -> None:
"""Sync Fabric world matrices from USD once, on the first read.

``set_world_poses`` and ``set_scales`` each set ``_fabric_usd_sync_done``
themselves, so no explicit flag assignment is needed here.
Combines position/orientation/scale into a single Fabric write so
:meth:`_prepare_for_reuse` (and its underlying ``PrepareForReuse``) is invoked
exactly once across the full sync.
"""
if not self._fabric_initialized:
self._initialize_fabric()

positions_usd_ta, orientations_usd_ta = self._usd_view.get_world_poses()
positions_usd = positions_usd_ta.warp
orientations_usd = orientations_usd_ta.warp
scales_usd = self._usd_view.get_scales()

self.set_world_poses(positions_usd, orientations_usd)
self.set_scales(scales_usd)
self._compose_fabric_transform(
positions=positions_usd_ta.warp,
orientations=orientations_usd_ta.warp,
scales=scales_usd,
)

def _resolve_indices_wp(self, indices: wp.array | None) -> wp.array:
"""Resolve view indices as a Warp uint32 array."""
Expand Down
100 changes: 97 additions & 3 deletions source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 sys.argv stripping: This is a pragmatic workaround for Kit's argv parsing. Consider adding a note that this must remain before AppLauncher — if someone reorders imports or adds a CLI parser above, the protection breaks.

Alternatively, AppLauncher itself could strip unknown flags before passing to Kit (upstream fix), but that's outside this PR's scope.

from isaaclab.app import AppLauncher

# Kit reads ``sys.argv`` directly during startup and segfaults on pytest flags
# (e.g. ``-m multi_gpu``) that collide with its own short options. Strip
# everything but ``argv[0]`` before booting the app — the test file takes no
# CLI arguments of its own.
sys.argv = sys.argv[:1]

simulation_app = AppLauncher(headless=True).app

import pytest # noqa: E402
Expand Down Expand Up @@ -44,8 +50,19 @@ def test_setup_teardown():


def _skip_if_unavailable(device: str):
if device.startswith("cuda") and not torch.cuda.is_available():
if not device.startswith("cuda"):
return
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Skip-vs-fail comment conflicts with implementation: The code comment says "Always skip rather than fail" and explains why. But the PR description says:

In CI (GITHUB_ACTIONS=true), a missing cuda:1 becomes pytest.fail

The actual code here always skips. Was the CI-fail behavior removed during development? The PR description should be updated to match the final implementation if it changed.

idx = int(device.split(":")[1]) if ":" in device else 0
n = torch.cuda.device_count()
if idx >= n:
# Always skip rather than fail: the dedicated multi-GPU workflow does its own
# pre-flight ``torch.cuda.device_count() >= 2`` check before invoking pytest, so
# a misconfigured multi-GPU runner is already caught there. Failing here would
# only break the standard single-GPU CI runners that legitimately can't run
# ``cuda:1+`` tests.
pytest.skip(f"{device} not available (device_count={n}) — multi-GPU test skipped")


# ------------------------------------------------------------------
Expand Down Expand Up @@ -169,7 +186,7 @@ def test_fabric_set_world_does_not_write_back_to_usd(device, view_factory):

# Verify Fabric has the new position
fab_pos, _ = view.get_world_poses()
pos_torch = wp.to_torch(fab_pos)
pos_torch = fab_pos.torch
assert torch.allclose(pos_torch, torch.tensor([[99.0, 99.0, 99.0]], device=device), atol=0.1), (
f"Fabric should have new position, got {pos_torch}"
)
Expand Down Expand Up @@ -230,6 +247,83 @@ def force_topology_changed():
# Read back — proves the rebuilt _view_to_fabric and _fabric_world_matrices
# are still consistent.
ret_pos, _ = view.get_world_poses()
pos_torch = wp.to_torch(ret_pos)
pos_torch = ret_pos.torch
expected = torch.tensor([[4.0, 5.0, 6.0], [4.0, 5.0, 6.0]], device=device)
assert torch.allclose(pos_torch, expected, atol=1e-7), f"Read after rebuild failed on {device}: {pos_torch}"


# ------------------------------------------------------------------
# Multi-GPU tests (cuda:1) — skipped automatically on single-GPU workstations
# ------------------------------------------------------------------


@pytest.mark.multi_gpu
@pytest.mark.parametrize("device", ["cuda:1"])
def test_fabric_cuda1_world_pose_roundtrip(device, view_factory):
"""set_world_poses -> get_world_poses roundtrip works on cuda:1.

Verifies that FabricFrameView operates correctly on a non-primary CUDA
device without falling back to the USD path.
"""
bundle = view_factory(2, device)
view = bundle.view

new_pos = wp.zeros((2, 3), dtype=wp.float32, device=device)
wp.launch(kernel=_fill_position, dim=2, inputs=[new_pos, 10.0, 20.0, 30.0], device=device)
view.set_world_poses(positions=new_pos)

ret_pos, _ = view.get_world_poses()
pos_torch = ret_pos.torch
expected = torch.tensor([[10.0, 20.0, 30.0], [10.0, 20.0, 30.0]], device=device)
assert torch.allclose(pos_torch, expected, atol=1e-7), f"Roundtrip failed on {device}: {pos_torch}"


@pytest.mark.multi_gpu
@pytest.mark.parametrize("device", ["cuda:1"])
def test_fabric_cuda1_no_usd_writeback(device, view_factory):
"""set_world_poses on cuda:1 does not write back to USD.

Mirrors test_fabric_set_world_does_not_write_back_to_usd for the cuda:1
device to confirm the no-writeback invariant holds across GPU indices.
"""
bundle = view_factory(1, device)
view = bundle.view

stage = sim_utils.get_current_stage()
prim = stage.GetPrimAtPath(view.prim_paths[0])
xform_cache = UsdGeom.XformCache()
t_before = xform_cache.GetLocalToWorldTransform(prim).ExtractTranslation()
orig_usd_pos = torch.tensor([float(t_before[0]), float(t_before[1]), float(t_before[2])])

new_pos = wp.zeros((1, 3), dtype=wp.float32, device=device)
wp.launch(kernel=_fill_position, dim=1, inputs=[new_pos, 99.0, 99.0, 99.0], device=device)
view.set_world_poses(positions=new_pos)

# USD must not have moved at all — equality, not approximate.
t_after = UsdGeom.XformCache().GetLocalToWorldTransform(prim).ExtractTranslation()
usd_pos_after = torch.tensor([float(t_after[0]), float(t_after[1]), float(t_after[2])])
assert torch.allclose(usd_pos_after, orig_usd_pos, atol=0.0), (
f"USD wrote back on {device}: expected {orig_usd_pos}, got {usd_pos_after}"
)


@pytest.mark.multi_gpu
@pytest.mark.parametrize("device", ["cuda:1"])
def test_fabric_cuda1_scales_roundtrip(device, view_factory):
"""set_scales -> get_scales roundtrip works on cuda:1.

Both write paths (``set_world_poses`` and ``set_scales``) call
``_prepare_for_reuse`` and launch on ``self._device``; this test covers
the scales path on the non-primary CUDA device.
"""
bundle = view_factory(2, device)
view = bundle.view

new_scales = wp.zeros((2, 3), dtype=wp.float32, device=device)
wp.launch(kernel=_fill_position, dim=2, inputs=[new_scales, 2.0, 3.0, 4.0], device=device)
view.set_scales(new_scales)

ret_scales = view.get_scales()
scales_torch = wp.to_torch(ret_scales)
expected = torch.tensor([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]], device=device)
assert torch.allclose(scales_torch, expected, atol=1e-7), f"Scales roundtrip failed on {device}: {scales_torch}"
Loading