diff --git a/.github/workflows/test-multi-gpu.yaml b/.github/workflows/test-multi-gpu.yaml index e9bee1c4ed2d..3158deafaf77 100644 --- a/.github/workflows/test-multi-gpu.yaml +++ b/.github/workflows/test-multi-gpu.yaml @@ -20,6 +20,8 @@ on: - "source/isaaclab/isaaclab/app/app_launcher.py" - "source/isaaclab_tasks/isaaclab_tasks/utils/sim_launcher.py" - "scripts/reinforcement_learning/**/train.py" + - "source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py" + - "source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py" - ".github/workflows/test-multi-gpu.yaml" workflow_dispatch: @@ -28,6 +30,22 @@ concurrency: cancel-in-progress: true jobs: + test-fabric-multi-gpu: + name: FabricFrameView multi-GPU unit tests + runs-on: [self-hosted, linux, x64, gpu, multi-gpu] + timeout-minutes: 30 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Isaac Lab + run: ./isaaclab.sh --install + + - name: Run FabricFrameView multi-GPU unit tests + run: | + ./isaaclab.sh -p -m pytest -m multi_gpu \ + source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py -v + test-multi-gpu: name: Multi-GPU (${{ matrix.physics }}, ${{ matrix.renderer }}) # Use dedicated multi-GPU runner to avoid blocking standard CI resources diff --git a/pyproject.toml b/pyproject.toml index 513ce684d93f..ea5c556598c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -139,6 +139,7 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty" markers = [ "isaacsim_ci: mark test to run in isaacsim ci", + "multi_gpu: tests that require 2+ GPUs; skipped automatically on single-GPU machines", ] # Add pypi.nvidia.com so that `uv pip install isaaclab[isaacsim]` works without --extra-index-url. diff --git a/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst b/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst new file mode 100644 index 000000000000..b710cb27489d --- /dev/null +++ b/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst @@ -0,0 +1,27 @@ +Changed +^^^^^^^ + +* Combined the initial USD→Fabric sync in + :class:`~isaaclab_physx.sim.views.FabricFrameView` into a single Fabric + write so ``PrepareForReuse`` is invoked exactly once per logical update + (positions, orientations, and scales are composed in one kernel launch). + This avoids the possibility of a second non-idempotent + ``PrepareForReuse`` call masking a topology-change signal that should + have triggered a fabricarray rebuild. + +Fixed +^^^^^ + +* Fixed :class:`~isaaclab_physx.sim.views.FabricFrameView` falling back to + the slow USD path on every CUDA device other than ``cuda:0``. USDRT + ``SelectPrims`` now accepts any CUDA device index, so Fabric acceleration + runs on the simulation device the view was constructed with (e.g. + ``cuda:1``). This unblocks distributed training where each rank is + pinned to a non-primary GPU. + +* Fixed the topology-change invariant guard in + :class:`~isaaclab_physx.sim.views.FabricFrameView` not surviving + ``python -O``. The check now raises :class:`RuntimeError` instead of + using ``assert`` so the prim-count mismatch between view and Fabric is + reported at every optimisation level rather than silently producing + wrong poses or out-of-bounds kernel indices. diff --git a/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py b/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py index 1bcff86d57ac..4cbb7161b2bc 100644 --- a/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py +++ b/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py @@ -23,12 +23,6 @@ logger = logging.getLogger(__name__) -# TODO: extend this to ``cuda:N`` once we wire up multi-GPU support for the view. -# Recent Kit / USDRT releases do support multi-GPU ``SelectPrims``, but the -# rest of the FabricFrameView wiring (selections, indexed arrays, etc.) still -# assumes a single device — to be tackled in a follow-up. -_fabric_supported_devices = ("cpu", "cuda", "cuda:0") - def _to_float32_2d(a: wp.array | torch.Tensor) -> wp.array | torch.Tensor: """Ensure array is compatible with Fabric kernels (2-D float32). @@ -92,15 +86,6 @@ def __init__( settings = SettingsManager.instance() self._use_fabric = bool(settings.get("/physics/fabricEnabled", False)) - if self._use_fabric and self._device not in _fabric_supported_devices: - logger.warning( - f"Fabric mode is not supported on device '{self._device}'. " - "USDRT SelectPrims and Warp fabric arrays are currently " - f"only supported on {', '.join(_fabric_supported_devices)}. " - "Falling back to standard USD operations. This may impact performance." - ) - self._use_fabric = False - self._fabric_initialized = False self._fabric_usd_sync_done = False self._fabric_selection = None @@ -149,43 +134,7 @@ def set_world_poses(self, positions=None, orientations=None, indices=None): if not self._use_fabric: self._usd_view.set_world_poses(positions, orientations, indices) return - - if not self._fabric_initialized: - self._initialize_fabric() - - self._prepare_for_reuse() - - indices_wp = self._resolve_indices_wp(indices) - count = indices_wp.shape[0] - - dummy = wp.zeros((0, 3), dtype=wp.float32, device=self._device) - positions_wp = _to_float32_2d(positions) if positions is not None else dummy - orientations_wp = ( - _to_float32_2d(orientations) - if orientations is not None - else wp.zeros((0, 4), dtype=wp.float32, device=self._device) - ) - - wp.launch( - kernel=fabric_utils.compose_fabric_transformation_matrix_from_warp_arrays, - dim=count, - inputs=[ - self._fabric_world_matrices, - positions_wp, - orientations_wp, - dummy, - False, - False, - False, - indices_wp, - self._view_to_fabric, - ], - device=self._fabric_device, - ) - wp.synchronize() - - self._fabric_hierarchy.update_world_xforms() - self._fabric_usd_sync_done = True + self._compose_fabric_transform(positions=positions, orientations=orientations, indices=indices) def get_world_poses(self, indices: wp.array | None = None) -> tuple[ProxyArray, ProxyArray]: if not self._use_fabric: @@ -244,7 +193,15 @@ def set_scales(self, scales, indices=None): if not self._use_fabric: self._usd_view.set_scales(scales, indices) return + self._compose_fabric_transform(scales=scales, indices=indices) + + def _compose_fabric_transform(self, positions=None, orientations=None, scales=None, indices=None): + """Write the given subset of (position, orientation, scale) into Fabric in one kernel launch. + Components left as ``None`` are skipped via empty input arrays — the kernel reads them + from the existing Fabric matrix. Always invokes :meth:`_prepare_for_reuse` exactly once + per write, even when multiple components are updated together. + """ if not self._fabric_initialized: self._initialize_fabric() @@ -253,17 +210,19 @@ def set_scales(self, scales, indices=None): indices_wp = self._resolve_indices_wp(indices) count = indices_wp.shape[0] - dummy3 = wp.zeros((0, 3), dtype=wp.float32, device=self._device) - dummy4 = wp.zeros((0, 4), dtype=wp.float32, device=self._device) - scales_wp = _to_float32_2d(scales) + empty3 = wp.zeros((0, 3), dtype=wp.float32, device=self._device) + empty4 = wp.zeros((0, 4), dtype=wp.float32, device=self._device) + positions_wp = _to_float32_2d(positions) if positions is not None else empty3 + orientations_wp = _to_float32_2d(orientations) if orientations is not None else empty4 + scales_wp = _to_float32_2d(scales) if scales is not None else empty3 wp.launch( kernel=fabric_utils.compose_fabric_transformation_matrix_from_warp_arrays, dim=count, inputs=[ self._fabric_world_matrices, - dummy3, - dummy4, + positions_wp, + orientations_wp, scales_wp, False, False, @@ -347,10 +306,11 @@ def _rebuild_fabric_arrays(self) -> None: pattern (via ``_usd_view.count``) and does not change when Fabric rearranges its internal memory layout. The assertion below guards this invariant. """ - assert self.count == self._default_view_indices.shape[0], ( - f"Prim count changed ({self.count} vs {self._default_view_indices.shape[0]}). " - "Fabric topology change added/removed tracked prims — full re-initialization required." - ) + if self.count != self._default_view_indices.shape[0]: + raise RuntimeError( + f"Prim count changed ({self.count} vs {self._default_view_indices.shape[0]}). " + "Fabric topology change added/removed tracked prims — full re-initialization required." + ) self._view_to_fabric = wp.zeros((self.count,), dtype=wp.uint32, device=self._fabric_device) self._fabric_to_view = wp.fabricarray(self._fabric_selection, self._view_index_attr) @@ -404,9 +364,6 @@ def _initialize_fabric(self) -> None: ) wp.synchronize() - # The constructor should have taken care of this, but double check here to avoid regressions - assert self._device in _fabric_supported_devices - self._fabric_selection = fabric_stage.SelectPrims( require_attrs=[ (usdrt.Sdf.ValueTypeNames.UInt, self._view_index_attr, usdrt.Usd.Access.Read), @@ -442,19 +399,20 @@ def _initialize_fabric(self) -> None: def _sync_fabric_from_usd_once(self) -> None: """Sync Fabric world matrices from USD once, on the first read. - ``set_world_poses`` and ``set_scales`` each set ``_fabric_usd_sync_done`` - themselves, so no explicit flag assignment is needed here. + Combines position/orientation/scale into a single Fabric write so + :meth:`_prepare_for_reuse` (and its underlying ``PrepareForReuse``) is invoked + exactly once across the full sync. """ if not self._fabric_initialized: self._initialize_fabric() positions_usd_ta, orientations_usd_ta = self._usd_view.get_world_poses() - positions_usd = positions_usd_ta.warp - orientations_usd = orientations_usd_ta.warp scales_usd = self._usd_view.get_scales() - - self.set_world_poses(positions_usd, orientations_usd) - self.set_scales(scales_usd) + self._compose_fabric_transform( + positions=positions_usd_ta.warp, + orientations=orientations_usd_ta.warp, + scales=scales_usd, + ) def _resolve_indices_wp(self, indices: wp.array | None) -> wp.array: """Resolve view indices as a Warp uint32 array.""" diff --git a/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py b/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py index f0c18ccb98c7..a812be530c8e 100644 --- a/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py +++ b/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py @@ -17,6 +17,12 @@ from isaaclab.app import AppLauncher +# Kit reads ``sys.argv`` directly during startup and segfaults on pytest flags +# (e.g. ``-m multi_gpu``) that collide with its own short options. Strip +# everything but ``argv[0]`` before booting the app — the test file takes no +# CLI arguments of its own. +sys.argv = sys.argv[:1] + simulation_app = AppLauncher(headless=True).app import pytest # noqa: E402 @@ -44,8 +50,19 @@ def test_setup_teardown(): def _skip_if_unavailable(device: str): - if device.startswith("cuda") and not torch.cuda.is_available(): + if not device.startswith("cuda"): + return + if not torch.cuda.is_available(): pytest.skip("CUDA not available") + idx = int(device.split(":")[1]) if ":" in device else 0 + n = torch.cuda.device_count() + if idx >= n: + # Always skip rather than fail: the dedicated multi-GPU workflow does its own + # pre-flight ``torch.cuda.device_count() >= 2`` check before invoking pytest, so + # a misconfigured multi-GPU runner is already caught there. Failing here would + # only break the standard single-GPU CI runners that legitimately can't run + # ``cuda:1+`` tests. + pytest.skip(f"{device} not available (device_count={n}) — multi-GPU test skipped") # ------------------------------------------------------------------ @@ -169,7 +186,7 @@ def test_fabric_set_world_does_not_write_back_to_usd(device, view_factory): # Verify Fabric has the new position fab_pos, _ = view.get_world_poses() - pos_torch = wp.to_torch(fab_pos) + pos_torch = fab_pos.torch assert torch.allclose(pos_torch, torch.tensor([[99.0, 99.0, 99.0]], device=device), atol=0.1), ( f"Fabric should have new position, got {pos_torch}" ) @@ -230,6 +247,83 @@ def force_topology_changed(): # Read back — proves the rebuilt _view_to_fabric and _fabric_world_matrices # are still consistent. ret_pos, _ = view.get_world_poses() - pos_torch = wp.to_torch(ret_pos) + pos_torch = ret_pos.torch expected = torch.tensor([[4.0, 5.0, 6.0], [4.0, 5.0, 6.0]], device=device) assert torch.allclose(pos_torch, expected, atol=1e-7), f"Read after rebuild failed on {device}: {pos_torch}" + + +# ------------------------------------------------------------------ +# Multi-GPU tests (cuda:1) — skipped automatically on single-GPU workstations +# ------------------------------------------------------------------ + + +@pytest.mark.multi_gpu +@pytest.mark.parametrize("device", ["cuda:1"]) +def test_fabric_cuda1_world_pose_roundtrip(device, view_factory): + """set_world_poses -> get_world_poses roundtrip works on cuda:1. + + Verifies that FabricFrameView operates correctly on a non-primary CUDA + device without falling back to the USD path. + """ + bundle = view_factory(2, device) + view = bundle.view + + new_pos = wp.zeros((2, 3), dtype=wp.float32, device=device) + wp.launch(kernel=_fill_position, dim=2, inputs=[new_pos, 10.0, 20.0, 30.0], device=device) + view.set_world_poses(positions=new_pos) + + ret_pos, _ = view.get_world_poses() + pos_torch = ret_pos.torch + expected = torch.tensor([[10.0, 20.0, 30.0], [10.0, 20.0, 30.0]], device=device) + assert torch.allclose(pos_torch, expected, atol=1e-7), f"Roundtrip failed on {device}: {pos_torch}" + + +@pytest.mark.multi_gpu +@pytest.mark.parametrize("device", ["cuda:1"]) +def test_fabric_cuda1_no_usd_writeback(device, view_factory): + """set_world_poses on cuda:1 does not write back to USD. + + Mirrors test_fabric_set_world_does_not_write_back_to_usd for the cuda:1 + device to confirm the no-writeback invariant holds across GPU indices. + """ + bundle = view_factory(1, device) + view = bundle.view + + stage = sim_utils.get_current_stage() + prim = stage.GetPrimAtPath(view.prim_paths[0]) + xform_cache = UsdGeom.XformCache() + t_before = xform_cache.GetLocalToWorldTransform(prim).ExtractTranslation() + orig_usd_pos = torch.tensor([float(t_before[0]), float(t_before[1]), float(t_before[2])]) + + new_pos = wp.zeros((1, 3), dtype=wp.float32, device=device) + wp.launch(kernel=_fill_position, dim=1, inputs=[new_pos, 99.0, 99.0, 99.0], device=device) + view.set_world_poses(positions=new_pos) + + # USD must not have moved at all — equality, not approximate. + t_after = UsdGeom.XformCache().GetLocalToWorldTransform(prim).ExtractTranslation() + usd_pos_after = torch.tensor([float(t_after[0]), float(t_after[1]), float(t_after[2])]) + assert torch.allclose(usd_pos_after, orig_usd_pos, atol=0.0), ( + f"USD wrote back on {device}: expected {orig_usd_pos}, got {usd_pos_after}" + ) + + +@pytest.mark.multi_gpu +@pytest.mark.parametrize("device", ["cuda:1"]) +def test_fabric_cuda1_scales_roundtrip(device, view_factory): + """set_scales -> get_scales roundtrip works on cuda:1. + + Both write paths (``set_world_poses`` and ``set_scales``) call + ``_prepare_for_reuse`` and launch on ``self._device``; this test covers + the scales path on the non-primary CUDA device. + """ + bundle = view_factory(2, device) + view = bundle.view + + new_scales = wp.zeros((2, 3), dtype=wp.float32, device=device) + wp.launch(kernel=_fill_position, dim=2, inputs=[new_scales, 2.0, 3.0, 4.0], device=device) + view.set_scales(new_scales) + + ret_scales = view.get_scales() + scales_torch = wp.to_torch(ret_scales) + expected = torch.tensor([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]], device=device) + assert torch.allclose(scales_torch, expected, atol=1e-7), f"Scales roundtrip failed on {device}: {scales_torch}"