Enable FabricFrameView on non-primary GPUs

pv-nvidia · pv-nvidia · commit a6cd73ecadad · 2026-05-06T17:06:12.000Z
- Allow FabricFrameView to run on cuda:N for any N; USDRT SelectPrims
  no longer needs cuda:0.
- Refactor the Fabric write path into a single _compose_fabric_transform
  helper shared by set_world_poses, set_scales, and the initial
  USD-&gt;Fabric sync, collapsing the sync to one kernel launch with one
  PrepareForReuse.
- Replace the topology-invariant assert with RuntimeError so it survives
  python -O.
- Add multi_gpu pytest marker plus cuda:1 unit-test coverage for both
  Fabric write paths, and run them in the existing test-multi-gpu CI
  job (one extra step, no new job).
diff --git a/.github/workflows/test-multi-gpu.yaml b/.github/workflows/test-multi-gpu.yaml
@@ -20,6 +20,8 @@ on:
       - "source/isaaclab/isaaclab/app/app_launcher.py"
       - "source/isaaclab_tasks/isaaclab_tasks/utils/sim_launcher.py"
       - "scripts/reinforcement_learning/**/train.py"
+      - "source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py"
+      - "source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py"
       - ".github/workflows/test-multi-gpu.yaml"
   workflow_dispatch:
 
@@ -104,6 +106,13 @@ jobs:
             exit 1
           fi
 
+      - name: Run FabricFrameView multi-GPU unit tests
+        # Cheap (~tens of seconds) and only depends on torch/warp + 2 GPUs,
+        # so run once per matrix entry rather than carving out a separate job.
+        run: |
+          ./isaaclab.sh -p -m pytest -m multi_gpu \
+            source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py -v
+
       - name: Run multi-GPU training (${{ matrix.physics }}, ${{ matrix.renderer }})
         env:
           NCCL_DEBUG: WARN
diff --git a/pyproject.toml b/pyproject.toml
@@ -139,6 +139,7 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty"
 
 markers = [
     "isaacsim_ci: mark test to run in isaacsim ci",
+    "multi_gpu: tests that require 2+ GPUs; skipped automatically on single-GPU machines",
 ]
 
 # Add pypi.nvidia.com so that `uv pip install isaaclab[isaacsim]` works without --extra-index-url.
diff --git a/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst b/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst
@@ -0,0 +1,27 @@
+Changed
+^^^^^^^
+
+* Combined the initial USD→Fabric sync in
+  :class:`~isaaclab_physx.sim.views.FabricFrameView` into a single Fabric
+  write so ``PrepareForReuse`` is invoked exactly once per logical update
+  (positions, orientations, and scales are composed in one kernel launch).
+  This avoids the possibility of a second non-idempotent
+  ``PrepareForReuse`` call masking a topology-change signal that should
+  have triggered a fabricarray rebuild.
+
+Fixed
+^^^^^
+
+* Fixed :class:`~isaaclab_physx.sim.views.FabricFrameView` falling back to
+  the slow USD path on every CUDA device other than ``cuda:0``.  USDRT
+  ``SelectPrims`` now accepts any CUDA device index, so Fabric acceleration
+  runs on the simulation device the view was constructed with (e.g.
+  ``cuda:1``).  This unblocks distributed training where each rank is
+  pinned to a non-primary GPU.
+
+* Fixed the topology-change invariant guard in
+  :class:`~isaaclab_physx.sim.views.FabricFrameView` not surviving
+  ``python -O``.  The check now raises :class:`RuntimeError` instead of
+  using ``assert`` so the prim-count mismatch between view and Fabric is
+  reported at every optimisation level rather than silently producing
+  wrong poses or out-of-bounds kernel indices.
diff --git a/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py b/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py
@@ -23,12 +23,6 @@
 
 logger = logging.getLogger(__name__)
 
-# TODO: extend this to ``cuda:N`` once we wire up multi-GPU support for the view.
-# Recent Kit / USDRT releases do support multi-GPU ``SelectPrims``, but the
-# rest of the FabricFrameView wiring (selections, indexed arrays, etc.) still
-# assumes a single device — to be tackled in a follow-up.
-_fabric_supported_devices = ("cpu", "cuda", "cuda:0")
-
 
 def _to_float32_2d(a: wp.array | torch.Tensor) -> wp.array | torch.Tensor:
     """Ensure array is compatible with Fabric kernels (2-D float32).
@@ -92,15 +86,6 @@ def __init__(
         settings = SettingsManager.instance()
         self._use_fabric = bool(settings.get("/physics/fabricEnabled", False))
 
-        if self._use_fabric and self._device not in _fabric_supported_devices:
-            logger.warning(
-                f"Fabric mode is not supported on device '{self._device}'. "
-                "USDRT SelectPrims and Warp fabric arrays are currently "
-                f"only supported on {', '.join(_fabric_supported_devices)}. "
-                "Falling back to standard USD operations. This may impact performance."
-            )
-            self._use_fabric = False
-
         self._fabric_initialized = False
         self._fabric_usd_sync_done = False
         self._fabric_selection = None
@@ -149,43 +134,7 @@ def set_world_poses(self, positions=None, orientations=None, indices=None):
         if not self._use_fabric:
             self._usd_view.set_world_poses(positions, orientations, indices)
             return
-
-        if not self._fabric_initialized:
-            self._initialize_fabric()
-
-        self._prepare_for_reuse()
-
-        indices_wp = self._resolve_indices_wp(indices)
-        count = indices_wp.shape[0]
-
-        dummy = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
-        positions_wp = _to_float32_2d(positions) if positions is not None else dummy
-        orientations_wp = (
-            _to_float32_2d(orientations)
-            if orientations is not None
-            else wp.zeros((0, 4), dtype=wp.float32, device=self._device)
-        )
-
-        wp.launch(
-            kernel=fabric_utils.compose_fabric_transformation_matrix_from_warp_arrays,
-            dim=count,
-            inputs=[
-                self._fabric_world_matrices,
-                positions_wp,
-                orientations_wp,
-                dummy,
-                False,
-                False,
-                False,
-                indices_wp,
-                self._view_to_fabric,
-            ],
-            device=self._fabric_device,
-        )
-        wp.synchronize()
-
-        self._fabric_hierarchy.update_world_xforms()
-        self._fabric_usd_sync_done = True
+        self._compose_fabric_transform(positions=positions, orientations=orientations, indices=indices)
 
     def get_world_poses(self, indices: wp.array | None = None) -> tuple[ProxyArray, ProxyArray]:
         if not self._use_fabric:
@@ -244,7 +193,15 @@ def set_scales(self, scales, indices=None):
         if not self._use_fabric:
             self._usd_view.set_scales(scales, indices)
             return
+        self._compose_fabric_transform(scales=scales, indices=indices)
+
+    def _compose_fabric_transform(self, positions=None, orientations=None, scales=None, indices=None):
+        """Write the given subset of (position, orientation, scale) into Fabric in one kernel launch.
 
+        Components left as ``None`` are skipped via empty input arrays — the kernel reads them
+        from the existing Fabric matrix. Always invokes :meth:`_prepare_for_reuse` exactly once
+        per write, even when multiple components are updated together.
+        """
         if not self._fabric_initialized:
             self._initialize_fabric()
 
@@ -253,17 +210,19 @@ def set_scales(self, scales, indices=None):
         indices_wp = self._resolve_indices_wp(indices)
         count = indices_wp.shape[0]
 
-        dummy3 = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
-        dummy4 = wp.zeros((0, 4), dtype=wp.float32, device=self._device)
-        scales_wp = _to_float32_2d(scales)
+        empty3 = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
+        empty4 = wp.zeros((0, 4), dtype=wp.float32, device=self._device)
+        positions_wp = _to_float32_2d(positions) if positions is not None else empty3
+        orientations_wp = _to_float32_2d(orientations) if orientations is not None else empty4
+        scales_wp = _to_float32_2d(scales) if scales is not None else empty3
 
         wp.launch(
             kernel=fabric_utils.compose_fabric_transformation_matrix_from_warp_arrays,
             dim=count,
             inputs=[
                 self._fabric_world_matrices,
-                dummy3,
-                dummy4,
+                positions_wp,
+                orientations_wp,
                 scales_wp,
                 False,
                 False,
@@ -347,10 +306,11 @@ def _rebuild_fabric_arrays(self) -> None:
         pattern (via ``_usd_view.count``) and does not change when Fabric rearranges its
         internal memory layout.  The assertion below guards this invariant.
         """
-        assert self.count == self._default_view_indices.shape[0], (
-            f"Prim count changed ({self.count} vs {self._default_view_indices.shape[0]}). "
-            "Fabric topology change added/removed tracked prims — full re-initialization required."
-        )
+        if self.count != self._default_view_indices.shape[0]:
+            raise RuntimeError(
+                f"Prim count changed ({self.count} vs {self._default_view_indices.shape[0]}). "
+                "Fabric topology change added/removed tracked prims — full re-initialization required."
+            )
         self._view_to_fabric = wp.zeros((self.count,), dtype=wp.uint32, device=self._fabric_device)
         self._fabric_to_view = wp.fabricarray(self._fabric_selection, self._view_index_attr)
 
@@ -404,9 +364,6 @@ def _initialize_fabric(self) -> None:
         )
         wp.synchronize()
 
-        # The constructor should have taken care of this, but double check here to avoid regressions
-        assert self._device in _fabric_supported_devices
-
         self._fabric_selection = fabric_stage.SelectPrims(
             require_attrs=[
                 (usdrt.Sdf.ValueTypeNames.UInt, self._view_index_attr, usdrt.Usd.Access.Read),
@@ -442,19 +399,20 @@ def _initialize_fabric(self) -> None:
     def _sync_fabric_from_usd_once(self) -> None:
         """Sync Fabric world matrices from USD once, on the first read.
 
-        ``set_world_poses`` and ``set_scales`` each set ``_fabric_usd_sync_done``
-        themselves, so no explicit flag assignment is needed here.
+        Combines position/orientation/scale into a single Fabric write so
+        :meth:`_prepare_for_reuse` (and its underlying ``PrepareForReuse``) is invoked
+        exactly once across the full sync.
         """
         if not self._fabric_initialized:
             self._initialize_fabric()
 
         positions_usd_ta, orientations_usd_ta = self._usd_view.get_world_poses()
-        positions_usd = positions_usd_ta.warp
-        orientations_usd = orientations_usd_ta.warp
         scales_usd = self._usd_view.get_scales()
-
-        self.set_world_poses(positions_usd, orientations_usd)
-        self.set_scales(scales_usd)
+        self._compose_fabric_transform(
+            positions=positions_usd_ta.warp,
+            orientations=orientations_usd_ta.warp,
+            scales=scales_usd,
+        )
 
     def _resolve_indices_wp(self, indices: wp.array | None) -> wp.array:
         """Resolve view indices as a Warp uint32 array."""
diff --git a/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py b/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py
@@ -10,6 +10,7 @@
 Camera prim type for Fabric SelectPrims compatibility).
 """
 
+import os
 import sys
 from pathlib import Path
 
@@ -44,8 +45,17 @@ def test_setup_teardown():
 
 
 def _skip_if_unavailable(device: str):
-    if device.startswith("cuda") and not torch.cuda.is_available():
+    if not device.startswith("cuda"):
+        return
+    if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
+    idx = int(device.split(":")[1]) if ":" in device else 0
+    n = torch.cuda.device_count()
+    if idx >= n:
+        msg = f"{device} not available (device_count={n})"
+        if os.environ.get("GITHUB_ACTIONS") == "true":
+            pytest.fail(f"{msg} — multi-GPU runner is misconfigured")
+        pytest.skip(f"{msg} — multi-GPU test skipped on single-GPU machine")
 
 
 # ------------------------------------------------------------------
@@ -233,3 +243,80 @@ def force_topology_changed():
     pos_torch = wp.to_torch(ret_pos)
     expected = torch.tensor([[4.0, 5.0, 6.0], [4.0, 5.0, 6.0]], device=device)
     assert torch.allclose(pos_torch, expected, atol=1e-7), f"Read after rebuild failed on {device}: {pos_torch}"
+
+
+# ------------------------------------------------------------------
+# Multi-GPU tests (cuda:1) — skipped automatically on single-GPU workstations
+# ------------------------------------------------------------------
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.parametrize("device", ["cuda:1"])
+def test_fabric_cuda1_world_pose_roundtrip(device, view_factory):
+    """set_world_poses -> get_world_poses roundtrip works on cuda:1.
+
+    Verifies that FabricFrameView operates correctly on a non-primary CUDA
+    device without falling back to the USD path.
+    """
+    bundle = view_factory(2, device)
+    view = bundle.view
+
+    new_pos = wp.zeros((2, 3), dtype=wp.float32, device=device)
+    wp.launch(kernel=_fill_position, dim=2, inputs=[new_pos, 10.0, 20.0, 30.0], device=device)
+    view.set_world_poses(positions=new_pos)
+
+    ret_pos, _ = view.get_world_poses()
+    pos_torch = wp.to_torch(ret_pos)
+    expected = torch.tensor([[10.0, 20.0, 30.0], [10.0, 20.0, 30.0]], device=device)
+    assert torch.allclose(pos_torch, expected, atol=1e-7), f"Roundtrip failed on {device}: {pos_torch}"
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.parametrize("device", ["cuda:1"])
+def test_fabric_cuda1_no_usd_writeback(device, view_factory):
+    """set_world_poses on cuda:1 does not write back to USD.
+
+    Mirrors test_fabric_set_world_does_not_write_back_to_usd for the cuda:1
+    device to confirm the no-writeback invariant holds across GPU indices.
+    """
+    bundle = view_factory(1, device)
+    view = bundle.view
+
+    stage = sim_utils.get_current_stage()
+    prim = stage.GetPrimAtPath(view.prim_paths[0])
+    xform_cache = UsdGeom.XformCache()
+    t_before = xform_cache.GetLocalToWorldTransform(prim).ExtractTranslation()
+    orig_usd_pos = torch.tensor([float(t_before[0]), float(t_before[1]), float(t_before[2])])
+
+    new_pos = wp.zeros((1, 3), dtype=wp.float32, device=device)
+    wp.launch(kernel=_fill_position, dim=1, inputs=[new_pos, 99.0, 99.0, 99.0], device=device)
+    view.set_world_poses(positions=new_pos)
+
+    # USD must not have moved at all — equality, not approximate.
+    t_after = UsdGeom.XformCache().GetLocalToWorldTransform(prim).ExtractTranslation()
+    usd_pos_after = torch.tensor([float(t_after[0]), float(t_after[1]), float(t_after[2])])
+    assert torch.allclose(usd_pos_after, orig_usd_pos, atol=0.0), (
+        f"USD wrote back on {device}: expected {orig_usd_pos}, got {usd_pos_after}"
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.parametrize("device", ["cuda:1"])
+def test_fabric_cuda1_scales_roundtrip(device, view_factory):
+    """set_scales -> get_scales roundtrip works on cuda:1.
+
+    Both write paths (``set_world_poses`` and ``set_scales``) call
+    ``_prepare_for_reuse`` and launch on ``self._device``; this test covers
+    the scales path on the non-primary CUDA device.
+    """
+    bundle = view_factory(2, device)
+    view = bundle.view
+
+    new_scales = wp.zeros((2, 3), dtype=wp.float32, device=device)
+    wp.launch(kernel=_fill_position, dim=2, inputs=[new_scales, 2.0, 3.0, 4.0], device=device)
+    view.set_scales(new_scales)
+
+    ret_scales = view.get_scales()
+    scales_torch = wp.to_torch(ret_scales)
+    expected = torch.tensor([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]], device=device)
+    assert torch.allclose(scales_torch, expected, atol=1e-7), f"Scales roundtrip failed on {device}: {scales_torch}"

Original file line number	Diff line number	Diff line change
`@@ -139,6 +139,7 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty"`
`139`	`139`
`140`	`140`	`markers = [`
`141`	`141`	`"isaacsim_ci: mark test to run in isaacsim ci",`
	`142`	`+ "multi_gpu: tests that require 2+ GPUs; skipped automatically on single-GPU machines",`
`142`	`143`	`]`
`143`	`144`
`144`	`145`	# Add pypi.nvidia.com so that `uv pip install isaaclab[isaacsim]` works without --extra-index-url.