Skip to content

Commit ce0aaa0

Browse files
committed
Enable FabricFrameView on non-primary GPUs
- Allow FabricFrameView to run on cuda:N for any N; USDRT SelectPrims no longer needs cuda:0. - Refactor the Fabric write path into a single _compose_fabric_transform helper shared by set_world_poses, set_scales, and the initial USD->Fabric sync, collapsing the sync to one kernel launch with one PrepareForReuse. - Replace the topology-invariant assert with RuntimeError so it survives python -O. - Add multi_gpu pytest marker plus cuda:1 unit-test coverage for both Fabric write paths. - Multi-GPU CI workflow gracefully degrades to ::warning:: on a single-GPU runner until a multi-GPU pool is provisioned.
1 parent efd9d1e commit ce0aaa0

7 files changed

Lines changed: 227 additions & 95 deletions

File tree

.github/workflows/test-multi-gpu.yaml

Lines changed: 69 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,28 @@
33
#
44
# SPDX-License-Identifier: BSD-3-Clause
55

6-
# Multi-GPU distributed training validation
6+
# Multi-GPU validation
77
#
8-
# This workflow validates that multi-GPU training works correctly across:
9-
# - Physics backends: PhysX, Newton
10-
# - Rendering backends: none (physics-only), Isaac RTX, Newton Warp
8+
# Two jobs:
119
#
12-
# Runs on a dedicated multi-GPU runner (separate from standard CI) to minimize costs.
13-
# Only triggered on PRs that touch distributed training code paths.
10+
# 1. test-fabric-multi-gpu — FabricFrameView unit tests on cuda:1.
11+
# Triggered by changes to FabricFrameView or its test file.
12+
#
13+
# 2. test-multi-gpu — distributed training validation across physics/renderer
14+
# combinations (PhysX, Newton × none, Isaac RTX, Newton Warp).
15+
# Triggered by changes to distributed training code paths.
16+
#
17+
# Both run on a dedicated multi-GPU runner to minimize costs.
1418

15-
name: Multi-GPU Training Tests
19+
name: Multi-GPU Tests
1620

1721
on:
1822
pull_request:
1923
paths:
24+
# Fabric multi-GPU unit tests
25+
- "source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py"
26+
- "source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py"
27+
# Distributed training tests
2028
- "source/isaaclab/isaaclab/app/app_launcher.py"
2129
- "source/isaaclab_tasks/isaaclab_tasks/utils/sim_launcher.py"
2230
- "scripts/reinforcement_learning/**/train.py"
@@ -28,11 +36,56 @@ concurrency:
2836
cancel-in-progress: true
2937

3038
jobs:
39+
test-fabric-multi-gpu:
40+
name: FabricFrameView multi-GPU unit tests
41+
# No dedicated multi-GPU runner pool is currently available, so we run on
42+
# any GPU runner. The ``cuda:1`` tests skip themselves on single-GPU
43+
# hosts via the ``@pytest.mark.multi_gpu`` marker plus runtime
44+
# device-count check, and we emit a workflow ``::warning::`` so the lack
45+
# of multi-GPU coverage is visible without failing the build. Switch
46+
# back to ``[self-hosted, linux, x64, gpu, multi-gpu]`` once a multi-GPU
47+
# runner is provisioned.
48+
runs-on: [self-hosted, gpu]
49+
timeout-minutes: 30
50+
steps:
51+
- name: Checkout repository
52+
uses: actions/checkout@v4
53+
54+
- name: Install Isaac Lab
55+
run: ./isaaclab.sh --install
56+
57+
- name: Detect available GPUs
58+
run: |
59+
GPU_COUNT=$(./isaaclab.sh -p -c "import torch; print(torch.cuda.device_count())")
60+
echo "Detected $GPU_COUNT GPU(s)"
61+
if [ "$GPU_COUNT" -lt 2 ]; then
62+
echo "::warning::Only $GPU_COUNT GPU(s) available — multi-GPU (cuda:1) tests will be skipped."
63+
fi
64+
65+
- name: Run Fabric multi-GPU unit tests
66+
run: |
67+
./isaaclab.sh -p -m pytest -m multi_gpu \
68+
source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py \
69+
-v --junitxml=reports/fabric-multi-gpu.xml
70+
71+
- name: Upload test results
72+
if: always()
73+
uses: actions/upload-artifact@v4
74+
with:
75+
name: fabric-multi-gpu-results
76+
path: reports/fabric-multi-gpu.xml
77+
if-no-files-found: ignore
78+
retention-days: 7
79+
3180
test-multi-gpu:
3281
name: Multi-GPU (${{ matrix.physics }}, ${{ matrix.renderer }})
33-
# Use dedicated multi-GPU runner to avoid blocking standard CI resources
34-
# Configure this label on a runner with 2+ GPUs (e.g., g5.12xlarge with 4x A10G)
35-
runs-on: [self-hosted, linux, x64, gpu, multi-gpu]
82+
# No dedicated multi-GPU runner pool is currently available, so we land
83+
# on any GPU runner. When fewer than 2 GPUs are present, the
84+
# distributed-training launch is replaced with a ``::warning::`` so the
85+
# lack of coverage is visible without failing the build. Switch back to
86+
# ``[self-hosted, linux, x64, gpu, multi-gpu]`` (e.g. a g5.12xlarge with
87+
# 4x A10G) once such a runner is provisioned.
88+
runs-on: [self-hosted, gpu]
3689
timeout-minutes: 30
3790
strategy:
3891
fail-fast: false
@@ -91,20 +144,22 @@ jobs:
91144
run: |
92145
./isaaclab.sh --install
93146
94-
- name: Verify multi-GPU availability
147+
- name: Detect available GPUs
148+
id: gpu_check
95149
run: |
96150
echo "=== GPU Info ==="
97151
nvidia-smi --query-gpu=index,name,memory.total --format=csv
98152
99153
GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())")
100154
echo "Detected $GPU_COUNT GPU(s)"
155+
echo "gpu_count=$GPU_COUNT" >> "$GITHUB_OUTPUT"
101156
102157
if [ "$GPU_COUNT" -lt 2 ]; then
103-
echo "::error::At least 2 GPUs required for multi-GPU tests, found $GPU_COUNT"
104-
exit 1
158+
echo "::warning::Only $GPU_COUNT GPU(s) available — distributed-training validation skipped (matrix: ${{ matrix.physics }}, ${{ matrix.renderer }})."
105159
fi
106160
107161
- name: Run multi-GPU training (${{ matrix.physics }}, ${{ matrix.renderer }})
162+
if: steps.gpu_check.outputs.gpu_count >= '2'
108163
env:
109164
NCCL_DEBUG: WARN
110165
run: |
@@ -129,6 +184,7 @@ jobs:
129184
${{ matrix.extra_args }}
130185
131186
- name: Verify training completed
187+
if: steps.gpu_check.outputs.gpu_count >= '2'
132188
run: |
133189
# Find the most recent log directory
134190
LATEST_LOG=$(ls -td logs/*/*/*/ 2>/dev/null | head -1)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty"
139139

140140
markers = [
141141
"isaacsim_ci: mark test to run in isaacsim ci",
142+
"multi_gpu: tests that require 2+ GPUs; skipped automatically on single-GPU machines",
142143
]
143144

144145
# Add pypi.nvidia.com so that `uv pip install isaaclab[isaacsim]` works without --extra-index-url.

source/isaaclab/changelog.d/fix-fabric-prepare-for-reuse.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ Changed
33

44
* Updated :class:`~isaaclab.sensors.camera.Camera` to construct its internal
55
:class:`~isaaclab.sim.views.FrameView` without the now-removed
6-
``sync_usd_on_fabric_write`` kwarg. USD attributes on camera prims are
7-
no longer kept in sync with Fabric writes; read poses through the view's
8-
getters instead.
6+
``sync_usd_on_fabric_write`` keyword (see the corresponding
7+
``isaaclab_physx`` entry). No user-facing migration is required —
8+
callers reading camera poses through the sensor's data buffers are
9+
unaffected. Direct readers of the camera prim's USD attributes during a
10+
simulation step should switch to the sensor getters, since USD attributes
11+
are no longer updated in lock-step with Fabric writes.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Fixed
2+
^^^^^
3+
4+
* Fixed :class:`~isaaclab_physx.sim.views.FabricFrameView` falling back to
5+
the slow USD path on every CUDA device other than ``cuda:0``. USDRT
6+
``SelectPrims`` now accepts any CUDA device index, so Fabric acceleration
7+
runs on the simulation device the view was constructed with (e.g.
8+
``cuda:1``). This unblocks distributed training where each rank is
9+
pinned to a non-primary GPU.

source/isaaclab_physx/changelog.d/fix-fabric-prepare-for-reuse.rst

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,27 @@ Changed
33

44
* **Breaking:** Removed the ``sync_usd_on_fabric_write`` keyword argument from
55
:class:`~isaaclab_physx.sim.views.FabricFrameView`. Fabric writes
6-
(``set_world_poses``, ``set_scales``) now notify the renderer via
7-
``PrepareForReuse()`` on the underlying ``PrimSelection`` instead of writing
8-
back to USD, which is ~200x faster and avoids the stale USD shadow state the
9-
old path produced. Callers passing ``sync_usd_on_fabric_write=True`` should
10-
remove the argument; if they relied on USD reflecting Fabric writes, they
11-
should now read Fabric poses directly via the view's getters or refresh USD
12-
explicitly.
6+
(:meth:`~isaaclab_physx.sim.views.FabricFrameView.set_world_poses`,
7+
:meth:`~isaaclab_physx.sim.views.FabricFrameView.set_scales`) now notify the
8+
renderer via ``PrepareForReuse`` on the underlying ``PrimSelection`` and
9+
detect Fabric topology changes, instead of writing back to USD. This is
10+
~200x faster and removes the stale USD shadow state the old path produced.
11+
Migration: drop the ``sync_usd_on_fabric_write=True`` argument; if you
12+
previously relied on USD reflecting Fabric writes, read poses through the
13+
view's getters or refresh USD explicitly at your sync point.
14+
15+
* Combined the initial USD→Fabric sync into a single Fabric write so
16+
``PrepareForReuse`` is invoked exactly once per logical update (positions,
17+
orientations, and scales are composed in one kernel launch). This avoids
18+
the possibility of a second non-idempotent ``PrepareForReuse`` call masking
19+
a topology-change signal that should have triggered a fabricarray rebuild.
20+
21+
Fixed
22+
^^^^^
23+
24+
* Fixed the topology-change invariant guard in
25+
:class:`~isaaclab_physx.sim.views.FabricFrameView` not surviving
26+
``python -O``. The check now raises :class:`RuntimeError` instead of using
27+
``assert`` so the prim-count mismatch between view and Fabric is reported
28+
at every optimisation level rather than silently producing wrong poses or
29+
out-of-bounds kernel indices.

source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py

Lines changed: 29 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,6 @@
2323

2424
logger = logging.getLogger(__name__)
2525

26-
# TODO: extend this to ``cuda:N`` once we wire up multi-GPU support for the view.
27-
# Recent Kit / USDRT releases do support multi-GPU ``SelectPrims``, but the
28-
# rest of the FabricFrameView wiring (selections, indexed arrays, etc.) still
29-
# assumes a single device — to be tackled in a follow-up.
30-
_fabric_supported_devices = ("cpu", "cuda", "cuda:0")
31-
3226

3327
def _to_float32_2d(a: wp.array | torch.Tensor) -> wp.array | torch.Tensor:
3428
"""Ensure array is compatible with Fabric kernels (2-D float32).
@@ -92,15 +86,6 @@ def __init__(
9286
settings = SettingsManager.instance()
9387
self._use_fabric = bool(settings.get("/physics/fabricEnabled", False))
9488

95-
if self._use_fabric and self._device not in _fabric_supported_devices:
96-
logger.warning(
97-
f"Fabric mode is not supported on device '{self._device}'. "
98-
"USDRT SelectPrims and Warp fabric arrays are currently "
99-
f"only supported on {', '.join(_fabric_supported_devices)}. "
100-
"Falling back to standard USD operations. This may impact performance."
101-
)
102-
self._use_fabric = False
103-
10489
self._fabric_initialized = False
10590
self._fabric_usd_sync_done = False
10691
self._fabric_selection = None
@@ -149,43 +134,7 @@ def set_world_poses(self, positions=None, orientations=None, indices=None):
149134
if not self._use_fabric:
150135
self._usd_view.set_world_poses(positions, orientations, indices)
151136
return
152-
153-
if not self._fabric_initialized:
154-
self._initialize_fabric()
155-
156-
self._prepare_for_reuse()
157-
158-
indices_wp = self._resolve_indices_wp(indices)
159-
count = indices_wp.shape[0]
160-
161-
dummy = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
162-
positions_wp = _to_float32_2d(positions) if positions is not None else dummy
163-
orientations_wp = (
164-
_to_float32_2d(orientations)
165-
if orientations is not None
166-
else wp.zeros((0, 4), dtype=wp.float32, device=self._device)
167-
)
168-
169-
wp.launch(
170-
kernel=fabric_utils.compose_fabric_transformation_matrix_from_warp_arrays,
171-
dim=count,
172-
inputs=[
173-
self._fabric_world_matrices,
174-
positions_wp,
175-
orientations_wp,
176-
dummy,
177-
False,
178-
False,
179-
False,
180-
indices_wp,
181-
self._view_to_fabric,
182-
],
183-
device=self._fabric_device,
184-
)
185-
wp.synchronize()
186-
187-
self._fabric_hierarchy.update_world_xforms()
188-
self._fabric_usd_sync_done = True
137+
self._compose_fabric_transform(positions=positions, orientations=orientations, indices=indices)
189138

190139
def get_world_poses(self, indices: wp.array | None = None) -> tuple[ProxyArray, ProxyArray]:
191140
if not self._use_fabric:
@@ -244,7 +193,15 @@ def set_scales(self, scales, indices=None):
244193
if not self._use_fabric:
245194
self._usd_view.set_scales(scales, indices)
246195
return
196+
self._compose_fabric_transform(scales=scales, indices=indices)
197+
198+
def _compose_fabric_transform(self, positions=None, orientations=None, scales=None, indices=None):
199+
"""Write the given subset of (position, orientation, scale) into Fabric in one kernel launch.
247200
201+
Components left as ``None`` are skipped via empty input arrays — the kernel reads them
202+
from the existing Fabric matrix. Always invokes :meth:`_prepare_for_reuse` exactly once
203+
per write, even when multiple components are updated together.
204+
"""
248205
if not self._fabric_initialized:
249206
self._initialize_fabric()
250207

@@ -253,17 +210,19 @@ def set_scales(self, scales, indices=None):
253210
indices_wp = self._resolve_indices_wp(indices)
254211
count = indices_wp.shape[0]
255212

256-
dummy3 = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
257-
dummy4 = wp.zeros((0, 4), dtype=wp.float32, device=self._device)
258-
scales_wp = _to_float32_2d(scales)
213+
empty3 = wp.zeros((0, 3), dtype=wp.float32, device=self._device)
214+
empty4 = wp.zeros((0, 4), dtype=wp.float32, device=self._device)
215+
positions_wp = _to_float32_2d(positions) if positions is not None else empty3
216+
orientations_wp = _to_float32_2d(orientations) if orientations is not None else empty4
217+
scales_wp = _to_float32_2d(scales) if scales is not None else empty3
259218

260219
wp.launch(
261220
kernel=fabric_utils.compose_fabric_transformation_matrix_from_warp_arrays,
262221
dim=count,
263222
inputs=[
264223
self._fabric_world_matrices,
265-
dummy3,
266-
dummy4,
224+
positions_wp,
225+
orientations_wp,
267226
scales_wp,
268227
False,
269228
False,
@@ -347,10 +306,11 @@ def _rebuild_fabric_arrays(self) -> None:
347306
pattern (via ``_usd_view.count``) and does not change when Fabric rearranges its
348307
internal memory layout. The assertion below guards this invariant.
349308
"""
350-
assert self.count == self._default_view_indices.shape[0], (
351-
f"Prim count changed ({self.count} vs {self._default_view_indices.shape[0]}). "
352-
"Fabric topology change added/removed tracked prims — full re-initialization required."
353-
)
309+
if self.count != self._default_view_indices.shape[0]:
310+
raise RuntimeError(
311+
f"Prim count changed ({self.count} vs {self._default_view_indices.shape[0]}). "
312+
"Fabric topology change added/removed tracked prims — full re-initialization required."
313+
)
354314
self._view_to_fabric = wp.zeros((self.count,), dtype=wp.uint32, device=self._fabric_device)
355315
self._fabric_to_view = wp.fabricarray(self._fabric_selection, self._view_index_attr)
356316

@@ -404,9 +364,6 @@ def _initialize_fabric(self) -> None:
404364
)
405365
wp.synchronize()
406366

407-
# The constructor should have taken care of this, but double check here to avoid regressions
408-
assert self._device in _fabric_supported_devices
409-
410367
self._fabric_selection = fabric_stage.SelectPrims(
411368
require_attrs=[
412369
(usdrt.Sdf.ValueTypeNames.UInt, self._view_index_attr, usdrt.Usd.Access.Read),
@@ -442,19 +399,20 @@ def _initialize_fabric(self) -> None:
442399
def _sync_fabric_from_usd_once(self) -> None:
443400
"""Sync Fabric world matrices from USD once, on the first read.
444401
445-
``set_world_poses`` and ``set_scales`` each set ``_fabric_usd_sync_done``
446-
themselves, so no explicit flag assignment is needed here.
402+
Combines position/orientation/scale into a single Fabric write so
403+
:meth:`_prepare_for_reuse` (and its underlying ``PrepareForReuse``) is invoked
404+
exactly once across the full sync.
447405
"""
448406
if not self._fabric_initialized:
449407
self._initialize_fabric()
450408

451409
positions_usd_ta, orientations_usd_ta = self._usd_view.get_world_poses()
452-
positions_usd = positions_usd_ta.warp
453-
orientations_usd = orientations_usd_ta.warp
454410
scales_usd = self._usd_view.get_scales()
455-
456-
self.set_world_poses(positions_usd, orientations_usd)
457-
self.set_scales(scales_usd)
411+
self._compose_fabric_transform(
412+
positions=positions_usd_ta.warp,
413+
orientations=orientations_usd_ta.warp,
414+
scales=scales_usd,
415+
)
458416

459417
def _resolve_indices_wp(self, indices: wp.array | None) -> wp.array:
460418
"""Resolve view indices as a Warp uint32 array."""

0 commit comments

Comments
 (0)