isaac-sim · daniela-hase · Apr 30, 2026 · May 1, 2026 · May 1, 2026 · May 4, 2026
@@ -162,8 +162,13 @@ Accessing camera data
 
 .. code-block:: python
 
+    import warp as wp
+
     tiled_camera = Camera(cfg.tiled_camera)
-    data = tiled_camera.data.output["rgb"]  # shape: (num_cameras, H, W, 3), torch.uint8
+    # ``data.output`` entries are ``wp.array`` values (e.g. ``wp.uint8``); use
+    # :func:`warp.to_torch` when Torch tensor operations are required.
+    data_wp = tiled_camera.data.output["rgb"]  # shape: (num_cameras, H, W, 3), wp.uint8
+    data = wp.to_torch(data_wp)  # zero-copy torch.uint8 view
 
 The returned data has shape ``(num_cameras, height, width, num_channels)``, ready to use directly
 as an observation in RL training.
@@ -207,11 +212,12 @@ RGB and RGBA
     :figwidth: 100%
     :alt: A scene captured in RGB
 
-``rgb`` returns a 3-channel RGB image of type ``torch.uint8``, shape ``(B, H, W, 3)``.
+``rgb`` returns a 3-channel RGB image of type ``wp.uint8``, shape ``(B, H, W, 3)``.
 
-``rgba`` returns a 4-channel RGBA image of type ``torch.uint8``, shape ``(B, H, W, 4)``.
+``rgba`` returns a 4-channel RGBA image of type ``wp.uint8``, shape ``(B, H, W, 4)``.
 
-To convert to ``torch.float32``, divide by 255.0.
+Use :func:`warp.to_torch` to obtain a zero-copy ``torch.uint8`` view; divide by ``255.0``
+to convert to ``torch.float32``.
 
 Depth and Distances
 ~~~~~~~~~~~~~~~~~~~
@@ -222,10 +228,10 @@ Depth and Distances
     :alt: A scene captured as depth
 
 ``distance_to_camera`` returns a single-channel depth image with distance to the camera optical
-center, shape ``(B, H, W, 1)``, type ``torch.float32``.
+center, shape ``(B, H, W, 1)``, type ``wp.float32``.
 
 ``distance_to_image_plane`` returns distances of 3D points from the camera plane along the Z-axis,
-shape ``(B, H, W, 1)``, type ``torch.float32``.
+shape ``(B, H, W, 1)``, type ``wp.float32``.
 
 ``depth`` is an alias for ``distance_to_image_plane``.
 
@@ -238,14 +244,14 @@ Normals
     :alt: A scene captured with surface normals
 
 ``normals`` returns local surface normal vectors at each pixel, shape ``(B, H, W, 3)`` containing
-``(x, y, z)``, type ``torch.float32``.
+``(x, y, z)``, type ``wp.float32``.
 
 Motion Vectors
 ~~~~~~~~~~~~~~
 
 ``motion_vectors`` returns per-pixel motion vectors in image space between frames.
 Shape ``(B, H, W, 2)``: ``x`` is horizontal motion (positive = left), ``y`` is vertical motion
-(positive = up). Type ``torch.float32``.
+(positive = up). Type ``wp.float32``.
 
 Semantic Segmentation
 ~~~~~~~~~~~~~~~~~~~~~
@@ -259,8 +265,8 @@ Semantic Segmentation
 An ``info`` dictionary is available via ``tiled_camera.data.info['semantic_segmentation']``.
 
 - If ``colorize_semantic_segmentation=True``: 4-channel RGBA image, shape ``(B, H, W, 4)``,
-  type ``torch.uint8``. The ``idToLabels`` dict maps color to semantic label.
-- If ``colorize_semantic_segmentation=False``: shape ``(B, H, W, 1)``, type ``torch.int32``,
+  type ``wp.uint8``. The ``idToLabels`` dict maps color to semantic label.
+- If ``colorize_semantic_segmentation=False``: shape ``(B, H, W, 1)``, type ``wp.int32``,
   containing semantic IDs. The ``idToLabels`` dict maps ID to label.
 
 Instance ID Segmentation
@@ -274,9 +280,9 @@ Instance ID Segmentation
 ``instance_id_segmentation_fast`` outputs per-pixel instance IDs, unique per USD prim path.
 An ``info`` dictionary is available via ``tiled_camera.data.info['instance_id_segmentation_fast']``.
 
-- If ``colorize_instance_id_segmentation=True``: shape ``(B, H, W, 4)``, type ``torch.uint8``.
+- If ``colorize_instance_id_segmentation=True``: shape ``(B, H, W, 4)``, type ``wp.uint8``.
   The ``idToLabels`` dict maps color to USD prim path.
-- If ``colorize_instance_id_segmentation=False``: shape ``(B, H, W, 1)``, type ``torch.int32``.
+- If ``colorize_instance_id_segmentation=False``: shape ``(B, H, W, 1)``, type ``wp.int32``.
   The ``idToLabels`` dict maps instance ID to USD prim path.
 
 Instance Segmentation
@@ -292,8 +298,8 @@ to the lowest level with semantic labels (unlike ``instance_id_segmentation_fast
 goes to the leaf prim).
 An ``info`` dictionary is available via ``tiled_camera.data.info['instance_segmentation_fast']``.
 
-- If ``colorize_instance_segmentation=True``: shape ``(B, H, W, 4)``, type ``torch.uint8``.
-- If ``colorize_instance_segmentation=False``: shape ``(B, H, W, 1)``, type ``torch.int32``.
+- If ``colorize_instance_segmentation=True``: shape ``(B, H, W, 4)``, type ``wp.uint8``.
+- If ``colorize_instance_segmentation=False``: shape ``(B, H, W, 1)``, type ``wp.int32``.
 
 The ``idToLabels`` dict maps color to USD prim path. The ``idToSemantics`` dict maps color to
 semantic label.
@@ -256,6 +256,7 @@
 import numpy as np
 import psutil
 import torch
+import warp as wp
 
 import isaaclab.sim as sim_utils
 from isaaclab.assets import RigidObject, RigidObjectCfg
@@ -268,6 +269,7 @@
     patterns,
 )
 from isaaclab.test.benchmark import BaseIsaacLabBenchmark, DictMeasurement, SingleMeasurement
+from isaaclab.utils.array import convert_to_torch
 from isaaclab.utils.math import orthogonalize_perspective_depth, unproject_depth
 
 from isaaclab_tasks.utils import load_cfg_from_registry
@@ -635,7 +637,7 @@ def run_simulator(
         # Set camera world poses
         for camera_list in camera_lists:
             for camera in camera_list:
-                num_cameras = camera.data.intrinsic_matrices.size(0)
+                num_cameras = camera.data.intrinsic_matrices.shape[0]
                 positions = torch.tensor([[2.5, 2.5, 2.5]], device=sim.device).repeat(num_cameras, 1)
                 targets = torch.tensor([[0.0, 0.0, 0.0]], device=sim.device).repeat(num_cameras, 1)
                 camera.set_world_poses_from_view(positions, targets)
@@ -675,23 +677,24 @@ def run_simulator(
                     # Only update the camera if it hasn't been updated as part of scene_entities.update ...
                     camera.update(dt=sim.get_physics_dt())
 
+                # Camera outputs are raw wp.array, while migrated metadata accessors expose .torch.
+                intrinsics_torch = convert_to_torch(camera.data.intrinsic_matrices)
+
                 for data_type in data_types:
                     data_label = f"{label}_{cam_idx}_{data_type}"
+                    output_torch = wp.to_torch(camera.data.output[data_type])
 
                     if depth_predicate(data_type):  # is a depth image, want to create cloud
-                        depth = camera.data.output[data_type]
+                        depth = output_torch
                         depth_images[data_label + "_raw"] = depth
                         if perspective_depth_predicate(data_type) and convert_depth_to_camera_to_image_plane:
-                            depth = orthogonalize_perspective_depth(
-                                camera.data.output[data_type], camera.data.intrinsic_matrices
-                            )
+                            depth = orthogonalize_perspective_depth(output_torch, intrinsics_torch)
                             depth_images[data_label + "_undistorted"] = depth
 
-                        pointcloud = unproject_depth(depth=depth, intrinsics=camera.data.intrinsic_matrices)
+                        pointcloud = unproject_depth(depth=depth, intrinsics=intrinsics_torch)
                         clouds[data_label] = pointcloud
                     else:  # rgb image, just save it
-                        image = camera.data.output[data_type]
-                        images[data_label] = image
+                        images[data_label] = output_torch
 
         # End timing for the step
         step_end_time = time.time()

@@ -44,6 +44,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
+import warp as wp
 
 import isaaclab.sim as sim_utils
 from isaaclab.assets import ArticulationCfg, AssetBaseCfg
@@ -239,8 +240,15 @@ def run_simulator(sim: sim_utils.SimulationContext, scene: InteractiveScene):
         # save every 10th image (for visualization purposes only)
         # note: saving images will slow down the simulation
         if count % 10 == 0:
+            # camera outputs are wp.array; lift to torch for image saving / matplotlib
+            camera_rgb = wp.to_torch(scene["camera"].data.output["rgb"])
+            tiled_rgb = wp.to_torch(scene["tiled_camera"].data.output["rgb"])
+            camera_depth = wp.to_torch(scene["camera"].data.output["distance_to_image_plane"])
+            tiled_depth = wp.to_torch(scene["tiled_camera"].data.output["distance_to_image_plane"])
+            raycast_depth = wp.to_torch(scene["raycast_camera"].data.output["distance_to_image_plane"])
+
             # compare generated RGB images across different cameras
-            rgb_images = [scene["camera"].data.output["rgb"][0, ..., :3], scene["tiled_camera"].data.output["rgb"][0]]
+            rgb_images = [camera_rgb[0, ..., :3], tiled_rgb[0]]
             save_images_grid(
                 rgb_images,
                 subtitles=["Camera"],
@@ -250,9 +258,9 @@ def run_simulator(sim: sim_utils.SimulationContext, scene: InteractiveScene):
 
             # compare generated Depth images across different cameras
             depth_images = [
-                scene["camera"].data.output["distance_to_image_plane"][0],
-                scene["tiled_camera"].data.output["distance_to_image_plane"][0, ..., 0],
-                scene["raycast_camera"].data.output["distance_to_image_plane"][0],
+                camera_depth[0],
+                tiled_depth[0, ..., 0],
+                raycast_depth[0],
             ]
             save_images_grid(
                 depth_images,
@@ -263,16 +271,15 @@ def run_simulator(sim: sim_utils.SimulationContext, scene: InteractiveScene):
             )
 
             # save all tiled RGB images
-            tiled_images = scene["tiled_camera"].data.output["rgb"]
             save_images_grid(
-                tiled_images,
-                subtitles=[f"Cam{i}" for i in range(tiled_images.shape[0])],
+                tiled_rgb,
+                subtitles=[f"Cam{i}" for i in range(tiled_rgb.shape[0])],
                 title="Tiled RGB Image",
                 filename=os.path.join(output_dir, "tiled_rgb", f"{count:04d}.jpg"),
             )
 
             # save all camera RGB images
-            cam_images = scene["camera"].data.output["rgb"][..., :3]
+            cam_images = camera_rgb[..., :3]
             save_images_grid(
                 cam_images,
                 subtitles=[f"Cam{i}" for i in range(cam_images.shape[0])],

@@ -65,6 +65,7 @@
 
 import numpy as np
 import torch
+import warp as wp
 from isaaclab_physx.renderers import IsaacRtxRendererCfg
 
 import omni.replicator.core as rep
@@ -76,6 +77,7 @@
 from isaaclab.sensors.camera import Camera, CameraCfg
 from isaaclab.sensors.camera.utils import create_pointcloud_from_depth
 from isaaclab.utils import convert_dict_to_backend
+from isaaclab.utils.array import convert_to_torch
 
 
 def define_sensor() -> Camera:
@@ -254,12 +256,12 @@ def run_simulator(sim: sim_utils.SimulationContext, scene_entities: dict):
             and args_cli.draw
             and "distance_to_image_plane" in camera.data.output.keys()
         ):
-            # Derive pointcloud from camera at camera_index
+            # Derive pointcloud from camera at camera_index; lift buffers to torch.
             pointcloud = create_pointcloud_from_depth(
-                intrinsic_matrix=camera.data.intrinsic_matrices[camera_index],
-                depth=camera.data.output["distance_to_image_plane"][camera_index],
-                position=camera.data.pos_w[camera_index],
-                orientation=camera.data.quat_w_ros[camera_index],
+                intrinsic_matrix=convert_to_torch(camera.data.intrinsic_matrices)[camera_index],
+                depth=wp.to_torch(camera.data.output["distance_to_image_plane"])[camera_index],
+                position=convert_to_torch(camera.data.pos_w)[camera_index],
+                orientation=convert_to_torch(camera.data.quat_w_ros)[camera_index],
                 device=sim.device,
             )
 

diff --git a/source/isaaclab/changelog.d/replace-torch-with-warp.minor.rst b/source/isaaclab/changelog.d/replace-torch-with-warp.minor.rst
@@ -0,0 +1,48 @@
+Fixed
+^^^^^
+
+* Fixed :class:`~isaaclab.envs.mdp.observations.image` (and the equivalent
+  ``image()`` helper in the Franka stack ``stack_ik_rel_blueprint`` config) so
+  that Torch tensor operations are applied via :func:`warp.to_torch` rather than
+  invoked directly on the new ``wp.array`` camera outputs.
+* Fixed downstream consumers (camera tutorials, ``demos/sensors/cameras.py``,
+  ``benchmarks/benchmark_cameras.py``, dexsuite ``vision_camera`` observation,
+  visualizer integration test, ``save_camera_output`` how-to and the camera
+  overview docs) to lift ``wp.array`` camera fields to torch tensors via
+  :func:`warp.to_torch` before performing Torch operations.
+* Fixed the camera and test suites to use ``wp.array`` dtypes
+  (``wp.uint8``, ``wp.float32``, ``wp.int32``) and :func:`warp.to_torch` views
+  in assertions on ``CameraData.output``, ``CameraData.intrinsic_matrices``,
+  ``CameraData.pos_w``/``quat_w_*`` and ``Camera.frame``.
+* Fixed :class:`~isaaclab.renderers.NewtonWarpRenderer` to populate the ``rgb``
+  output buffer when both ``rgb`` and ``rgba`` are requested, restoring the
+  legacy "rgb mirrors rgba" behavior that broke when ``rgb`` and ``rgba``
+  became independent ``wp.array`` allocations.
+
+Changed
+^^^^^^^
+
+* Tightened :class:`~isaaclab.renderers.RenderBufferSpec` ``dtype`` annotation
+  from ``Any`` to ``type`` to document that all renderers must publish Warp
+  scalar dtype classes (e.g. ``warp.uint8``).
+* Removed the transitional ``torch.dtype → wp.dtype`` shim in
+  :meth:`~isaaclab.sensors.camera.CameraData.allocate` now that all in-tree
+  renderers publish ``wp`` dtypes via :class:`~isaaclab.renderers.RenderBufferSpec`.
+* Documented the transitional Torch input fallback on
+  :func:`~isaaclab.utils.math.convert_camera_frame_orientation_convention` and
+  consolidated the redundant ``wp ↔ torch`` round-trips in
+  :meth:`~isaaclab.sensors.camera.Camera.set_world_poses` and
+  :meth:`~isaaclab.sensors.camera.Camera.set_world_poses_from_view`.
+* Changed :class:`~isaaclab.sensors.camera.CameraData` array fields and
+  :attr:`~isaaclab.sensors.camera.CameraData.output` buffers to expose
+  ``wp.array`` values instead of :class:`torch.Tensor` values. Use
+  :func:`warp.to_torch` where Torch tensor operations are required.
+* Changed :class:`~isaaclab.sensors.camera.Camera` pose, intrinsic, and frame
+  array APIs to accept or return ``wp.array`` values instead of
+  :class:`torch.Tensor` values. Existing Torch inputs are still accepted during
+  the transition; prefer ``wp.array`` at public call sites.
+* Changed :class:`~isaaclab.renderers.BaseRenderer` output and camera-update
+  APIs to exchange ``wp.array`` buffers with camera sensors.
+* Changed :func:`~isaaclab.utils.math.convert_camera_frame_orientation_convention`
+  to accept and return ``wp.array`` quaternion arrays. Use :func:`warp.to_torch`
+  where Torch tensor operations are required.
@@ -14,6 +14,7 @@
 from typing import TYPE_CHECKING
 
 import torch
+import warp as wp
 
 import isaaclab.utils.math as math_utils
 from isaaclab.managers import SceneEntityCfg
@@ -402,12 +403,12 @@ def image(
     # extract the used quantities (to enable type-hinting)
     sensor: Camera | RayCasterCamera = env.scene.sensors[sensor_cfg.name]
 
-    # obtain the input image
-    images = sensor.data.output[data_type]
+    # obtain the input image; camera outputs are wp.array, lift to torch for tensor ops
+    images = wp.to_torch(sensor.data.output[data_type])
 
     # depth image conversion
     if (data_type == "distance_to_camera") and convert_perspective_to_orthogonal:
-        images = math_utils.orthogonalize_perspective_depth(images, sensor.data.intrinsic_matrices)
+        images = math_utils.orthogonalize_perspective_depth(images, wp.to_torch(sensor.data.intrinsic_matrices))
 
     # rgb/depth/normals image normalization
     if normalize:

diff --git a/source/isaaclab/isaaclab/renderers/base_renderer.py b/source/isaaclab/isaaclab/renderers/base_renderer.py
@@ -14,7 +14,7 @@
 from .output_contract import RenderBufferKind, RenderBufferSpec
 
 if TYPE_CHECKING:
-    import torch
+    import warp as wp
 
     from isaaclab.sensors.camera.camera_data import CameraData
 
@@ -60,13 +60,13 @@ def create_render_data(self, spec: CameraRenderSpec) -> Any:
         pass
 
     @abstractmethod
-    def set_outputs(self, render_data: Any, output_data: dict[str, torch.Tensor]) -> None:
+    def set_outputs(self, render_data: Any, output_data: dict[str, wp.array]) -> None:
         """Store reference to output buffers for writing during render.
 
         Args:
             render_data: The render data object from :meth:`create_render_data`.
             output_data: Dictionary mapping output names (e.g. ``"rgb"``, ``"depth"``)
-                to pre-allocated tensors where rendered data will be written.
+                to pre-allocated Warp arrays where rendered data will be written.
         """
         pass
 
@@ -80,7 +80,7 @@ def update_transforms(self) -> None:
 
     @abstractmethod
     def update_camera(
-        self, render_data: Any, positions: torch.Tensor, orientations: torch.Tensor, intrinsics: torch.Tensor
+        self, render_data: Any, positions: wp.array, orientations: wp.array, intrinsics: wp.array
     ) -> None:
         """Update camera poses and intrinsics for the next render.
 

diff --git a/source/isaaclab/isaaclab/renderers/output_contract.py b/source/isaaclab/isaaclab/renderers/output_contract.py
@@ -14,10 +14,6 @@
 
 from dataclasses import dataclass
 from enum import StrEnum
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    import torch
 
 
 class RenderBufferKind(StrEnum):
@@ -48,7 +44,11 @@ class RenderBufferSpec:
     """Per-pixel layout (channels + dtype) for one render buffer kind."""
 
     channels: int
-    """Number of per-pixel channels (last dimension of the allocated tensor)."""
+    """Number of per-pixel channels (last dimension of the allocated array)."""
+
+    dtype: type
+    """Warp dtype the renderer writes for this render buffer kind.
 
-    dtype: torch.dtype
-    """Torch dtype the renderer writes for this render buffer kind."""
+    Must be a Warp scalar dtype class (e.g. ``warp.uint8``, ``warp.float32``,
+    ``warp.int32``) accepted by :func:`warp.zeros`.
+    """