diff --git a/.gitignore b/.gitignore index b3a54b003..9119d3cea 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,9 @@ *.*~ TAGS +# Vim swap files +*.sw* + # Build files *build *install diff --git a/.vscode/settings.json b/.vscode/settings.json index 8e4075da1..de64c7d30 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,17 +2,28 @@ "python.analysis.extraPaths": [ "${workspaceFolder}", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab", + "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_tasks", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_assets", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_rl", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_mimic", + "${workspaceFolder}/submodules/Isaac-GR00T" + ], + "python.autoComplete.extraPaths": [ + "${workspaceFolder}", + "${workspaceFolder}/submodules/IsaacLab/source/isaaclab", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_tasks", + "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_assets", + "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_rl", + "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_mimic", + "${workspaceFolder}/submodules/Isaac-GR00T" ], "cursorpyright.analysis.extraPaths": [ "${workspaceFolder}", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab", + "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_tasks", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_assets", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_rl", "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_mimic", - "${workspaceFolder}/submodules/IsaacLab/source/isaaclab_tasks", + "${workspaceFolder}/submodules/Isaac-GR00T" ] } diff --git a/docker/Dockerfile.vln_server b/docker/Dockerfile.vln_server new file mode 100644 index 000000000..c63459fa5 --- /dev/null +++ b/docker/Dockerfile.vln_server @@ -0,0 +1,38 @@ +FROM nvcr.io/nvidia/pytorch:24.02-py3 + +WORKDIR /workspace + +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir pyzmq msgpack Pillow + +# NaVILA / LLaVA — clone and install per upstream instructions: +# https://github.com/yang-zj1026/NaVILA-Bench#vla-evaluation +RUN git clone https://github.com/AnjieCheng/NaVILA.git /workspace/NaVILA && \ + cd /workspace/NaVILA && \ + pip install --no-cache-dir -e . && \ + pip install --no-cache-dir -e ".[train]" && \ + pip install --no-cache-dir -e ".[eval]" && \ + pip install --no-cache-dir git+https://github.com/huggingface/transformers@v4.37.2 && \ + SITE_PKG=$(python -c 'import site; print(site.getsitepackages()[0])') && \ + cp -rv ./llava/train/transformers_replace/* "$SITE_PKG/transformers/" && \ + cp -rv ./llava/train/deepspeed_replace/* "$SITE_PKG/deepspeed/" + +# NaVILA pins torch==2.3.0 (PyPI release) which replaces the pre-release +# torch 2.3.0a0 from the base image, breaking any native extensions that +# were compiled against the pre-release. Remove them and rebuild. +RUN pip uninstall -y transformer-engine transformer-engine-extensions 2>/dev/null; true + +# flash-attn must be installed AFTER NaVILA so it compiles against the +# correct PyTorch version (2.3.0 release, not the pre-release 2.3.0a0). +RUN pip install --no-cache-dir flash-attn==2.5.8 || \ + echo "WARNING: flash-attn build failed, VLM will run without it" + +# Arena code last so that code-only changes rebuild in seconds +COPY isaaclab_arena/remote_policy /workspace/isaaclab_arena/remote_policy +COPY isaaclab_arena/__init__.py /workspace/isaaclab_arena/__init__.py +COPY isaaclab_arena_navila /workspace/isaaclab_arena_navila + +ENV PYTHONPATH=/workspace + +ENTRYPOINT ["python", "-u", "-m", "isaaclab_arena.remote_policy.remote_policy_server_runner"] diff --git a/docker/run_vln_server.sh b/docker/run_vln_server.sh new file mode 100644 index 000000000..715134a40 --- /dev/null +++ b/docker/run_vln_server.sh @@ -0,0 +1,200 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ------------------------- +# User-configurable defaults +# ------------------------- + +# Default mount directories on the host machine +DATASETS_DIR="${DATASETS_DIR:-$HOME/datasets}" +MODELS_DIR="${MODELS_DIR:-$HOME/models}" +EVAL_DIR="${EVAL_DIR:-$HOME/eval}" + +# Docker image name and tag for the VLN policy server +DOCKER_IMAGE_NAME="${DOCKER_IMAGE_NAME:-vln_policy_server}" +DOCKER_VERSION_TAG="${DOCKER_VERSION_TAG:-latest}" + +# Rebuild controls +FORCE_REBUILD="${FORCE_REBUILD:-false}" +NO_CACHE="" + +# Server parameters (can also be overridden via environment variables) +HOST="${HOST:-0.0.0.0}" +PORT="${PORT:-5555}" +API_TOKEN="${API_TOKEN:-}" +TIMEOUT_MS="${TIMEOUT_MS:-15000}" +POLICY_TYPE="${POLICY_TYPE:-isaaclab_arena_navila.navila_server_policy.NaVilaServerPolicy}" + +# GPU selection for docker --gpus (can also be overridden via environment variables) +# Examples: +# all -> use all GPUs +# 1 -> use 1 GPU (count) +# "device=0" -> use GPU 0 +# "device=0,1" -> use GPU 0 and 1 +GPUS="${GPUS:-all}" + +# ------------------------- +# Help message +# ------------------------- +usage() { + script_name=$(basename "$0") + cat < Path to datasets on the host. Default: "$DATASETS_DIR". + -m Path to models on the host. Default: "$MODELS_DIR". + -e Path to evaluation data on the host. Default: "$EVAL_DIR". + -n Docker image name. Default: "$DOCKER_IMAGE_NAME". + -g GPU selection for docker --gpus. Default: "all". + Examples: "all", "1", "device=0", "device=0,1". + -r Force rebuilding of the Docker image. + -R Force rebuilding of the Docker image, without cache. + +Server-specific options (passed through to the policy server entrypoint): + --host HOST + --port PORT + --api_token TOKEN + --timeout_ms MS + --policy_type TYPE + --model_path PATH + --num_video_frames N + --conv_mode MODE + +Examples: + # Minimal: use all defaults (model at default path, all GPUs, port 5555) + bash $script_name + + # Custom model path and single GPU + bash $script_name -m /path/to/navila-checkpoint -g "device=0" + + # Custom port + bash $script_name --port 6000 +EOF +} + +# ------------------------- +# Parse all options +# ------------------------- +# Server parameters that can be overridden individually via --flags. +# Unset values will be filled with defaults after parsing. +MODEL_PATH="" + +while [[ $# -gt 0 ]]; do + case "$1" in + -v) set -x; shift 1 ;; + -d) DATASETS_DIR="$2"; shift 2 ;; + -m) MODELS_DIR="$2"; shift 2 ;; + -e) EVAL_DIR="$2"; shift 2 ;; + -n) DOCKER_IMAGE_NAME="$2"; shift 2 ;; + -g) GPUS="$2"; shift 2 ;; + -r) FORCE_REBUILD="true"; shift 1 ;; + -R) FORCE_REBUILD="true"; NO_CACHE="--no-cache"; shift 1 ;; + -h|--help) usage; exit 0 ;; + --host) HOST="$2"; shift 2 ;; + --port) PORT="$2"; shift 2 ;; + --api_token) API_TOKEN="$2"; shift 2 ;; + --timeout_ms) TIMEOUT_MS="$2"; shift 2 ;; + --policy_type) POLICY_TYPE="$2"; shift 2 ;; + --model_path) MODEL_PATH="$2"; shift 2 ;; + --num_video_frames) NUM_VIDEO_FRAMES="$2"; shift 2 ;; + --conv_mode) CONV_MODE="$2"; shift 2 ;; + --policy_device) POLICY_DEVICE="$2"; shift 2 ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +# Build server args: always include all parameters, using defaults for +# anything the user did not override. +SERVER_ARGS=( + --host "${HOST}" + --port "${PORT}" + --timeout_ms "${TIMEOUT_MS}" + --policy_type "${POLICY_TYPE}" + --model_path "${MODEL_PATH:-/models}" +) +[ -n "${API_TOKEN}" ] && SERVER_ARGS+=(--api_token "${API_TOKEN}") +[ -n "${NUM_VIDEO_FRAMES+x}" ] && SERVER_ARGS+=(--num_video_frames "${NUM_VIDEO_FRAMES}") +[ -n "${CONV_MODE+x}" ] && SERVER_ARGS+=(--conv_mode "${CONV_MODE}") +[ -n "${POLICY_DEVICE+x}" ] && SERVER_ARGS+=(--policy_device "${POLICY_DEVICE}") + +echo "Host paths:" +echo " DATASETS_DIR = ${DATASETS_DIR}" +echo " MODELS_DIR = ${MODELS_DIR}" +echo " EVAL_DIR = ${EVAL_DIR}" +echo "Docker image:" +echo " ${DOCKER_IMAGE_NAME}:${DOCKER_VERSION_TAG}" +echo "GPU:" +echo " --gpus ${GPUS}" +echo "Rebuild:" +echo " FORCE_REBUILD = ${FORCE_REBUILD}, NO_CACHE = '${NO_CACHE}'" +echo "Server args:" +printf ' %q ' "${SERVER_ARGS[@]}"; echo + +# ------------------------- +# 1) Build the Docker image +# ------------------------- + +IMAGE_TAG_FULL="${DOCKER_IMAGE_NAME}:${DOCKER_VERSION_TAG}" + +SHOULD_BUILD=false + +if [ "${FORCE_REBUILD}" = "true" ]; then + SHOULD_BUILD=true +else + if [ -z "$(docker images -q "${IMAGE_TAG_FULL}")" ]; then + SHOULD_BUILD=true + fi +fi + +if [ "${SHOULD_BUILD}" = "true" ]; then + echo "Building Docker image ${IMAGE_TAG_FULL}..." + # Use existing image layers as cache source (BuildKit may GC intermediate + # layer cache, but the final image layers can still be reused). + CACHE_FROM_ARGS="" + if [ -n "$(docker images -q "${IMAGE_TAG_FULL}" 2>/dev/null)" ]; then + CACHE_FROM_ARGS="--cache-from ${IMAGE_TAG_FULL}" + fi + docker build \ + ${NO_CACHE} \ + ${CACHE_FROM_ARGS} \ + --network host \ + -f docker/Dockerfile.vln_server \ + -t "${IMAGE_TAG_FULL}" \ + . +else + echo "Docker image ${IMAGE_TAG_FULL} already exists. Skipping rebuild." + echo "Use -r or -R to force rebuilding the image." +fi + +# ------------------------- +# 2) Run the container +# ------------------------- + +DOCKER_RUN_ARGS=( + --rm + --gpus "${GPUS}" + --net host + --name vln_policy_server_container + -v "${MODELS_DIR}":/models +) + +if [ -d "${DATASETS_DIR}" ]; then + DOCKER_RUN_ARGS+=(-v "${DATASETS_DIR}":/datasets) +fi + +if [ -d "${EVAL_DIR}" ]; then + DOCKER_RUN_ARGS+=(-v "${EVAL_DIR}":/eval) +fi + +docker run "${DOCKER_RUN_ARGS[@]}" \ + "${IMAGE_TAG_FULL}" \ + "${SERVER_ARGS[@]}" diff --git a/isaaclab_arena/assets/matterport_background.py b/isaaclab_arena/assets/matterport_background.py new file mode 100644 index 000000000..06a8f896b --- /dev/null +++ b/isaaclab_arena/assets/matterport_background.py @@ -0,0 +1,285 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Matterport 3D scene background for VLN tasks. + +This background can be spawned as a shared global scene at +``/World/matterport`` so the Matterport USD is not cloned per environment. +We keep the invisible ground plane as an optional fallback while validating +whether the referenced USD mesh participates in PhysX collision reliably. + +For collision experiments we support: + - Explicit colliders on the referenced visual USD. + - A legacy single collision overlay. + - Split floor / obstacle collision overlays built from a cleaner raw mesh. +""" + +from __future__ import annotations + +import isaaclab.sim as sim_utils +from isaaclab.assets import AssetBaseCfg +from isaaclab.sim.utils import clone +from pxr import Usd, UsdGeom + +from isaaclab_arena.assets.background_library import LibraryBackground +from isaaclab_arena.assets.register import register_asset +from isaaclab_arena.utils.pose import Pose + + +def _get_mesh_collision_cfg(mesh_collider_approximation: str): + """Return the IsaacLab mesh-collision config for the requested approximation.""" + if mesh_collider_approximation == "triangle": + return sim_utils.TriangleMeshPropertiesCfg() + if mesh_collider_approximation == "convex_decomposition": + return sim_utils.ConvexDecompositionPropertiesCfg() + if mesh_collider_approximation == "sdf": + return sim_utils.SDFMeshPropertiesCfg() + raise ValueError( + "Unsupported mesh collider approximation " + f"'{mesh_collider_approximation}'. Expected one of: triangle, convex_decomposition, sdf." + ) + + +def _iter_descendant_prims(root_prim): + """Yield descendant prims, including children hidden behind instance proxies.""" + for child in root_prim.GetFilteredChildren(Usd.TraverseInstanceProxies()): + yield child + yield from _iter_descendant_prims(child) + + +def _apply_explicit_mesh_colliders(stage, root_prim_path: str, physics_material_path: str, approximation: str) -> int: + """Apply explicit collider schemas to all descendant Mesh prims.""" + mesh_collision_cfg = _get_mesh_collision_cfg(approximation) + root_prim = stage.GetPrimAtPath(root_prim_path) + if not root_prim.IsValid(): + return 0 + mesh_count = 0 + for prim in _iter_descendant_prims(root_prim): + if not prim.IsA(UsdGeom.Mesh): + continue + prim_path = str(prim.GetPath()) + sim_utils.define_collision_properties( + prim_path, + sim_utils.CollisionPropertiesCfg(collision_enabled=True), + stage=stage, + ) + sim_utils.define_mesh_collision_properties(prim_path, mesh_collision_cfg, stage=stage) + sim_utils.bind_physics_material(prim_path, physics_material_path, stage=stage) + mesh_count += 1 + return mesh_count + + +def _get_collision_overlay_specs(cfg) -> list[tuple[str, str, str]]: + """Return collision overlay specs as ``(prim_name, usd_path, label)`` tuples.""" + combined_overlay = getattr(cfg, "collision_overlay_usd_path", None) + floor_overlay = getattr(cfg, "floor_collision_usd_path", None) + obstacle_overlay = getattr(cfg, "obstacle_collision_usd_path", None) + + if combined_overlay and (floor_overlay or obstacle_overlay): + raise ValueError( + "Legacy combined collision overlay cannot be used together with split " + "floor / obstacle collision overlays." + ) + + specs: list[tuple[str, str, str]] = [] + if combined_overlay: + specs.append(("collision", combined_overlay, "combined")) + if floor_overlay: + specs.append(("collisionFloor", floor_overlay, "floor")) + if obstacle_overlay: + specs.append(("collisionObstacle", obstacle_overlay, "obstacle")) + return specs + + +def _spawn_collision_overlay( + prim_path: str, + stage, + usd_path: str, + physics_material_path: str, + approximation: str, + overlay_prim_name: str, +) -> int: + """Spawn a hidden collision-only USD and apply mesh colliders to it.""" + from isaaclab.sim.spawners.from_files.from_files import _spawn_from_usd_file + + collision_prim_path = f"{prim_path}/{overlay_prim_name}" + collision_cfg = sim_utils.UsdFileCfg(usd_path=usd_path) + collision_prim = _spawn_from_usd_file(collision_prim_path, usd_path, collision_cfg) + if collision_prim.IsValid(): + UsdGeom.Imageable(collision_prim).MakeInvisible() + sim_utils.define_collision_properties( + collision_prim_path, + sim_utils.CollisionPropertiesCfg(collision_enabled=True), + stage=stage, + ) + sim_utils.bind_physics_material(collision_prim_path, physics_material_path, stage=stage) + return _apply_explicit_mesh_colliders( + stage=stage, + root_prim_path=collision_prim_path, + physics_material_path=physics_material_path, + approximation=approximation, + ) + + +@clone +def _spawn_matterport_with_ground(prim_path, cfg, *args, **kwargs): + """Spawn Matterport USD + optional ground plane + indoor lighting. + + Matterport 3D scans contain geometry and textures but no light + sources. This function adds everything needed to render and + simulate inside a Matterport scene: + - The USD scene itself (visual mesh with textures). + - Optional invisible ground plane for fallback collision support. + - Indoor lighting matching NaVILA-Bench: + DomeLight(500) — ambient fill under ceilings + DistantLight(1000) — directional light + DiskLight x2(10000) — area lights at ceiling height + """ + from isaaclab.sim.spawners.from_files.from_files import _spawn_from_usd_file + + prim = _spawn_from_usd_file(prim_path, cfg.usd_path, cfg, *args, **kwargs) + + # Try the same top-level collider setup used by NaVILA-Bench. + collider_cfg = sim_utils.CollisionPropertiesCfg(collision_enabled=True) + sim_utils.define_collision_properties(prim.GetPrimPath(), collider_cfg) + + physics_mat = sim_utils.RigidBodyMaterialCfg( + static_friction=1.0, + dynamic_friction=1.0, + friction_combine_mode="multiply", + restitution_combine_mode="multiply", + ) + mat_path = f"{prim_path}/matterportPhysicsMaterial" + physics_mat.func(mat_path, physics_mat) + sim_utils.bind_physics_material(prim.GetPrimPath(), mat_path) + + explicit_mesh_colliders = getattr(cfg, "explicit_mesh_colliders", False) + mesh_collider_approximation = getattr(cfg, "mesh_collider_approximation", "triangle") + explicit_mesh_count = 0 + if explicit_mesh_colliders: + explicit_mesh_count = _apply_explicit_mesh_colliders( + stage=prim.GetStage(), + root_prim_path=str(prim.GetPrimPath()), + physics_material_path=mat_path, + approximation=mesh_collider_approximation, + ) + + collision_overlay_stats: list[str] = [] + for overlay_prim_name, overlay_usd_path, overlay_label in _get_collision_overlay_specs(cfg): + collision_overlay_mesh_count = _spawn_collision_overlay( + prim_path=prim_path, + stage=prim.GetStage(), + usd_path=overlay_usd_path, + physics_material_path=mat_path, + approximation=mesh_collider_approximation, + overlay_prim_name=overlay_prim_name, + ) + collision_overlay_stats.append( + f"{overlay_label}-overlay({collision_overlay_mesh_count}, {mesh_collider_approximation})" + ) + + ground_plane_z = getattr(cfg, "ground_plane_z", 0.0) + if ground_plane_z is not None: + ground_cfg = sim_utils.GroundPlaneCfg( + physics_material=physics_mat, + visible=False, + ) + ground_cfg.func("/World/GroundPlane", ground_cfg) + + # Indoor lighting — Matterport scenes have no built-in lights. + # Matches NaVILA-Bench h1_matterport_base_cfg.py lighting setup. + sim_utils.DomeLightCfg(intensity=500.0, color=(1.0, 1.0, 1.0)).func( + "/World/MatterportDomeLight", sim_utils.DomeLightCfg(intensity=500.0, color=(1.0, 1.0, 1.0)) + ) + sim_utils.DistantLightCfg(intensity=1000.0, color=(1.0, 1.0, 1.0)).func( + "/World/MatterportDistantLight", sim_utils.DistantLightCfg(intensity=1000.0, color=(1.0, 1.0, 1.0)) + ) + disk_cfg = sim_utils.DiskLightCfg(intensity=10000.0, color=(1.0, 1.0, 1.0), radius=50.0) + disk_cfg.func("/World/MatterportDisk1", disk_cfg) + disk_cfg.func("/World/MatterportDisk2", disk_cfg) + # Position the disk lights at ceiling height + from pxr import Gf + stage = prim.GetStage() + for path, pos in [("/World/MatterportDisk1", (0.0, 0.0, 2.6)), ("/World/MatterportDisk2", (-1.0, 0.0, 2.6))]: + disk_prim = stage.GetPrimAtPath(path) + if disk_prim.IsValid(): + xformable = UsdGeom.Xformable(disk_prim) + ops = xformable.GetOrderedXformOps() + for op in ops: + if op.GetOpName() == "xformOp:translate": + op.Set(Gf.Vec3d(*pos)) + break + else: + xformable.AddTranslateOp().Set(Gf.Vec3d(*pos)) + + ground_str = f"ground(z={ground_plane_z})" if ground_plane_z is not None else "ground(disabled)" + collider_str = "top-level collision" + if explicit_mesh_colliders: + collider_str += f" + explicit-mesh({explicit_mesh_count}, {mesh_collider_approximation})" + if collision_overlay_stats: + collider_str += " + " + " + ".join(collision_overlay_stats) + print(f"[MatterportBackground] {prim_path} + {collider_str} + {ground_str} + lighting") + return prim + + +@register_asset +class MatterportBackground(LibraryBackground): + """Matterport 3D scene shared globally across environments.""" + + name = "matterport" + tags = ["background"] + usd_path = None + initial_pose = Pose.identity() + object_min_z = -0.5 + + def __init__( + self, + usd_path: str, + ground_plane_z: float | None = 0.0, + use_global_prim: bool = False, + explicit_mesh_colliders: bool = False, + mesh_collider_approximation: str = "triangle", + collision_overlay_usd_path: str | None = None, + floor_collision_usd_path: str | None = None, + obstacle_collision_usd_path: str | None = None, + ): + if collision_overlay_usd_path and (floor_collision_usd_path or obstacle_collision_usd_path): + raise ValueError( + "Use either the legacy combined collision overlay or the split floor / " + "obstacle overlays, not both." + ) + self.usd_path = usd_path + self.ground_plane_z = ground_plane_z + self.use_global_prim = use_global_prim + self.explicit_mesh_colliders = explicit_mesh_colliders + self.mesh_collider_approximation = mesh_collider_approximation + self.collision_overlay_usd_path = collision_overlay_usd_path + self.floor_collision_usd_path = floor_collision_usd_path + self.obstacle_collision_usd_path = obstacle_collision_usd_path + self.spawn_cfg_addon = {"func": _spawn_matterport_with_ground} + self.asset_cfg_addon = {"collision_group": -1} if use_global_prim else {} + prim_path = "/World/matterport" if use_global_prim else None + super().__init__(prim_path=prim_path) + + def _generate_base_cfg(self) -> AssetBaseCfg: + """Generate an AssetBaseCfg, optionally using a shared global prim path.""" + prim_path = "/World/matterport" if self.use_global_prim else self.prim_path + object_cfg = AssetBaseCfg( + prim_path=prim_path, + spawn=sim_utils.UsdFileCfg( + usd_path=self.usd_path, + scale=self.scale, + **self.spawn_cfg_addon, + ), + **self.asset_cfg_addon, + ) + object_cfg = self._add_initial_pose_to_cfg(object_cfg) + object_cfg.spawn.ground_plane_z = self.ground_plane_z + object_cfg.spawn.explicit_mesh_colliders = self.explicit_mesh_colliders + object_cfg.spawn.mesh_collider_approximation = self.mesh_collider_approximation + object_cfg.spawn.collision_overlay_usd_path = self.collision_overlay_usd_path + object_cfg.spawn.floor_collision_usd_path = self.floor_collision_usd_path + object_cfg.spawn.obstacle_collision_usd_path = self.obstacle_collision_usd_path + return object_cfg diff --git a/isaaclab_arena/embodiments/__init__.py b/isaaclab_arena/embodiments/__init__.py index 5f3052fd2..7ab4b7da6 100644 --- a/isaaclab_arena/embodiments/__init__.py +++ b/isaaclab_arena/embodiments/__init__.py @@ -9,3 +9,4 @@ from .g1.g1 import * from .galbot.galbot import * from .gr1t2.gr1t2 import * +from .h1.h1 import * diff --git a/isaaclab_arena/embodiments/h1/__init__.py b/isaaclab_arena/embodiments/h1/__init__.py new file mode 100644 index 000000000..d373688a3 --- /dev/null +++ b/isaaclab_arena/embodiments/h1/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/isaaclab_arena/embodiments/h1/h1.py b/isaaclab_arena/embodiments/h1/h1.py new file mode 100644 index 000000000..24f40516f --- /dev/null +++ b/isaaclab_arena/embodiments/h1/h1.py @@ -0,0 +1,143 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Standard H1 humanoid embodiment for IsaacLab Arena. + +Provides the Unitree H1 robot hardware configuration (articulation, +actuators, contact sensors) that can be shared across different tasks. +Task-specific extensions (observation layout, cameras, commands) are +defined in task-specific modules (e.g. ``isaaclab_arena.embodiments.h1.h1_vln``). + +Usage:: + + from isaaclab_arena.embodiments.h1.h1 import H1SceneCfg, H1CameraCfg + + class MyH1Embodiment(EmbodimentBase): + def __init__(self): + self.scene_config = H1SceneCfg() + self.camera_config = H1CameraCfg() + ... +""" + +from __future__ import annotations + +from dataclasses import MISSING + +import isaaclab.sim as sim_utils +from isaaclab.assets.articulation.articulation_cfg import ArticulationCfg +from isaaclab.sensors import CameraCfg, ContactSensorCfg, TiledCameraCfg +from isaaclab.utils import configclass +from isaaclab_assets import H1_MINIMAL_CFG + +from isaaclab_arena.utils.pose import Pose + + +# ========================================================================== # +# Scene: H1 robot articulation + contact sensors # +# ========================================================================== # + + +@configclass +class H1SceneCfg: + """Scene configuration for the H1 humanoid. + + Uses the official ``H1_MINIMAL_CFG`` from ``isaaclab_assets`` to ensure + joint names, init state, and actuator PD gains are consistent across + all H1-based tasks. + """ + + robot: ArticulationCfg = H1_MINIMAL_CFG.replace( + prim_path="{ENV_REGEX_NS}/Robot", + ) + + contact_forces: ContactSensorCfg = ContactSensorCfg( + prim_path="{ENV_REGEX_NS}/Robot/.*", + history_length=3, + track_air_time=True, + debug_vis=False, + ) + + +# ========================================================================== # +# Camera: configurable head camera with optional follow camera # +# ========================================================================== # + +_DEFAULT_H1_CAMERA_OFFSET = Pose( + position_xyz=(0.1, 0.0, 0.5), + rotation_wxyz=(-0.5, 0.5, -0.5, 0.5), +) + +_DEFAULT_H1_FOLLOW_CAMERA_OFFSET = Pose( + position_xyz=(-1.0, 0.0, 0.57), + rotation_wxyz=(-0.5, 0.5, -0.5, 0.5), +) + + +@configclass +class H1CameraCfg: + """Configurable camera setup for the H1 humanoid. + + Supports: + - ``robot_head_cam``: first-person camera on pelvis (always created). + - ``robot_follow_cam``: third-person follow camera behind the robot + (created only when ``_enable_follow_cam=True``). + + Both camera positions can be customized via ``_camera_offset`` and + ``_follow_camera_offset``. Use ``_is_tiled_camera=True`` for parallel + multi-env evaluation. + """ + + robot_head_cam: CameraCfg | TiledCameraCfg = MISSING + robot_follow_cam: CameraCfg | TiledCameraCfg | None = None + + def __post_init__(self): + is_tiled = getattr(self, "_is_tiled_camera", False) + cam_offset = getattr(self, "_camera_offset", _DEFAULT_H1_CAMERA_OFFSET) + enable_follow = getattr(self, "_enable_follow_cam", False) + follow_offset = getattr(self, "_follow_camera_offset", _DEFAULT_H1_FOLLOW_CAMERA_OFFSET) + head_camera_data_types = getattr(self, "_head_camera_data_types", ["rgb"]) + follow_camera_data_types = getattr(self, "_follow_camera_data_types", ["rgb"]) + + CameraClass = TiledCameraCfg if is_tiled else CameraCfg + OffsetClass = CameraClass.OffsetCfg + + cam_res = getattr(self, "_camera_resolution", 512) + + # First-person head camera + self.robot_head_cam = CameraClass( + prim_path="{ENV_REGEX_NS}/Robot/pelvis/HeadCamera", + offset=OffsetClass( + pos=cam_offset.position_xyz, + rot=cam_offset.rotation_wxyz, + convention="ros", + ), + update_period=0.0, + height=cam_res, + width=cam_res, + data_types=list(head_camera_data_types), + spawn=sim_utils.PinholeCameraCfg( + horizontal_aperture=54.0, + clipping_range=(0.1, 10.0), + ), + ) + + # Third-person follow camera (optional) + if enable_follow: + self.robot_follow_cam = CameraClass( + prim_path="{ENV_REGEX_NS}/Robot/pelvis/FollowCamera", + offset=OffsetClass( + pos=follow_offset.position_xyz, + rot=follow_offset.rotation_wxyz, + convention="ros", + ), + update_period=0.0, + height=cam_res, + width=cam_res, + data_types=list(follow_camera_data_types), + spawn=sim_utils.PinholeCameraCfg( + horizontal_aperture=100.0, + clipping_range=(0.1, 20.0), + ), + ) diff --git a/isaaclab_arena/embodiments/h1/h1_vln.py b/isaaclab_arena/embodiments/h1/h1_vln.py new file mode 100644 index 000000000..196a70cf8 --- /dev/null +++ b/isaaclab_arena/embodiments/h1/h1_vln.py @@ -0,0 +1,253 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""H1 humanoid embodiment configured for VLN navigation. + +Extends the standard H1 embodiment (``isaaclab_arena.embodiments.h1``) +with VLN-specific configuration: + - Observation layout matching the NaVILA low-level locomotion policy. + - Velocity command generator for VLM → LL policy integration. + - VLN-specific event handling. + +The observation vector layout is dictated by the pre-trained RSL-RL +locomotion checkpoint and **cannot be reordered**:: + + [0:3] base_lin_vel (3) + [3:6] base_ang_vel (3) + [6:9] projected_gravity (3) + [9:12] velocity_commands (3) ← VLM commands injected here + [12:31] joint_pos_rel (19) + [31:50] joint_vel_rel (19) + [50:69] last_action (19) + +Camera and lighting are provided by the standard H1 and Matterport +background respectively; this module only adds the observation/command +configuration. +""" + +from __future__ import annotations + +import math + +import isaaclab.envs.mdp as base_mdp +from isaaclab.managers import EventTermCfg as EventTerm +from isaaclab.managers import ObservationGroupCfg as ObsGroup +from isaaclab.managers import ObservationTermCfg as ObsTerm +from isaaclab.managers import SceneEntityCfg +from isaaclab.sensors import RayCasterCfg, patterns +from isaaclab.utils import configclass + +from isaaclab_arena.assets.register import register_asset +from isaaclab_arena.embodiments.embodiment_base import EmbodimentBase +from isaaclab_arena.embodiments.h1.h1 import ( + H1CameraCfg, + H1SceneCfg, + _DEFAULT_H1_CAMERA_OFFSET, +) +from isaaclab_arena.utils.pose import Pose + + +# ========================================================================== # +# VLN observations # +# ========================================================================== # +# Layout must match the NaVILA RSL-RL locomotion policy training config. +# See ``env.yaml`` in the training checkpoint directory for the canonical +# definition. Changing the order will break the pre-trained policy. + + +@configclass +class H1VlnObservationsCfg: + """Observation groups for the H1 VLN embodiment.""" + + @configclass + class PolicyCfg(ObsGroup): + """Proprioceptive observations consumed by the low-level locomotion policy.""" + + base_lin_vel = ObsTerm(func=base_mdp.base_lin_vel) + base_ang_vel = ObsTerm(func=base_mdp.base_ang_vel) + projected_gravity = ObsTerm(func=base_mdp.projected_gravity) + velocity_commands = ObsTerm( + func=base_mdp.generated_commands, + params={"command_name": "base_velocity"}, + ) + joint_pos = ObsTerm(func=base_mdp.joint_pos_rel) + joint_vel = ObsTerm(func=base_mdp.joint_vel_rel) + actions = ObsTerm(func=base_mdp.last_action) + + def __post_init__(self): + self.enable_corruption = False + self.concatenate_terms = True + + @configclass + class ProprioCfg(ObsGroup): + """Duplicate proprio group for the history wrapper (NaVILA compatibility).""" + + base_lin_vel = ObsTerm(func=base_mdp.base_lin_vel) + base_ang_vel = ObsTerm(func=base_mdp.base_ang_vel) + projected_gravity = ObsTerm(func=base_mdp.projected_gravity) + velocity_commands = ObsTerm( + func=base_mdp.generated_commands, + params={"command_name": "base_velocity"}, + ) + joint_pos = ObsTerm(func=base_mdp.joint_pos_rel) + joint_vel = ObsTerm(func=base_mdp.joint_vel_rel) + actions = ObsTerm(func=base_mdp.last_action) + + def __post_init__(self): + self.enable_corruption = False + self.concatenate_terms = True + + policy: PolicyCfg = PolicyCfg() + proprio: ProprioCfg = ProprioCfg() + + +# ========================================================================== # +# VLN actions # +# ========================================================================== # + + +@configclass +class H1VlnActionCfg: + """Joint-position action space matching the NaVILA training config.""" + + joint_pos: base_mdp.JointPositionActionCfg = base_mdp.JointPositionActionCfg( + asset_name="robot", + joint_names=[".*"], + scale=0.5, + use_default_offset=True, + ) + + +# ========================================================================== # +# VLN commands # +# ========================================================================== # + + +@configclass +class H1VlnCommandsCfg: + """Velocity command generator. + + Provides the ``base_velocity`` command term that the proprioceptive + observation ``velocity_commands`` reads from. During VLN evaluation + the actual velocity command is injected by ``VlnPolicy`` directly + into the observation buffer (indices 9:12). + """ + + base_velocity: base_mdp.UniformVelocityCommandCfg = base_mdp.UniformVelocityCommandCfg( + asset_name="robot", + resampling_time_range=(10.0, 10.0), + rel_standing_envs=0.02, + rel_heading_envs=1.0, + heading_command=True, + heading_control_stiffness=0.5, + debug_vis=False, + ranges=base_mdp.UniformVelocityCommandCfg.Ranges( + lin_vel_x=(-1.0, 1.0), + lin_vel_y=(-1.0, 1.0), + ang_vel_z=(-1.0, 1.0), + heading=(-math.pi, math.pi), + ), + ) + + +# ========================================================================== # +# VLN events # +# ========================================================================== # + + +@configclass +class H1VlnEventCfg: + """Reset events for the H1 VLN embodiment.""" + + reset_robot_joints: EventTerm = EventTerm( + func=base_mdp.reset_joints_by_offset, + mode="reset", + params={ + "position_range": (0.0, 0.0), + "velocity_range": (0.0, 0.0), + "asset_cfg": SceneEntityCfg("robot"), + }, + ) + + +@configclass +class H1VlnSceneCfg(H1SceneCfg): + """H1 scene config with optional experimental debug sensors.""" + + height_scanner: RayCasterCfg | None = None + + +# ========================================================================== # +# VLN Embodiment # +# ========================================================================== # + + +@register_asset +class H1VlnEmbodiment(EmbodimentBase): + """H1 humanoid embodiment for Vision-Language Navigation. + + Extends the standard ``H1SceneCfg`` and ``H1CameraCfg`` from + ``isaaclab_arena.embodiments.h1`` with VLN-specific observations, + commands, and events. + + Args: + enable_cameras: Enable camera sensors. + initial_pose: Robot initial pose. + camera_offset: First-person camera position on pelvis. + enable_follow_camera: Add a third-person follow camera for + visualization. Default True for VLN. + follow_camera_offset: Follow camera position on pelvis. + use_tiled_camera: Use TiledCamera for parallel evaluation. + """ + + name = "h1_vln" + + def __init__( + self, + enable_cameras: bool = True, + initial_pose: Pose | None = None, + camera_offset: Pose | None = _DEFAULT_H1_CAMERA_OFFSET, + enable_follow_camera: bool = True, + follow_camera_offset: Pose | None = None, + use_tiled_camera: bool = False, + camera_resolution: int = 512, + enable_head_depth: bool = False, + enable_height_scanner: bool = False, + height_scanner_debug_vis: bool = False, + ): + super().__init__(enable_cameras=enable_cameras, initial_pose=initial_pose) + + # Robot hardware from standard H1 + self.scene_config = H1VlnSceneCfg() + if enable_height_scanner: + self.scene_config.height_scanner = RayCasterCfg( + prim_path="{ENV_REGEX_NS}/Robot/pelvis", + offset=RayCasterCfg.OffsetCfg(pos=(0.0, 0.0, 0.5)), + attach_yaw_only=True, + pattern_cfg=patterns.GridPatternCfg(resolution=0.1, size=[1.6, 1.0]), + debug_vis=height_scanner_debug_vis, + mesh_prim_paths=["/World/matterport"], + ) + + # Cameras: head (required) + follow (optional). + # Private attributes must be set BEFORE construction because + # @configclass __post_init__ reads them at init time. + cam_cfg = H1CameraCfg.__new__(H1CameraCfg) + cam_cfg._is_tiled_camera = use_tiled_camera + cam_cfg._camera_resolution = camera_resolution + cam_cfg._camera_offset = camera_offset + cam_cfg._enable_follow_cam = enable_follow_camera + cam_cfg._head_camera_data_types = ["rgb", "distance_to_image_plane"] if enable_head_depth else ["rgb"] + cam_cfg._follow_camera_data_types = ["rgb"] + if follow_camera_offset is not None: + cam_cfg._follow_camera_offset = follow_camera_offset + cam_cfg.__init__() + self.camera_config = cam_cfg + + # VLN-specific configs + self.action_config = H1VlnActionCfg() + self.observation_config = H1VlnObservationsCfg() + self.event_config = H1VlnEventCfg() + self.command_config = H1VlnCommandsCfg() diff --git a/isaaclab_arena/evaluation/policy_runner.py b/isaaclab_arena/evaluation/policy_runner.py index dec7e2ea2..6d03d760f 100644 --- a/isaaclab_arena/evaluation/policy_runner.py +++ b/isaaclab_arena/evaluation/policy_runner.py @@ -25,28 +25,32 @@ def get_policy_cls(policy_type: str) -> type["PolicyBase"]: """Get the policy class for the given policy type name. Note that this function: - - first: checks for a registered policy type in the PolicyRegistry - - if not found, it tries to dynamically import the policy class, treating - the policy_type argument as a string representing the module path and class name. + - prefers dynamically importing dotted module paths directly, which avoids + forcing unrelated asset libraries to register + - otherwise: checks for a registered policy type in the PolicyRegistry """ - from isaaclab_arena.assets.asset_registry import PolicyRegistry - - policy_registry = PolicyRegistry() - if policy_registry.is_registered(policy_type): - return policy_registry.get_policy(policy_type) - else: - print(f"Policy {policy_type} is not registered. Dynamically importing from path: {policy_type}") + if "." in policy_type: + print(f"Dynamically importing policy from path: {policy_type}") assert "." in policy_type, ( "policy_type must be a dotted Python import path of the form 'module.submodule.ClassName', got:" f" {policy_type}" ) - # Dynamically import the class from the string path module_path, class_name = policy_type.rsplit(".", 1) module = import_module(module_path) policy_cls = getattr(module, class_name) return policy_cls + from isaaclab_arena.assets.asset_registry import PolicyRegistry + + policy_registry = PolicyRegistry() + if policy_registry.is_registered(policy_type): + return policy_registry.get_policy(policy_type) + + raise ValueError( + f"Policy {policy_type!r} is neither a registered policy name nor a dotted Python import path." + ) + def is_distributed(args_cli: argparse.Namespace) -> bool: return ( diff --git a/isaaclab_arena/metrics/vln_metrics.py b/isaaclab_arena/metrics/vln_metrics.py new file mode 100644 index 000000000..62b1308d9 --- /dev/null +++ b/isaaclab_arena/metrics/vln_metrics.py @@ -0,0 +1,427 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Standard Vision-Language Navigation metrics. + +Metrics implemented: + - PathLength: mean cumulative Euclidean distance traversed in XY. + - DistanceToGoal: approximate geodesic distance from the final robot + position to the goal, estimated via the ground-truth + reference path and a KDTree. + - OracleSuccess: fraction of episodes where the trajectory comes within + the success radius of the goal at least once. + - Success: fraction of episodes where the agent explicitly + issues STOP and the final DistanceToGoal < radius. + - SPL: Success weighted by path efficiency. + +All metrics rely on a single recorder that logs the robot base position +plus whether the agent has emitted STOP at every simulation step. +""" + +from __future__ import annotations + +from typing import List, Sequence + +import numpy as np +import torch +from scipy.spatial import KDTree + +from isaaclab.envs.manager_based_rl_env import ManagerBasedEnv +from isaaclab.managers.recorder_manager import RecorderTerm, RecorderTermCfg +from isaaclab.utils import configclass + +from isaaclab_arena.metrics.metric_base import MetricBase + + +# ========================================================================== # +# Recorder: robot position per step # +# ========================================================================== # + + +class RobotPositionRecorder(RecorderTerm): + """Record robot root position plus STOP state after each step. + + The exported tensor has shape ``[N, 4]`` and stores: + - ``[..., 0:3]``: world-frame root position ``[x, y, z]`` + - ``[..., 3]``: STOP flag as ``0.0`` or ``1.0`` + + Keeping the STOP state in the same recorder lets offline metrics follow + the Anderson et al. protocol: an episode is successful only if the agent + explicitly signals completion. + """ + + name = "robot_position_w" + + def __init__(self, cfg: RecorderTermCfg, env: ManagerBasedEnv): + super().__init__(cfg, env) + + def record_post_step(self): + """Return (key, data) after each simulation step.""" + root_pos = self._env.scene["robot"].data.root_pos_w + stop_flag = self._env.extras.get("vln_stop_called") + if stop_flag is None: + stop_flag = np.zeros((self._env.num_envs,), dtype=np.float32) + if hasattr(stop_flag, "to"): + stop_tensor = stop_flag.to(device=root_pos.device, dtype=root_pos.dtype) + else: + stop_tensor = torch.as_tensor(stop_flag, device=root_pos.device, dtype=root_pos.dtype) + stop_tensor = stop_tensor.unsqueeze(-1) + return self.name, torch.cat((root_pos, stop_tensor), dim=-1).clone() + + +@configclass +class RobotPositionRecorderCfg(RecorderTermCfg): + """Config for :class:`RobotPositionRecorder`.""" + + class_type: type[RecorderTerm] = RobotPositionRecorder + + +# ========================================================================== # +# Helpers # +# ========================================================================== # + + +def _episode_state_array(episode_data: np.ndarray) -> np.ndarray: + """Normalize recorded per-step state data to ``[T, D]``.""" + arr = np.asarray(episode_data) + if arr.ndim == 3: + # [T, num_envs, D] -> take env 0 + arr = arr[:, 0, :] + return arr + + +def _episode_pos_array(episode_data: np.ndarray) -> np.ndarray: + """Extract recorded positions as an array of shape ``[T, 3]``.""" + arr = _episode_state_array(episode_data) + if arr.ndim != 2 or arr.shape[-1] < 3: + raise ValueError(f"Expected episode data with at least 3 columns, got shape {arr.shape}") + return arr[:, :3] + + +def _episode_stop_called(episode_data: np.ndarray) -> bool | None: + """Return whether STOP was emitted in this episode. + + Returns ``None`` when reading legacy recordings that do not contain the + STOP flag column. New recordings export STOP in column 3. + """ + arr = _episode_state_array(episode_data) + if arr.ndim != 2 or arr.shape[0] == 0 or arr.shape[-1] < 4: + return None + return bool(np.any(arr[:, 3] > 0.5)) + + +def _xy(arr: np.ndarray) -> np.ndarray: + """Project positions to the XY (horizontal) plane. + + **Why XY only?** + The robot's ``root_pos_w`` reports the *pelvis* height (~0.9m above + ground), while the reference-path waypoints from the VLN-CE dataset + record *floor-level* positions (~0.17m). Using full 3-D distance + would inflate the KDTree matching error and make geodesic distances + unreliable. + + **Current dataset**: ``vln_ce_isaac_v1.json.gz`` contains only + single-floor navigation episodes (start-z values vary by < 0.1m + within each scene). XY distance is the standard metric for + single-floor VLN benchmarks (consistent with Habitat VLN-CE). + + **Future multi-floor support**: If episodes with stairs or + multiple floors are added, this function should be replaced by + a proper 3-D geodesic computation that accounts for the + pelvis-to-floor offset (subtract ~0.88m from robot z before + comparing with waypoint z). + """ + return arr[..., :2] + + +def _path_length_xy(pos: np.ndarray) -> float: + """Return cumulative XY path length for one episode.""" + pos_xy = _xy(pos) + if pos_xy.shape[0] < 2: + return 0.0 + return float(np.linalg.norm(pos_xy[1:] - pos_xy[:-1], axis=-1).sum()) + + +def _reference_path_suffix_lengths(gt_wps_xy: np.ndarray) -> np.ndarray: + """Return suffix lengths from each waypoint to the goal.""" + if gt_wps_xy.shape[0] == 0: + return np.zeros((0,), dtype=np.float32) + suffix = np.zeros((gt_wps_xy.shape[0],), dtype=np.float32) + if gt_wps_xy.shape[0] > 1: + segment_lengths = np.linalg.norm(gt_wps_xy[1:] - gt_wps_xy[:-1], axis=-1) + suffix[:-1] = np.cumsum(segment_lengths[::-1], dtype=np.float32)[::-1] + return suffix + + +def _approx_goal_distances_along_reference(pos: np.ndarray, gt_wps: np.ndarray) -> np.ndarray: + """Approximate geodesic distances from episode positions to the goal. + + For each recorded position, we: + 1. Find the closest reference waypoint in XY via KDTree. + 2. Add the remaining reference-path suffix length from that waypoint. + """ + pos_xy = _xy(pos) + if pos_xy.shape[0] == 0: + return np.zeros((0,), dtype=np.float32) + + gt_wps_xy = _xy(gt_wps) + if gt_wps_xy.shape[0] == 0: + return np.zeros((pos_xy.shape[0],), dtype=np.float32) + + tree = KDTree(gt_wps_xy) + suffix_lengths = _reference_path_suffix_lengths(gt_wps_xy) + closest_dist, closest_idx = tree.query(pos_xy) + closest_idx = np.asarray(closest_idx, dtype=np.int64) + return np.asarray(closest_dist, dtype=np.float32) + suffix_lengths[closest_idx] + + +# ========================================================================== # +# Path Length # +# ========================================================================== # + + +class PathLengthMetric(MetricBase): + """Mean cumulative Euclidean path length over recorded episodes.""" + + name = "path_length" + recorder_term_name = RobotPositionRecorder.name + + def get_recorder_term_cfg(self) -> RecorderTermCfg: + return RobotPositionRecorderCfg() + + def compute_metric_from_recording(self, recorded_metric_data: List[np.ndarray]) -> float: + if not recorded_metric_data: + return 0.0 + + lengths: List[float] = [] + for ep_data in recorded_metric_data: + pos = _episode_pos_array(ep_data) + lengths.append(_path_length_xy(pos)) + return float(np.mean(lengths)) + + +# ========================================================================== # +# Distance-To-Goal # +# ========================================================================== # + + +class DistanceToGoalMetric(MetricBase): + """Approximate geodesic distance from the final robot position to the goal. + + Habitat-Lab defines distance-to-goal as a simulator geodesic query. In the + current Isaac Sim benchmark we do not yet have an equivalent navmesh + backend, so we estimate the distance by: + 1. Finding the closest ground-truth waypoint to the robot via KDTree. + 2. Summing the segment lengths from that waypoint to the last waypoint. + """ + + name = "distance_to_goal" + recorder_term_name = RobotPositionRecorder.name + + def __init__(self, gt_waypoints_per_episode: Sequence[Sequence[Sequence[float]]]): + """ + Args: + gt_waypoints_per_episode: For each episode ``i``, a list of 3-D + waypoints ``[[x, y, z], ...]`` describing the reference path. + """ + super().__init__() + self._gt_waypoints = [np.asarray(wps, dtype=np.float32) for wps in gt_waypoints_per_episode] + + def get_recorder_term_cfg(self) -> RecorderTermCfg: + return RobotPositionRecorderCfg() + + def compute_metric_from_recording(self, recorded_metric_data: List[np.ndarray]) -> float: + if not recorded_metric_data: + return 0.0 + num_eps = min(len(recorded_metric_data), len(self._gt_waypoints)) + + distances: List[float] = [] + for ep_idx in range(num_eps): + ep_data = recorded_metric_data[ep_idx] + pos = _episode_pos_array(ep_data) + dists = _approx_goal_distances_along_reference(pos, self._gt_waypoints[ep_idx]) + distances.append(float(dists[-1]) if dists.size > 0 else 0.0) + + return float(np.mean(distances)) + + +# ========================================================================== # +# Oracle Success # +# ========================================================================== # + + +class OracleSuccessMetric(MetricBase): + """Oracle Success from Habitat/VLN-CE-style distance-to-goal traces. + + This follows the standard embodied-navigation definition: + + ``oracle_success = 1[min_t d_t < r]`` + + where ``d_t`` is the per-step distance-to-goal measure and ``r`` is the + success radius. In this Arena benchmark, ``d_t`` currently comes from the + same approximate distance surrogate used by :class:`DistanceToGoalMetric`. + """ + + name = "oracle_success" + recorder_term_name = RobotPositionRecorder.name + + def __init__( + self, + gt_waypoints_per_episode: Sequence[Sequence[Sequence[float]]], + success_radius_per_episode: Sequence[float], + ): + super().__init__() + assert len(gt_waypoints_per_episode) == len(success_radius_per_episode) + self._gt_waypoints = [np.asarray(wps, dtype=np.float32) for wps in gt_waypoints_per_episode] + self._radii = list(success_radius_per_episode) + + def get_recorder_term_cfg(self) -> RecorderTermCfg: + return RobotPositionRecorderCfg() + + def compute_metric_from_recording(self, recorded_metric_data: List[np.ndarray]) -> float: + if not recorded_metric_data: + return 0.0 + num_eps = min(len(recorded_metric_data), len(self._radii)) + + successes: List[float] = [] + for ep_idx in range(num_eps): + pos = _episode_pos_array(recorded_metric_data[ep_idx]) + dists = _approx_goal_distances_along_reference(pos, self._gt_waypoints[ep_idx]) + if dists.size == 0: + successes.append(0.0) + continue + oracle_dist = float(np.min(dists)) + successes.append(1.0 if oracle_dist < self._radii[ep_idx] else 0.0) + return float(np.mean(successes)) + + +# ========================================================================== # +# Success # +# ========================================================================== # + + +class SuccessMetric(MetricBase): + """Fraction of episodes that STOP near the goal. + + By default this follows the embodied-navigation recommendation from + Anderson et al.: an episode is successful only if the agent explicitly + emits STOP and its final distance-to-goal is within the success radius. + In this benchmark the final distance is currently the same approximate + surrogate used by :class:`DistanceToGoalMetric`. + """ + + name = "success" + recorder_term_name = RobotPositionRecorder.name + + def __init__( + self, + gt_waypoints_per_episode: Sequence[Sequence[Sequence[float]]], + success_radius_per_episode: Sequence[float], + require_stop_signal: bool = True, + ): + super().__init__() + assert len(gt_waypoints_per_episode) == len(success_radius_per_episode) + self._dist_metric = DistanceToGoalMetric(gt_waypoints_per_episode) + self._radii = list(success_radius_per_episode) + self._require_stop_signal = require_stop_signal + + def get_recorder_term_cfg(self) -> RecorderTermCfg: + return self._dist_metric.get_recorder_term_cfg() + + def _compute_episode_success(self, ep_idx: int, ep_data: np.ndarray) -> float: + """Return success for one episode using the final recorded position.""" + stop_called = _episode_stop_called(ep_data) + if self._require_stop_signal and stop_called is False: + return 0.0 + + pos = _episode_pos_array(ep_data) + dists = _approx_goal_distances_along_reference(pos, self._dist_metric._gt_waypoints[ep_idx]) + if dists.size == 0: + return 0.0 + final_dist = float(dists[-1]) + return 1.0 if final_dist < self._radii[ep_idx] else 0.0 + + def compute_metric_from_recording(self, recorded_metric_data: List[np.ndarray]) -> float: + if not recorded_metric_data: + return 0.0 + num_eps = min(len(recorded_metric_data), len(self._radii)) + + successes: List[float] = [] + for ep_idx in range(num_eps): + successes.append(self._compute_episode_success(ep_idx, recorded_metric_data[ep_idx])) + return float(np.mean(successes)) + + +# ========================================================================== # +# SPL (Success weighted by Path Length) # +# ========================================================================== # + + +class SPLMetric(MetricBase): + r"""SPL = mean_i [ S_i * d_i / max(d_i, l_i) ]. + + Where: + - S_i: binary success indicator + - d_i: shortest-path distance from episode start to goal + - l_i: actual path length + """ + + name = "spl" + recorder_term_name = RobotPositionRecorder.name + + def __init__( + self, + gt_waypoints_per_episode: Sequence[Sequence[Sequence[float]]], + success_radius_per_episode: Sequence[float], + shortest_path_distance_per_episode: Sequence[float] | None = None, + require_stop_signal: bool = True, + ): + super().__init__() + assert len(gt_waypoints_per_episode) == len(success_radius_per_episode) + + self._shortest: List[float] = [] + if shortest_path_distance_per_episode is not None: + assert len(shortest_path_distance_per_episode) == len(gt_waypoints_per_episode) + self._shortest = [max(0.0, float(d)) for d in shortest_path_distance_per_episode] + else: + # Fallback for datasets that do not expose the official start-goal + # geodesic distance. This uses the reference path length as a proxy. + for wps in gt_waypoints_per_episode: + arr = _xy(np.asarray(wps, dtype=np.float32)) + suffix_lengths = _reference_path_suffix_lengths(arr) + self._shortest.append(float(suffix_lengths[0]) if suffix_lengths.size > 0 else 0.0) + + self._success_metric = SuccessMetric( + gt_waypoints_per_episode, + success_radius_per_episode, + require_stop_signal=require_stop_signal, + ) + + def get_recorder_term_cfg(self) -> RecorderTermCfg: + return RobotPositionRecorderCfg() + + def compute_metric_from_recording(self, recorded_metric_data: List[np.ndarray]) -> float: + if not recorded_metric_data: + return 0.0 + num_eps = min(len(recorded_metric_data), len(self._shortest)) + + spl_values: List[float] = [] + for i in range(num_eps): + ep_data = recorded_metric_data[i] + pos = _episode_pos_array(ep_data) + + # Actual path length l_i (XY) + l_i = _path_length_xy(pos) + + # Success S_i + s_i = self._success_metric._compute_episode_success(i, ep_data) + + d_i = self._shortest[i] + if d_i <= 0.0: + spl_values.append(0.0) + else: + spl_values.append(float(s_i * d_i / max(d_i, l_i if l_i > 0.0 else d_i))) + + return float(np.mean(spl_values)) diff --git a/isaaclab_arena/policy/__init__.py b/isaaclab_arena/policy/__init__.py index 66caa51a1..bb2b811e8 100644 --- a/isaaclab_arena/policy/__init__.py +++ b/isaaclab_arena/policy/__init__.py @@ -3,8 +3,39 @@ # # SPDX-License-Identifier: Apache-2.0 +"""Policy package exports. + +This module avoids eager importing of all policy implementations because some +policies trigger asset registration side effects during import. Training code +that only needs lightweight helpers (for example `get_agent_cfg`) should not +pull in remote asset registry dependencies. +""" + +from __future__ import annotations + +from importlib import import_module + from .action_chunking import ActionChunkingState -from .action_chunking_client import * -from .replay_action_policy import * -from .rsl_rl_action_policy import * -from .zero_action_policy import * + +__all__ = [ + "ActionChunkingState", + "ActionChunkingClientSidePolicy", + "ReplayActionPolicy", + "RslRlActionPolicy", + "ZeroActionPolicy", +] + + +_LAZY_IMPORTS = { + "ActionChunkingClientSidePolicy": "isaaclab_arena.policy.action_chunking_client", + "ReplayActionPolicy": "isaaclab_arena.policy.replay_action_policy", + "RslRlActionPolicy": "isaaclab_arena.policy.rsl_rl_action_policy", + "ZeroActionPolicy": "isaaclab_arena.policy.zero_action_policy", +} + + +def __getattr__(name: str): + if name not in _LAZY_IMPORTS: + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + module = import_module(_LAZY_IMPORTS[name]) + return getattr(module, name) diff --git a/isaaclab_arena/policy/vln/__init__.py b/isaaclab_arena/policy/vln/__init__.py new file mode 100644 index 000000000..fb5d6f3b3 --- /dev/null +++ b/isaaclab_arena/policy/vln/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from isaaclab_arena.policy.vln.vln_vlm_locomotion_policy import VlnVlmLocomotionPolicy # noqa: F401 diff --git a/isaaclab_arena/policy/vln/pretrained/h1_navila_agent.yaml b/isaaclab_arena/policy/vln/pretrained/h1_navila_agent.yaml new file mode 100644 index 000000000..32ed38c5b --- /dev/null +++ b/isaaclab_arena/policy/vln/pretrained/h1_navila_agent.yaml @@ -0,0 +1,53 @@ +seed: 42 +device: cuda:0 +num_steps_per_env: 32 +max_iterations: 1000 +empirical_normalization: null +obs_groups: + policy: + - policy + critic: + - policy +clip_actions: null +save_interval: 50 +experiment_name: h1_base_rough +run_name: '' +logger: tensorboard +neptune_project: isaaclab +wandb_project: isaaclab +resume: false +load_run: .* +load_checkpoint: model_.*.pt +class_name: OnPolicyRunner +policy: + class_name: ActorCritic + init_noise_std: 1.0 + noise_std_type: scalar + actor_obs_normalization: false + critic_obs_normalization: false + actor_hidden_dims: + - 512 + - 256 + - 128 + critic_hidden_dims: + - 512 + - 256 + - 128 + activation: elu +algorithm: + class_name: PPO + num_learning_epochs: 5 + num_mini_batches: 4 + learning_rate: 0.001 + schedule: adaptive + gamma: 0.99 + lam: 0.95 + entropy_coef: 0.005 + desired_kl: 0.01 + max_grad_norm: 1.0 + value_loss_coef: 1.0 + use_clipped_value_loss: true + clip_param: 0.2 + normalize_advantage_per_mini_batch: false + rnd_cfg: null + symmetry_cfg: null diff --git a/isaaclab_arena/policy/vln/pretrained/h1_navila_env.yaml b/isaaclab_arena/policy/vln/pretrained/h1_navila_env.yaml new file mode 100644 index 000000000..31a8869e7 --- /dev/null +++ b/isaaclab_arena/policy/vln/pretrained/h1_navila_env.yaml @@ -0,0 +1,1962 @@ +viewer: + eye: !!python/tuple + - 7.5 + - 7.5 + - 7.5 + lookat: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + cam_prim_path: /OmniverseKit_Persp + resolution: !!python/tuple + - 1280 + - 720 + origin_type: world + env_index: 0 + asset_name: null + body_name: null +sim: + physics_prim_path: /physicsScene + device: cuda:0 + dt: 0.005 + render_interval: 4 + gravity: !!python/tuple + - 0.0 + - 0.0 + - -9.81 + enable_scene_query_support: false + use_fabric: true + physx: + solver_type: 1 + min_position_iteration_count: 1 + max_position_iteration_count: 255 + min_velocity_iteration_count: 0 + max_velocity_iteration_count: 255 + enable_ccd: false + enable_stabilization: true + enable_enhanced_determinism: false + bounce_threshold_velocity: 0.5 + friction_offset_threshold: 0.04 + friction_correlation_distance: 0.025 + gpu_max_rigid_contact_count: 8388608 + gpu_max_rigid_patch_count: 327680 + gpu_found_lost_pairs_capacity: 2097152 + gpu_found_lost_aggregate_pairs_capacity: 33554432 + gpu_total_aggregate_pairs_capacity: 2097152 + gpu_collision_stack_size: 67108864 + gpu_heap_capacity: 67108864 + gpu_temp_buffer_capacity: 16777216 + gpu_max_num_partitions: 8 + gpu_max_soft_body_contacts: 1048576 + gpu_max_particle_contacts: 1048576 + solve_articulation_contact_last: false + physics_material: + func: isaaclab.sim.spawners.materials.physics_materials:spawn_rigid_body_material + static_friction: 1.0 + dynamic_friction: 1.0 + restitution: 0.0 + friction_combine_mode: multiply + restitution_combine_mode: multiply + compliant_contact_stiffness: 0.0 + compliant_contact_damping: 0.0 + render: + enable_translucency: null + enable_reflections: null + enable_global_illumination: null + antialiasing_mode: null + enable_dlssg: null + enable_dl_denoiser: null + dlss_mode: null + enable_direct_lighting: null + samples_per_pixel: null + enable_shadows: null + enable_ambient_occlusion: null + dome_light_upper_lower_strategy: null + carb_settings: null + rendering_mode: null + create_stage_in_memory: false +ui_window_class_type: isaaclab.envs.ui.manager_based_rl_env_window:ManagerBasedRLEnvWindow +seed: 42 +decimation: 4 +scene: + num_envs: 4096 + env_spacing: 2.5 + lazy_sensor_update: true + replicate_physics: false + filter_collisions: true + clone_in_fabric: false + terrain: + class_type: isaaclab.terrains.terrain_importer:TerrainImporter + collision_group: -1 + prim_path: /World/ground + num_envs: 4096 + terrain_type: generator + terrain_generator: + class_type: isaaclab.terrains.terrain_generator:TerrainGenerator + seed: null + curriculum: true + size: !!python/tuple + - 8.0 + - 8.0 + border_width: 20.0 + border_height: 1.0 + num_rows: 10 + num_cols: 20 + color_scheme: none + horizontal_scale: 0.1 + vertical_scale: 0.005 + slope_threshold: 0.75 + sub_terrains: + random_rough: + function: isaaclab.terrains.height_field.hf_terrains:random_uniform_terrain + proportion: 1.0 + size: !!python/tuple + - 8.0 + - 8.0 + flat_patch_sampling: null + border_width: 0.0 + horizontal_scale: 0.1 + vertical_scale: 0.005 + slope_threshold: 0.75 + noise_range: !!python/tuple + - 0.02 + - 0.1 + noise_step: 0.02 + downsampled_scale: null + difficulty_range: !!python/tuple + - 0.0 + - 1.0 + use_cache: false + cache_dir: /tmp/isaaclab/terrains + usd_path: null + env_spacing: 2.5 + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_from_mdl_file + mdl_path: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/IsaacLab/Materials/TilesMarbleSpiderWhiteBrickBondHoned/TilesMarbleSpiderWhiteBrickBondHoned.mdl + project_uvw: true + albedo_brightness: null + texture_scale: !!python/tuple + - 0.25 + - 0.25 + physics_material: + func: isaaclab.sim.spawners.materials.physics_materials:spawn_rigid_body_material + static_friction: 1.0 + dynamic_friction: 1.0 + restitution: 0.0 + friction_combine_mode: multiply + restitution_combine_mode: multiply + compliant_contact_stiffness: 0.0 + compliant_contact_damping: 0.0 + max_init_terrain_level: 5 + debug_vis: false + robot: + class_type: isaaclab.assets.articulation.articulation:Articulation + prim_path: /World/envs/env_.*/Robot + spawn: + func: isaaclab.sim.spawners.from_files.from_files:spawn_from_usd + visible: true + semantic_tags: null + copy_from_source: true + mass_props: null + deformable_props: null + rigid_props: + rigid_body_enabled: null + kinematic_enabled: null + disable_gravity: false + linear_damping: 0.0 + angular_damping: 0.0 + max_linear_velocity: 1000.0 + max_angular_velocity: 1000.0 + max_depenetration_velocity: 1.0 + max_contact_impulse: null + enable_gyroscopic_forces: null + retain_accelerations: false + solver_position_iteration_count: null + solver_velocity_iteration_count: null + sleep_threshold: null + stabilization_threshold: null + collision_props: null + activate_contact_sensors: true + scale: null + articulation_props: + articulation_enabled: null + enabled_self_collisions: false + solver_position_iteration_count: 4 + solver_velocity_iteration_count: 4 + sleep_threshold: null + stabilization_threshold: null + fix_root_link: null + fixed_tendons_props: null + spatial_tendons_props: null + joint_drive_props: null + visual_material_path: material + visual_material: null + usd_path: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/IsaacLab/Robots/Unitree/H1/h1_minimal.usd + variants: null + init_state: + pos: !!python/tuple + - 0.0 + - 0.0 + - 1.05 + rot: !!python/tuple + - 1.0 + - 0.0 + - 0.0 + - 0.0 + lin_vel: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + ang_vel: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + joint_pos: + .*_hip_yaw: 0.0 + .*_hip_roll: 0.0 + .*_hip_pitch: -0.28 + .*_knee: 0.79 + .*_ankle: -0.52 + torso: 0.0 + .*_shoulder_pitch: 0.28 + .*_shoulder_roll: 0.0 + .*_shoulder_yaw: 0.0 + .*_elbow: 0.52 + joint_vel: + .*: 0.0 + collision_group: 0 + debug_vis: false + articulation_root_prim_path: null + soft_joint_pos_limit_factor: 0.9 + actuators: + legs: + class_type: isaaclab.actuators.actuator_pd:ImplicitActuator + joint_names_expr: + - .*_hip_yaw + - .*_hip_roll + - .*_hip_pitch + - .*_knee + - torso + effort_limit: null + velocity_limit: null + effort_limit_sim: 300 + velocity_limit_sim: null + stiffness: + .*_hip_yaw: 150.0 + .*_hip_roll: 150.0 + .*_hip_pitch: 200.0 + .*_knee: 200.0 + torso: 200.0 + damping: + .*_hip_yaw: 5.0 + .*_hip_roll: 5.0 + .*_hip_pitch: 5.0 + .*_knee: 5.0 + torso: 5.0 + armature: null + friction: null + dynamic_friction: null + viscous_friction: null + feet: + class_type: isaaclab.actuators.actuator_pd:ImplicitActuator + joint_names_expr: + - .*_ankle + effort_limit: null + velocity_limit: null + effort_limit_sim: 100 + velocity_limit_sim: null + stiffness: + .*_ankle: 20.0 + damping: + .*_ankle: 4.0 + armature: null + friction: null + dynamic_friction: null + viscous_friction: null + arms: + class_type: isaaclab.actuators.actuator_pd:ImplicitActuator + joint_names_expr: + - .*_shoulder_pitch + - .*_shoulder_roll + - .*_shoulder_yaw + - .*_elbow + effort_limit: null + velocity_limit: null + effort_limit_sim: 300 + velocity_limit_sim: null + stiffness: + .*_shoulder_pitch: 40.0 + .*_shoulder_roll: 40.0 + .*_shoulder_yaw: 40.0 + .*_elbow: 40.0 + damping: + .*_shoulder_pitch: 10.0 + .*_shoulder_roll: 10.0 + .*_shoulder_yaw: 10.0 + .*_elbow: 10.0 + armature: null + friction: null + dynamic_friction: null + viscous_friction: null + actuator_value_resolution_debug_print: false + contact_forces: + class_type: isaaclab.sensors.contact_sensor.contact_sensor:ContactSensor + prim_path: /World/envs/env_.*/Robot/.* + update_period: 0.0 + history_length: 3 + debug_vis: false + track_pose: false + track_contact_points: false + max_contact_data_count_per_prim: 4 + track_air_time: true + force_threshold: 1.0 + filter_prim_paths_expr: [] + visualizer_cfg: + prim_path: /Visuals/ContactSensor + markers: + contact: + func: isaaclab.sim.spawners.shapes.shapes:spawn_sphere + visible: true + semantic_tags: null + copy_from_source: true + mass_props: null + rigid_props: null + collision_props: null + activate_contact_sensors: false + visual_material_path: material + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_preview_surface + diffuse_color: !!python/tuple + - 1.0 + - 0.0 + - 0.0 + emissive_color: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + roughness: 0.5 + metallic: 0.0 + opacity: 1.0 + physics_material_path: material + physics_material: null + radius: 0.02 + no_contact: + func: isaaclab.sim.spawners.shapes.shapes:spawn_sphere + visible: false + semantic_tags: null + copy_from_source: true + mass_props: null + rigid_props: null + collision_props: null + activate_contact_sensors: false + visual_material_path: material + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_preview_surface + diffuse_color: !!python/tuple + - 0.0 + - 1.0 + - 0.0 + emissive_color: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + roughness: 0.5 + metallic: 0.0 + opacity: 1.0 + physics_material_path: material + physics_material: null + radius: 0.02 + sky_light: + class_type: null + prim_path: /World/skyLight + spawn: + func: isaaclab.sim.spawners.lights.lights:spawn_light + visible: true + semantic_tags: null + copy_from_source: true + prim_type: DomeLight + color: !!python/tuple + - 1.0 + - 1.0 + - 1.0 + enable_color_temperature: false + color_temperature: 6500.0 + normalize: false + exposure: 0.0 + intensity: 750.0 + texture_file: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/Materials/Textures/Skies/PolyHaven/kloofendal_43d_clear_puresky_4k.hdr + texture_format: automatic + visible_in_primary_ray: true + init_state: + pos: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + rot: !!python/tuple + - 1.0 + - 0.0 + - 0.0 + - 0.0 + collision_group: 0 + debug_vis: false +recorders: + dataset_file_handler_class_type: isaaclab.utils.datasets.hdf5_dataset_file_handler:HDF5DatasetFileHandler + dataset_export_dir_path: /tmp/isaaclab/logs + dataset_filename: dataset + dataset_export_mode: + _value_: 1 + _name_: EXPORT_ALL + _sort_order_: 1 + export_in_record_pre_reset: true +observations: + policy: + concatenate_terms: true + concatenate_dim: -1 + enable_corruption: true + history_length: null + flatten_history_dim: true + base_lin_vel: + func: isaaclab.envs.mdp.observations:base_lin_vel + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -0.1 + n_max: 0.1 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + base_ang_vel: + func: isaaclab.envs.mdp.observations:base_ang_vel + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -0.2 + n_max: 0.2 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + projected_gravity: + func: isaaclab.envs.mdp.observations:projected_gravity + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -0.05 + n_max: 0.05 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + velocity_commands: + func: isaaclab.envs.mdp.observations:generated_commands + params: + command_name: base_velocity + modifiers: null + noise: null + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + joint_pos: + func: isaaclab.envs.mdp.observations:joint_pos_rel + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -0.01 + n_max: 0.01 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + joint_vel: + func: isaaclab.envs.mdp.observations:joint_vel_rel + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -1.5 + n_max: 1.5 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + actions: + func: isaaclab.envs.mdp.observations:last_action + params: {} + modifiers: null + noise: null + clip: null + scale: null + history_length: 0 + flatten_history_dim: true +actions: + joint_pos: + class_type: isaaclab.envs.mdp.actions.joint_actions:JointPositionAction + asset_name: robot + debug_vis: false + clip: null + joint_names: + - .* + scale: 0.5 + offset: 0.0 + preserve_order: false + use_default_offset: true +events: + physics_material: + func: isaaclab.envs.mdp.events:randomize_rigid_body_material + params: + asset_cfg: + name: robot + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .* + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + static_friction_range: !!python/tuple + - 0.8 + - 0.8 + dynamic_friction_range: !!python/tuple + - 0.6 + - 0.6 + restitution_range: !!python/tuple + - 0.0 + - 0.0 + num_buckets: 64 + mode: startup + interval_range_s: null + is_global_time: false + min_step_count_between_reset: 0 + add_base_mass: null + base_com: null + base_external_force_torque: + func: isaaclab.envs.mdp.events:apply_external_force_torque + params: + asset_cfg: + name: robot + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: + - .*torso_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + force_range: !!python/tuple + - 0.0 + - 0.0 + torque_range: !!python/tuple + - -0.0 + - 0.0 + mode: reset + interval_range_s: null + is_global_time: false + min_step_count_between_reset: 0 + reset_base: + func: isaaclab.envs.mdp.events:reset_root_state_uniform + params: + pose_range: + x: !!python/tuple + - -0.5 + - 0.5 + y: !!python/tuple + - -0.5 + - 0.5 + yaw: !!python/tuple + - -3.14 + - 3.14 + velocity_range: + x: !!python/tuple + - 0.0 + - 0.0 + y: !!python/tuple + - 0.0 + - 0.0 + z: !!python/tuple + - 0.0 + - 0.0 + roll: !!python/tuple + - 0.0 + - 0.0 + pitch: !!python/tuple + - 0.0 + - 0.0 + yaw: !!python/tuple + - 0.0 + - 0.0 + mode: reset + interval_range_s: null + is_global_time: false + min_step_count_between_reset: 0 + reset_robot_joints: + func: isaaclab.envs.mdp.events:reset_joints_by_scale + params: + position_range: !!python/tuple + - 1.0 + - 1.0 + velocity_range: !!python/tuple + - 0.0 + - 0.0 + mode: reset + interval_range_s: null + is_global_time: false + min_step_count_between_reset: 0 + push_robot: null +rerender_on_reset: false +wait_for_textures: false +xr: null +teleop_devices: null +export_io_descriptors: false +log_dir: /workspaces/isaaclab_arena/logs/rsl_rl/h1_base_rough/2026-03-06_10-34-21 +is_finite_horizon: false +episode_length_s: 20.0 +rewards: + track_lin_vel_xy_exp: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.rewards:track_lin_vel_xy_yaw_frame_exp + params: + command_name: base_velocity + std: 0.5 + weight: 1.0 + track_ang_vel_z_exp: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.rewards:track_ang_vel_z_world_exp + params: + command_name: base_velocity + std: 0.5 + weight: 1.0 + lin_vel_z_l2: null + ang_vel_xy_l2: + func: isaaclab.envs.mdp.rewards:ang_vel_xy_l2 + params: {} + weight: -0.05 + dof_torques_l2: + func: isaaclab.envs.mdp.rewards:joint_torques_l2 + params: {} + weight: 0.0 + dof_acc_l2: + func: isaaclab.envs.mdp.rewards:joint_acc_l2 + params: {} + weight: -1.25e-07 + action_rate_l2: + func: isaaclab.envs.mdp.rewards:action_rate_l2 + params: {} + weight: -0.005 + feet_air_time: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.rewards:feet_air_time_positive_biped + params: + command_name: base_velocity + sensor_cfg: + name: contact_forces + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .*ankle_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + threshold: 0.4 + weight: 0.25 + undesired_contacts: null + flat_orientation_l2: + func: isaaclab.envs.mdp.rewards:flat_orientation_l2 + params: {} + weight: -1.0 + dof_pos_limits: + func: isaaclab.envs.mdp.rewards:joint_pos_limits + params: + asset_cfg: + name: robot + joint_names: .*_ankle + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: null + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -1.0 + termination_penalty: + func: isaaclab.envs.mdp.rewards:is_terminated + params: {} + weight: -200.0 + feet_slide: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.rewards:feet_slide + params: + sensor_cfg: + name: contact_forces + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .*ankle_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + asset_cfg: + name: robot + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .*ankle_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -0.25 + joint_deviation_hip: + func: isaaclab.envs.mdp.rewards:joint_deviation_l1 + params: + asset_cfg: + name: robot + joint_names: + - .*_hip_yaw + - .*_hip_roll + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: null + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -0.2 + joint_deviation_arms: + func: isaaclab.envs.mdp.rewards:joint_deviation_l1 + params: + asset_cfg: + name: robot + joint_names: + - .*_shoulder_.* + - .*_elbow + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: null + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -0.2 + joint_deviation_torso: + func: isaaclab.envs.mdp.rewards:joint_deviation_l1 + params: + asset_cfg: + name: robot + joint_names: torso + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: null + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -0.1 +terminations: + time_out: + func: isaaclab.envs.mdp.terminations:time_out + params: {} + time_out: true + base_contact: + func: isaaclab.envs.mdp.terminations:illegal_contact + params: + sensor_cfg: + name: contact_forces + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .*torso_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + threshold: 1.0 + time_out: false +curriculum: + terrain_levels: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.curriculums:terrain_levels_vel + params: {} +commands: + base_velocity: + class_type: isaaclab.envs.mdp.commands.velocity_command:UniformVelocityCommand + resampling_time_range: !!python/tuple + - 10.0 + - 10.0 + debug_vis: false + asset_name: robot + heading_command: true + heading_control_stiffness: 0.5 + rel_standing_envs: 0.02 + rel_heading_envs: 1.0 + ranges: + lin_vel_x: !!python/tuple + - 0.0 + - 1.0 + lin_vel_y: !!python/tuple + - 0.0 + - 0.0 + ang_vel_z: !!python/tuple + - -1.0 + - 1.0 + heading: !!python/tuple + - -3.141592653589793 + - 3.141592653589793 + goal_vel_visualizer_cfg: + prim_path: /Visuals/Command/velocity_goal + markers: + arrow: + func: isaaclab.sim.spawners.from_files.from_files:spawn_from_usd + visible: true + semantic_tags: null + copy_from_source: true + mass_props: null + deformable_props: null + rigid_props: null + collision_props: null + activate_contact_sensors: false + scale: !!python/tuple + - 0.5 + - 0.5 + - 0.5 + articulation_props: null + fixed_tendons_props: null + spatial_tendons_props: null + joint_drive_props: null + visual_material_path: material + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_preview_surface + diffuse_color: !!python/tuple + - 0.0 + - 1.0 + - 0.0 + emissive_color: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + roughness: 0.5 + metallic: 0.0 + opacity: 1.0 + usd_path: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/Props/UIElements/arrow_x.usd + variants: null + current_vel_visualizer_cfg: + prim_path: /Visuals/Command/velocity_current + markers: + arrow: + func: isaaclab.sim.spawners.from_files.from_files:spawn_from_usd + visible: true + semantic_tags: null + copy_from_source: true + mass_props: null + deformable_props: null + rigid_props: null + collision_props: null + activate_contact_sensors: false + scale: !!python/tuple + - 0.5 + - 0.5 + - 0.5 + articulation_props: null + fixed_tendons_props: null + spatial_tendons_props: null + joint_drive_props: null + visual_material_path: material + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_preview_surface + diffuse_color: !!python/tuple + - 0.0 + - 0.0 + - 1.0 + emissive_color: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + roughness: 0.5 + metallic: 0.0 + opacity: 1.0 + usd_path: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/Props/UIElements/arrow_x.usd + variants: null +metrics: [] +isaaclab_arena_env: + name: h1_base_rough + scene: + assets: {} + observation_cfg: null + events_cfg: null + termination_cfg: null + rewards_cfg: null + curriculum_cfg: null + commands_cfg: null + embodiment: + enable_cameras: false + initial_pose: null + concatenate_observation_terms: false + arm_mode: null + scene_config: + terrain: + class_type: isaaclab.terrains.terrain_importer:TerrainImporter + collision_group: -1 + prim_path: /World/ground + num_envs: 1 + terrain_type: generator + terrain_generator: + class_type: isaaclab.terrains.terrain_generator:TerrainGenerator + seed: null + curriculum: true + size: !!python/tuple + - 8.0 + - 8.0 + border_width: 20.0 + border_height: 1.0 + num_rows: 10 + num_cols: 20 + color_scheme: none + horizontal_scale: 0.1 + vertical_scale: 0.005 + slope_threshold: 0.75 + sub_terrains: + random_rough: + function: isaaclab.terrains.height_field.hf_terrains:random_uniform_terrain + proportion: 1.0 + size: !!python/tuple + - 8.0 + - 8.0 + flat_patch_sampling: null + border_width: 0.0 + horizontal_scale: 0.1 + vertical_scale: 0.005 + slope_threshold: null + noise_range: !!python/tuple + - 0.02 + - 0.1 + noise_step: 0.02 + downsampled_scale: null + difficulty_range: !!python/tuple + - 0.0 + - 1.0 + use_cache: false + cache_dir: /tmp/isaaclab/terrains + usd_path: null + env_spacing: null + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_from_mdl_file + mdl_path: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/IsaacLab/Materials/TilesMarbleSpiderWhiteBrickBondHoned/TilesMarbleSpiderWhiteBrickBondHoned.mdl + project_uvw: true + albedo_brightness: null + texture_scale: !!python/tuple + - 0.25 + - 0.25 + physics_material: + func: isaaclab.sim.spawners.materials.physics_materials:spawn_rigid_body_material + static_friction: 1.0 + dynamic_friction: 1.0 + restitution: 0.0 + friction_combine_mode: multiply + restitution_combine_mode: multiply + compliant_contact_stiffness: 0.0 + compliant_contact_damping: 0.0 + max_init_terrain_level: 5 + debug_vis: false + robot: + class_type: isaaclab.assets.articulation.articulation:Articulation + prim_path: '{ENV_REGEX_NS}/Robot' + spawn: + func: isaaclab.sim.spawners.from_files.from_files:spawn_from_usd + visible: true + semantic_tags: null + copy_from_source: true + mass_props: null + deformable_props: null + rigid_props: + rigid_body_enabled: null + kinematic_enabled: null + disable_gravity: false + linear_damping: 0.0 + angular_damping: 0.0 + max_linear_velocity: 1000.0 + max_angular_velocity: 1000.0 + max_depenetration_velocity: 1.0 + max_contact_impulse: null + enable_gyroscopic_forces: null + retain_accelerations: false + solver_position_iteration_count: null + solver_velocity_iteration_count: null + sleep_threshold: null + stabilization_threshold: null + collision_props: null + activate_contact_sensors: true + scale: null + articulation_props: + articulation_enabled: null + enabled_self_collisions: false + solver_position_iteration_count: 4 + solver_velocity_iteration_count: 4 + sleep_threshold: null + stabilization_threshold: null + fix_root_link: null + fixed_tendons_props: null + spatial_tendons_props: null + joint_drive_props: null + visual_material_path: material + visual_material: null + usd_path: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/IsaacLab/Robots/Unitree/H1/h1_minimal.usd + variants: null + init_state: + pos: !!python/tuple + - 0.0 + - 0.0 + - 1.05 + rot: !!python/tuple + - 1.0 + - 0.0 + - 0.0 + - 0.0 + lin_vel: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + ang_vel: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + joint_pos: + .*_hip_yaw: 0.0 + .*_hip_roll: 0.0 + .*_hip_pitch: -0.28 + .*_knee: 0.79 + .*_ankle: -0.52 + torso: 0.0 + .*_shoulder_pitch: 0.28 + .*_shoulder_roll: 0.0 + .*_shoulder_yaw: 0.0 + .*_elbow: 0.52 + joint_vel: + .*: 0.0 + collision_group: 0 + debug_vis: false + articulation_root_prim_path: null + soft_joint_pos_limit_factor: 0.9 + actuators: + legs: + class_type: isaaclab.actuators.actuator_pd:ImplicitActuator + joint_names_expr: + - .*_hip_yaw + - .*_hip_roll + - .*_hip_pitch + - .*_knee + - torso + effort_limit: null + velocity_limit: null + effort_limit_sim: 300 + velocity_limit_sim: null + stiffness: + .*_hip_yaw: 150.0 + .*_hip_roll: 150.0 + .*_hip_pitch: 200.0 + .*_knee: 200.0 + torso: 200.0 + damping: + .*_hip_yaw: 5.0 + .*_hip_roll: 5.0 + .*_hip_pitch: 5.0 + .*_knee: 5.0 + torso: 5.0 + armature: null + friction: null + dynamic_friction: null + viscous_friction: null + feet: + class_type: isaaclab.actuators.actuator_pd:ImplicitActuator + joint_names_expr: + - .*_ankle + effort_limit: null + velocity_limit: null + effort_limit_sim: 100 + velocity_limit_sim: null + stiffness: + .*_ankle: 20.0 + damping: + .*_ankle: 4.0 + armature: null + friction: null + dynamic_friction: null + viscous_friction: null + arms: + class_type: isaaclab.actuators.actuator_pd:ImplicitActuator + joint_names_expr: + - .*_shoulder_pitch + - .*_shoulder_roll + - .*_shoulder_yaw + - .*_elbow + effort_limit: null + velocity_limit: null + effort_limit_sim: 300 + velocity_limit_sim: null + stiffness: + .*_shoulder_pitch: 40.0 + .*_shoulder_roll: 40.0 + .*_shoulder_yaw: 40.0 + .*_elbow: 40.0 + damping: + .*_shoulder_pitch: 10.0 + .*_shoulder_roll: 10.0 + .*_shoulder_yaw: 10.0 + .*_elbow: 10.0 + armature: null + friction: null + dynamic_friction: null + viscous_friction: null + actuator_value_resolution_debug_print: false + contact_forces: + class_type: isaaclab.sensors.contact_sensor.contact_sensor:ContactSensor + prim_path: '{ENV_REGEX_NS}/Robot/.*' + update_period: 0.0 + history_length: 3 + debug_vis: false + track_pose: false + track_contact_points: false + max_contact_data_count_per_prim: 4 + track_air_time: true + force_threshold: 1.0 + filter_prim_paths_expr: [] + visualizer_cfg: + prim_path: /Visuals/ContactSensor + markers: + contact: + func: isaaclab.sim.spawners.shapes.shapes:spawn_sphere + visible: true + semantic_tags: null + copy_from_source: true + mass_props: null + rigid_props: null + collision_props: null + activate_contact_sensors: false + visual_material_path: material + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_preview_surface + diffuse_color: !!python/tuple + - 1.0 + - 0.0 + - 0.0 + emissive_color: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + roughness: 0.5 + metallic: 0.0 + opacity: 1.0 + physics_material_path: material + physics_material: null + radius: 0.02 + no_contact: + func: isaaclab.sim.spawners.shapes.shapes:spawn_sphere + visible: false + semantic_tags: null + copy_from_source: true + mass_props: null + rigid_props: null + collision_props: null + activate_contact_sensors: false + visual_material_path: material + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_preview_surface + diffuse_color: !!python/tuple + - 0.0 + - 1.0 + - 0.0 + emissive_color: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + roughness: 0.5 + metallic: 0.0 + opacity: 1.0 + physics_material_path: material + physics_material: null + radius: 0.02 + sky_light: + class_type: null + prim_path: /World/skyLight + spawn: + func: isaaclab.sim.spawners.lights.lights:spawn_light + visible: true + semantic_tags: null + copy_from_source: true + prim_type: DomeLight + color: !!python/tuple + - 1.0 + - 1.0 + - 1.0 + enable_color_temperature: false + color_temperature: 6500.0 + normalize: false + exposure: 0.0 + intensity: 750.0 + texture_file: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/Materials/Textures/Skies/PolyHaven/kloofendal_43d_clear_puresky_4k.hdr + texture_format: automatic + visible_in_primary_ray: true + init_state: + pos: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + rot: !!python/tuple + - 1.0 + - 0.0 + - 0.0 + - 0.0 + collision_group: 0 + debug_vis: false + camera_config: null + action_config: + joint_pos: + class_type: isaaclab.envs.mdp.actions.joint_actions:JointPositionAction + asset_name: robot + debug_vis: false + clip: null + joint_names: + - .* + scale: 0.5 + offset: 0.0 + preserve_order: false + use_default_offset: true + observation_config: + policy: + concatenate_terms: true + concatenate_dim: -1 + enable_corruption: true + history_length: null + flatten_history_dim: true + base_lin_vel: + func: isaaclab.envs.mdp.observations:base_lin_vel + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -0.1 + n_max: 0.1 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + base_ang_vel: + func: isaaclab.envs.mdp.observations:base_ang_vel + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -0.2 + n_max: 0.2 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + projected_gravity: + func: isaaclab.envs.mdp.observations:projected_gravity + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -0.05 + n_max: 0.05 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + velocity_commands: + func: isaaclab.envs.mdp.observations:generated_commands + params: + command_name: base_velocity + modifiers: null + noise: null + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + joint_pos: + func: isaaclab.envs.mdp.observations:joint_pos_rel + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -0.01 + n_max: 0.01 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + joint_vel: + func: isaaclab.envs.mdp.observations:joint_vel_rel + params: {} + modifiers: null + noise: + func: isaaclab.utils.noise.noise_model:uniform_noise + operation: add + n_min: -1.5 + n_max: 1.5 + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + actions: + func: isaaclab.envs.mdp.observations:last_action + params: {} + modifiers: null + noise: null + clip: null + scale: null + history_length: 0 + flatten_history_dim: true + event_config: null + reward_config: null + curriculum_config: null + command_config: null + mimic_env: null + xr: null + termination_cfg: null + task: + episode_length_s: 20.0 + task_description: Train a rough-terrain velocity tracking locomotion policy for + H1. + _events_cfg: + physics_material: + func: isaaclab.envs.mdp.events:randomize_rigid_body_material + params: + asset_cfg: + name: robot + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .* + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + static_friction_range: !!python/tuple + - 0.8 + - 0.8 + dynamic_friction_range: !!python/tuple + - 0.6 + - 0.6 + restitution_range: !!python/tuple + - 0.0 + - 0.0 + num_buckets: 64 + mode: startup + interval_range_s: null + is_global_time: false + min_step_count_between_reset: 0 + add_base_mass: null + base_com: null + base_external_force_torque: + func: isaaclab.envs.mdp.events:apply_external_force_torque + params: + asset_cfg: + name: robot + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: + - .*torso_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + force_range: !!python/tuple + - 0.0 + - 0.0 + torque_range: !!python/tuple + - -0.0 + - 0.0 + mode: reset + interval_range_s: null + is_global_time: false + min_step_count_between_reset: 0 + reset_base: + func: isaaclab.envs.mdp.events:reset_root_state_uniform + params: + pose_range: + x: !!python/tuple + - -0.5 + - 0.5 + y: !!python/tuple + - -0.5 + - 0.5 + yaw: !!python/tuple + - -3.14 + - 3.14 + velocity_range: + x: !!python/tuple + - 0.0 + - 0.0 + y: !!python/tuple + - 0.0 + - 0.0 + z: !!python/tuple + - 0.0 + - 0.0 + roll: !!python/tuple + - 0.0 + - 0.0 + pitch: !!python/tuple + - 0.0 + - 0.0 + yaw: !!python/tuple + - 0.0 + - 0.0 + mode: reset + interval_range_s: null + is_global_time: false + min_step_count_between_reset: 0 + reset_robot_joints: + func: isaaclab.envs.mdp.events:reset_joints_by_scale + params: + position_range: !!python/tuple + - 1.0 + - 1.0 + velocity_range: !!python/tuple + - 0.0 + - 0.0 + mode: reset + interval_range_s: null + is_global_time: false + min_step_count_between_reset: 0 + push_robot: null + _rewards_cfg: + track_lin_vel_xy_exp: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.rewards:track_lin_vel_xy_yaw_frame_exp + params: + command_name: base_velocity + std: 0.5 + weight: 1.0 + track_ang_vel_z_exp: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.rewards:track_ang_vel_z_world_exp + params: + command_name: base_velocity + std: 0.5 + weight: 1.0 + lin_vel_z_l2: null + ang_vel_xy_l2: + func: isaaclab.envs.mdp.rewards:ang_vel_xy_l2 + params: {} + weight: -0.05 + dof_torques_l2: + func: isaaclab.envs.mdp.rewards:joint_torques_l2 + params: {} + weight: 0.0 + dof_acc_l2: + func: isaaclab.envs.mdp.rewards:joint_acc_l2 + params: {} + weight: -1.25e-07 + action_rate_l2: + func: isaaclab.envs.mdp.rewards:action_rate_l2 + params: {} + weight: -0.005 + feet_air_time: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.rewards:feet_air_time_positive_biped + params: + command_name: base_velocity + sensor_cfg: + name: contact_forces + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .*ankle_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + threshold: 0.4 + weight: 0.25 + undesired_contacts: null + flat_orientation_l2: + func: isaaclab.envs.mdp.rewards:flat_orientation_l2 + params: {} + weight: -1.0 + dof_pos_limits: + func: isaaclab.envs.mdp.rewards:joint_pos_limits + params: + asset_cfg: + name: robot + joint_names: .*_ankle + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: null + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -1.0 + termination_penalty: + func: isaaclab.envs.mdp.rewards:is_terminated + params: {} + weight: -200.0 + feet_slide: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.rewards:feet_slide + params: + sensor_cfg: + name: contact_forces + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .*ankle_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + asset_cfg: + name: robot + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .*ankle_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -0.25 + joint_deviation_hip: + func: isaaclab.envs.mdp.rewards:joint_deviation_l1 + params: + asset_cfg: + name: robot + joint_names: + - .*_hip_yaw + - .*_hip_roll + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: null + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -0.2 + joint_deviation_arms: + func: isaaclab.envs.mdp.rewards:joint_deviation_l1 + params: + asset_cfg: + name: robot + joint_names: + - .*_shoulder_.* + - .*_elbow + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: null + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -0.2 + joint_deviation_torso: + func: isaaclab.envs.mdp.rewards:joint_deviation_l1 + params: + asset_cfg: + name: robot + joint_names: torso + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: null + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + weight: -0.1 + _terminations_cfg: + time_out: + func: isaaclab.envs.mdp.terminations:time_out + params: {} + time_out: true + base_contact: + func: isaaclab.envs.mdp.terminations:illegal_contact + params: + sensor_cfg: + name: contact_forces + joint_names: null + joint_ids: !!python/object/apply:builtins.slice + - null + - null + - null + fixed_tendon_names: null + fixed_tendon_ids: !!python/object/apply:builtins.slice + - null + - null + - null + body_names: .*torso_link + body_ids: !!python/object/apply:builtins.slice + - null + - null + - null + object_collection_names: null + object_collection_ids: !!python/object/apply:builtins.slice + - null + - null + - null + preserve_order: false + threshold: 1.0 + time_out: false + _curriculum_cfg: + terrain_levels: + func: isaaclab_tasks.manager_based.locomotion.velocity.mdp.curriculums:terrain_levels_vel + params: {} + _commands_cfg: + base_velocity: + class_type: isaaclab.envs.mdp.commands.velocity_command:UniformVelocityCommand + resampling_time_range: !!python/tuple + - 10.0 + - 10.0 + debug_vis: false + asset_name: robot + heading_command: true + heading_control_stiffness: 0.5 + rel_standing_envs: 0.02 + rel_heading_envs: 1.0 + ranges: + lin_vel_x: !!python/tuple + - 0.0 + - 1.0 + lin_vel_y: !!python/tuple + - 0.0 + - 0.0 + ang_vel_z: !!python/tuple + - -1.0 + - 1.0 + heading: !!python/tuple + - -3.141592653589793 + - 3.141592653589793 + goal_vel_visualizer_cfg: + prim_path: /Visuals/Command/velocity_goal + markers: + arrow: + func: isaaclab.sim.spawners.from_files.from_files:spawn_from_usd + visible: true + semantic_tags: null + copy_from_source: true + mass_props: null + deformable_props: null + rigid_props: null + collision_props: null + activate_contact_sensors: false + scale: !!python/tuple + - 0.5 + - 0.5 + - 0.5 + articulation_props: null + fixed_tendons_props: null + spatial_tendons_props: null + joint_drive_props: null + visual_material_path: material + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_preview_surface + diffuse_color: !!python/tuple + - 0.0 + - 1.0 + - 0.0 + emissive_color: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + roughness: 0.5 + metallic: 0.0 + opacity: 1.0 + usd_path: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/Props/UIElements/arrow_x.usd + variants: null + current_vel_visualizer_cfg: + prim_path: /Visuals/Command/velocity_current + markers: + arrow: + func: isaaclab.sim.spawners.from_files.from_files:spawn_from_usd + visible: true + semantic_tags: null + copy_from_source: true + mass_props: null + deformable_props: null + rigid_props: null + collision_props: null + activate_contact_sensors: false + scale: !!python/tuple + - 0.5 + - 0.5 + - 0.5 + articulation_props: null + fixed_tendons_props: null + spatial_tendons_props: null + joint_drive_props: null + visual_material_path: material + visual_material: + func: isaaclab.sim.spawners.materials.visual_materials:spawn_preview_surface + diffuse_color: !!python/tuple + - 0.0 + - 0.0 + - 1.0 + emissive_color: !!python/tuple + - 0.0 + - 0.0 + - 0.0 + roughness: 0.5 + metallic: 0.0 + opacity: 1.0 + usd_path: https://omniverse-content-production.s3-us-west-2.amazonaws.com/Assets/Isaac/5.1/Isaac/Props/UIElements/arrow_x.usd + variants: null + teleop_device: null + orchestrator: null + env_cfg_callback: null diff --git a/isaaclab_arena/policy/vln/pretrained/h1_navila_locomotion.pt b/isaaclab_arena/policy/vln/pretrained/h1_navila_locomotion.pt new file mode 100644 index 000000000..7b45f45f3 Binary files /dev/null and b/isaaclab_arena/policy/vln/pretrained/h1_navila_locomotion.pt differ diff --git a/isaaclab_arena/policy/vln/vln_action_protocol.py b/isaaclab_arena/policy/vln/vln_action_protocol.py new file mode 100644 index 000000000..39f62297d --- /dev/null +++ b/isaaclab_arena/policy/vln/vln_action_protocol.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Re-export VlnVelocityActionProtocol from the Arena core package. + +The VLN velocity-command ActionProtocol is defined in +``isaaclab_arena.remote_policy.action_protocol`` alongside the other +protocol types (ChunkingActionProtocol, etc.). This module re-exports +it for convenience. +""" + +from isaaclab_arena.remote_policy.action_protocol import ( # noqa: F401 + ActionMode, + VlnVelocityActionProtocol, +) diff --git a/isaaclab_arena/policy/vln/vln_vlm_locomotion_policy.py b/isaaclab_arena/policy/vln/vln_vlm_locomotion_policy.py new file mode 100644 index 000000000..472faa1fd --- /dev/null +++ b/isaaclab_arena/policy/vln/vln_vlm_locomotion_policy.py @@ -0,0 +1,588 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""VLN hierarchical policy: remote VLM + local RSL-RL locomotion. + +This composite policy has two layers: + - **High-level**: A remote VLM server (e.g. NaVILA, GR00T) that + produces velocity commands ``[vx, vy, yaw_rate]`` from RGB images. + - **Low-level**: A local RSL-RL locomotion policy that converts + velocity commands into joint-position actions. + +Works with Arena's ``policy_runner.py`` like any other policy. + +Usage:: + + python -m isaaclab_arena.evaluation.policy_runner \\ + --policy_type isaaclab_arena.policy.vln.vln_vlm_locomotion_policy.VlnVlmLocomotionPolicy \\ + --remote_host localhost --remote_port 5555 \\ + --ll_checkpoint_path /path/to/rsl_rl/model.pt \\ + --ll_agent_cfg /path/to/agent.yaml \\ + --num_episodes 10 \\ + VLN_Benchmark \\ + --usd_path /path/to/scene.usd \\ + --r2r_dataset_path /path/to/dataset.json.gz +""" + +from __future__ import annotations + +import argparse +import os +import time +from typing import Any + +import gymnasium as gym +import numpy as np +import torch +from gymnasium.spaces.dict import Dict as GymSpacesDict +from PIL import Image as PILImage +from rsl_rl.runners import OnPolicyRunner + +from isaaclab.utils.io import load_yaml +from isaaclab_rl.rsl_rl import RslRlVecEnvWrapper + +from isaaclab_arena.policy.client_side_policy import ClientSidePolicy +from isaaclab_arena.remote_policy.action_protocol import VlnVelocityActionProtocol +from isaaclab_arena.remote_policy.remote_policy_config import RemotePolicyConfig + + +class VlnVlmLocomotionPolicy(ClientSidePolicy): + """VLN hierarchical policy: remote VLM + local RSL-RL locomotion. + + High-level layer queries a remote VLM server for velocity commands. + Low-level layer runs a pre-trained RSL-RL locomotion policy to + convert velocity commands into joint-position actions. + + Inherits from ``ClientSidePolicy`` to reuse the ZeroMQ handshake, + observation packing, and remote lifecycle management. + """ + + def __init__( + self, + remote_config: RemotePolicyConfig, + ll_checkpoint_path: str, + ll_agent_cfg: str, + device: str = "cuda", + vel_cmd_obs_indices: tuple[int, int] = (9, 12), + warmup_steps: int = 200, + debug: bool = False, + save_interval: int = 0, + save_dir: str = "", + ignore_vlm_stop: bool = False, + min_vlm_stop_distance: float = -1.0, + ): + super().__init__( + config=None, + remote_config=remote_config, + protocol_cls=VlnVelocityActionProtocol, + ) + self._device = device + self._vel_cmd_indices = vel_cmd_obs_indices + self._warmup_steps = warmup_steps + self._debug = debug + self._save_interval = save_interval + self._save_dir = save_dir + self._ignore_vlm_stop = ignore_vlm_stop + self._min_vlm_stop_distance = min_vlm_stop_distance + + # RSL-RL low-level policy (loaded lazily in first get_action) + self._ll_checkpoint_path = ll_checkpoint_path + self._ll_agent_cfg = ll_agent_cfg + self._ll_policy = None + self._ll_obs_td = None + self._ll_vec_env = None + + # VLM scheduling state + self._step_count: int = 0 + self._target_step: int = 0 + self._last_vel_cmd = np.zeros(self.action_dim, dtype=np.float32) + self._env_dt: float | None = None + + # Track current instruction to detect episode changes + self._current_instruction: str | None = None + + self._vlm_query_count: int = 0 + + # ------------------------------------------------------------------ # + # PolicyBase interface # + # ------------------------------------------------------------------ # + + def get_action(self, env: gym.Env, observation: GymSpacesDict) -> torch.Tensor: + """Return joint-position actions for the environment. + + Internally: VLM query → velocity command → inject into obs → + RSL-RL forward pass → joint actions. + """ + unwrapped = env.unwrapped if hasattr(env, "unwrapped") else env + self._last_unwrapped = unwrapped + + # Lazy init + if self._ll_policy is None: + self._load_low_level_policy(env) + if self._debug: + self._debug_save_dir = "/tmp/vln_debug_frames" + os.makedirs(self._debug_save_dir, exist_ok=True) + if self._save_interval > 0 and self._save_dir: + ts = time.strftime("%Y%m%d_%H%M%S") + self._save_dir = os.path.join(self._save_dir, ts) + os.makedirs(self._save_dir, exist_ok=True) + print(f"[VlnPolicy] Saving frames every {self._save_interval} steps to {self._save_dir}") + + if self._env_dt is None: + try: + self._env_dt = float(unwrapped.cfg.sim.dt * unwrapped.cfg.decimation) + except Exception: + self._env_dt = 0.02 + + # Detect per-episode instruction changes from env.extras + self._check_instruction_update(unwrapped) + + goal_dist = self._get_distance_to_goal(unwrapped) + + # Query VLM if scheduling says it's time + if self._step_count >= self._target_step: + packed_obs = self.pack_observation_for_server(observation) + resp = self.remote_client.get_action(observation=packed_obs) + + vel_cmd = np.asarray( + resp.get("action", np.zeros(self.action_dim)), dtype=np.float32 + ) + duration = float(resp.get("duration", self.protocol.default_duration)) + vlm_text = resp.get("vlm_text", "") + self._last_vel_cmd = vel_cmd + + self._vlm_query_count += 1 + if self._debug: + dist_str = f" dist={goal_dist:.2f}" if goal_dist is not None else "" + print( + f"[VlnPolicy VLM #{self._vlm_query_count}] step={self._step_count}" + f" cmd={vel_cmd} dur={duration:.1f}s{dist_str}" + f" vlm=\"{vlm_text[-80:] if vlm_text else ''}\"" + ) + + if self._env_dt > 0.0 and duration > 0.0: + steps_to_hold = max(1, int(duration / self._env_dt)) + else: + steps_to_hold = 1 + self._target_step = self._step_count + steps_to_hold + + # STOP: VLM returns zero velocity + zero duration + if np.allclose(vel_cmd, 0.0) and duration <= 0.0: + accept_stop, stop_reason = self._should_accept_vlm_stop(goal_dist) + if accept_stop: + extras = getattr(unwrapped, "extras", {}) + if "vln_stop_called" in extras: + extras["vln_stop_called"][:] = True + elif self._debug: + print( + f"[VlnPolicy STOP ignored] step={self._step_count} " + f"dist={goal_dist if goal_dist is not None else 'unknown'} " + f"reason={stop_reason}" + ) + + if self._debug: + self._save_debug_frame(observation, self._step_count) + if self._save_dir: + self._save_frame_to_dir(observation, unwrapped) + + if self._debug and self._step_count % 200 == 0: + self._save_debug_frame(observation, self._step_count) + + # Get the latest proprioceptive observation from the RSL-RL wrapper. + self._ll_obs_td = self._ll_vec_env.get_observations() + + # Inject velocity command into the policy observation in-place + i, j = self._vel_cmd_indices + cmd_tensor = torch.tensor(self._last_vel_cmd, device=self._device, dtype=torch.float32) + self._ll_obs_td["policy"][:, i:j] = cmd_tensor + + # Run low-level policy (pass full TensorDict, RSL-RL indexes by group) + with torch.inference_mode(): + joint_actions = self._ll_policy(self._ll_obs_td) + + if self._debug and self._step_count % 500 == 0: + try: + pos = unwrapped.scene["robot"].data.root_pos_w[0].cpu().numpy() + goal = unwrapped.extras.get("current_goal_pos") + dist_str = "" + if goal is not None: + goal_np = np.asarray(goal) + if goal_np.ndim == 2: + goal_np = goal_np[0] + dist_str = f" dist_to_goal={np.linalg.norm(pos - goal_np):.2f}" + ep_id = "" + ep_ids = unwrapped.extras.get("current_episode_id") + if ep_ids is not None: + ep_id = f" ep={ep_ids[0]}" + print(f"[VlnPolicy] step={self._step_count} robot=[{pos[0]:.1f},{pos[1]:.1f},{pos[2]:.2f}] cmd={self._last_vel_cmd}{dist_str}{ep_id}") + except Exception: + pass + + if self._save_interval > 0 and self._save_dir and self._step_count % self._save_interval == 0: + self._save_frame_to_dir(observation, unwrapped) + + self._step_count += 1 + return joint_actions + + def _save_debug_frame(self, observation, step): + """Save camera images with distance and instruction overlay.""" + try: + cam_dict = self._get_camera_obs_dict(observation, getattr(self, "_last_unwrapped", None)) + if cam_dict is None: + return + + goal_dist = self._get_distance_to_goal( + getattr(self, "_last_unwrapped", None) + ) + instruction = self._current_instruction or "" + + for cam_name, cam_data in cam_dict.items(): + if "rgb" not in cam_name: + continue + img_tensor = cam_data + if hasattr(img_tensor, 'cpu'): + img_np = img_tensor[0].cpu().numpy() if img_tensor.ndim == 4 else img_tensor.cpu().numpy() + else: + img_np = np.asarray(img_tensor) + if img_np.ndim == 4: + img_np = img_np[0] + if img_np.dtype in (np.float32, np.float64): + img_np = np.clip(img_np * 255.0 if img_np.max() <= 1.5 else img_np, 0, 255).astype(np.uint8) + else: + img_np = img_np.astype(np.uint8) + + img = PILImage.fromarray(img_np) + self._draw_overlay(img, goal_dist, instruction, step) + + tag = "head" if "head" in cam_name else "follow" + path = f"{self._debug_save_dir}/{tag}_{step:06d}.png" + img.save(path) + except Exception as e: + print(f"[VlnPolicy DEBUG] Failed to save frame: {e}") + + @staticmethod + def _get_overlay_fonts(img): + """Return readable overlay fonts scaled to the image resolution.""" + from PIL import ImageFont + + title_size = max(28, img.width // 28) + body_size = max(20, img.width // 45) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", title_size) + font_sm = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", body_size) + except OSError: + font = ImageFont.load_default() + font_sm = font + return font, font_sm + + @staticmethod + def _draw_text_box(draw, xy, text, font, text_fill, box_fill=(0, 0, 0), padding=6): + """Draw text with a solid background box for video readability.""" + left, top, right, bottom = draw.textbbox(xy, text, font=font) + draw.rectangle( + (left - padding, top - padding, right + padding, bottom + padding), + fill=box_fill, + ) + draw.text(xy, text, fill=text_fill, font=font) + return bottom + padding + + @staticmethod + def _wrap_overlay_text(draw, text: str, font, max_width: int) -> list[str]: + """Wrap text by rendered width so long instructions stay readable.""" + lines: list[str] = [] + paragraphs = text.replace("\r", "\n").split("\n") + for paragraph in paragraphs: + words = paragraph.split() + if not words: + continue + line = words[0] + for word in words[1:]: + candidate = f"{line} {word}" + if draw.textlength(candidate, font=font) <= max_width: + line = candidate + else: + lines.append(line) + line = word + lines.append(line) + return lines + + @staticmethod + def _draw_overlay(img, goal_dist, instruction, step, pos_str: str | None = None): + """Draw readable distance, step, position, and instruction overlays.""" + from PIL import ImageDraw + + draw = ImageDraw.Draw(img) + font, font_sm = VlnVlmLocomotionPolicy._get_overlay_fonts(img) + margin = max(10, img.width // 64) + padding = max(6, img.width // 170) + gap = max(8, img.width // 128) + + # Top-left status stack + dist_text = f"Dist: {goal_dist:.2f}m" if goal_dist is not None else "Dist: --" + y = margin + y = VlnVlmLocomotionPolicy._draw_text_box( + draw, (margin, y), dist_text, font, (255, 80, 80), padding=padding + ) + gap + y = VlnVlmLocomotionPolicy._draw_text_box( + draw, (margin, y), f"Step: {step}", font_sm, (255, 255, 255), padding=padding + ) + gap + if pos_str is not None: + VlnVlmLocomotionPolicy._draw_text_box( + draw, (margin, y), f"Pos: {pos_str}", font_sm, (255, 255, 255), padding=padding + ) + + # Bottom instruction panel + if instruction: + max_width = img.width - 2 * margin + lines = VlnVlmLocomotionPolicy._wrap_overlay_text(draw, instruction, font_sm, max_width - 2 * padding) + lines = lines[:4] + line_boxes = [draw.textbbox((0, 0), line, font=font_sm) for line in lines] + line_height = max((box[3] - box[1] for box in line_boxes), default=font_sm.size) + gap // 2 + panel_height = padding * 2 + line_height * len(lines) + panel_top = img.height - margin - panel_height + draw.rectangle( + (margin - padding, panel_top, img.width - margin + padding, img.height - margin + padding), + fill=(0, 0, 0), + ) + text_y = panel_top + padding + for line in lines: + draw.text((margin, text_y), line, fill=(255, 110, 110), font=font_sm) + text_y += line_height + + def _save_frame_to_dir(self, observation, unwrapped) -> None: + """Save camera frames to the configured save directory.""" + try: + cam_dict = self._get_camera_obs_dict(observation, unwrapped) + if cam_dict is None: + return + goal_dist = self._get_distance_to_goal(unwrapped) + instruction = self._current_instruction or "" + try: + pos = unwrapped.scene["robot"].data.root_pos_w[0].cpu().numpy() + pos_str = f"[{pos[0]:.1f},{pos[1]:.1f},{pos[2]:.2f}]" + except Exception: + pos_str = "?" + ep_id = 0 + ep_ids = getattr(unwrapped, "extras", {}).get("current_episode_id") + if ep_ids is not None: + ep_id = ep_ids[0] + for cam_name, cam_data in cam_dict.items(): + if "rgb" not in cam_name: + continue + img_tensor = cam_data + if hasattr(img_tensor, 'cpu'): + img_np = img_tensor[0].cpu().numpy() if img_tensor.ndim == 4 else img_tensor.cpu().numpy() + else: + img_np = np.asarray(img_tensor) + if img_np.ndim == 4: + img_np = img_np[0] + if img_np.dtype in (np.float32, np.float64): + img_np = np.clip(img_np * 255.0 if img_np.max() <= 1.5 else img_np, 0, 255).astype(np.uint8) + else: + img_np = img_np.astype(np.uint8) + img = PILImage.fromarray(img_np) + self._draw_overlay(img, goal_dist, instruction, self._step_count, pos_str=pos_str) + tag = "head" if "head" in cam_name else "follow" + path = os.path.join(self._save_dir, f"ep{ep_id}_{tag}_{self._step_count:06d}.png") + img.save(path) + except Exception as e: + print(f"[VlnPolicy] frame save error: {e}") + + def _get_camera_obs_dict(self, observation, unwrapped) -> dict[str, Any] | None: + """Return a camera_obs dict from either the step observation or the env.""" + if isinstance(observation, dict): + cam_dict = observation.get("camera_obs") + if isinstance(cam_dict, dict): + return cam_dict + + obs_manager = getattr(unwrapped, "observation_manager", None) + if obs_manager is None: + return None + + try: + cam_dict = obs_manager.compute_group("camera_obs", update_history=False) + except Exception: + return None + return cam_dict if isinstance(cam_dict, dict) else None + + def _should_accept_vlm_stop(self, goal_dist: float | None) -> tuple[bool, str]: + """Return whether a zero-duration stop should terminate the episode.""" + reasons: list[str] = [] + if self._ignore_vlm_stop: + reasons.append("ignore_vlm_stop") + if ( + self._min_vlm_stop_distance >= 0.0 + and goal_dist is not None + and goal_dist > self._min_vlm_stop_distance + ): + reasons.append( + f"goal_dist={goal_dist:.2f} > min_vlm_stop_distance={self._min_vlm_stop_distance:.2f}" + ) + return (not reasons, ", ".join(reasons) if reasons else "accepted") + + def reset(self, env_ids: torch.Tensor | None = None) -> None: + """Reset VLM scheduling state and notify server.""" + super().reset(env_ids) + self._step_count = 0 + self._target_step = 0 + self._last_vel_cmd[:] = 0.0 + self._current_instruction = None + + def set_task_description(self, task_description: str | None) -> str: + """Forward task description to the VLM server.""" + self.task_description = task_description + if task_description is not None: + self.remote_client.call_endpoint( + "set_task_description", + data={"task_description": task_description}, + requires_input=True, + ) + return self.task_description or "" + + # ------------------------------------------------------------------ # + # Low-level policy loading # + # ------------------------------------------------------------------ # + + def _load_low_level_policy(self, env) -> None: + """Load the RSL-RL locomotion policy and do warmup.""" + unwrapped = env.unwrapped if hasattr(env, "unwrapped") else env + + if isinstance(env, RslRlVecEnvWrapper): + vec_env = env + else: + vec_env = RslRlVecEnvWrapper(unwrapped) + self._ll_vec_env = vec_env + + agent_cfg_dict = load_yaml(self._ll_agent_cfg) + device = agent_cfg_dict.get("device", "cuda") + + runner = OnPolicyRunner(vec_env, agent_cfg_dict, log_dir=None, device=device) + runner.load(self._ll_checkpoint_path) + self._ll_policy = runner.get_inference_policy(device=vec_env.unwrapped.device) + + # Warmup: do forward-only dry runs so we don't mutate the first episode + # before metrics/recording start. The previous implementation stepped + # the environment here, which polluted path-length metrics and could + # physically bump the robot into nearby geometry before the first VLM + # query was issued. + self._ll_obs_td = vec_env.get_observations() + zero_cmd = torch.zeros(self.action_dim, device=self._device) + i, j = self._vel_cmd_indices + + print(f"[VlnPolicy] Warming up ({self._warmup_steps} steps)...") + for step in range(self._warmup_steps): + self._ll_obs_td["policy"][:, i:j] = zero_cmd + with torch.inference_mode(): + _ = self._ll_policy(self._ll_obs_td) + print("[VlnPolicy] Warmup complete.") + + # ------------------------------------------------------------------ # + # Distance helpers # + # ------------------------------------------------------------------ # + + def _get_distance_to_goal(self, unwrapped) -> float | None: + """Return Euclidean distance from robot to goal, or None.""" + try: + pos = unwrapped.scene["robot"].data.root_pos_w[0].cpu().numpy() + goal = unwrapped.extras.get("current_goal_pos") + if goal is None: + return None + goal_np = np.asarray(goal) + if goal_np.ndim == 2: + goal_np = goal_np[0] + return float(np.linalg.norm(pos - goal_np)) + except Exception: + return None + + # ------------------------------------------------------------------ # + # Instruction tracking # + # ------------------------------------------------------------------ # + + def _check_instruction_update(self, unwrapped) -> None: + """Detect per-episode instruction changes from env.extras.""" + extras = getattr(unwrapped, "extras", {}) + instruction = extras.get("current_instruction") + if instruction is None: + return + if isinstance(instruction, list): + instruction = instruction[0] + + if instruction != self._current_instruction: + self._current_instruction = instruction + self.remote_client.call_endpoint( + "set_task_description", + data={"task_description": instruction}, + requires_input=True, + ) + self._step_count = 0 + self._target_step = 0 + self._last_vel_cmd[:] = 0.0 + + # ------------------------------------------------------------------ # + # CLI helpers # + # ------------------------------------------------------------------ # + + @staticmethod + def add_args_to_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser = ClientSidePolicy.add_remote_args_to_parser(parser) + + ll = parser.add_argument_group("Low-Level Locomotion Policy") + ll.add_argument( + "--ll_checkpoint_path", type=str, required=True, + help="Path to the RSL-RL checkpoint (e.g. model_0.pt).", + ) + ll.add_argument( + "--ll_agent_cfg", type=str, required=True, + help="Path to the RSL-RL agent config YAML.", + ) + ll.add_argument( + "--warmup_steps", type=int, default=200, + help="Low-level policy warmup steps (default: 200).", + ) + ll.add_argument( + "--policy_device", type=str, default="cuda", + help="Device for policy inference (default: cuda).", + ) + ll.add_argument( + "--debug_vln", action="store_true", default=False, + help="Enable VLN debug logging and camera frame saving (default: off).", + ) + ll.add_argument( + "--save_interval", type=int, default=0, + help="Save camera frames every N steps to --save_dir (0=disabled).", + ) + ll.add_argument( + "--save_dir", type=str, default="", + help="Directory for saved camera frames.", + ) + ll.add_argument( + "--ignore_vlm_stop", + action="store_true", + default=False, + help="Diagnostic mode: never let VLM STOP terminate the episode.", + ) + ll.add_argument( + "--min_vlm_stop_distance", + type=float, + default=-1.0, + help="Diagnostic mode: ignore STOP while distance-to-goal is above this threshold; negative disables.", + ) + return parser + + @staticmethod + def from_args(args: argparse.Namespace) -> VlnVlmLocomotionPolicy: + remote_config = ClientSidePolicy.build_remote_config_from_args(args) + return VlnVlmLocomotionPolicy( + remote_config=remote_config, + ll_checkpoint_path=args.ll_checkpoint_path, + ll_agent_cfg=args.ll_agent_cfg, + device=getattr(args, "policy_device", "cuda"), + warmup_steps=getattr(args, "warmup_steps", 200), + debug=getattr(args, "debug_vln", False), + save_interval=getattr(args, "save_interval", 0), + save_dir=getattr(args, "save_dir", ""), + ignore_vlm_stop=getattr(args, "ignore_vlm_stop", False), + min_vlm_stop_distance=getattr(args, "min_vlm_stop_distance", -1.0), + ) diff --git a/isaaclab_arena/remote_policy/action_protocol.py b/isaaclab_arena/remote_policy/action_protocol.py index 5d4df2203..d6faf8b6a 100644 --- a/isaaclab_arena/remote_policy/action_protocol.py +++ b/isaaclab_arena/remote_policy/action_protocol.py @@ -14,11 +14,17 @@ class ActionMode(str, Enum): """Action output mode of a policy. - Currently only CHUNK is used. - Other modes can be added later if needed. + Attributes: + CHUNK: Action chunking mode (e.g. GR00T). The server returns a + sequence of ``action_horizon`` actions; the client consumes + ``action_chunk_length`` at a time. + VLN_VELOCITY: VLN velocity-command mode. The server returns a + single velocity command ``[vx, vy, yaw_rate]`` with a + ``duration`` (seconds) to hold it before re-querying. """ CHUNK = "chunk" + VLN_VELOCITY = "vln_velocity" @dataclass @@ -92,3 +98,36 @@ def to_dict(self) -> dict[str, Any]: "action_chunk_length": self.action_chunk_length, "action_horizon": self.action_horizon, } + + +@dataclass +class VlnVelocityActionProtocol(ActionProtocol): + """ActionProtocol for VLN_VELOCITY mode. + + The server returns a single velocity command ``[vx, vy, yaw_rate]`` and + a ``duration`` (seconds) for which the client should hold the command + before re-querying the server. + + default_duration: + Fallback duration when the server response does not include one. + """ + + MODE: ClassVar[ActionMode] = ActionMode.VLN_VELOCITY + + default_duration: float = 0.5 + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> VlnVelocityActionProtocol: + return cls( + action_dim=int(data["action_dim"]), + observation_keys=list(data["observation_keys"]), + default_duration=float(data.get("default_duration", 0.5)), + ) + + def to_dict(self) -> dict[str, Any]: + return { + "action_mode": self.mode.value, + "action_dim": self.action_dim, + "observation_keys": self.observation_keys, + "default_duration": self.default_duration, + } diff --git a/isaaclab_arena/scripts/assets/build_matterport_collision_layers.py b/isaaclab_arena/scripts/assets/build_matterport_collision_layers.py new file mode 100644 index 000000000..4aa03d81c --- /dev/null +++ b/isaaclab_arena/scripts/assets/build_matterport_collision_layers.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Build collision-only USDA layers from a cleaned Matterport OBJ mesh. + +This script is intended for the recommended collision workflow: + +1. Start from a raw Matterport / MP3D mesh, not the already-converted visual USD. +2. Export or convert that raw mesh to a triangulated OBJ. +3. Optionally clean / simplify the OBJ in a mesh tool. +4. Run this script to emit floor-only, obstacle-only, and/or combined collision USDA assets. + +The floor split is heuristic and tuned for single-floor indoor scenes: + - triangle centroid z must lie within ``[floor_min_z, floor_max_z]`` + - triangle normal must point mostly upward + - tiny triangles can be filtered out via ``min_face_area`` + +For multi-floor scenes or heavily tilted geometry, treat this as a starting point +and adjust thresholds or manually clean the exported mesh. +""" + +from __future__ import annotations + +import argparse +import math +from pathlib import Path + + +def _parse_obj(obj_path: Path) -> tuple[list[tuple[float, float, float]], list[tuple[int, int, int]]]: + """Parse a triangulated OBJ file into vertices and triangle faces.""" + vertices: list[tuple[float, float, float]] = [] + faces: list[tuple[int, int, int]] = [] + + with obj_path.open("r", encoding="utf-8", errors="ignore") as file: + for line in file: + if line.startswith("v "): + _, x_str, y_str, z_str = line.strip().split()[:4] + vertices.append((float(x_str), float(y_str), float(z_str))) + elif line.startswith("f "): + tokens = line.strip().split()[1:] + face: list[int] = [] + for token in tokens: + vertex_index_str = token.split("/")[0] + if not vertex_index_str: + continue + vertex_index = int(vertex_index_str) + if vertex_index < 0: + vertex_index = len(vertices) + vertex_index + else: + vertex_index -= 1 + face.append(vertex_index) + if len(face) < 3: + continue + for i in range(1, len(face) - 1): + faces.append((face[0], face[i], face[i + 1])) + + if not vertices or not faces: + raise RuntimeError(f"OBJ parse produced no usable mesh data: {obj_path}") + + return vertices, faces + + +def _sub(a: tuple[float, float, float], b: tuple[float, float, float]) -> tuple[float, float, float]: + return (a[0] - b[0], a[1] - b[1], a[2] - b[2]) + + +def _cross(a: tuple[float, float, float], b: tuple[float, float, float]) -> tuple[float, float, float]: + return ( + a[1] * b[2] - a[2] * b[1], + a[2] * b[0] - a[0] * b[2], + a[0] * b[1] - a[1] * b[0], + ) + + +def _norm(v: tuple[float, float, float]) -> float: + return math.sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]) + + +def _triangle_area( + vertices: list[tuple[float, float, float]], face: tuple[int, int, int] +) -> float: + p0, p1, p2 = (vertices[idx] for idx in face) + return 0.5 * _norm(_cross(_sub(p1, p0), _sub(p2, p0))) + + +def _triangle_centroid_z( + vertices: list[tuple[float, float, float]], face: tuple[int, int, int] +) -> float: + p0, p1, p2 = (vertices[idx] for idx in face) + return (p0[2] + p1[2] + p2[2]) / 3.0 + + +def _triangle_normal_z( + vertices: list[tuple[float, float, float]], face: tuple[int, int, int] +) -> float: + p0, p1, p2 = (vertices[idx] for idx in face) + cross = _cross(_sub(p1, p0), _sub(p2, p0)) + length = _norm(cross) + if length <= 1e-12: + return 0.0 + return cross[2] / length + + +def _split_floor_and_obstacles( + vertices: list[tuple[float, float, float]], + faces: list[tuple[int, int, int]], + *, + floor_min_z: float, + floor_max_z: float, + floor_normal_min_z: float, + min_face_area: float, +) -> tuple[list[tuple[int, int, int]], list[tuple[int, int, int]]]: + """Split triangles into floor-like and obstacle-like sets.""" + floor_faces: list[tuple[int, int, int]] = [] + obstacle_faces: list[tuple[int, int, int]] = [] + + for face in faces: + area = _triangle_area(vertices, face) + if area < min_face_area: + continue + + centroid_z = _triangle_centroid_z(vertices, face) + normal_z = _triangle_normal_z(vertices, face) + is_floor_like = ( + floor_min_z <= centroid_z <= floor_max_z and normal_z >= floor_normal_min_z + ) + if is_floor_like: + floor_faces.append(face) + else: + obstacle_faces.append(face) + + return floor_faces, obstacle_faces + + +def _compact_mesh( + vertices: list[tuple[float, float, float]], faces: list[tuple[int, int, int]] +) -> tuple[list[tuple[float, float, float]], list[int], list[int]]: + """Compact a mesh selection into a new vertex array and face indices.""" + used_indices: dict[int, int] = {} + compact_vertices: list[tuple[float, float, float]] = [] + compact_face_indices: list[int] = [] + compact_face_counts: list[int] = [] + + for face in faces: + compact_face_counts.append(len(face)) + for old_idx in face: + if old_idx not in used_indices: + used_indices[old_idx] = len(compact_vertices) + compact_vertices.append(vertices[old_idx]) + compact_face_indices.append(used_indices[old_idx]) + + return compact_vertices, compact_face_counts, compact_face_indices + + +def _format_int_array(values: list[int]) -> str: + return "[" + ", ".join(str(value) for value in values) + "]" + + +def _format_points(vertices: list[tuple[float, float, float]]) -> str: + return "[" + ", ".join(f"({x:.8f}, {y:.8f}, {z:.8f})" for x, y, z in vertices) + "]" + + +def _write_usda( + usd_path: Path, + vertices: list[tuple[float, float, float]], + face_vertex_counts: list[int], + face_vertex_indices: list[int], + mesh_prim_path: str, +) -> None: + """Write a simple USDA stage containing a single mesh.""" + prim_parts = [part for part in mesh_prim_path.split("/") if part] + if len(prim_parts) < 2 or prim_parts[0] != "World": + raise RuntimeError("mesh_prim_path must start with /World and include a mesh name.") + + mesh_name = prim_parts[-1] + xform_parts = prim_parts[:-1] + + lines: list[str] = [ + "#usda 1.0", + "(", + ' defaultPrim = "World"', + ' upAxis = "Z"', + ")", + "", + ] + + indent = "" + for index, prim_name in enumerate(xform_parts): + lines.append(f'{indent}def Xform "{prim_name}"') + lines.append(f"{indent}" + "{") + indent += " " + if index == len(xform_parts) - 1: + lines.append(f'{indent}def Mesh "{mesh_name}"') + lines.append(f"{indent}" + "{") + lines.append(f"{indent} int[] faceVertexCounts = {_format_int_array(face_vertex_counts)}") + lines.append(f"{indent} int[] faceVertexIndices = {_format_int_array(face_vertex_indices)}") + lines.append(f"{indent} point3f[] points = {_format_points(vertices)}") + lines.append(f'{indent} uniform token subdivisionScheme = "none"') + lines.append(f"{indent}" + "}") + + for close_idx in range(len(xform_parts)): + close_indent = " " * (len(xform_parts) - close_idx - 1) + lines.append(f"{close_indent}" + "}") + + usd_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def _write_selection( + *, + output_path: Path | None, + vertices: list[tuple[float, float, float]], + faces: list[tuple[int, int, int]], + mesh_prim_path: str, + label: str, +) -> None: + if output_path is None: + return + if not faces: + print(f"[build_matterport_collision_layers] skipping {label}: no faces selected") + return + + output_path.parent.mkdir(parents=True, exist_ok=True) + compact_vertices, face_vertex_counts, face_vertex_indices = _compact_mesh(vertices, faces) + _write_usda( + usd_path=output_path, + vertices=compact_vertices, + face_vertex_counts=face_vertex_counts, + face_vertex_indices=face_vertex_indices, + mesh_prim_path=mesh_prim_path, + ) + print( + "[build_matterport_collision_layers] wrote " + f"{label} -> {output_path} " + f"(vertices={len(compact_vertices)} faces={len(face_vertex_counts)})" + ) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Build floor / obstacle / combined collision USDA files from a cleaned OBJ mesh." + ) + parser.add_argument("--input_obj", type=Path, required=True, help="Input triangulated OBJ mesh path.") + parser.add_argument( + "--output_combined_usd", + type=Path, + default=None, + help="Optional path for a combined collision USDA output.", + ) + parser.add_argument( + "--output_floor_usd", + type=Path, + default=None, + help="Optional path for a floor-only collision USDA output.", + ) + parser.add_argument( + "--output_obstacle_usd", + type=Path, + default=None, + help="Optional path for an obstacle-only collision USDA output.", + ) + parser.add_argument( + "--mesh_prim_prefix", + type=str, + default="/World/CollisionProxy", + help="Prefix used for generated mesh prims.", + ) + parser.add_argument( + "--floor_min_z", + type=float, + default=-0.25, + help="Minimum triangle centroid z to be considered floor-like.", + ) + parser.add_argument( + "--floor_max_z", + type=float, + default=0.35, + help="Maximum triangle centroid z to be considered floor-like.", + ) + parser.add_argument( + "--floor_normal_min_z", + type=float, + default=0.9, + help="Minimum triangle normal z-component for floor classification.", + ) + parser.add_argument( + "--min_face_area", + type=float, + default=1e-6, + help="Drop tiny triangles below this area threshold before classification.", + ) + args = parser.parse_args() + + if not (args.output_combined_usd or args.output_floor_usd or args.output_obstacle_usd): + raise RuntimeError("At least one output path must be provided.") + + input_obj = args.input_obj.expanduser().resolve() + vertices, faces = _parse_obj(input_obj) + floor_faces, obstacle_faces = _split_floor_and_obstacles( + vertices, + faces, + floor_min_z=args.floor_min_z, + floor_max_z=args.floor_max_z, + floor_normal_min_z=args.floor_normal_min_z, + min_face_area=args.min_face_area, + ) + + print( + "[build_matterport_collision_layers] input=" + f"{input_obj} faces={len(faces)} floor_faces={len(floor_faces)} obstacle_faces={len(obstacle_faces)}" + ) + + prim_prefix = args.mesh_prim_prefix.rstrip("/") + _write_selection( + output_path=args.output_combined_usd.expanduser().resolve() if args.output_combined_usd else None, + vertices=vertices, + faces=faces, + mesh_prim_path=f"{prim_prefix}/combined", + label="combined", + ) + _write_selection( + output_path=args.output_floor_usd.expanduser().resolve() if args.output_floor_usd else None, + vertices=vertices, + faces=floor_faces, + mesh_prim_path=f"{prim_prefix}/floor", + label="floor", + ) + _write_selection( + output_path=args.output_obstacle_usd.expanduser().resolve() if args.output_obstacle_usd else None, + vertices=vertices, + faces=obstacle_faces, + mesh_prim_path=f"{prim_prefix}/obstacle", + label="obstacle", + ) + + +if __name__ == "__main__": + main() diff --git a/isaaclab_arena/scripts/assets/debug_matterport_collision_stage.py b/isaaclab_arena/scripts/assets/debug_matterport_collision_stage.py new file mode 100644 index 000000000..d9eae1646 --- /dev/null +++ b/isaaclab_arena/scripts/assets/debug_matterport_collision_stage.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Build a debug stage for inspecting Matterport collision setup in Isaac Sim. + +This script mirrors the runtime collision wiring used by +`MatterportBackground` but is focused on inspection instead of evaluation. +It can: + +1. Spawn the visual Matterport USD. +2. Optionally enable explicit child-mesh colliders. +3. Optionally attach legacy or split collision overlays. +4. Make hidden collision overlays visible for viewport inspection. +5. Save the resulting stage to USD so it can be reopened in Isaac Sim. +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +from isaaclab.app import AppLauncher + + +def _path_or_none(path: Path | None) -> str | None: + if path is None: + return None + return str(path.expanduser().resolve()) + + +parser = argparse.ArgumentParser(description="Create a debug USD stage for Matterport collision inspection.") +parser.add_argument("--visual_usd_path", type=Path, required=True, help="Path to the visual Matterport USD.") +parser.add_argument( + "--output_usd_path", + type=Path, + default=None, + help="Optional path to export the composed debug stage.", +) +parser.add_argument( + "--combined_collision_usd_path", + type=Path, + default=None, + help="Optional legacy combined collision overlay USD.", +) +parser.add_argument( + "--floor_collision_usd_path", + type=Path, + default=None, + help="Optional floor-only collision overlay USD.", +) +parser.add_argument( + "--obstacle_collision_usd_path", + type=Path, + default=None, + help="Optional obstacle-only collision overlay USD.", +) +parser.add_argument( + "--prim_path", + type=str, + default="/World/matterport", + help="Prim path where the Matterport scene will be spawned.", +) +parser.add_argument( + "--mesh_collider_type", + type=str, + default="triangle", + choices=("triangle", "convex_decomposition", "sdf"), + help="Approximation used for explicit child-mesh colliders and overlays.", +) +parser.add_argument( + "--enable_child_mesh_colliders", + action="store_true", + default=False, + help="Apply explicit collision schemas to descendant visual USD meshes.", +) +parser.add_argument( + "--disable_ground_plane", + action="store_true", + default=False, + help="Do not spawn the fallback invisible z=0 ground plane.", +) +parser.add_argument( + "--show_collision_prims", + action="store_true", + default=False, + help="Make hidden collision overlay prims visible after spawning them.", +) +parser.add_argument( + "--show_ground_plane", + action="store_true", + default=False, + help="Make the fallback ground plane visible if it is spawned.", +) +parser.add_argument( + "--flatten_output", + action="store_true", + default=False, + help="Export a flattened stage instead of only the root layer.", +) +parser.add_argument( + "--settle_frames", + type=int, + default=4, + help="Number of app update frames after spawning before export.", +) +AppLauncher.add_app_launcher_args(parser) +args_cli = parser.parse_args() + +app_launcher = AppLauncher(args_cli) +simulation_app = app_launcher.app + + +def _iter_descendant_prims(root_prim): + from pxr import Usd + + for child in root_prim.GetFilteredChildren(Usd.TraverseInstanceProxies()): + yield child + yield from _iter_descendant_prims(child) + + +def _count_meshes(stage, prim_path: str) -> int: + from pxr import UsdGeom + + prim = stage.GetPrimAtPath(prim_path) + if not prim.IsValid(): + return 0 + return sum(1 for descendant in _iter_descendant_prims(prim) if descendant.IsA(UsdGeom.Mesh)) + + +def _set_visible(stage, prim_path: str) -> bool: + from pxr import UsdGeom + + prim = stage.GetPrimAtPath(prim_path) + if not prim.IsValid(): + return False + UsdGeom.Imageable(prim).MakeVisible() + return True + + +def _overlay_prim_names() -> list[str]: + names: list[str] = [] + if args_cli.combined_collision_usd_path is not None: + names.append("collision") + if args_cli.floor_collision_usd_path is not None: + names.append("collisionFloor") + if args_cli.obstacle_collision_usd_path is not None: + names.append("collisionObstacle") + return names + + +def main() -> None: + import omni.usd + import isaaclab.sim as sim_utils + + from isaaclab_arena.assets.matterport_background import _spawn_matterport_with_ground + + if args_cli.combined_collision_usd_path and ( + args_cli.floor_collision_usd_path or args_cli.obstacle_collision_usd_path + ): + raise ValueError("Use either the combined collision overlay or split floor/obstacle overlays, not both.") + + visual_usd_path = str(args_cli.visual_usd_path.expanduser().resolve()) + output_usd_path = args_cli.output_usd_path.expanduser().resolve() if args_cli.output_usd_path else None + + usd_context = omni.usd.get_context() + usd_context.new_stage() + stage = usd_context.get_stage() + if stage is None: + raise RuntimeError("Failed to create a fresh USD stage.") + + cfg = sim_utils.UsdFileCfg(usd_path=visual_usd_path) + cfg.ground_plane_z = None if args_cli.disable_ground_plane else 0.0 + cfg.explicit_mesh_colliders = args_cli.enable_child_mesh_colliders + cfg.mesh_collider_approximation = args_cli.mesh_collider_type + cfg.collision_overlay_usd_path = _path_or_none(args_cli.combined_collision_usd_path) + cfg.floor_collision_usd_path = _path_or_none(args_cli.floor_collision_usd_path) + cfg.obstacle_collision_usd_path = _path_or_none(args_cli.obstacle_collision_usd_path) + + _spawn_matterport_with_ground(args_cli.prim_path, cfg) + + for _ in range(max(args_cli.settle_frames, 0)): + simulation_app.update() + + visible_overlays: list[str] = [] + if args_cli.show_collision_prims: + for overlay_name in _overlay_prim_names(): + overlay_path = f"{args_cli.prim_path}/{overlay_name}" + if _set_visible(stage, overlay_path): + visible_overlays.append(overlay_path) + + ground_plane_visible = False + if args_cli.show_ground_plane: + ground_plane_visible = _set_visible(stage, "/World/GroundPlane") + + visual_mesh_count = _count_meshes(stage, args_cli.prim_path) + print(f"[debug_matterport_collision_stage] visual_root={args_cli.prim_path} meshes={visual_mesh_count}") + for overlay_name in _overlay_prim_names(): + overlay_path = f"{args_cli.prim_path}/{overlay_name}" + print(f"[debug_matterport_collision_stage] overlay={overlay_path} meshes={_count_meshes(stage, overlay_path)}") + + if visible_overlays: + print("[debug_matterport_collision_stage] visible_overlays=" + ", ".join(visible_overlays)) + if ground_plane_visible: + print("[debug_matterport_collision_stage] ground_plane made visible at /World/GroundPlane") + + if output_usd_path is not None: + output_usd_path.parent.mkdir(parents=True, exist_ok=True) + if args_cli.flatten_output: + stage.Export(str(output_usd_path)) + export_mode = "flattened stage" + else: + stage.GetRootLayer().Export(str(output_usd_path)) + export_mode = "root layer" + print(f"[debug_matterport_collision_stage] exported {export_mode} to {output_usd_path}") + + if not args_cli.headless: + print("[debug_matterport_collision_stage] GUI mode active. Inspect the stage and close Isaac Sim when done.") + while simulation_app.is_running(): + simulation_app.update() + + +if __name__ == "__main__": + try: + main() + finally: + simulation_app.close() diff --git a/isaaclab_arena/scripts/assets/export_matterport_collision_proxy.py b/isaaclab_arena/scripts/assets/export_matterport_collision_proxy.py new file mode 100644 index 000000000..47362212c --- /dev/null +++ b/isaaclab_arena/scripts/assets/export_matterport_collision_proxy.py @@ -0,0 +1,178 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Export a scene asset into a mesh file for collision authoring workflows. + +This script runs inside Isaac Sim so it can use Omniverse's asset converter. +The intended workflow is: + +1. Export a raw Matterport / MP3D mesh (preferred) or a visual USD fallback to OBJ. +2. Simplify / clean the OBJ in an external mesh tool if needed. +3. Convert the cleaned OBJ into one or more collision-only USDA files. +4. Feed those USDA files to the Matterport collision overlay CLI flags. +""" + +from __future__ import annotations + +import argparse +import asyncio +from pathlib import Path + +from isaaclab.app import AppLauncher + + +parser = argparse.ArgumentParser( + description="Export a scene asset (raw mesh or USD) to OBJ or USD via Omniverse asset converter." +) +parser.add_argument( + "--input_path", + type=Path, + required=True, + help="Input asset path (raw Matterport mesh, USD, or another asset-converter supported format).", +) +parser.add_argument("--output_path", type=Path, required=True, help="Output asset path (e.g. .obj or .usd).") +parser.add_argument( + "--merge_all_meshes", + action="store_true", + default=False, + help="Request Omniverse asset converter to merge meshes when supported.", +) +parser.add_argument( + "--ignore_materials", + action="store_true", + default=False, + help="Drop materials during export when supported by the converter.", +) +parser.add_argument( + "--export_hidden_props", + action="store_true", + default=False, + help="Include hidden prims in the exported asset when supported.", +) +AppLauncher.add_app_launcher_args(parser) +args_cli = parser.parse_args() + +app_launcher = AppLauncher(args_cli) +simulation_app = app_launcher.app + +try: + import omni.kit.asset_converter as asset_converter # noqa: E402 +except ModuleNotFoundError: + asset_converter = None + +from pxr import Gf, Usd, UsdGeom # noqa: E402 + + +async def _convert_asset(input_path: Path, output_path: Path) -> None: + """Convert an asset using Omniverse asset converter.""" + input_path = input_path.expanduser().resolve() + output_path = output_path.expanduser().resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + + converter = asset_converter.get_instance() + ctx = asset_converter.AssetConverterContext() + ctx.ignore_materials = args_cli.ignore_materials + ctx.export_hidden_props = args_cli.export_hidden_props + ctx.merge_all_meshes = args_cli.merge_all_meshes + + def _progress_callback(progress: float, total_steps: float) -> bool: + print(f"[asset_converter] progress={progress:.3f} total_steps={total_steps}") + return True + + task = converter.create_converter_task( + str(input_path), + str(output_path), + _progress_callback, + ctx, + ) + success = await task.wait_until_finished() + if not success: + raise RuntimeError( + f"Asset conversion failed: status={task.get_status()} error={task.get_error_message()}" + ) + print(f"[asset_converter] exported: {output_path}") + + +def _iter_mesh_prims(stage: Usd.Stage): + """Yield mesh prims from a USD stage.""" + for prim in stage.Traverse(): + if prim.IsA(UsdGeom.Mesh): + yield prim + + +def _transform_points(matrix: Gf.Matrix4d, points) -> list[tuple[float, float, float]]: + """Transform USD mesh points into world-space XYZ tuples.""" + xyz_points: list[tuple[float, float, float]] = [] + for point in points: + world = matrix.Transform(Gf.Vec3d(point[0], point[1], point[2])) + xyz_points.append((world[0], world[1], world[2])) + return xyz_points + + +def _write_obj_from_usd(input_path: Path, output_path: Path) -> None: + """Fallback exporter: compose the USD stage and dump all meshes to OBJ.""" + stage = Usd.Stage.Open(str(input_path)) + if stage is None: + raise RuntimeError(f"Failed to open USD stage: {input_path}") + + output_path.parent.mkdir(parents=True, exist_ok=True) + xform_cache = UsdGeom.XformCache() + vertex_offset = 1 + mesh_count = 0 + + with output_path.open("w", encoding="utf-8") as obj_file: + obj_file.write(f"# Exported from USD: {input_path}\n") + for prim in _iter_mesh_prims(stage): + mesh = UsdGeom.Mesh(prim) + points = mesh.GetPointsAttr().Get() + face_counts = mesh.GetFaceVertexCountsAttr().Get() + face_indices = mesh.GetFaceVertexIndicesAttr().Get() + if not points or not face_counts or not face_indices: + continue + + local_to_world = xform_cache.GetLocalToWorldTransform(prim) + world_points = _transform_points(local_to_world, points) + if not world_points: + continue + + mesh_count += 1 + obj_file.write(f"o mesh_{mesh_count}\n") + for x, y, z in world_points: + obj_file.write(f"v {x:.8f} {y:.8f} {z:.8f}\n") + + cursor = 0 + for face_count in face_counts: + face = [face_indices[cursor + i] + vertex_offset for i in range(face_count)] + cursor += face_count + if len(face) < 3: + continue + # Fan triangulation for non-triangle faces. + for i in range(1, len(face) - 1): + obj_file.write(f"f {face[0]} {face[i]} {face[i + 1]}\n") + + vertex_offset += len(world_points) + + if mesh_count == 0: + raise RuntimeError(f"No meshes found in USD stage: {input_path}") + print(f"[pxr_obj_export] exported {mesh_count} meshes to: {output_path}") + + +def main() -> None: + try: + if asset_converter is not None: + asyncio.get_event_loop().run_until_complete(_convert_asset(args_cli.input_path, args_cli.output_path)) + else: + if args_cli.output_path.suffix.lower() != ".obj": + raise RuntimeError( + "Asset converter is unavailable in this Isaac Sim image. " + "Fallback exporter currently supports only OBJ output." + ) + _write_obj_from_usd(args_cli.input_path, args_cli.output_path) + finally: + simulation_app.close() + + +if __name__ == "__main__": + main() diff --git a/isaaclab_arena/scripts/assets/filter_collision_usda.py b/isaaclab_arena/scripts/assets/filter_collision_usda.py new file mode 100644 index 000000000..2a6f35b70 --- /dev/null +++ b/isaaclab_arena/scripts/assets/filter_collision_usda.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 + +"""Filter simple collision USDA meshes and optionally add viewport colors.""" + +from __future__ import annotations + +import argparse +import math +import re +from collections import defaultdict, deque +from dataclasses import dataclass +from pathlib import Path + + +@dataclass +class MeshData: + mesh_name: str + vertices: list[tuple[float, float, float]] + faces: list[tuple[int, ...]] + + +@dataclass +class ComponentStats: + face_indices: list[int] + min_z: float + max_z: float + diag: float + + +def _parse_usda_mesh(path: Path) -> MeshData: + text = path.read_text(encoding="utf-8") + mesh_name_match = re.search(r'def Mesh "([^"]+)"', text) + points_match = re.search(r"point3f\[\] points = \[(.*?)\]\n", text, re.S) + counts_match = re.search(r"int\[\] faceVertexCounts = \[(.*?)\]\n", text, re.S) + indices_match = re.search(r"int\[\] faceVertexIndices = \[(.*?)\]\n", text, re.S) + if not (mesh_name_match and points_match and counts_match and indices_match): + raise RuntimeError(f"Failed to parse simple collision USDA mesh: {path}") + + vertices = [ + tuple(map(float, match)) + for match in re.findall(r"\(([-0-9.eE]+),\s*([-0-9.eE]+),\s*([-0-9.eE]+)\)", points_match.group(1)) + ] + face_counts = [int(value) for value in re.findall(r"-?\d+", counts_match.group(1))] + face_indices_flat = [int(value) for value in re.findall(r"-?\d+", indices_match.group(1))] + + faces: list[tuple[int, ...]] = [] + cursor = 0 + for face_count in face_counts: + faces.append(tuple(face_indices_flat[cursor : cursor + face_count])) + cursor += face_count + + return MeshData(mesh_name=mesh_name_match.group(1), vertices=vertices, faces=faces) + + +def _iter_connected_components(mesh: MeshData) -> list[ComponentStats]: + triangle_faces = [tuple(face) for face in mesh.faces if len(face) == 3] + vertex_to_faces: dict[int, list[int]] = defaultdict(list) + for face_index, face in enumerate(triangle_faces): + for vertex_index in face: + vertex_to_faces[vertex_index].append(face_index) + + seen = [False] * len(triangle_faces) + components: list[ComponentStats] = [] + for start_index in range(len(triangle_faces)): + if seen[start_index]: + continue + + queue = deque([start_index]) + seen[start_index] = True + component_faces: list[int] = [] + component_vertices: set[int] = set() + while queue: + face_index = queue.popleft() + component_faces.append(face_index) + for vertex_index in triangle_faces[face_index]: + component_vertices.add(vertex_index) + for neighbor_face_index in vertex_to_faces[vertex_index]: + if not seen[neighbor_face_index]: + seen[neighbor_face_index] = True + queue.append(neighbor_face_index) + + xs = [mesh.vertices[index][0] for index in component_vertices] + ys = [mesh.vertices[index][1] for index in component_vertices] + zs = [mesh.vertices[index][2] for index in component_vertices] + dx = max(xs) - min(xs) + dy = max(ys) - min(ys) + dz = max(zs) - min(zs) + components.append( + ComponentStats( + face_indices=component_faces, + min_z=min(zs), + max_z=max(zs), + diag=math.sqrt(dx * dx + dy * dy + dz * dz), + ) + ) + + return components + + +def _should_remove_ground_component(component: ComponentStats, args: argparse.Namespace) -> bool: + return ( + args.ground_max_faces > 0 + and len(component.face_indices) <= args.ground_max_faces + and component.max_z <= args.ground_max_top_z + and component.diag <= args.ground_max_diag + ) + + +def _should_remove_floating_component(component: ComponentStats, args: argparse.Namespace) -> bool: + return ( + args.floating_max_faces > 0 + and len(component.face_indices) <= args.floating_max_faces + and component.min_z >= args.floating_min_bottom_z + and (args.floating_max_top_z <= 0 or component.max_z <= args.floating_max_top_z) + and component.diag <= args.floating_max_diag + ) + + +def _compact_mesh( + vertices: list[tuple[float, float, float]], faces: list[tuple[int, ...]] +) -> tuple[list[tuple[float, float, float]], list[int], list[int]]: + used_indices: dict[int, int] = {} + compact_vertices: list[tuple[float, float, float]] = [] + compact_face_counts: list[int] = [] + compact_face_indices: list[int] = [] + + for face in faces: + compact_face_counts.append(len(face)) + for old_index in face: + if old_index not in used_indices: + used_indices[old_index] = len(compact_vertices) + compact_vertices.append(vertices[old_index]) + compact_face_indices.append(used_indices[old_index]) + + return compact_vertices, compact_face_counts, compact_face_indices + + +def _format_points(vertices: list[tuple[float, float, float]]) -> str: + return "[" + ", ".join(f"({x:.8f}, {y:.8f}, {z:.8f})" for x, y, z in vertices) + "]" + + +def _format_int_array(values: list[int]) -> str: + return "[" + ", ".join(str(value) for value in values) + "]" + + +def _build_output_text( + mesh_name: str, + vertices: list[tuple[float, float, float]], + face_counts: list[int], + face_indices: list[int], + display_color: tuple[float, float, float] | None, + display_opacity: float | None, +) -> str: + lines = [ + "#usda 1.0", + "(", + ' defaultPrim = "World"', + ' upAxis = "Z"', + ")", + "", + 'def Xform "World"', + "{", + ' def Xform "CollisionProxy"', + " {", + f' def Mesh "{mesh_name}"', + " {", + f" int[] faceVertexCounts = {_format_int_array(face_counts)}", + f" int[] faceVertexIndices = {_format_int_array(face_indices)}", + f" point3f[] points = {_format_points(vertices)}", + ] + if display_color is not None: + r, g, b = display_color + lines.append(f" color3f[] primvars:displayColor = [({r:.4f}, {g:.4f}, {b:.4f})]") + lines.append(' uniform token primvars:displayColor:interpolation = "constant"') + if display_opacity is not None: + lines.append(f" float[] primvars:displayOpacity = [{display_opacity:.4f}]") + lines.append(' uniform token primvars:displayOpacity:interpolation = "constant"') + lines.extend( + [ + ' uniform token subdivisionScheme = "none"', + " }", + " }", + "}", + "", + ] + ) + return "\n".join(lines) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Filter simple collision USDA meshes and optionally add display color.") + parser.add_argument("--input_usd", type=Path, required=True, help="Input USDA produced by the collision pipeline.") + parser.add_argument("--output_usd", type=Path, required=True, help="Output USDA path.") + parser.add_argument( + "--display_color", + type=float, + nargs=3, + default=None, + metavar=("R", "G", "B"), + help="Optional constant displayColor for viewport visualization.", + ) + parser.add_argument( + "--display_opacity", + type=float, + default=None, + help="Optional constant displayOpacity for viewport visualization.", + ) + parser.add_argument( + "--ground_max_faces", + type=int, + default=0, + help="Remove connected components at ground height with face counts at or below this value (0 disables).", + ) + parser.add_argument( + "--ground_max_top_z", + type=float, + default=0.0, + help="Maximum component top z for ground-fragment removal.", + ) + parser.add_argument( + "--ground_max_diag", + type=float, + default=0.0, + help="Maximum component diagonal length for ground-fragment removal.", + ) + parser.add_argument( + "--floating_max_faces", + type=int, + default=0, + help="Remove floating connected components with face counts at or below this value (0 disables).", + ) + parser.add_argument( + "--floating_min_bottom_z", + type=float, + default=0.0, + help="Minimum component bottom z for floating-fragment removal.", + ) + parser.add_argument( + "--floating_max_diag", + type=float, + default=0.0, + help="Maximum component diagonal length for floating-fragment removal.", + ) + parser.add_argument( + "--floating_max_top_z", + type=float, + default=0.0, + help="Optional maximum component top z for floating-fragment removal (0 disables).", + ) + args = parser.parse_args() + + input_usd = args.input_usd.expanduser().resolve() + output_usd = args.output_usd.expanduser().resolve() + mesh = _parse_usda_mesh(input_usd) + components = _iter_connected_components(mesh) + + removed_face_indices: set[int] = set() + ground_removed = 0 + floating_removed = 0 + for component in components: + if _should_remove_ground_component(component, args): + removed_face_indices.update(component.face_indices) + ground_removed += 1 + continue + if _should_remove_floating_component(component, args): + removed_face_indices.update(component.face_indices) + floating_removed += 1 + + triangle_faces = [tuple(face) for face in mesh.faces if len(face) == 3] + kept_faces = [triangle_faces[index] for index in range(len(triangle_faces)) if index not in removed_face_indices] + compact_vertices, face_counts, face_indices = _compact_mesh(mesh.vertices, kept_faces) + output_usd.parent.mkdir(parents=True, exist_ok=True) + output_usd.write_text( + _build_output_text( + mesh_name=mesh.mesh_name, + vertices=compact_vertices, + face_counts=face_counts, + face_indices=face_indices, + display_color=tuple(args.display_color) if args.display_color is not None else None, + display_opacity=args.display_opacity, + ), + encoding="utf-8", + ) + + print( + "[filter_collision_usda] " + f"input={input_usd} output={output_usd} " + f"kept_faces={len(face_counts)} removed_faces={len(removed_face_indices)} " + f"removed_ground_components={ground_removed} removed_floating_components={floating_removed}" + ) + + +if __name__ == "__main__": + main() diff --git a/isaaclab_arena/scripts/assets/obj_to_usd_mesh.py b/isaaclab_arena/scripts/assets/obj_to_usd_mesh.py new file mode 100644 index 000000000..4417f1b8f --- /dev/null +++ b/isaaclab_arena/scripts/assets/obj_to_usd_mesh.py @@ -0,0 +1,139 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Convert a triangulated OBJ mesh into a simple USD mesh stage. + +This utility is intended for collision-proxy workflows where we already have an +OBJ export from the Matterport visual USD and want a lightweight USD mesh back +for ``--matterport_collision_usd_path``. +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +def _parse_obj(obj_path: Path) -> tuple[list[tuple[float, float, float]], list[int], list[int]]: + """Parse vertex and face data from a simple OBJ file.""" + vertices: list[tuple[float, float, float]] = [] + face_vertex_counts: list[int] = [] + face_vertex_indices: list[int] = [] + + with obj_path.open("r", encoding="utf-8", errors="ignore") as file: + for line in file: + if line.startswith("v "): + _, x_str, y_str, z_str = line.strip().split()[:4] + vertices.append((float(x_str), float(y_str), float(z_str))) + elif line.startswith("f "): + tokens = line.strip().split()[1:] + face: list[int] = [] + for token in tokens: + vertex_index_str = token.split("/")[0] + if not vertex_index_str: + continue + vertex_index = int(vertex_index_str) + if vertex_index < 0: + vertex_index = len(vertices) + vertex_index + else: + vertex_index -= 1 + face.append(vertex_index) + if len(face) < 3: + continue + face_vertex_counts.append(len(face)) + face_vertex_indices.extend(face) + + if not vertices or not face_vertex_counts: + raise RuntimeError(f"OBJ parse produced no usable mesh data: {obj_path}") + + return vertices, face_vertex_counts, face_vertex_indices + + +def _format_int_array(values: list[int]) -> str: + """Format a USD int array.""" + return "[" + ", ".join(str(value) for value in values) + "]" + + +def _format_points(vertices: list[tuple[float, float, float]]) -> str: + """Format a USD point3f array.""" + return "[" + ", ".join(f"({x:.8f}, {y:.8f}, {z:.8f})" for x, y, z in vertices) + "]" + + +def _write_usda( + usd_path: Path, + vertices: list[tuple[float, float, float]], + face_vertex_counts: list[int], + face_vertex_indices: list[int], + mesh_prim_path: str, +) -> None: + """Write a simple USDA stage containing a single mesh.""" + prim_parts = [part for part in mesh_prim_path.split("/") if part] + if len(prim_parts) < 2 or prim_parts[0] != "World": + raise RuntimeError("mesh_prim_path must start with /World and include a mesh name.") + + mesh_name = prim_parts[-1] + xform_parts = prim_parts[:-1] + + lines: list[str] = [ + "#usda 1.0", + "(", + ' defaultPrim = "World"', + ' upAxis = "Z"', + ")", + "", + ] + + indent = "" + for index, prim_name in enumerate(xform_parts): + lines.append(f'{indent}def Xform "{prim_name}"') + lines.append(f"{indent}" + "{") + indent += " " + if index == len(xform_parts) - 1: + lines.append(f'{indent}def Mesh "{mesh_name}"') + lines.append(f"{indent}" + "{") + lines.append(f"{indent} int[] faceVertexCounts = {_format_int_array(face_vertex_counts)}") + lines.append(f"{indent} int[] faceVertexIndices = {_format_int_array(face_vertex_indices)}") + lines.append(f"{indent} point3f[] points = {_format_points(vertices)}") + lines.append(f'{indent} uniform token subdivisionScheme = "none"') + lines.append(f"{indent}" + "}") + + for close_idx in range(len(xform_parts)): + close_indent = " " * (len(xform_parts) - close_idx - 1) + lines.append(f"{close_indent}" + "}") + + usd_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Convert OBJ mesh to a simple USD mesh.") + parser.add_argument("--input_obj", type=Path, required=True, help="Input OBJ mesh path.") + parser.add_argument("--output_usd", type=Path, required=True, help="Output USD path.") + parser.add_argument( + "--mesh_prim_path", + type=str, + default="/World/CollisionProxy/mesh", + help="Prim path for the generated mesh.", + ) + args = parser.parse_args() + + input_obj = args.input_obj.expanduser().resolve() + output_usd = args.output_usd.expanduser().resolve() + output_usd.parent.mkdir(parents=True, exist_ok=True) + + vertices, face_vertex_counts, face_vertex_indices = _parse_obj(input_obj) + _write_usda( + usd_path=output_usd, + vertices=vertices, + face_vertex_counts=face_vertex_counts, + face_vertex_indices=face_vertex_indices, + mesh_prim_path=args.mesh_prim_path, + ) + print( + f"[obj_to_usd_mesh] wrote {output_usd} " + f"(vertices={len(vertices)} faces={len(face_vertex_counts)})" + ) + + +if __name__ == "__main__": + main() diff --git a/isaaclab_arena/scripts/assets/open_usd_stage.py b/isaaclab_arena/scripts/assets/open_usd_stage.py new file mode 100644 index 000000000..265dc37e1 --- /dev/null +++ b/isaaclab_arena/scripts/assets/open_usd_stage.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Open a USD stage in Isaac Sim and keep the app alive for inspection.""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +from isaaclab.app import AppLauncher + + +parser = argparse.ArgumentParser(description="Open a USD stage in Isaac Sim for manual inspection.") +parser.add_argument("--usd_path", type=Path, required=True, help="USD or USDA file to open in Isaac Sim.") +parser.add_argument( + "--wait_frames", + type=int, + default=4, + help="Number of app update frames after opening the stage before reporting success.", +) +AppLauncher.add_app_launcher_args(parser) +args_cli = parser.parse_args() + +app_launcher = AppLauncher(args_cli) +simulation_app = app_launcher.app + + +def main() -> None: + import omni.usd + + usd_path = args_cli.usd_path.expanduser().resolve() + if not usd_path.exists(): + raise FileNotFoundError(f"USD stage does not exist: {usd_path}") + + usd_context = omni.usd.get_context() + opened = usd_context.open_stage(str(usd_path)) + if not opened: + raise RuntimeError(f"Failed to open stage: {usd_path}") + + for _ in range(max(args_cli.wait_frames, 0)): + simulation_app.update() + + stage = usd_context.get_stage() + default_prim = stage.GetDefaultPrim().GetPath().pathString if stage and stage.GetDefaultPrim() else "" + print(f"[open_usd_stage] opened={usd_path}") + print(f"[open_usd_stage] default_prim={default_prim}") + + if not args_cli.headless: + print("[open_usd_stage] GUI mode active. Inspect the stage and close Isaac Sim when done.") + while simulation_app.is_running(): + simulation_app.update() + + +if __name__ == "__main__": + try: + main() + finally: + simulation_app.close() diff --git a/isaaclab_arena/scripts/reinforcement_learning/train.py b/isaaclab_arena/scripts/reinforcement_learning/train.py index a7ad52391..4e5a0c72a 100644 --- a/isaaclab_arena/scripts/reinforcement_learning/train.py +++ b/isaaclab_arena/scripts/reinforcement_learning/train.py @@ -33,9 +33,6 @@ default=Path("isaaclab_arena/policy/rl_policy/generic_policy.json"), help="Path to the RL agent configuration file.", ) -parser.add_argument( - "--distributed", action="store_true", default=False, help="Run training with multiple GPUs or nodes." -) parser.add_argument("--export_io_descriptors", action="store_true", default=False, help="Export IO descriptors.") # append RSL-RL cli arguments cli_args.add_rsl_rl_args(parser) diff --git a/isaaclab_arena/tasks/vln_r2r_matterport_task.py b/isaaclab_arena/tasks/vln_r2r_matterport_task.py new file mode 100644 index 000000000..3a18489d0 --- /dev/null +++ b/isaaclab_arena/tasks/vln_r2r_matterport_task.py @@ -0,0 +1,541 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""VLN navigation task for IsaacLab Arena. + +This module provides: + - ``VlnEpisodeCfg``: configuration for a single VLN episode. + - ``VlnBenchmarkCfg``: sequential episode sampler (supports multi-env). + - ``VlnNavTask``: TaskBase implementation that wires episodes, + terminations, events, and metrics together. + - Event / termination helpers that reset the robot pose and check goal + arrival at runtime. + +Multi-env support: + When ``num_envs > 1``, each environment gets its own episode, goal + position, and instruction. Episode data is stored in ``env.extras`` + with per-env arrays (e.g. ``current_goal_pos`` has shape ``[N, 3]``). + Note that each env also loads a full copy of the Matterport scene, + which may require significant GPU memory. +""" + +from __future__ import annotations + +import gzip +import json +import os +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import numpy as np +import torch +from dataclasses import MISSING + +import isaaclab.envs.mdp as mdp_isaac_lab +from isaaclab.envs.common import ViewerCfg +from isaaclab.managers import EventTermCfg, SceneEntityCfg, TerminationTermCfg +from isaaclab.utils import configclass + +from isaaclab_arena.assets.asset import Asset +from isaaclab_arena.environments.isaaclab_arena_manager_based_env import IsaacLabArenaManagerBasedRLEnvCfg +from isaaclab_arena.metrics.metric_base import MetricBase +from isaaclab_arena.tasks.task_base import TaskBase + +from isaaclab_arena.metrics.vln_metrics import ( + DistanceToGoalMetric, + OracleSuccessMetric, + PathLengthMetric, + SPLMetric, + SuccessMetric, +) + + +# ========================================================================== # +# Dataset loading # +# ========================================================================== # + + +def read_episodes(r2r_dataset_path: str) -> List[Dict[str, Any]]: + """Read R2R-style episodes from a gzipped JSON file. + + Expected format:: + + {"episodes": [ {episode_dict}, ... ]} + """ + with gzip.open(r2r_dataset_path, "rt", encoding="utf-8") as f: + data = json.load(f) + return data["episodes"] + + +# ========================================================================== # +# Episode configuration # +# ========================================================================== # + + +@configclass +class VlnEpisodeCfg: + """Configuration for a single VLN episode.""" + + scene_id: str = MISSING + episode_id: int = MISSING + start_pos: Tuple[float, float, float] = MISSING + start_quat_wxyz: Tuple[float, float, float, float] = MISSING + goal_pos: Tuple[float, float, float] = MISSING + goal_radius: float = MISSING + geodesic_distance: float = MISSING + instruction_text: str = MISSING + reference_path: List[Tuple[float, float, float]] = MISSING + + +@configclass +class VlnBenchmarkCfg: + """Sequential episode sampler wrapping a list of :class:`VlnEpisodeCfg`. + + Supports multi-env: ``sample_episodes(n)`` returns ``n`` episodes, + one per environment, advancing the internal index sequentially. + """ + + episodes: list[VlnEpisodeCfg] = MISSING + current_index: int = 0 + robot_root_height_offset: float = 0.0 + + def sample_episode(self) -> VlnEpisodeCfg: + """Sample a single episode (for single-env or per-env calls).""" + ep = self.episodes[self.current_index] + self.current_index = (self.current_index + 1) % len(self.episodes) + return ep + + def sample_episodes(self, n: int) -> list[VlnEpisodeCfg]: + """Sample ``n`` episodes sequentially (one per env).""" + return [self.sample_episode() for _ in range(n)] + + +def build_vln_episode_from_raw(raw_ep: dict) -> VlnEpisodeCfg: + """Convert a raw R2R-style episode dict into :class:`VlnEpisodeCfg`.""" + scene_id = os.path.splitext(os.path.basename(raw_ep["scene_id"]))[0] + start_pos = tuple(raw_ep["start_position"]) + start_quat = tuple(raw_ep["start_rotation"]) + goals = raw_ep.get("goals") or [] + if goals: + goal_pos = tuple(goals[0]["position"]) + goal_radius = float(goals[0].get("radius", 3.0)) + else: + goal_pos = tuple(raw_ep["reference_path"][-1]) + goal_radius = 3.0 + geodesic_distance = float(raw_ep.get("info", {}).get("geodesic_distance", 0.0)) + instruction_text = raw_ep["instruction"]["instruction_text"] + reference_path = [tuple(p) for p in raw_ep["reference_path"]] + + return VlnEpisodeCfg( + scene_id=scene_id, + episode_id=int(raw_ep["episode_id"]), + start_pos=start_pos, + start_quat_wxyz=start_quat, + goal_pos=goal_pos, + goal_radius=goal_radius, + geodesic_distance=geodesic_distance, + instruction_text=instruction_text, + reference_path=reference_path, + ) + + +# ========================================================================== # +# Termination helpers # +# ========================================================================== # + + +def _log_episode_end(env, reason: str, env_ids: torch.Tensor) -> None: + """Log distance-to-goal when an episode ends.""" + if not env_ids.any(): + return + try: + ids = env_ids.nonzero(as_tuple=False).flatten().tolist() + root_pos = env.scene["robot"].data.root_pos_w.cpu().numpy() + goal_pos = env.extras.get("current_goal_pos") + ep_ids = env.extras.get("current_episode_id") + for i in ids: + pos = root_pos[i] + goal_str = "" + if goal_pos is not None: + g = np.asarray(goal_pos[i]) + dist = float(np.linalg.norm(pos - g)) + goal_str = f" goal=[{g[0]:.1f},{g[1]:.1f},{g[2]:.2f}] dist={dist:.2f}" + ep_str = f" ep={ep_ids[i]}" if ep_ids is not None else "" + print(f"[VlnNavTask] EPISODE END ({reason}) env={i}{ep_str} robot=[{pos[0]:.1f},{pos[1]:.1f},{pos[2]:.2f}]{goal_str}") + except Exception: + pass + + +def vln_stop_term(env) -> torch.Tensor: + """Termination term: True when the VLM policy signals STOP. + + The policy sets ``env.extras["vln_stop_called"]`` (a bool tensor of + shape ``[N]``) when the VLM outputs a stop command (zero velocity + and zero duration). + """ + flag = env.extras.get("vln_stop_called") + if flag is None: + return torch.zeros(env.num_envs, dtype=torch.bool, device=env.device) + result = torch.as_tensor(flag, device=env.device, dtype=torch.bool) + if result.any(): + _log_episode_end(env, "STOP", result) + return result + + +def vln_stuck_term(env, velocity_threshold: float = 0.1, patience: int = 1000) -> torch.Tensor: + """Termination term: True when the robot has been stuck for too long. + + A robot is considered stuck when its linear velocity magnitude stays + below ``velocity_threshold`` for ``patience`` consecutive steps. + Per-env counters are stored in ``env.extras["_vln_stuck_counter"]``. + """ + num_envs = env.num_envs + device = env.device + + root_vel = env.scene["robot"].data.root_vel_w[:num_envs, :3] # [N, 3] + speed = torch.norm(root_vel, dim=-1) # [N] + + if "_vln_stuck_counter" not in env.extras: + env.extras["_vln_stuck_counter"] = torch.zeros(num_envs, dtype=torch.long, device=device) + + counter = env.extras["_vln_stuck_counter"] + is_slow = speed < velocity_threshold + counter = torch.where(is_slow, counter + 1, torch.zeros_like(counter)) + env.extras["_vln_stuck_counter"] = counter + + stuck_mask = counter >= patience + if stuck_mask.any(): + _log_episode_end(env, f"STUCK({patience}steps)", stuck_mask) + return stuck_mask + + +def vln_success_term(env, position_tolerance: float = 0.3) -> torch.Tensor: + """Termination term: True when the robot is within *position_tolerance* of the goal. + + Supports multi-env: ``env.extras["current_goal_pos"]`` should have + shape ``[N, 3]`` where ``N = num_envs``. + """ + num_envs = env.num_envs + root_state = env.scene["robot"].data.root_state_w # [N, 13] + goal_pos_np = env.extras.get("current_goal_pos") + if goal_pos_np is None: + return torch.zeros(num_envs, dtype=torch.bool, device=root_state.device) + + goal_pos = torch.as_tensor(goal_pos_np, device=root_state.device, dtype=root_state.dtype) + # Ensure shape [N, 3] + if goal_pos.ndim == 1: + goal_pos = goal_pos.unsqueeze(0).expand(num_envs, -1) + elif goal_pos.shape[0] == 1 and num_envs > 1: + goal_pos = goal_pos.expand(num_envs, -1) + + robot_pos = root_state[..., :3] + dist = torch.linalg.norm(robot_pos - goal_pos, dim=-1) + result = dist < position_tolerance + if result.any(): + _log_episode_end(env, f"SUCCESS(d<{position_tolerance}m)", result) + return result + + +def vln_disabled_success_term(env, position_tolerance: float = 0.0) -> torch.Tensor: + """Disabled proximity-success termination. + + When following the embodied-navigation evaluation protocol from + Anderson et al., episodes should only be deemed successful if the + agent explicitly emits STOP/DONE. We therefore keep the termination + slot but disable auto-success by default. + """ + _ = position_tolerance + return torch.zeros(env.num_envs, dtype=torch.bool, device=env.device) + + +def vln_time_out_term(env) -> torch.Tensor: + """Wraps standard time_out to add EPISODE END logging.""" + result = mdp_isaac_lab.time_out(env) + if result.any(): + _log_episode_end(env, "TIME_OUT", result) + return result + + +@configclass +class VlnTerminationsCfg: + """Termination terms for the VLN task.""" + + time_out: TerminationTermCfg = TerminationTermCfg(func=vln_time_out_term, time_out=True) + success: TerminationTermCfg = MISSING + stop: TerminationTermCfg = TerminationTermCfg(func=vln_stop_term) + stuck: TerminationTermCfg = MISSING + + +# ========================================================================== # +# Event helpers (reset robot pose from dataset) # +# ========================================================================== # + + +def reset_robot_and_goal_from_vln_dataset( + env, + env_ids: torch.Tensor, + robot_cfg: SceneEntityCfg, + vln_cfg: VlnBenchmarkCfg, +): + """Event callback executed on environment reset. + + Supports multi-env: only the environments in ``env_ids`` are reset. + Each reset env gets a new episode sampled sequentially from the dataset. + Episode metadata is stored per-env in ``env.extras``. + + Storage layout in ``env.extras``: + - ``current_goal_pos``: np.ndarray shape ``[N, 3]`` + - ``current_instruction``: list[str] length ``N`` + - ``current_reference_path``: list[np.ndarray] length ``N`` + - ``current_scene_id``: list[str] length ``N`` + - ``current_episode_id``: list[int] length ``N`` + """ + num_envs = env.num_envs + env_ids_list = env_ids.cpu().tolist() + num_reset = len(env_ids_list) + + # Initialize per-env storage on first call + if "current_goal_pos" not in env.extras: + env.extras["current_goal_pos"] = np.zeros((num_envs, 3), dtype=np.float32) + env.extras["current_instruction"] = [""] * num_envs + env.extras["current_reference_path"] = [np.zeros((1, 3), dtype=np.float32)] * num_envs + env.extras["current_scene_id"] = [""] * num_envs + env.extras["current_episode_id"] = [0] * num_envs + env.extras["vln_stop_called"] = torch.zeros(num_envs, dtype=torch.bool, device=env.device) + env.extras["_vln_stuck_counter"] = torch.zeros(num_envs, dtype=torch.long, device=env.device) + + # Clear stop/stuck flags for reset envs + if "vln_stop_called" in env.extras: + env.extras["vln_stop_called"][env_ids] = False + if "_vln_stuck_counter" in env.extras: + env.extras["_vln_stuck_counter"][env_ids] = 0 + + # Sample one episode per reset env + episodes = vln_cfg.sample_episodes(num_reset) + + # Build batched start poses for all reset envs + start_poses = torch.zeros(num_reset, 7, dtype=torch.float32, device=env.device) + for i, ep in enumerate(episodes): + start_poses[i, :3] = torch.tensor(ep.start_pos) + start_poses[i, 2] += vln_cfg.robot_root_height_offset + start_poses[i, 3:7] = torch.tensor(ep.start_quat_wxyz) + + # Teleport all reset robots and zero velocities in one batched call + robot = env.scene[robot_cfg.name] + robot.write_root_pose_to_sim(start_poses, env_ids=env_ids) + zero_vel = torch.zeros(num_reset, 6, dtype=torch.float32, device=env.device) + robot.write_root_velocity_to_sim(zero_vel, env_ids=env_ids) + + # Update per-env metadata + for i, env_id in enumerate(env_ids_list): + ep = episodes[i] + env.extras["current_goal_pos"][env_id] = np.array(ep.goal_pos, dtype=np.float32) + env.extras["current_instruction"][env_id] = ep.instruction_text + env.extras["current_reference_path"][env_id] = np.array(ep.reference_path, dtype=np.float32) + env.extras["current_scene_id"][env_id] = ep.scene_id + env.extras["current_episode_id"][env_id] = ep.episode_id + print( + f"[VlnNavTask] NEW EPISODE env={env_id} ep={ep.episode_id} scene={ep.scene_id}" + f" start=[{ep.start_pos[0]:.1f},{ep.start_pos[1]:.1f},{ep.start_pos[2]:.2f}]" + f" root_z_offset={vln_cfg.robot_root_height_offset:.2f}" + f" goal=[{ep.goal_pos[0]:.1f},{ep.goal_pos[1]:.1f},{ep.goal_pos[2]:.2f}]" + f" instruction={ep.instruction_text[:80]}..." + ) + + +@configclass +class VlnEventsCfg: + """Events configuration for the VLN task.""" + + reset_robot_and_goal: EventTermCfg = MISSING + + +# ========================================================================== # +# VLN Navigation Task # +# ========================================================================== # + + +class VlnR2rMatterportTask(TaskBase): + """VLN task using R2R (Room-to-Room) episodes in Matterport scenes. + + This task: + - Reads episodes from a gzipped VLN-CE R2R dataset. + - Provides termination (time-out, goal-reached, VLM stop, stuck). + - Registers VLN metrics (Oracle Success, SPL, Success, PathLength, DistanceToGoal). + - On reset, teleports the robot to the episode start pose and stores + the instruction / goal in ``env.extras``. + + For other VLN datasets (RxR, REVERIE) or scene types (Gibson), + create a new task class following this pattern. + """ + + def __init__( + self, + robot: Asset, + r2r_dataset_path: str, + episode_indices: Optional[Sequence[int]] = None, + episode_length_s: float = 60.0, + success_radius: float = 3.0, + scene_filter: Optional[str] = None, + robot_root_height_offset: float = 0.0, + require_stop_for_success: bool = True, + ): + """ + Args: + robot: The robot embodiment asset. + r2r_dataset_path: Path to the gzipped R2R dataset JSON file. + episode_indices: Optional subset of episode indices to evaluate. + episode_length_s: Maximum episode duration in seconds. + success_radius: Fallback distance threshold (meters) for goal + success when the dataset episode does not define a goal radius. + scene_filter: If set, only load episodes whose scene_id contains + this string (e.g. ``"zsNo4HB9uLZ"``). This is critical + because the dataset contains episodes from many scenes but + only one scene USD is loaded at a time. + robot_root_height_offset: Added to dataset start_position.z when + teleporting the robot root on reset. Useful when dataset + positions are floor-level but the robot root is pelvis-level. + require_stop_for_success: If True, only STOP/DONE can end a + successful episode. If False, the benchmark falls back to + proximity-based success termination. + """ + super().__init__( + episode_length_s=episode_length_s, + task_description="Navigate to the target location following the instruction.", + ) + self.robot = robot + self.success_radius = success_radius + self.require_stop_for_success = require_stop_for_success + + # Load episodes: filter by scene FIRST, then slice by index range. + # This ensures --episode_start/end refer to indices within the + # filtered (scene-specific) episode list, not the global dataset. + raw_episodes = read_episodes(r2r_dataset_path) + + if scene_filter is not None: + before = len(raw_episodes) + raw_episodes = [ep for ep in raw_episodes if scene_filter in ep.get("scene_id", "")] + print( + f"[VlnR2rMatterportTask] Scene filter '{scene_filter}': {before} -> {len(raw_episodes)} episodes" + ) + if not raw_episodes: + raise ValueError( + f"No episodes match scene_filter='{scene_filter}'. " + f"Check that --usd_path matches the dataset scenes." + ) + + if episode_indices is not None: + raw_episodes = [raw_episodes[i] for i in episode_indices if i < len(raw_episodes)] + print(f"[VlnR2rMatterportTask] Episode indices: {len(raw_episodes)} episodes selected") + + vln_episodes = [build_vln_episode_from_raw(ep) for ep in raw_episodes] + self.vln_benchmark_cfg = VlnBenchmarkCfg( + episodes=vln_episodes, + robot_root_height_offset=robot_root_height_offset, + ) + + # Pre-compute data for metrics + self._gt_waypoints_per_episode: list[list[list[float]]] = [ + [list(p) for p in ep.reference_path] for ep in vln_episodes + ] + self._success_radius_per_episode: list[float] = [ + float(ep.goal_radius) if ep.goal_radius > 0.0 else success_radius + for ep in vln_episodes + ] + self._shortest_path_distance_per_episode: list[float] | None = None + if all(ep.geodesic_distance > 0.0 for ep in vln_episodes): + self._shortest_path_distance_per_episode = [ + float(ep.geodesic_distance) for ep in vln_episodes + ] + + # ------------------------------------------------------------------ # + # TaskBase interface # + # ------------------------------------------------------------------ # + + def get_scene_cfg(self): + """VLN scene assets (Matterport background) are added by the Environment, not the Task.""" + return None + + def get_termination_cfg(self) -> VlnTerminationsCfg: + success_term_func = vln_disabled_success_term if self.require_stop_for_success else vln_success_term + return VlnTerminationsCfg( + success=TerminationTermCfg( + func=success_term_func, + params={"position_tolerance": self.success_radius}, + ), + stuck=TerminationTermCfg( + func=vln_stuck_term, + params={"velocity_threshold": 0.1, "patience": 1000}, + ), + ) + + def get_events_cfg(self) -> VlnEventsCfg: + cfg = VlnEventsCfg( + reset_robot_and_goal=EventTermCfg( + func=reset_robot_and_goal_from_vln_dataset, + mode="reset", + params={ + "robot_cfg": SceneEntityCfg("robot"), + "vln_cfg": self.vln_benchmark_cfg, + }, + ), + ) + return cfg + + def get_mimic_env_cfg(self, arm_mode): + # VLN does not use mimic / imitation learning datagen + return None + + def get_metrics(self) -> list[MetricBase]: + return [ + PathLengthMetric(), + DistanceToGoalMetric(self._gt_waypoints_per_episode), + OracleSuccessMetric( + self._gt_waypoints_per_episode, + self._success_radius_per_episode, + ), + SuccessMetric( + self._gt_waypoints_per_episode, + self._success_radius_per_episode, + require_stop_signal=self.require_stop_for_success, + ), + SPLMetric( + self._gt_waypoints_per_episode, + self._success_radius_per_episode, + shortest_path_distance_per_episode=self._shortest_path_distance_per_episode, + require_stop_signal=self.require_stop_for_success, + ), + ] + + def get_task_description(self) -> str | None: + """Return a static task description for the VLN benchmark. + + This returns a generic description set at init time. It does NOT + return the per-episode navigation instruction, because: + + 1. Arena's ``policy_runner`` calls ``get_task_description()`` once + at startup via ``task.get_task_description()``, before any + episode is loaded. + 2. VLN instructions change every episode and are per-env. + + Per-episode instructions are stored in + ``env.extras["current_instruction"]`` (a list of length ``num_envs``) + and are automatically picked up by ``VlnPolicy`` on each step via + ``_check_instruction_update()``. + """ + return self.task_description + + def get_viewer_cfg(self) -> ViewerCfg: + # Third-person camera behind and above the robot, looking slightly ahead. + # Kept close to avoid clipping through Matterport walls. + return ViewerCfg( + eye=(-1.5, 0.0, 1.5), + lookat=(1.0, 0.0, 0.8), + origin_type="asset_root", + asset_name="robot", + env_index=0, + ) + + def modify_env_cfg(self, env_cfg: IsaacLabArenaManagerBasedRLEnvCfg) -> IsaacLabArenaManagerBasedRLEnvCfg: + return env_cfg diff --git a/isaaclab_arena/tests/test_vln_metrics_protocol.py b/isaaclab_arena/tests/test_vln_metrics_protocol.py new file mode 100644 index 000000000..4208f6094 --- /dev/null +++ b/isaaclab_arena/tests/test_vln_metrics_protocol.py @@ -0,0 +1,260 @@ +import sys +import types + +import numpy as np +import torch + + +def _install_lightweight_stubs() -> None: + """Install minimal module stubs so pure-logic tests can import VLN code.""" + + def _identity_configclass(cls): + return cls + + class _RecorderTerm: + def __init__(self, cfg=None, env=None): + self.cfg = cfg + self._env = env + + class _RecorderTermCfg: + class_type = None + + class _ManagerBasedEnv: + pass + + class _DummyClientSidePolicy: + def __init__(self, *args, **kwargs): + pass + + @staticmethod + def add_remote_args_to_parser(parser): + return parser + + class _DummyProtocol: + default_duration = 1.0 + + class _DummyRemotePolicyConfig: + pass + + class _DummyRslRlVecEnvWrapper: + pass + + class _DummyOnPolicyRunner: + def __init__(self, *args, **kwargs): + pass + + sys.modules.setdefault("isaaclab", types.ModuleType("isaaclab")) + sys.modules.setdefault("isaaclab.envs", types.ModuleType("isaaclab.envs")) + sys.modules.setdefault("isaaclab.managers", types.ModuleType("isaaclab.managers")) + sys.modules.setdefault("isaaclab.utils", types.ModuleType("isaaclab.utils")) + sys.modules.setdefault("isaaclab.utils.io", types.ModuleType("isaaclab.utils.io")) + sys.modules.setdefault("isaaclab_rl", types.ModuleType("isaaclab_rl")) + sys.modules.setdefault("isaaclab_rl.rsl_rl", types.ModuleType("isaaclab_rl.rsl_rl")) + sys.modules.setdefault("rsl_rl", types.ModuleType("rsl_rl")) + sys.modules.setdefault("rsl_rl.runners", types.ModuleType("rsl_rl.runners")) + + env_mod = types.ModuleType("isaaclab.envs.manager_based_rl_env") + env_mod.ManagerBasedEnv = _ManagerBasedEnv + sys.modules["isaaclab.envs.manager_based_rl_env"] = env_mod + + recorder_mod = types.ModuleType("isaaclab.managers.recorder_manager") + recorder_mod.RecorderTerm = _RecorderTerm + recorder_mod.RecorderTermCfg = _RecorderTermCfg + sys.modules["isaaclab.managers.recorder_manager"] = recorder_mod + + utils_mod = sys.modules["isaaclab.utils"] + utils_mod.configclass = _identity_configclass + + io_mod = sys.modules["isaaclab.utils.io"] + io_mod.load_yaml = lambda _path: {"device": "cpu"} + + rsl_wrapper_mod = sys.modules["isaaclab_rl.rsl_rl"] + rsl_wrapper_mod.RslRlVecEnvWrapper = _DummyRslRlVecEnvWrapper + + runners_mod = sys.modules["rsl_rl.runners"] + runners_mod.OnPolicyRunner = _DummyOnPolicyRunner + + client_policy_mod = types.ModuleType("isaaclab_arena.policy.client_side_policy") + client_policy_mod.ClientSidePolicy = _DummyClientSidePolicy + sys.modules["isaaclab_arena.policy.client_side_policy"] = client_policy_mod + + protocol_mod = types.ModuleType("isaaclab_arena.remote_policy.action_protocol") + protocol_mod.VlnVelocityActionProtocol = _DummyProtocol + sys.modules["isaaclab_arena.remote_policy.action_protocol"] = protocol_mod + + config_mod = types.ModuleType("isaaclab_arena.remote_policy.remote_policy_config") + config_mod.RemotePolicyConfig = _DummyRemotePolicyConfig + sys.modules["isaaclab_arena.remote_policy.remote_policy_config"] = config_mod + + +_install_lightweight_stubs() + +from isaaclab_arena.metrics.vln_metrics import OracleSuccessMetric, PathLengthMetric, SPLMetric, SuccessMetric +from isaaclab_arena.policy.vln.vln_vlm_locomotion_policy import VlnVlmLocomotionPolicy + + +def test_success_and_spl_require_stop_signal(): + gt_waypoints = [[[0.0, 0.0, 0.0], [1.0, 0.0, 0.0]]] + success_radius = [0.5] + + # Episode reaches the goal region but never emits STOP. + no_stop_episode = [ + np.array( + [ + [0.0, 0.0, 0.0, 0.0], + [0.8, 0.0, 0.0, 0.0], + [1.1, 0.0, 0.0, 0.0], + ], + dtype=np.float32, + ) + ] + success_metric = SuccessMetric(gt_waypoints, success_radius, require_stop_signal=True) + spl_metric = SPLMetric(gt_waypoints, success_radius, require_stop_signal=True) + assert success_metric.compute_metric_from_recording(no_stop_episode) == 0.0 + assert spl_metric.compute_metric_from_recording(no_stop_episode) == 0.0 + + # Episode emits STOP near the goal and should count as successful. + stop_success_episode = [ + np.array( + [ + [0.0, 0.0, 0.0, 0.0], + [0.5, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 1.0], + ], + dtype=np.float32, + ) + ] + assert success_metric.compute_metric_from_recording(stop_success_episode) == 1.0 + + # Same shortest path (1m), but a longer actual path (3m) before STOP. + stop_long_path_episode = [ + np.array( + [ + [0.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [1.0, 1.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 1.0], + ], + dtype=np.float32, + ) + ] + assert np.isclose(spl_metric.compute_metric_from_recording(stop_long_path_episode), 1.0 / 3.0) + + +def test_oracle_success_uses_best_trajectory_point_without_stop(): + gt_waypoints = [[[0.0, 0.0, 0.0], [2.0, 0.0, 0.0]]] + success_radius = [0.5] + oracle_metric = OracleSuccessMetric(gt_waypoints, success_radius) + + # The agent reaches the goal area once, then walks away and never emits STOP. + pass_then_leave_episode = [ + np.array( + [ + [0.0, 0.0, 0.0, 0.0], + [2.0, 0.0, 0.0, 0.0], + [4.0, 0.0, 0.0, 0.0], + ], + dtype=np.float32, + ) + ] + assert oracle_metric.compute_metric_from_recording(pass_then_leave_episode) == 1.0 + + # The agent never gets close enough to the goal region. + miss_episode = [ + np.array( + [ + [0.0, 0.0, 0.0, 0.0], + [0.8, 0.0, 0.0, 0.0], + [1.2, 0.0, 0.0, 0.0], + ], + dtype=np.float32, + ) + ] + assert oracle_metric.compute_metric_from_recording(miss_episode) == 0.0 + + +def test_spl_prefers_dataset_geodesic_distance_when_provided(): + gt_waypoints = [[[0.0, 0.0, 0.0], [3.0, 0.0, 0.0]]] + success_radius = [1.1] + spl_metric = SPLMetric( + gt_waypoints, + success_radius, + shortest_path_distance_per_episode=[2.0], + require_stop_signal=True, + ) + + stop_success_episode = [ + np.array( + [ + [0.0, 0.0, 0.0, 0.0], + [2.0, 0.0, 0.0, 1.0], + ], + dtype=np.float32, + ) + ] + assert np.isclose(spl_metric.compute_metric_from_recording(stop_success_episode), 1.0) + + +def test_path_length_ignores_stop_flag_column(): + metric = PathLengthMetric() + episode = [ + np.array( + [ + [0.0, 0.0, 0.0, 0.0], + [3.0, 4.0, 0.0, 0.0], + [6.0, 8.0, 0.0, 1.0], + ], + dtype=np.float32, + ) + ] + assert metric.compute_metric_from_recording(episode) == 10.0 + + +def test_low_level_warmup_does_not_step_environment(monkeypatch): + class DummyVecEnv: + def __init__(self): + self.unwrapped = types.SimpleNamespace(device="cpu") + self.step_calls = 0 + + def get_observations(self): + return {"policy": torch.zeros((1, 69), dtype=torch.float32)} + + def step(self, _actions): + self.step_calls += 1 + raise AssertionError("Warmup should not step the environment") + + class DummyRunner: + def __init__(self, vec_env, _cfg, log_dir=None, device="cpu"): + self._vec_env = vec_env + + def load(self, _checkpoint_path): + return None + + def get_inference_policy(self, device="cpu"): + def _policy(obs_td): + return torch.zeros((1, 19), dtype=torch.float32) + + return _policy + + monkeypatch.setattr("isaaclab_arena.policy.vln.vln_vlm_locomotion_policy.load_yaml", lambda _path: {"device": "cpu"}) + monkeypatch.setattr("isaaclab_arena.policy.vln.vln_vlm_locomotion_policy.OnPolicyRunner", DummyRunner) + monkeypatch.setattr("isaaclab_arena.policy.vln.vln_vlm_locomotion_policy.RslRlVecEnvWrapper", DummyVecEnv) + + dummy_vec_env = DummyVecEnv() + dummy_policy = types.SimpleNamespace( + _ll_agent_cfg="unused.yaml", + _ll_checkpoint_path="unused.pt", + _device="cpu", + action_dim=3, + _vel_cmd_indices=(9, 12), + _warmup_steps=5, + _ll_vec_env=None, + _ll_policy=None, + _ll_obs_td=None, + ) + + VlnVlmLocomotionPolicy._load_low_level_policy(dummy_policy, dummy_vec_env) + + assert dummy_vec_env.step_calls == 0 + assert dummy_policy._ll_vec_env is dummy_vec_env + assert dummy_policy._ll_policy is not None diff --git a/isaaclab_arena/utils/configclass.py b/isaaclab_arena/utils/configclass.py index 28c69835d..427912e85 100644 --- a/isaaclab_arena/utils/configclass.py +++ b/isaaclab_arena/utils/configclass.py @@ -127,7 +127,7 @@ def get_field_info(field: dataclasses.Field) -> tuple[str, type, Any]: if field.default is not dataclasses.MISSING: field_info += (field.default,) elif field.default_factory is not dataclasses.MISSING: - field_info += (field.default_factory,) + field_info += (dataclasses.field(default_factory=field.default_factory),) return field_info diff --git a/isaaclab_arena_environments/cli.py b/isaaclab_arena_environments/cli.py index 81df53cd8..7b2b58e20 100644 --- a/isaaclab_arena_environments/cli.py +++ b/isaaclab_arena_environments/cli.py @@ -22,6 +22,7 @@ from isaaclab_arena_environments.lift_object_environment import LiftObjectEnvironment from isaaclab_arena_environments.press_button_environment import PressButtonEnvironment from isaaclab_arena_environments.tabletop_place_upright_environment import TableTopPlaceUprightEnvironment +from isaaclab_arena_environments.vln_environment import VLNBenchmarkEnvironment # NOTE(alexmillane, 2025.09.04): There is an issue with type annotation in this file. # We cannot annotate types which require the simulation app to be started in order to @@ -44,6 +45,7 @@ LiftObjectEnvironment.name: LiftObjectEnvironment, TableTopPlaceUprightEnvironment.name: TableTopPlaceUprightEnvironment, Gr1TurnStandMixerKnobEnvironment.name: Gr1TurnStandMixerKnobEnvironment, + VLNBenchmarkEnvironment.name: VLNBenchmarkEnvironment, } diff --git a/isaaclab_arena_environments/vln_environment.py b/isaaclab_arena_environments/vln_environment.py new file mode 100644 index 000000000..51c2d3c55 --- /dev/null +++ b/isaaclab_arena_environments/vln_environment.py @@ -0,0 +1,392 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""VLN benchmark environment builder. + +This module defines the ``VLNBenchmarkEnvironment`` that integrates: + - A Matterport 3D background scene. + - The H1 humanoid embodiment configured for VLN. + - The VLN navigation task with R2R episode management. + +It follows the ``ExampleEnvironmentBase`` pattern used by all IsaacLab Arena +environments, so it plugs into the CLI and ``ArenaEnvBuilder`` seamlessly. + +Usage (CLI):: + + python -m isaaclab_arena.evaluation.policy_runner \\ + --policy_type isaaclab_arena.policy.vln.vln_vlm_locomotion_policy.VlnVlmLocomotionPolicy \\ + --remote_host localhost --remote_port 5555 \\ + --num_episodes 10 \\ + VLN_Benchmark \\ + --usd_path /path/to/matterport.usd \\ + --r2r_dataset_path /path/to/vln_ce_isaac_v1.json.gz +""" + +from __future__ import annotations + +import argparse +import os + +from isaaclab_arena_environments.example_environment_base import ExampleEnvironmentBase + + +def _patch_usd_resolved_path(): + """Monkey-patch omni.usd functions that fail with ``Ar.ResolvedPath``. + + Isaac Sim 5.1.0's ``is_usd_crate_file`` and + ``is_usd_crate_file_version_supported`` internally call + ``Sdf.FileFormat.GetFileExtension`` and ``Usd.CrateInfo.Open`` with + ``Ar.ResolvedPath`` objects, but those C++ APIs only accept ``str``. + Wrapping the outer function does not help because the ``ResolvedPath`` + is created as a *local variable* inside the function body. + + The fix replaces both functions with clean re-implementations that + convert every path to ``str`` before passing to C++ APIs. + """ + try: + import omni.usd + import omni.usd._impl.utils as _usd_utils + from pxr import Ar, Sdf, Usd + + def _fixed_is_usd_crate_file(filepath): + ext = Sdf.FileFormat.GetFileExtension(str(filepath)) + return ext in ("usdc", "usd") + + def _fixed_is_usd_crate_file_version_supported(filepath): + filepath = str(filepath) + if not _fixed_is_usd_crate_file(filepath): + return True + try: + resolved = Ar.GetResolver().Resolve(filepath) + resolved_str = str(resolved) if resolved else filepath + if not resolved_str: + return False + crate_info = Usd.CrateInfo.Open(resolved_str) + return crate_info is not None + except Exception: + return True + + _usd_utils.is_usd_crate_file = _fixed_is_usd_crate_file + _usd_utils.is_usd_crate_file_version_supported = _fixed_is_usd_crate_file_version_supported + omni.usd.is_usd_crate_file = _fixed_is_usd_crate_file + omni.usd.is_usd_crate_file_version_supported = _fixed_is_usd_crate_file_version_supported + print("[VLN] Patched omni.usd ResolvedPath compatibility (full replacement).") + except Exception as e: + print(f"[VLN] Warning: could not patch omni.usd: {e}") + + +class VLNBenchmarkEnvironment(ExampleEnvironmentBase): + """IsaacLab Arena environment for VLN benchmarking.""" + + name: str = "h1_vln_matterport" + + def get_env(self, args_cli: argparse.Namespace): + """Build and return the VLN environment. + + Multi-env note: + When ``num_envs > 1``, each env gets a full copy of the Matterport + scene. The ``env_spacing`` should be large enough that scenes don't + overlap (Matterport houses are typically 20-50m wide). The default + is overridden to 100m if the user hasn't set it explicitly. + """ + # Currently only num_envs=1 is supported. Multi-env requires + # per-env instruction tracking in the VLM server and dynamic scene + # switching, which are not yet implemented. + num_envs = getattr(args_cli, "num_envs", 1) + if num_envs != 1: + raise ValueError( + f"VLN benchmark currently only supports num_envs=1 (got {num_envs}). " + f"Multi-env support requires per-env VLM instruction tracking " + f"and is planned for a future release." + ) + + # Fix Isaac Sim 5.1.0 USD ResolvedPath compatibility issue + _patch_usd_resolved_path() + + # Matterport scenes are large — override default env_spacing if user + # hasn't set a custom value (the Arena default is 30m, too small). + if not hasattr(args_cli, "_env_spacing_set_by_user"): + if getattr(args_cli, "num_envs", 1) > 1 and args_cli.env_spacing < 100.0: + print( + f"[VLN] Overriding env_spacing from {args_cli.env_spacing}m to 100m " + f"for Matterport scenes (num_envs={args_cli.num_envs})." + ) + args_cli.env_spacing = 100.0 + + # Delayed imports — require simulation app to be running + import isaaclab.sim as sim_utils + from isaaclab_arena.environments.isaaclab_arena_environment import IsaacLabArenaEnvironment + from isaaclab_arena.scene.scene import Scene + + from isaaclab_arena.assets.matterport_background import MatterportBackground + from isaaclab_arena.embodiments.h1.h1 import _DEFAULT_H1_CAMERA_OFFSET, _DEFAULT_H1_FOLLOW_CAMERA_OFFSET + from isaaclab_arena.embodiments.h1.h1_vln import H1VlnEmbodiment + from isaaclab_arena.tasks.vln_r2r_matterport_task import VlnR2rMatterportTask + from isaaclab_arena.utils.pose import Pose + + # 1) Background: Matterport 3D scene + ground_plane_z = None if getattr(args_cli, "disable_matterport_ground_plane", False) else 0.0 + background = MatterportBackground( + usd_path=args_cli.usd_path, + ground_plane_z=ground_plane_z, + use_global_prim=getattr(args_cli, "use_global_matterport_prim", False), + explicit_mesh_colliders=getattr(args_cli, "enable_matterport_child_mesh_colliders", False), + mesh_collider_approximation=getattr(args_cli, "matterport_mesh_collider_type", "triangle"), + collision_overlay_usd_path=getattr(args_cli, "matterport_collision_usd_path", None), + floor_collision_usd_path=getattr(args_cli, "matterport_floor_collision_usd_path", None), + obstacle_collision_usd_path=getattr(args_cli, "matterport_obstacle_collision_usd_path", None), + ) + + # 2) Embodiment: H1 humanoid with camera + use_tiled = getattr(args_cli, "use_tiled_camera", False) + enable_follow = not getattr(args_cli, "no_follow_camera", False) + cam_res = getattr(args_cli, "camera_resolution", 512) + head_camera_offset = _DEFAULT_H1_CAMERA_OFFSET + follow_camera_offset = _DEFAULT_H1_FOLLOW_CAMERA_OFFSET + if getattr(args_cli, "head_camera_offset_xyz", None) is not None: + head_camera_offset = Pose( + position_xyz=tuple(args_cli.head_camera_offset_xyz), + rotation_wxyz=_DEFAULT_H1_CAMERA_OFFSET.rotation_wxyz, + ) + if getattr(args_cli, "follow_camera_offset_xyz", None) is not None: + follow_camera_offset = Pose( + position_xyz=tuple(args_cli.follow_camera_offset_xyz), + rotation_wxyz=_DEFAULT_H1_FOLLOW_CAMERA_OFFSET.rotation_wxyz, + ) + if getattr(args_cli, "enable_height_scanner", False) and not getattr(args_cli, "use_global_matterport_prim", False): + raise ValueError("Height scanner currently requires --use_global_matterport_prim so it can raycast /World/matterport.") + embodiment = H1VlnEmbodiment( + enable_cameras=True, + camera_offset=head_camera_offset, + use_tiled_camera=use_tiled, + enable_follow_camera=enable_follow, + follow_camera_offset=follow_camera_offset, + camera_resolution=cam_res, + enable_head_depth=getattr(args_cli, "enable_head_camera_depth", False), + enable_height_scanner=getattr(args_cli, "enable_height_scanner", False), + height_scanner_debug_vis=getattr(args_cli, "height_scanner_debug_vis", False), + ) + + # 3) Task: VLN navigation with R2R episodes + # Extract scene_id from the USD path to filter episodes. + usd_scene_id = os.path.splitext(os.path.basename(args_cli.usd_path))[0] + + episode_indices = None + if hasattr(args_cli, "episode_start") and args_cli.episode_start is not None: + end = getattr(args_cli, "episode_end", args_cli.episode_start + 1) + episode_indices = list(range(args_cli.episode_start, end)) + + task = VlnR2rMatterportTask( + robot=embodiment, + r2r_dataset_path=args_cli.r2r_dataset_path, + episode_indices=episode_indices, + episode_length_s=getattr(args_cli, "episode_length_s", 60.0), + success_radius=getattr(args_cli, "success_radius", 3.0), + scene_filter=usd_scene_id, + robot_root_height_offset=getattr(args_cli, "robot_root_height_offset", 1.0), + require_stop_for_success=not getattr(args_cli, "allow_success_without_stop", False), + ) + + # 4) Scene: Matterport background + scene = Scene(assets=[background]) + + # 5) Simulation parameters callback + # These MUST match the low-level locomotion policy training config. + # Default values come from NaVILA-Bench: + # h1_matterport_base_cfg.py -> H1MatterportBaseCfg.__post_init__() + sim_dt = getattr(args_cli, "sim_dt", 0.005) + decimation = getattr(args_cli, "sim_decimation", 4) + + def vln_sim_cfg_callback(env_cfg): + env_cfg.sim.dt = sim_dt # 200 Hz physics + env_cfg.decimation = decimation # 50 Hz control (200/4) + env_cfg.sim.render_interval = decimation + env_cfg.sim.disable_contact_processing = True + env_cfg.sim.physics_material = sim_utils.RigidBodyMaterialCfg( + static_friction=1.0, + dynamic_friction=1.0, + friction_combine_mode="max", + restitution_combine_mode="max", + ) + return env_cfg + + # 6) Compose the Arena environment + arena_env = IsaacLabArenaEnvironment( + name=self.name, + scene=scene, + embodiment=embodiment, + task=task, + env_cfg_callback=vln_sim_cfg_callback, + ) + return arena_env + + @staticmethod + def add_cli_args(parser: argparse.ArgumentParser) -> None: + """Add VLN-specific CLI arguments.""" + group = parser.add_argument_group("VLN Benchmark", "VLN benchmark environment arguments") + group.add_argument( + "--usd_path", + type=str, + required=True, + help="Path to the Matterport USD scene file.", + ) + group.add_argument( + "--r2r_dataset_path", + type=str, + required=True, + help="Path to the R2R VLN dataset (e.g. vln_ce_isaac_v1.json.gz).", + ) + group.add_argument( + "--episode_start", + type=int, + default=None, + help="Starting episode index (inclusive). If None, use all episodes.", + ) + group.add_argument( + "--episode_end", + type=int, + default=None, + help="Ending episode index (exclusive). Used with --episode_start.", + ) + group.add_argument( + "--episode_length_s", + type=float, + default=60.0, + help="Maximum episode duration in seconds (default: 60).", + ) + group.add_argument( + "--success_radius", + type=float, + default=3.0, + help="Distance threshold for goal success (default: 3.0m).", + ) + group.add_argument( + "--use_tiled_camera", + action="store_true", + default=False, + help="Use TiledCamera for parallel evaluation (default: False).", + ) + group.add_argument( + "--no_follow_camera", + action="store_true", + default=False, + help="Disable the third-person follow camera (default: enabled).", + ) + group.add_argument( + "--camera_resolution", + type=int, + default=512, + help="Camera resolution in pixels (default: 512, use 1024 for high-quality demo).", + ) + group.add_argument( + "--head_camera_offset_xyz", + type=float, + nargs=3, + default=None, + metavar=("X", "Y", "Z"), + help="Override head camera XYZ offset in pelvis frame.", + ) + group.add_argument( + "--follow_camera_offset_xyz", + type=float, + nargs=3, + default=None, + metavar=("X", "Y", "Z"), + help="Override follow camera XYZ offset in pelvis frame.", + ) + group.add_argument( + "--enable_head_camera_depth", + action="store_true", + default=False, + help="Expose head camera depth alongside RGB for debugging; not sent to the VLM by default.", + ) + group.add_argument( + "--enable_height_scanner", + action="store_true", + default=False, + help="Enable a NaVILA-Bench-style height scanner sensor for debugging or future experiments.", + ) + group.add_argument( + "--height_scanner_debug_vis", + action="store_true", + default=False, + help="Visualize height-scanner rays when the height scanner is enabled.", + ) + group.add_argument( + "--disable_matterport_ground_plane", + action="store_true", + default=False, + help="Disable the fallback invisible ground plane and rely on Matterport mesh collision only.", + ) + group.add_argument( + "--use_global_matterport_prim", + action="store_true", + default=False, + help="Spawn Matterport at /World/matterport with collision_group=-1 for collision experiments.", + ) + group.add_argument( + "--enable_matterport_child_mesh_colliders", + action="store_true", + default=False, + help="Explicitly apply collision and mesh-collider schemas to descendant Matterport Mesh prims.", + ) + group.add_argument( + "--matterport_mesh_collider_type", + type=str, + default="triangle", + choices=("triangle", "sdf", "convex_decomposition"), + help="Approximation used when explicit Matterport child-mesh colliders are enabled.", + ) + group.add_argument( + "--matterport_collision_usd_path", + type=str, + default=None, + help="Legacy combined collision-only USD layered under the visual Matterport scene.", + ) + group.add_argument( + "--matterport_floor_collision_usd_path", + type=str, + default=None, + help="Optional floor-only collision USD layered under the visual Matterport scene.", + ) + group.add_argument( + "--matterport_obstacle_collision_usd_path", + type=str, + default=None, + help="Optional obstacle-only collision USD layered under the visual Matterport scene.", + ) + group.add_argument( + "--robot_root_height_offset", + type=float, + default=1.0, + help="Added to dataset start_position.z when resetting the robot root (default tuned for H1 pelvis root).", + ) + group.add_argument( + "--allow_success_without_stop", + action="store_true", + default=False, + help=( + "Legacy mode: allow proximity-only success without requiring VLM STOP/DONE. " + "By default the benchmark follows STOP-gated success." + ), + ) + + # Simulation parameters — must match the low-level policy training config + sim_group = parser.add_argument_group( + "Simulation", "Physics simulation parameters (must match low-level policy training)" + ) + sim_group.add_argument( + "--sim_dt", + type=float, + default=0.005, + help="Physics simulation timestep in seconds (default: 0.005 = 200Hz).", + ) + sim_group.add_argument( + "--sim_decimation", + type=int, + default=4, + help="Number of physics steps per policy step (default: 4, giving 50Hz control).", + ) + diff --git a/isaaclab_arena_navila/README.md b/isaaclab_arena_navila/README.md new file mode 100644 index 000000000..df6582186 --- /dev/null +++ b/isaaclab_arena_navila/README.md @@ -0,0 +1,549 @@ +# NaVILA VLN Benchmark for IsaacLab Arena + +**Vision-Language Navigation (VLN)** benchmark using the [NaVILA](https://github.com/AnjieCheng/NaVILA) VLM on the H1 humanoid robot in Matterport 3D indoor scenes. + +## Architecture + +``` + ISAAC SIM CONTAINER NAVILA SERVER CONTAINER + ══════════════════ ═══════════════════════ + + ┌──────────────┐ ┌───────────────────┐ + │ Matterport │ │ VLN-CE R2R Data │ + │ Scene (.usd) │ │ (.json.gz) │ + └──────┬───────┘ └───────┬───────────┘ + │ │ + ▼ ▼ + ┌──────────────┐ ┌───────────────────┐ + │ H1 Robot │ │VlnR2rMatterport │ + │ + Head Cam │ │Task │ + │ + Follow Cam │ │ scene_filter │ + └──────┬───────┘ │ termination │ + │ │ metrics (SPL...) │ + │ └───────┬───────────┘ + │ │ + ▼ ▼ + ┌─────────────────────────────────┐ + │ VlnVlmLocomotionPolicy │ + │ │ + │ ┌─ High-level ──────────────┐ │ + │ │ Send 1 RGB frame ────────┼──┼──► ZeroMQ ──►┐ + │ │ Receive vel_cmd ◄────────┼──┼──◄ ZeroMQ ◄──┤ + │ └────────────┬──────────────┘ │ │ + │ │ │ │ + │ ┌────────────▼──────────────┐ │ │ + │ │ Low-level: RSL-RL │ │ │ + │ │ Inject vel_cmd → obs │ │ │ + │ │ NN forward → joint act │ │ │ + │ └────────────┬──────────────┘ │ │ + └───────────────┼─────────────────┘ │ + │ │ + ▼ │ + ┌─────────────────────────────┐ ┌─────────────▼────────────┐ + │ PhysX GPU Simulation │ │ NaVilaServerPolicy │ + │ joint act → robot moves │ │ │ + │ → new camera RGB │ │ 1. Receive 1 RGB frame │ + └─────────────────────────────┘ │ 2. Append to history │ + │ 3. Uniform sample 8 │ + │ from full history │ + │ 4. VLM inference │ + │ 5. Parse → vel_cmd │ + │ 6. Return vel_cmd + dur │ + └──────────────────────────┘ +``` + +## Quick Start + +### 1. Start the NaVILA VLM Server + +```bash +# Option A: Use the launch script (rebuilds Docker if needed) +bash docker/run_vln_server.sh -m ~/models/navila/navila-llama3-8b-8f --port 5555 + +# Option B: Manual start with code mounted (no rebuild) +docker run --rm -d --gpus all --net host \ + --name vln_policy_server_container \ + -v ~/models:/models \ + -v ~/IsaacLab-Arena/isaaclab_arena_navila:/workspace/isaaclab_arena_navila \ + -v ~/IsaacLab-Arena/isaaclab_arena/remote_policy:/workspace/isaaclab_arena/remote_policy \ + vln_policy_server:latest \ + --host 0.0.0.0 --port 5555 --timeout_ms 15000 \ + --policy_type isaaclab_arena_navila.navila_server_policy.NaVilaServerPolicy \ + --model_path /models/navila/navila-llama3-8b-8f +``` + +Wait for: `listening on tcp://0.0.0.0:5555` + +### 2. Run VLN Evaluation (Client) + +```bash +# Enter Isaac Sim container +bash docker/run_docker.sh + +# Inside the container: +/isaac-sim/python.sh -u -m isaaclab_arena.evaluation.policy_runner \ + --enable_cameras --num_envs 1 \ + --policy_type isaaclab_arena.policy.vln.vln_vlm_locomotion_policy.VlnVlmLocomotionPolicy \ + --remote_host localhost --remote_port 5555 \ + --ll_checkpoint_path isaaclab_arena/policy/vln/pretrained/h1_navila_locomotion.pt \ + --ll_agent_cfg isaaclab_arena/policy/vln/pretrained/h1_navila_agent.yaml \ + --num_episodes 10 \ + h1_vln_matterport \ + --usd_path /path/to/matterport_usd/zsNo4HB9uLZ/zsNo4HB9uLZ.usd \ + --r2r_dataset_path /path/to/vln_ce_isaac_v1.json.gz +``` + +Add `--headless` for batch evaluation without GUI. + +## Matterport Scene Collision + +The default VLN-CE-Isaac Matterport USD scenes are primarily **visual meshes**. +They render correctly, but they do not provide reliable PhysX collision by +default. The current benchmark therefore uses an **overlay collision** approach: + +- keep the original Matterport visual USD for rendering +- generate one or more collision-only USDA assets +- attach those collision USDA assets under `/World/matterport` +- let PhysX collide with the overlay mesh instead of trusting the visual mesh + +By default, if no collision overlay is provided, an invisible ground plane at +`z=0` provides basic support so the robot can stand and walk. + +### Why the default Matterport USD is not enough + +The common failure mode is that the visual USD is a poor proxy for stable +collision: + +1. raw scan / mesh -> visual USD +2. visual USD -> OBJ +3. OBJ -> collision USDA + +If geometric detail is already lost or distorted in step 1, the later collision +asset inherits that loss. In practice this leads to: + +- thin door-frame shells +- floating or near-ground fragments +- furniture-adjacent blue shells that do not match the visible structure +- floor noise that destabilizes humanoid locomotion + +For better collision fidelity, prefer starting from a **cleaner raw Matterport / +MP3D mesh** instead of relying only on the already-converted visual USD. + +### Collision overlay modes + +The runtime currently supports three collision-overlay styles: + +- **legacy combined overlay** + - one collision-only USDA under the visual scene +- **split overlay** + - `floor` collision for locomotion support + - `obstacle` collision for walls, cabinets, sofas, tables, and door frames +- **explicit child-mesh colliders** + - apply collision schemas directly to the visual USD descendant meshes + +The split `floor + obstacle` route is usually the most practical for debugging +and tuning. + +### Recommended asset layout + +For each scene, keep: + +- visual scene: + - `matterport_usd//.usd` +- floor collision: + - `collision/_floor_collision.usda` +- obstacle collision: + - `collision/_obstacle_collision.usda` +- optional combined collision: + - `collision/_combined_collision.usda` + +Put intermediate OBJ files and temporary logs under `claude_tmp/`. + +### Build workflow + +#### 1. Export raw mesh to OBJ + +If you have the original raw MP3D / Matterport mesh, export or convert it to +OBJ first: + +```bash +/isaac-sim/python.sh isaaclab_arena/scripts/assets/export_matterport_collision_proxy.py \ + --input_path /path/to/raw_mp3d_mesh.glb \ + --output_path /tmp/scene_raw.obj +``` + +If you do not have the raw mesh yet, you can still use the current visual USD as +a fallback, but collision quality will usually be worse. + +#### 2. Clean / simplify the OBJ externally + +Before building collision assets, the most useful cleanup is: + +- remove tiny floating fragments +- remove ceiling-only details that do not matter to locomotion +- smooth or rebuild the floor if it is noisy or broken +- simplify small decorative clutter +- keep large walls, cabinets, sofas, tables, and door frames + +Try to make the floor cleaner than the obstacle mesh. For humanoid locomotion, +floor quality usually matters more than visual fidelity. + +#### 3. Build split floor / obstacle USDA files + +Use the builder: + +```bash +python3 isaaclab_arena/scripts/assets/build_matterport_collision_layers.py \ + --input_obj /tmp/scene_clean.obj \ + --output_floor_usd /path/to/collision/scene_floor_collision.usda \ + --output_obstacle_usd /path/to/collision/scene_obstacle_collision.usda \ + --output_combined_usd /path/to/collision/scene_combined_collision.usda \ + --floor_min_z -0.25 \ + --floor_max_z 0.35 \ + --floor_normal_min_z 0.9 +``` + +This split is heuristic and scene-dependent. The key idea is: + +- floor-like triangles: low enough and upward-facing +- everything else: obstacle + +#### 4. Filter obstacle fragments + +After the initial split, a collision mesh is often still too noisy. The new +`filter_collision_usda.py` script removes connected components that look like: + +- tiny near-ground shards +- small floating fragments +- optionally, medium-sized suspended shells after a second pass + +Example first pass: + +```bash +python3 isaaclab_arena/scripts/assets/filter_collision_usda.py \ + --input_usd /path/to/collision/scene_obstacle_collision.usda \ + --output_usd /path/to/collision/scene_obstacle_collision_clean.usda \ + --ground_max_faces 20 \ + --ground_max_top_z 0.30 \ + --ground_max_diag 0.35 \ + --floating_max_faces 10 \ + --floating_min_bottom_z 0.35 \ + --floating_max_diag 0.25 +``` + +This stage is what removes the obvious “bad collision geometry” that makes the +robot snag, trip, or appear to hit empty space. + +### Runtime configurations + +#### A. Stable baseline + +Keep the invisible ground plane, add only obstacle collision: + +```bash +/isaac-sim/python.sh -u -m isaaclab_arena.evaluation.policy_runner \ + ... \ + h1_vln_matterport \ + --usd_path /path/to/scene.usd \ + --r2r_dataset_path /path/to/vln_ce_isaac_v1.json.gz \ + --use_global_matterport_prim \ + --matterport_obstacle_collision_usd_path /path/to/scene_obstacle_collision.usda +``` + +Use this first when debugging torso clipping, arm clipping, or corridor contact +without destabilizing the feet. + +#### B. Split full collision + +Disable the fallback ground plane and use both floor + obstacle collision: + +```bash +/isaac-sim/python.sh -u -m isaaclab_arena.evaluation.policy_runner \ + ... \ + h1_vln_matterport \ + --usd_path /path/to/scene.usd \ + --r2r_dataset_path /path/to/vln_ce_isaac_v1.json.gz \ + --use_global_matterport_prim \ + --disable_matterport_ground_plane \ + --matterport_floor_collision_usd_path /path/to/scene_floor_collision.usda \ + --matterport_obstacle_collision_usd_path /path/to/scene_obstacle_collision.usda +``` + +Use this after the floor mesh looks smooth enough to replace the fallback ground +plane. + +#### C. Legacy combined overlay + +This mode is still supported: + +```bash +/isaac-sim/python.sh -u -m isaaclab_arena.evaluation.policy_runner \ + ... \ + h1_vln_matterport \ + --usd_path /path/to/scene.usd \ + --r2r_dataset_path /path/to/vln_ce_isaac_v1.json.gz \ + --use_global_matterport_prim \ + --disable_matterport_ground_plane \ + --matterport_collision_usd_path /path/to/scene_combined_collision.usda +``` + +Do not mix the legacy combined overlay with the split `floor / obstacle` +overlays in the same run. + +### Visual inspection in Isaac Sim + +The recommended local inspection tool is: + +- `isaaclab_arena/scripts/assets/debug_matterport_collision_stage.py` + +Use it to export a debug stage and reopen it in Isaac Sim. When checking the +result, verify: + +1. Stage paths exist: + - `/World/matterport` + - `/World/matterport/collision` + - `/World/matterport/collisionFloor` + - `/World/matterport/collisionObstacle` + - `/World/GroundPlane` +2. Collision visualization is enabled: + - eye icon -> `Show by type` -> `Physics Mesh` -> `All` + - `Tools` -> `Physics API Editor` + - `Window` -> `Simulation` -> `Debug` +3. Geometry alignment looks reasonable: + - obstacle shells are not drifting away from walls or furniture + - floor meshes are not floating above or below the visible floor + - door frames are not dominated by noisy thin slivers + - the fallback ground plane does not fight a custom floor mesh + +### Validation ladder + +Use this order when debugging a new scene: + +1. Visual USD + ground plane only +2. Ground plane + obstacle collision +3. Floor + obstacle collision, no ground plane + +This isolates whether failures come from: + +- obstacle mesh quality +- floor mesh quality +- low-level locomotion mismatch + +### Common failure modes + +#### Robot clips through furniture + +Likely causes: + +- obstacle collision is too coarse +- only the ground plane is active +- upper-body posture is too wide for corridor / furniture clearance + +What to try: + +- enable obstacle-only collision first +- keep large furniture and wall boundaries in the obstacle mesh +- reduce decorative clutter that adds noisy contacts but does not help body clearance + +#### Robot gets stuck and stops moving + +Likely causes: + +- floor mesh is noisy or non-manifold +- floor overlay and ground plane both support the robot in conflicting ways +- `robot_root_height_offset` is wrong +- the low-level locomotion policy was not trained for contact-rich indoor mesh geometry + +What to try: + +- keep the ground plane and add only obstacle collision first +- rebuild a cleaner floor mesh +- tune `--robot_root_height_offset` +- compare against a ground-plane-only baseline + +#### Robot snags on door frames or thin geometry + +Likely causes: + +- obstacle collision is too detailed for the current locomotion controller +- the robot body is wider than the assumed corridor + +What to try: + +- simplify sharp mesh details near doors +- test `triangle` vs `convex_decomposition` +- compare with and without obstacle collision + +### Handoff checklist + +If another machine or another person needs to continue the work, keep these +unchanged: + +- the visual Matterport USD +- the VLN episode file such as `vln_ce_isaac_v1.json.gz` +- the existing Arena / NaVILA evaluation pipeline + +Replace or improve only: + +- the collision-only assets used by PhysX + +For the other machine, the most useful assets are: + +- `matterport_usd//.usd` +- `vln_ce_isaac_v1.json.gz` +- a cleaner raw MP3D / Matterport mesh +- generated collision outputs: + - `_floor_collision.usda` + - `_obstacle_collision.usda` + - optional `_combined_collision.usda` + +In this branch, the collision-related code and docs to carry forward are: + +- `isaaclab_arena/assets/matterport_background.py` +- `isaaclab_arena_environments/vln_environment.py` +- `isaaclab_arena/scripts/assets/export_matterport_collision_proxy.py` +- `isaaclab_arena/scripts/assets/build_matterport_collision_layers.py` +- `isaaclab_arena/scripts/assets/filter_collision_usda.py` +- `isaaclab_arena/scripts/assets/debug_matterport_collision_stage.py` +- `isaaclab_arena/scripts/assets/open_usd_stage.py` + +Collision-related CLI flags: + +| Flag | Default | Description | +|------|---------|-------------| +| `--use_global_matterport_prim` | off | Spawn Matterport at `/World/matterport` with `collision_group=-1`. Required for collision overlay and height scanner. | +| `--disable_matterport_ground_plane` | off | Remove the invisible z=0 ground plane. Use when a collision overlay provides floor support. | +| `--matterport_collision_usd_path` | None | Legacy combined collision-only USD layered under the visual scene. | +| `--matterport_floor_collision_usd_path` | None | Floor-only collision USD layered under the visual scene. | +| `--matterport_obstacle_collision_usd_path` | None | Obstacle-only collision USD layered under the visual scene. | +| `--enable_matterport_child_mesh_colliders` | off | Explicitly apply collision schemas to descendant Mesh prims of the visual USD. | +| `--matterport_mesh_collider_type` | `triangle` | Mesh collider approximation: `triangle`, `sdf`, or `convex_decomposition`. | +| `--robot_root_height_offset` | 1.0 | Added to dataset `start_position.z` when resetting the robot (H1 pelvis is ~1m above the floor). | + +## Training Your Own Low-Level Locomotion Policy + +The VLN pipeline uses a pre-trained RSL-RL locomotion policy that converts +velocity commands `[vx, vy, yaw_rate]` into joint-position actions. A default +checkpoint is included at `isaaclab_arena/policy/vln/pretrained/`. + +To train a replacement checkpoint with the built-in rough-terrain environment: + +```bash +docker run --rm --gpus all --net host --entrypoint /bin/bash \ + -v "$(pwd):/workspaces/isaaclab_arena" \ + -v "/home/$USER:/home/$USER" \ + isaaclab_arena:latest -lc " + cd /workspaces/isaaclab_arena && \ + /isaac-sim/python.sh -u isaaclab_arena/scripts/reinforcement_learning/train.py \ + --headless \ + --num_envs 4096 \ + --max_iterations 1000 \ + --save_interval 50 \ + --experiment_name h1_base_rough \ + --agent_cfg_path isaaclab_arena/policy/rl_policy/h1_base_rough_policy.json \ + h1_base_rough + " +``` + +Checkpoints are saved under `logs/rsl_rl/h1_base_rough//`. +To use a newly trained checkpoint for VLN evaluation, point to it with: + +```bash +--ll_checkpoint_path logs/rsl_rl/h1_base_rough//model_.pt +--ll_agent_cfg logs/rsl_rl/h1_base_rough//params/agent.yaml +``` + +The training environment preserves the same 69-dim proprioceptive layout as the +VLN evaluation, so the velocity-command injection at `obs[:, 9:12]` remains +compatible without code changes. + +## Sensor Configuration + +### Camera Positions (CLI-configurable) + +| Camera | Default Position (pelvis frame) | FOV | CLI Override | +|--------|--------------------------------|-----|-------------| +| Head camera | `(0.1, 0.0, 0.5)` | 54 deg | `--head_camera_offset_xyz X Y Z` | +| Follow camera | `(-1.0, 0.0, 0.57)` | 100 deg | `--follow_camera_offset_xyz X Y Z` | + +### Optional Sensors + +| Flag | Description | +|------|-------------| +| `--enable_head_camera_depth` | Add depth output to the head camera (not sent to VLM by default). | +| `--enable_height_scanner` | Enable a raycast-based height scanner on the pelvis. Requires `--use_global_matterport_prim`. | +| `--height_scanner_debug_vis` | Visualize the height-scanner rays in the viewport. | +| `--camera_resolution N` | Camera resolution in pixels (default: 512). | +| `--no_follow_camera` | Disable the third-person follow camera. | +| `--use_tiled_camera` | Use TiledCamera for parallel multi-env evaluation. | + +### STOP Diagnostic Flags + +| Flag | Description | +|------|-------------| +| `--ignore_vlm_stop` | Never let the VLM STOP command terminate an episode. Useful for diagnosing early-stop behaviour. | +| `--min_vlm_stop_distance D` | Ignore VLM STOP when distance-to-goal exceeds `D` meters. Negative value (default) disables. | +| `--debug_vln` | Enable verbose VLM query logging and debug frame saving. | +| `--save_interval N` | Save camera frames every N steps to `--save_dir`. | +| `--save_dir DIR` | Directory for saved camera frames. | + +## CLI Parameters + +### Client Parameters (before `h1_vln_matterport`) + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `--num_envs` | 1 | Must be 1 (multi-env not yet supported). | +| `--policy_type` | (required) | `isaaclab_arena.policy.vln.vln_vlm_locomotion_policy.VlnVlmLocomotionPolicy` | +| `--remote_host` | localhost | VLM server hostname. | +| `--remote_port` | 5555 | VLM server port. | +| `--ll_checkpoint_path` | (required) | RSL-RL locomotion checkpoint (.pt). | +| `--ll_agent_cfg` | (required) | RSL-RL agent config (.yaml). | +| `--warmup_steps` | 200 | Low-level policy warmup steps. | +| `--num_episodes` | (required) | Number of episodes to evaluate. | + +### Environment Parameters (after `h1_vln_matterport`) + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `--usd_path` | (required) | Path to Matterport USD scene file. | +| `--r2r_dataset_path` | (required) | Path to VLN-CE R2R dataset (.json.gz). | +| `--episode_start` | None | Start index within filtered episodes. | +| `--episode_end` | None | End index (exclusive) within filtered episodes. | +| `--episode_length_s` | 60 | Max episode duration in seconds. | +| `--success_radius` | 3.0 | Success distance threshold (meters). | + +### Server Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `--model_path` | (required) | Path to NaVILA / LLaVA model checkpoint. | +| `--port` | 5555 | Server listening port. | +| `--num_video_frames` | 8 | Number of frames sent to VLM per query. | +| `--conv_mode` | llama_3 | LLaVA conversation template. | +| `--history_padding_mode` | repeat_first | Early-frame padding: `repeat_first` or `black` (NaVILA-Bench style). | +| `--max_history_frames` | 200 | Maximum frames kept in server-side history. | +| `--max_new_tokens` | 80 | Maximum generated tokens per VLM query. | + +## Metrics + +| Metric | Description | +|--------|-------------| +| **Success** | Fraction of episodes where the agent emits `STOP` and the final distance-to-goal is below `success_radius`. | +| **SPL** | Mean over episodes of `S_i * l_i / max(p_i, l_i)`, where `S_i` is STOP-gated success, `l_i` is shortest-path distance, and `p_i` is actual path length. | +| **Distance-to-Goal** | Approximate geodesic distance from the final position to the goal in the XY plane, estimated from the reference path. | +| **Path Length** | Total distance traversed (XY only). | + +The benchmark follows a `DONE/STOP`-style evaluation protocol: without an explicit `STOP`, an episode is counted as unsuccessful even if the agent passes near the goal. + +## Known Limitations + +- **`num_envs` must be 1.** Multi-env requires per-env VLM instruction tracking. +- **Scene switching requires restart.** Matterport USD is loaded at initialization. +- **Some episode start positions are outside the building envelope.** The + Matterport scan may have incomplete or visually degraded geometry at these + locations, which can confuse the VLM and cause the robot to get stuck. + Consider filtering edge-case episodes or adjusting `--episode_start` / `--episode_end`. +- **Collision proxy is experimental.** The exported proxy mesh may not perfectly + match the original building geometry. Use the default ground plane for stable + baseline evaluation. diff --git a/isaaclab_arena_navila/__init__.py b/isaaclab_arena_navila/__init__.py new file mode 100644 index 000000000..2ec11e967 --- /dev/null +++ b/isaaclab_arena_navila/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""NaVILA VLM server policy for VLN navigation. + +This package provides the NaVILA-specific server-side policy that wraps +the NaVILA/LLaVA model for generating navigation velocity commands. +Similar in structure to ``isaaclab_arena_gr00t``. +""" + +from isaaclab_arena_navila.navila_server_policy import NaVilaServerPolicy # noqa: F401 diff --git a/isaaclab_arena_navila/navila_server_policy.py b/isaaclab_arena_navila/navila_server_policy.py new file mode 100644 index 000000000..edfd49987 --- /dev/null +++ b/isaaclab_arena_navila/navila_server_policy.py @@ -0,0 +1,358 @@ +# Copyright (c) 2025-2026, The Isaac Lab Arena Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""NaVILA server-side policy for VLN navigation. + +This policy runs on the **server** side (typically a GPU machine with the +NaVILA / LLaVA model loaded). It receives RGB image observations from the +client, runs VLM inference, parses the text output into a velocity command, +and returns it. + +To add a different VLM backend (e.g. GR00T), create a new server policy +in ``isaaclab_arena_navila/`` following this pattern. + +Launch via the standard Arena remote policy server runner:: + + python -m isaaclab_arena.remote_policy.remote_policy_server_runner \\ + --policy_type isaaclab_arena_navila.navila_server_policy.NaVilaServerPolicy \\ + --model_path /path/to/navila/checkpoint \\ + --port 5555 +""" + +from __future__ import annotations + +import argparse +from dataclasses import dataclass +from typing import Any + +import numpy as np +import torch +from PIL import Image + +from llava.constants import IMAGE_TOKEN_INDEX +from llava.conversation import SeparatorStyle, conv_templates +from llava.mm_utils import ( + KeywordsStoppingCriteria, + get_model_name_from_path, + process_image, + tokenizer_image_token, +) +from llava.model.builder import load_pretrained_model + +from isaaclab_arena.remote_policy.action_protocol import ActionProtocol, VlnVelocityActionProtocol +from isaaclab_arena.remote_policy.server_side_policy import ServerSidePolicy + + +# ========================================================================== # +# VLM command parsing # +# ========================================================================== # + + +def parse_vlm_output_to_velocity(text: str) -> tuple[np.ndarray, float]: + """Parse VLM text output into a velocity command and duration. + + Recognized commands (case-insensitive): + - "turn left [15|30|45]" -> [0, 0, +pi/6], duration 0.5-1.5s + - "turn right [15|30|45]" -> [0, 0, -pi/6], duration 0.5-1.5s + - "move forward [25|50|75]" -> [0.5, 0, 0], duration 0.5-1.5s + - "stop" -> [0, 0, 0], duration 0.0 + + Returns: + (vel_cmd, duration) where vel_cmd is [vx, vy, yaw_rate]. + """ + t = text.lower().strip() + + if "turn left" in t: + vx, vy, wz = 0.0, 0.0, np.pi / 6.0 + if "45" in t: + dur = 1.5 + elif "30" in t: + dur = 1.0 + else: + dur = 0.5 + return np.array([vx, vy, wz], dtype=np.float32), dur + + if "turn right" in t: + vx, vy, wz = 0.0, 0.0, -np.pi / 6.0 + if "45" in t: + dur = 1.5 + elif "30" in t: + dur = 1.0 + else: + dur = 0.5 + return np.array([vx, vy, wz], dtype=np.float32), dur + + if "move forward" in t or "move" in t: + vx, vy, wz = 0.5, 0.0, 0.0 + if "75" in t: + dur = 1.5 + elif "50" in t: + dur = 1.0 + elif "25" in t: + dur = 0.5 + else: + dur = 0.5 + return np.array([vx, vy, wz], dtype=np.float32), dur + + if "stop" in t: + return np.array([0.0, 0.0, 0.0], dtype=np.float32), 0.0 + + return np.array([0.5, 0.0, 0.0], dtype=np.float32), 0.5 + + +# ========================================================================== # +# Server-side policy # +# ========================================================================== # + + +@dataclass +class NaVilaServerPolicyConfig: + """Configuration for :class:`NaVilaServerPolicy`.""" + + model_path: str = "" + device: str = "cuda" + num_video_frames: int = 8 + conv_mode: str = "llama_3" + history_padding_mode: str = "repeat_first" + max_history_frames: int = 200 + max_new_tokens: int = 80 + + +class NaVilaServerPolicy(ServerSidePolicy): + """NaVILA VLM server policy for VLN navigation. + + Wraps the NaVILA (LLaVA-based) model to generate navigation + velocity commands from RGB image observations. Maintains a + full episode image history and uses uniform sampling (matching + the NaVILA-Bench evaluation protocol) so the VLM can determine + task completion and output "stop". + """ + + config_class = NaVilaServerPolicyConfig + + def __init__(self, config: NaVilaServerPolicyConfig) -> None: + super().__init__(config) + self._instruction: str = "Navigate to the target." + self._image_history: list[Image.Image] = [] + self._num_video_frames: int = config.num_video_frames + self._device = config.device + self._conv_mode = config.conv_mode + self._model_path = config.model_path + self._history_padding_mode = config.history_padding_mode + self._max_history_frames = config.max_history_frames + self._max_new_tokens = config.max_new_tokens + if self._history_padding_mode not in {"repeat_first", "black"}: + raise ValueError( + "history_padding_mode must be one of: repeat_first, black. " + f"Got: {self._history_padding_mode!r}" + ) + if self._max_history_frames <= 0: + raise ValueError(f"max_history_frames must be > 0. Got: {self._max_history_frames}") + if self._max_new_tokens <= 0: + raise ValueError(f"max_new_tokens must be > 0. Got: {self._max_new_tokens}") + + print(f"[NaVilaServerPolicy] Loading VLM model from: {self._model_path}") + model_name = get_model_name_from_path(self._model_path) + tokenizer, model, image_processor, _ = load_pretrained_model( + self._model_path, model_name, None + ) + self._tokenizer = tokenizer + self._model = model.to(self._device) + self._image_processor = image_processor + print("[NaVilaServerPolicy] VLM model loaded successfully.") + + # ------------------------------------------------------------------ # + # Protocol # + # ------------------------------------------------------------------ # + + def _build_protocol(self) -> ActionProtocol: + return VlnVelocityActionProtocol( + action_dim=3, + observation_keys=["camera_obs.robot_head_cam_rgb"], + default_duration=0.5, + ) + + # ------------------------------------------------------------------ # + # RPC methods # + # ------------------------------------------------------------------ # + + def set_task_description(self, task_description: str | None) -> dict[str, Any]: + self._instruction = task_description or "Navigate to the target." + print(f"[NaVilaServerPolicy] Instruction: {self._instruction[:80]}...") + return {"task_description": self._instruction} + + def get_action( + self, observation: dict[str, Any], options: dict[str, Any] | None = None + ) -> tuple[dict[str, Any], dict[str, Any]]: + nested = self.unpack_observation(observation) + rgb_np = None + if "camera_obs" in nested: + cam_obs = nested["camera_obs"] + for cam_key in cam_obs: + cam_data = cam_obs[cam_key] + if isinstance(cam_data, dict) and "rgb" in cam_data: + rgb_np = cam_data["rgb"] + elif isinstance(cam_data, np.ndarray): + rgb_np = cam_data + break + + if rgb_np is not None: + if rgb_np.ndim == 4: + rgb_np = rgb_np[0] + img = Image.fromarray(rgb_np[:, :, :3].astype(np.uint8)) + self._image_history.append(img) + if len(self._image_history) > self._max_history_frames: + self._image_history = self._image_history[-self._max_history_frames:] + + vlm_text = self._run_vlm_inference() + vel_cmd, duration = parse_vlm_output_to_velocity(vlm_text) + + action = {"action": vel_cmd, "duration": duration} + info = {"vlm_text": vlm_text} + return action, info + + def reset(self, env_ids=None, reset_options=None) -> dict[str, Any]: + self._image_history.clear() + return {"status": "reset_success"} + + # ------------------------------------------------------------------ # + # VLM inference # + # ------------------------------------------------------------------ # + + def _run_vlm_inference(self) -> str: + """Run VLM forward pass on the image history + instruction. + + Matches NaVILA-Bench: uniformly sample ``num_video_frames - 1`` + frames from the *full* episode history, plus the latest frame. + This gives the VLM global temporal context so it can determine + when the navigation task is complete and output "stop". + """ + if not self._image_history: + return "move forward 50" + + n = self._num_video_frames + num_images = len(self._image_history) + + if num_images < n: + if self._history_padding_mode == "black": + # Match NaVILA-Bench exactly for apples-to-apples diagnosis. + pad_img = Image.new("RGB", self._image_history[-1].size, (0, 0, 0)) + else: + # Repeating the first real image often gives a steadier early + # rollout than feeding synthetic black frames. + pad_img = self._image_history[0] + frames = [pad_img] * (n - num_images) + list(self._image_history) + else: + # Uniform sample n-1 from full history + latest (matching NaVILA) + n_hist = n - 1 + indices = [int(i * (num_images - 1) / n_hist) for i in range(n_hist)] + frames = [self._image_history[i] for i in indices] + frames.append(self._image_history[-1]) + + self._model.config.image_processor = self._image_processor + processed = [] + for img in frames: + p = process_image(img, self._model.config, None) + processed.append(p) + if all(x.shape == processed[0].shape for x in processed): + image_tensor = torch.stack(processed, dim=0).to(self._device, dtype=torch.float16) + else: + image_tensor = processed[0].unsqueeze(0).to(self._device, dtype=torch.float16) + + conv = conv_templates[self._conv_mode].copy() + image_token = "\n" + n_hist = self._num_video_frames - 1 + qs = ( + f"Imagine you are a robot programmed for navigation tasks. You have been given a video " + f'of historical observations {image_token * n_hist}, and current observation \n. ' + f'Your assigned task is: "{self._instruction}" ' + f"Analyze this series of images to decide your next action, which could be turning left " + f"or right by a specific degree, moving forward a certain distance, or stop if the task " + f"is completed." + ) + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token( + prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ).unsqueeze(0).to(self._device) + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + stopping_criteria = KeywordsStoppingCriteria([stop_str], self._tokenizer, input_ids) + + with torch.inference_mode(): + output_ids = self._model.generate( + input_ids, + images=[image_tensor], + do_sample=False, + temperature=0, + top_p=None, + num_beams=1, + max_new_tokens=self._max_new_tokens, + use_cache=True, + stopping_criteria=[stopping_criteria], + ) + + output_text = self._tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + print(f"[VLM] history={len(self._image_history)} output=\"{output_text[-120:]}\"") + return output_text + + # ------------------------------------------------------------------ # + # CLI helpers # + # ------------------------------------------------------------------ # + + @staticmethod + def add_args_to_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + group = parser.add_argument_group("VLN Server-Side Policy") + group.add_argument( + "--model_path", type=str, required=True, + help="Path to the NaVILA / LLaVA model checkpoint.", + ) + group.add_argument( + "--policy_device", type=str, default="cuda", + help="Device for VLM inference (default: cuda).", + ) + group.add_argument( + "--num_video_frames", type=int, default=8, + help="Number of video frames to send to the VLM (default: 8).", + ) + group.add_argument( + "--conv_mode", type=str, default="llama_3", + help="Conversation template mode for LLaVA (default: llama_3).", + ) + group.add_argument( + "--history_padding_mode", + type=str, + default="repeat_first", + choices=("repeat_first", "black"), + help="How to pad early frame history before enough frames are collected.", + ) + group.add_argument( + "--max_history_frames", + type=int, + default=200, + help="Maximum number of RGB frames kept in server-side history.", + ) + group.add_argument( + "--max_new_tokens", + type=int, + default=80, + help="Maximum generated tokens per VLM query.", + ) + return parser + + @staticmethod + def from_args(args: argparse.Namespace) -> NaVilaServerPolicy: + config = NaVilaServerPolicyConfig( + model_path=args.model_path, + device=getattr(args, "policy_device", "cuda"), + num_video_frames=getattr(args, "num_video_frames", 8), + conv_mode=getattr(args, "conv_mode", "llama_3"), + history_padding_mode=getattr(args, "history_padding_mode", "repeat_first"), + max_history_frames=getattr(args, "max_history_frames", 200), + max_new_tokens=getattr(args, "max_new_tokens", 80), + ) + return NaVilaServerPolicy(config) diff --git a/setup.py b/setup.py index b05b9afed..b903cfcf0 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ "isaaclab_arena_examples*", "isaaclab_arena_g1*", "isaaclab_arena_gr00t*", + "isaaclab_arena_navila*", ] ), python_requires=">=3.10",