Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 6 additions & 13 deletions scripts/benchmarks/benchmark_rlgames.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,9 @@
from scripts.benchmarks.utils import (
get_backend_type,
get_preset_string,
get_success_rate_log,
log_app_start_time,
log_convergence,
log_python_imports_time,
log_rl_policy_episode_lengths,
log_rl_policy_rewards,
log_rl_policy_success_rates,
log_rl_training_metrics,
log_runtime_step_times,
log_scene_creation_time,
log_simulation_start_time,
Expand Down Expand Up @@ -288,15 +284,12 @@ def main(
log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
log_rl_policy_rewards(benchmark, log_data["rewards/iter"])
log_rl_policy_episode_lengths(benchmark, log_data["episode_lengths/iter"])
success_rates = get_success_rate_log(log_data)
if success_rates is not None:
log_rl_policy_success_rates(benchmark, success_rates)
log_convergence(
log_rl_training_metrics(
benchmark,
log_data["rewards/iter"],
args_cli.task,
log_data,
reward_tag="rewards/iter",
episode_length_tag="episode_lengths/iter",
task=args_cli.task,
workflow="rl_games",
should_check_convergence=args_cli.check_convergence,
reward_threshold=args_cli.reward_threshold,
Expand Down
20 changes: 6 additions & 14 deletions scripts/benchmarks/benchmark_rsl_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,9 @@
from scripts.benchmarks.utils import (
get_backend_type,
get_preset_string,
get_success_rate_log,
log_app_start_time,
log_convergence,
log_python_imports_time,
log_rl_policy_episode_lengths,
log_rl_policy_rewards,
log_rl_policy_success_rates,
log_rl_training_metrics,
log_runtime_step_times,
log_scene_creation_time,
log_simulation_start_time,
Expand Down Expand Up @@ -287,16 +283,12 @@ def main(
log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
success_rates = get_success_rate_log(log_data)
if success_rates is not None:
log_rl_policy_success_rates(benchmark, success_rates)

log_convergence(
log_rl_training_metrics(
benchmark,
log_data["Train/mean_reward"],
args_cli.task,
log_data,
reward_tag="Train/mean_reward",
episode_length_tag="Train/mean_episode_length",
task=args_cli.task,
workflow="rsl_rl",
should_check_convergence=args_cli.check_convergence,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔵 Suggestion: For consistency with the success_rates is not None check a few lines below, consider using if rewards is not None: here (and for episode_lengths). Both work correctly in practice — parse_tf_logs returns None for missing keys — but is not None makes the intent clearer and matches the existing pattern in this file.

Non-blocking — the truthiness check is also safe since an empty list from TensorBoard is practically impossible, and log_rl_policy_rewards would crash on max([]) anyway.

reward_threshold=args_cli.reward_threshold,
Expand Down
90 changes: 90 additions & 0 deletions scripts/benchmarks/test/test_training_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

"""Unit tests for benchmark training-metric logging helpers."""

from __future__ import annotations

import pytest

from scripts.benchmarks.utils import SUCCESS_RATE_LOG_TAGS, log_rl_training_metrics


class _FakeBenchmark:
"""Collect benchmark measurements without initializing benchmark backends."""

def __init__(self):
self.measurements: list[tuple[str, str, object, str]] = []

def add_measurement(self, phase, measurement):
self.measurements.append((phase, measurement.name, measurement.value, getattr(measurement, "unit", "")))

def measurement_by_name(self, name: str):
return next(m for m in self.measurements if m[1] == name)


@pytest.mark.parametrize(
"workflow,reward_tag,episode_length_tag",
[
("rl_games", "rewards/iter", "episode_lengths/iter"),
("rsl_rl", "Train/mean_reward", "Train/mean_episode_length"),
],
)
def test_log_rl_training_metrics_skips_missing_short_run_scalars(
workflow: str, reward_tag: str, episode_length_tag: str, capsys: pytest.CaptureFixture[str]
):
"""Short benchmark runs may finish before reward and episode-length scalars are emitted."""
benchmark = _FakeBenchmark()

log_rl_training_metrics(
benchmark,
log_data={},
reward_tag=reward_tag,
episode_length_tag=episode_length_tag,
task="Isaac-Ant-v0",
workflow=workflow,
should_check_convergence=True,
)

assert benchmark.measurements == []
output = capsys.readouterr().out
assert f"TensorBoard log is missing '{reward_tag}'" in output
assert f"TensorBoard log is missing '{episode_length_tag}'" in output
assert f"Cannot check convergence because '{reward_tag}' was not logged" in output


@pytest.mark.parametrize(
"workflow,reward_tag,episode_length_tag",
[
("rl_games", "rewards/iter", "episode_lengths/iter"),
("rsl_rl", "Train/mean_reward", "Train/mean_episode_length"),
],
)
def test_log_rl_training_metrics_logs_present_normal_run_scalars(
workflow: str, reward_tag: str, episode_length_tag: str, capsys: pytest.CaptureFixture[str]
):
"""Normal runs with reward and episode-length scalars should log train metrics."""
benchmark = _FakeBenchmark()

log_rl_training_metrics(
benchmark,
log_data={
reward_tag: [1.0, 2.0, 3.0],
episode_length_tag: [10.0, 11.0],
SUCCESS_RATE_LOG_TAGS[0]: [0.25, 0.5],
},
reward_tag=reward_tag,
episode_length_tag=episode_length_tag,
task="Isaac-Ant-v0",
workflow=workflow,
)

assert benchmark.measurement_by_name("Rewards")[2] == [1.0, 2.0, 3.0]
assert benchmark.measurement_by_name("Max Rewards")[2] == 3.0
assert benchmark.measurement_by_name("Episode Lengths")[2] == [10.0, 11.0]
assert benchmark.measurement_by_name("Max Episode Lengths")[2] == 11.0
assert benchmark.measurement_by_name("Success Rates")[2] == [0.25, 0.5]
assert benchmark.measurement_by_name("success_rate")[2] == 0.5
assert "TensorBoard log is missing" not in capsys.readouterr().out
46 changes: 46 additions & 0 deletions scripts/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,52 @@ def log_success(benchmark, tracker, framework_iteration_count: int | None = None
)


def log_rl_training_metrics(
benchmark: BaseIsaacLabBenchmark,
log_data: dict[str, list[float]],
reward_tag: str,
episode_length_tag: str,
task: str,
workflow: str,
should_check_convergence: bool = False,
reward_threshold: float | None = None,
convergence_config: str = "full",
) -> None:
"""Log optional RL training metrics from TensorBoard data.

Short smoke-test runs can finish before the RL framework emits reward or
episode-length scalars. Missing tags should skip those measurements instead
of failing the whole benchmark.
"""
rewards = log_data.get(reward_tag)
episode_lengths = log_data.get(episode_length_tag)
if rewards:
log_rl_policy_rewards(benchmark, rewards)
else:
print(f"[WARNING] TensorBoard log is missing '{reward_tag}'; skipping reward benchmark metrics.")
if episode_lengths:
log_rl_policy_episode_lengths(benchmark, episode_lengths)
else:
print(f"[WARNING] TensorBoard log is missing '{episode_length_tag}'; skipping episode-length metrics.")

success_rates = get_success_rate_log(log_data)
if success_rates is not None:
log_rl_policy_success_rates(benchmark, success_rates)

if rewards:
log_convergence(
benchmark,
rewards,
task,
workflow=workflow,
should_check_convergence=should_check_convergence,
reward_threshold=reward_threshold,
convergence_config=convergence_config,
)
elif should_check_convergence:
print(f"[WARNING] Cannot check convergence because '{reward_tag}' was not logged.")


def parse_cprofile_stats(
profile: cProfile.Profile,
isaaclab_prefixes: list[str],
Expand Down
Loading