Evaluator API (#100)

dbellicoso-bdai · exploy-bot · commit 05eb5276287c · 2026-03-27T12:27:35.000Z
### What change is being made

Split evaluation into episodes.

### Why this change is being made

Cleanup how resets are handled when evaluating policies.

### Tested

Covered by existing tests.

GitOrigin-RevId: 93670bfb7a3f7dbc979f2851a1ef9c4a8dc3caa0
diff --git a/docs/tutorial/exporter/exporter_tutorial.md b/docs/tutorial/exporter/exporter_tutorial.md
@@ -33,7 +33,7 @@ import torch
 
 from exploy.exporter.core.actor import ExportableActor, add_actor_memory
 from exploy.exporter.core.context_manager import Group, Input, Memory, Output
-from exploy.exporter.core.evaluator import evaluate
+from exploy.exporter.core.evaluator import evaluate, evaluate_episode
 from exploy.exporter.core.exportable_environment import ExportableEnvironment
 from exploy.exporter.core.exporter import export_environment_as_onnx
 from exploy.exporter.core.session_wrapper import SessionWrapper
@@ -212,7 +212,7 @@ class ExportableEnv(ExportableEnvironment):
     def metadata(self) -> dict:
         return {"env_name": "Env", "version": "1.0"}
 
-    def register_evaluation_hooks(self, update, reset, evaluate_substep):
+    def register_evaluation_hooks(self, update, evaluate_substep):
         pass
 
     def get_observation_names(self) -> list[str]:
@@ -497,7 +497,8 @@ with torch.inference_mode():
         env=exp_env,
         context_manager=exp_env.context_manager(),
         session_wrapper=session_wrapper,
-        num_steps=20,
+        num_episodes=1,
+        max_episode_steps=20,
         verbose=True,
         pause_on_failure=False,
     )
@@ -511,6 +512,23 @@ with torch.inference_mode():
 If `export_ok` is `False`, the evaluator prints a detailed diagnostic showing which outputs
 diverged and at which step.
 
+Under the hood, {py:func}`evaluate() <exploy.exporter.core.evaluator.evaluate>` calls
+{py:func}`evaluate_episode() <exploy.exporter.core.evaluator.evaluate_episode>` once per episode.
+You can call `evaluate_episode()` directly when you want finer control — for example, to run and
+inspect a single episode in a tight debugging loop without the outer episode iteration:
+
+```python
+with torch.inference_mode():
+    episode_ok, observations = evaluate_episode(
+        env=exp_env,
+        context_manager=exp_env.context_manager(),
+        session_wrapper=session_wrapper,
+        max_num_steps=20,
+        verbose=True,
+        pause_on_failure=False,
+    )
+```
+
 ---
 
 ## Advanced: Using Torch Modules in Observations
@@ -649,7 +667,8 @@ def export_and_evaluate(
     exp_env: ExportableEnv,
     actor: ExportableActor,
     onnx_file_name: str,
-    num_eval_steps: int,
+    num_eval_episodes: int,
+    max_eval_steps_per_episode: int,
 ) -> bool:
     # Register inputs, outputs, and memory.
     exp_env.context_manager().add_components(
@@ -714,7 +733,8 @@ def export_and_evaluate(
                 env=exp_env,
                 context_manager=exp_env.context_manager(),
                 session_wrapper=session_wrapper,
-                num_steps=num_eval_steps,
+                num_episodes=num_eval_episodes,
+                max_episode_steps=max_eval_steps_per_episode,
                 verbose=False,
                 pause_on_failure=False,
             )
@@ -730,7 +750,7 @@ env = Environment(data_source=data_source)
 exp_env = ExportableEnv(env=env)
 actor = Actor(num_obs=env.num_obs, num_act=env.num_act).eval()
 
-assert export_and_evaluate(exp_env, actor, "policy.onnx", num_eval_steps=20)
+assert export_and_evaluate(exp_env, actor, "policy.onnx", num_eval_episodes=1, max_eval_steps_per_episode=20)
 ```
 
 ### Environment with a torch module
@@ -742,7 +762,7 @@ exp_env = ExportableEnv(env=env)
 actor = Actor(num_obs=env.num_obs, num_act=env.num_act).eval()
 exp_env.context_manager().add_module(env.module)
 
-assert export_and_evaluate(exp_env, actor, "policy_with_module.onnx", num_eval_steps=20)
+assert export_and_evaluate(exp_env, actor, "policy_with_module.onnx", num_eval_episodes=1, max_eval_steps_per_episode=20)
 ```
 
 ### Environment with a torch module and an RNN actor
@@ -760,5 +780,5 @@ add_actor_memory(
     get_hidden_states_func=actor.get_state,
 )
 
-assert export_and_evaluate(exp_env, actor, "policy_with_rnn.onnx", num_eval_steps=20)
+assert export_and_evaluate(exp_env, actor, "policy_with_rnn.onnx", num_eval_episodes=1, max_eval_steps_per_episode=20)
 ```
diff --git a/examples/exporter_scripts/isaaclab/export_isaaclab.py b/examples/exporter_scripts/isaaclab/export_isaaclab.py
@@ -164,13 +164,15 @@ def export_isaaclab(
     )
 
     # Evaluate.
-    evaluate_steps = 200
+    evaluate_episodes = 2
+    evaluate_steps = 100
     with torch.inference_mode():
         export_ok, _ = evaluate(
             env=exportable_env,
             context_manager=exportable_env.context_manager(),
             session_wrapper=session_wrapper,
-            num_steps=evaluate_steps,
+            num_episodes=evaluate_episodes,
+            max_episode_steps=evaluate_steps,
             verbose=True,
             pause_on_failure=pause_on_failure,
         )
diff --git a/python/exploy/exporter/core/evaluator.py b/python/exploy/exporter/core/evaluator.py
@@ -18,7 +18,6 @@ def _print_progress_bar(
     num_steps: int,
     failed_steps: int,
     step_export_ok: bool,
-    is_reset_step: bool,
     inference_times: list[float],
 ) -> None:
     """Print progress bar with step information.
@@ -28,7 +27,6 @@ def _print_progress_bar(
         num_steps: Total number of steps.
         failed_steps: Number of failed steps so far.
         step_export_ok: Whether current step passed validation.
-        is_reset_step: Whether environment was reset this step.
         inference_times: List of inference times.
     """
     status_emoji = "🔴" if not step_export_ok else "🟢"
@@ -41,8 +39,6 @@ def _print_progress_bar(
     mean_time = np.mean(inference_times) * 1.0e3
     std_time = np.std(inference_times) * 1.0e3
     extra_info.append(f"⏱️  μ={mean_time:.3f}ms σ={std_time:.3f}ms")
-    if is_reset_step:
-        extra_info.append("RESET")
     extra_str = " | ".join(extra_info)
 
     print(
@@ -189,7 +185,8 @@ def evaluate(
     env: ExportableEnvironment,
     context_manager: ContextManager,
     session_wrapper: SessionWrapper,
-    num_steps: int,
+    num_episodes: int,
+    max_episode_steps: int | None = None,
     verbose: bool = True,
     reset_from_onnx_counter_steps: int = 50,
     atol: float = 1.0e-5,
@@ -198,15 +195,16 @@ def evaluate(
 ) -> tuple[bool, torch.Tensor]:
     """Evaluate an ONNX exported model against an `ExportableEnvironment` stepped through a `SessionWrapper`.
 
-    This function runs the simulation for a specified number of steps and compares the
-    outputs of the ONNX model with the environment's state and actor's actions at each step.
-    This is useful for verifying the correctness of the ONNX export.
+    This function runs the simulation for a specified number of episodes, each with a maximum number
+    of steps, and compares the outputs of the ONNX model with the environment's state and actor's
+    actions at each step. This is useful for verifying the correctness of the ONNX export.
 
     Args:
         env: The environment to run the evaluation in.
         context_manager: The context manager handling inputs and outputs.
         session_wrapper: An ONNX session wrapper.
-        num_steps: The number of steps to run the evaluation for.
+        num_episodes: The number of episodes to run the evaluation for.
+        max_episode_steps: The maximum number of steps per episode.
         verbose: Whether to print verbose output during evaluation. Defaults to True.
         reset_from_onnx_counter_steps: Set after how many steps we should set memory inputs from ONNX instead of using
             the environment's state.
@@ -218,14 +216,76 @@ def evaluate(
             Note: this value is chosen arbitrarily.
         atol: Absolute tolerance used to compare tensors.
         rtol: Relative tolerance used to compare tensors.
+        pause_on_failure: Whether to pause on each failed step and wait for user input before
+            continuing. Defaults to True.
 
     Returns:
         A tuple containing a boolean indicating if the evaluation was successful and
         the final observations tensor.
     """
+    if verbose:
+        print("Starting evaluation...")
+
+    for i_episode in range(num_episodes):
+        if verbose:
+            print(f"\nStarting episode {i_episode + 1}/{num_episodes}...")
+        export_ok, final_obs = evaluate_episode(
+            env=env,
+            context_manager=context_manager,
+            session_wrapper=session_wrapper,
+            max_num_steps=max_episode_steps,
+            verbose=verbose,
+            reset_from_onnx_counter_steps=reset_from_onnx_counter_steps,
+            atol=atol,
+            rtol=rtol,
+            pause_on_failure=pause_on_failure,
+        )
+    return export_ok, final_obs
+
+
+def evaluate_episode(
+    env: ExportableEnvironment,
+    context_manager: ContextManager,
+    session_wrapper: SessionWrapper,
+    max_num_steps: int | None = None,
+    verbose: bool = True,
+    reset_from_onnx_counter_steps: int = 50,
+    atol: float = 1.0e-5,
+    rtol: float = 1.0e-5,
+    pause_on_failure: bool = True,
+):
+    """Run evaluation for a single episode, comparing the ONNX model outputs against the environment.
+
+    Steps the environment and the ONNX session in lockstep, comparing observations, actions, and
+    outputs at each step. Useful for fine-grained inspection of a single episode, e.g. in a
+    debugging loop, without the outer episode iteration of :func:`evaluate`.
+
+    Args:
+        env: The environment to run the evaluation in.
+        context_manager: The context manager handling inputs and outputs.
+        session_wrapper: An ONNX session wrapper.
+        max_num_steps: The maximum number of steps to run. If ``None``, runs until the environment
+            signals a reset.
+        verbose: Whether to print verbose output during evaluation. Defaults to True.
+        reset_from_onnx_counter_steps: Set after how many steps we should set memory inputs from
+            ONNX instead of using the environment's state.
+
+            Note: we do this to avoid numerical error accumulation that would occur if we only
+            ever use the ONNX inference outputs as memory fed back as ONNX inference inputs,
+            while all other inputs are set directly from the environment's state.
+
+            Note: this value is chosen arbitrarily.
+        atol: Absolute tolerance used to compare tensors.
+        rtol: Relative tolerance used to compare tensors.
+        pause_on_failure: Whether to pause on each failed step and wait for user input before
+            continuing. Defaults to True.
 
-    # Reset both the environment and the actor.
+    Returns:
+        A tuple containing a boolean indicating if the episode evaluation was successful and
+        the final observations tensor.
+    """
     obs = env.observations_reset()
+    session_wrapper.reset()
 
     actor = session_wrapper.get_actor()
     if actor is None:
@@ -273,15 +333,8 @@ def update():
         """
         context_manager.read_inputs()
 
-    def reset():
-        """Callback passed to the environment's evaluation hooks to reset the
-        context manager's inputs from the environment's state at each reset.
-        """
-        context_manager.read_inputs()
-
     env.register_evaluation_hooks(
         update=update,
-        reset=reset,
         evaluate_substep=evaluate_substep,
     )
 
@@ -291,7 +344,7 @@ def reset():
     reset_memory_from_env = False
     env.context_manager().read_inputs()
 
-    while step_ctr < num_steps:
+    while step_ctr < max_num_steps if max_num_steps is not None else True:
         reset_memory_from_env = (
             reset_memory_from_env or (step_ctr % reset_from_onnx_counter_steps) == 0
         )
@@ -301,18 +354,9 @@ def reset():
 
         # Check if the environment was reset.
         if is_reset_step:
-            # Re-read the ONNX inputs from the environment after a reset to avoid mismatch between
-            # ONNX inputs and environment state after reset.
-            env.context_manager().read_inputs()
-
-            # Reset the actor state.
-            actor.reset(torch.tensor([is_reset_step], device=env_actions.device))
-
-            # We need to reset the memory inputs from the environment after a reset.
-            reset_memory_from_env = True
-
-            # Reset the session wrapper results to avoid using stale outputs.
-            session_wrapper._results = None
+            if verbose:
+                print(f"\n🔄 Environment reset at step {step_ctr + 1}.")
+            break
 
         # Get onnx outputs if the session has been run.
         ort_outputs = (
@@ -383,16 +427,13 @@ def reset():
 
         # Display progress bar.
         if verbose:
-            if step_ctr == 0:
-                print("\n\nStarting evaluation...")
             if not step_export_ok:
                 print(msg)
             _print_progress_bar(
                 step_ctr=step_ctr,
-                num_steps=num_steps,
+                num_steps=max_num_steps,
                 failed_steps=failed_steps,
                 step_export_ok=step_export_ok,
-                is_reset_step=is_reset_step,
                 inference_times=inference_times,
             )
 
diff --git a/python/exploy/exporter/core/exportable_environment.py b/python/exploy/exporter/core/exportable_environment.py
@@ -59,7 +59,6 @@ def decimation(self) -> int:
     def register_evaluation_hooks(
         self,
         update: Callable[[], None],
-        reset: Callable[[], None],
         evaluate_substep: Callable[[int], None],
     ):
         """Register evaluation hooks for this environment."""
diff --git a/python/exploy/exporter/core/session_wrapper.py b/python/exploy/exporter/core/session_wrapper.py
@@ -2,7 +2,6 @@
 
 import pathlib
 
-import numpy as np
 import onnxruntime as ort
 
 from exploy.exporter.core.actor import ExportableActor
@@ -105,4 +104,4 @@ def get_output_value(self, output_name: str):
 
     def reset(self):
         """Reset the internal results to zeros to avoid stale data at environment reset."""
-        self._results = [np.zeros_like(output) for output in self._results]
+        self._results = None
diff --git a/python/exploy/exporter/core/tests/test_export_environment.py b/python/exploy/exporter/core/tests/test_export_environment.py
@@ -213,7 +213,7 @@ def get_observation_names(self) -> list[str]:
     def observations_reset(self) -> torch.Tensor:
         return self.env.compute_obs()
 
-    def register_evaluation_hooks(self, update, reset, evaluate_substep):
+    def register_evaluation_hooks(self, update, evaluate_substep):
         pass
 
     def metadata(self) -> dict:
@@ -309,7 +309,8 @@ def export_and_evaluate_env(
     exp_env: ExportableEnv,
     actor: ExportableActor,
     onnx_file_name: str,
-    num_eval_steps: int,
+    num_eval_episodes: int,
+    max_eval_steps_per_episode: int,
 ) -> bool:
     """Helper function to export an environment and evaluate it using the exported ONNX graph."""
     exp_env.context_manager().add_components(
@@ -372,13 +373,13 @@ def export_and_evaluate_env(
         )
 
         # Evaluate.
-        evaluate_steps = num_eval_steps
         with torch.inference_mode():
             export_ok, _ = evaluate(
                 env=exp_env,
                 context_manager=exp_env.context_manager(),
                 session_wrapper=session_wrapper,
-                num_steps=evaluate_steps,
+                num_episodes=num_eval_episodes,
+                max_episode_steps=max_eval_steps_per_episode,
                 verbose=False,
                 pause_on_failure=False,
             )
@@ -397,7 +398,8 @@ def test_env(self):
             exp_env=exp_env,
             actor=actor,
             onnx_file_name="test_export_env.onnx",
-            num_eval_steps=20,
+            num_eval_episodes=2,
+            max_eval_steps_per_episode=20,
         )
         assert export_ok, "ONNX export validation failed"
 
@@ -414,7 +416,8 @@ def test_env_with_module(self):
             exp_env=exp_env,
             actor=actor,
             onnx_file_name="test_export_env_with_module.onnx",
-            num_eval_steps=20,
+            num_eval_episodes=2,
+            max_eval_steps_per_episode=20,
         )
         assert export_ok, "ONNX export validation failed"
 
@@ -440,6 +443,7 @@ def test_env_with_module_and_rnn_actor(self):
             exp_env=exp_env,
             actor=actor,
             onnx_file_name="test_export_env_with_rnn_actor.onnx",
-            num_eval_steps=20,
+            num_eval_episodes=2,
+            max_eval_steps_per_episode=20,
         )
         assert export_ok, "ONNX export validation failed"
diff --git a/python/exploy/exporter/core/tests/test_session_wrapper.py b/python/exploy/exporter/core/tests/test_session_wrapper.py
diff --git a/python/exploy/exporter/core/utils/math.py b/python/exploy/exporter/core/utils/math.py
diff --git a/python/exploy/exporter/frameworks/isaaclab/env.py b/python/exploy/exporter/frameworks/isaaclab/env.py