avg_suc_time only cares about solve time, not policy execution time (#591)

ronuchit · tomsilver · web-flow · commit a6241a772998 · 2022-03-14T13:44:55.000-04:00
Co-authored-by: Tom Silver &lt;tomssilver@gmail.com&gt;
diff --git a/src/datasets/demo_only.py b/src/datasets/demo_only.py
@@ -34,7 +34,7 @@ def create_demo_data(env: BaseEnv, train_tasks: List[Task]) -> Dataset:
             # get_last_plan(). We do this because we want to run the full plan.
             plan = oracle_approach.get_last_plan()
             # Stop run_policy() when OptionPlanExhausted() is hit.
-            traj = utils.run_policy(
+            traj, _ = utils.run_policy(
                 utils.option_plan_to_policy(plan),
                 env,
                 "train",
diff --git a/src/main.py b/src/main.py
@@ -209,7 +209,7 @@ def _generate_interaction_results(
     for request in requests:
         monitor = TeacherInteractionMonitorWithVideo(env.render, request,
                                                      teacher)
-        traj = utils.run_policy(
+        traj, _ = utils.run_policy(
             request.act_policy,
             env,
             "train",
@@ -240,7 +240,7 @@ def _run_testing(env: BaseEnv, approach: BaseApproach) -> Metrics:
     total_num_execution_failures = 0
     video_prefix = utils.get_config_path_str()
     for test_task_idx, task in enumerate(test_tasks):
-        start = time.time()
+        solve_start = time.time()
         try:
             policy = approach.solve(task, timeout=CFG.timeout)
         except (ApproachTimeout, ApproachFailure) as e:
@@ -253,20 +253,23 @@ def _run_testing(env: BaseEnv, approach: BaseApproach) -> Metrics:
                 outfile = f"{video_prefix}__task{test_task_idx+1}_failure.mp4"
                 utils.save_video(outfile, video)
             continue
+        solve_time = time.time() - solve_start
         num_found_policy += 1
         try:
             if CFG.make_test_videos:
                 monitor = utils.VideoMonitor(env.render)
             else:
                 monitor = None
-            traj = utils.run_policy(policy,
-                                    env,
-                                    "test",
-                                    test_task_idx,
-                                    task.goal_holds,
-                                    max_num_steps=CFG.horizon,
-                                    monitor=monitor)
+            traj, execution_metrics = utils.run_policy(
+                policy,
+                env,
+                "test",
+                test_task_idx,
+                task.goal_holds,
+                max_num_steps=CFG.horizon,
+                monitor=monitor)
             solved = task.goal_holds(traj.states[-1])
+            solve_time += execution_metrics["policy_call_time"]
         except utils.EnvironmentFailure as e:
             logging.info(f"Task {test_task_idx+1} / {len(test_tasks)}: "
                          f"Environment failed with error: {e}")
@@ -280,7 +283,7 @@ def _run_testing(env: BaseEnv, approach: BaseApproach) -> Metrics:
         if solved:
             logging.info(f"Task {test_task_idx+1} / {len(test_tasks)}: SOLVED")
             num_solved += 1
-            total_suc_time += (time.time() - start)
+            total_suc_time += solve_time
         else:
             logging.info(f"Task {test_task_idx+1} / {len(test_tasks)}: Policy "
                          f"failed to reach goal")
diff --git a/src/utils.py b/src/utils.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import subprocess
+import time
 from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Callable, Collection, Dict, FrozenSet, \
@@ -31,7 +32,7 @@
 from predicators.src.structs import NSRT, Action, Array, DummyOption, \
     EntToEntSub, GroundAtom, GroundAtomTrajectory, \
     GroundNSRTOrSTRIPSOperator, Image, LiftedAtom, LiftedOrGroundAtom, \
-    LowLevelTrajectory, NSRTOrSTRIPSOperator, Object, OptionSpec, \
+    LowLevelTrajectory, Metrics, NSRTOrSTRIPSOperator, Object, OptionSpec, \
     ParameterizedOption, Predicate, Segment, State, STRIPSOperator, Task, \
     Type, VarToObjSub, Video, _GroundNSRT, _GroundSTRIPSOperator, _Option, \
     _TypedEntity
@@ -443,15 +444,16 @@ def observe(self, state: State, action: Optional[Action]) -> None:
         raise NotImplementedError("Override me!")
 
 
-def run_policy(policy: Callable[[State], Action],
-               env: BaseEnv,
-               train_or_test: str,
-               task_idx: int,
-               termination_function: Callable[[State], bool],
-               max_num_steps: int,
-               exceptions_to_break_on: Optional[Set[
-                   TypingType[Exception]]] = None,
-               monitor: Optional[Monitor] = None) -> LowLevelTrajectory:
+def run_policy(
+        policy: Callable[[State], Action],
+        env: BaseEnv,
+        train_or_test: str,
+        task_idx: int,
+        termination_function: Callable[[State], bool],
+        max_num_steps: int,
+        exceptions_to_break_on: Optional[Set[TypingType[Exception]]] = None,
+        monitor: Optional[Monitor] = None
+) -> Tuple[LowLevelTrajectory, Metrics]:
     """Execute a policy starting from the initial state of a train or test task
     in the environment. The task's goal is not used.
 
@@ -465,10 +467,14 @@ def run_policy(policy: Callable[[State], Action],
     state = env.reset(train_or_test, task_idx)
     states = [state]
     actions: List[Action] = []
+    metrics: Metrics = defaultdict(float)
+    metrics["policy_call_time"] = 0.0
     if not termination_function(state):
         for _ in range(max_num_steps):
             try:
+                start_time = time.time()
                 act = policy(state)
+                metrics["policy_call_time"] += time.time() - start_time
             except Exception as e:
                 if exceptions_to_break_on is not None and \
                    type(e) in exceptions_to_break_on:
@@ -484,7 +490,7 @@ def run_policy(policy: Callable[[State], Action],
     if monitor is not None:
         monitor.observe(state, None)
     traj = LowLevelTrajectory(states, actions)
-    return traj
+    return traj, metrics
 
 
 def run_policy_with_simulator(
@@ -500,10 +506,10 @@ def run_policy_with_simulator(
     *** This function should not be used with any core code, because we want
     to avoid the assumption of a simulator when possible. ***
 
-    This is similar to run_policy, with two major differences:
+    This is similar to run_policy, with three major differences:
     (1) The initial state `init_state` can be any state, not just the initial
     state of a train or test task. (2) A simulator (function that takes state
-    as input) is assumed.
+    as input) is assumed. (3) Metrics are not returned.
 
     Note that the environment internal state is NOT updated.
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -240,21 +240,23 @@ def test_run_policy():
     env = CoverEnv()
     policy = lambda _: Action(env.action_space.sample())
     task = env.get_task("test", 0)
-    traj = utils.run_policy(policy,
-                            env,
-                            "test",
-                            0,
-                            task.goal_holds,
-                            max_num_steps=5)
+    traj, metrics = utils.run_policy(policy,
+                                     env,
+                                     "test",
+                                     0,
+                                     task.goal_holds,
+                                     max_num_steps=5)
     assert not task.goal_holds(traj.states[-1])
     assert len(traj.states) == 6
     assert len(traj.actions) == 5
-    traj2 = utils.run_policy(policy,
-                             env,
-                             "test",
-                             0,
-                             lambda s: True,
-                             max_num_steps=5)
+    assert "policy_call_time" in metrics
+    assert metrics["policy_call_time"] > 0.0
+    traj2, _ = utils.run_policy(policy,
+                                env,
+                                "test",
+                                0,
+                                lambda s: True,
+                                max_num_steps=5)
     assert not task.goal_holds(traj2.states[-1])
     assert len(traj2.states) == 1
     assert len(traj2.actions) == 0
@@ -266,12 +268,12 @@ def _onestep_terminal(_):
         executed = True
         return terminate
 
-    traj3 = utils.run_policy(policy,
-                             env,
-                             "test",
-                             0,
-                             _onestep_terminal,
-                             max_num_steps=5)
+    traj3, _ = utils.run_policy(policy,
+                                env,
+                                "test",
+                                0,
+                                _onestep_terminal,
+                                max_num_steps=5)
     assert not task.goal_holds(traj3.states[-1])
     assert len(traj3.states) == 2
     assert len(traj3.actions) == 1
@@ -288,15 +290,28 @@ def _policy(_):
                          task.goal_holds,
                          max_num_steps=5)
     assert "mock error" in str(e)
-    traj4 = utils.run_policy(_policy,
-                             env,
-                             "test",
-                             0,
-                             task.goal_holds,
-                             max_num_steps=5,
-                             exceptions_to_break_on={ValueError})
+    traj4, _ = utils.run_policy(_policy,
+                                env,
+                                "test",
+                                0,
+                                task.goal_holds,
+                                max_num_steps=5,
+                                exceptions_to_break_on={ValueError})
     assert len(traj4.states) == 1
 
+    # Test policy call time.
+    def _policy(_):
+        time.sleep(0.1)
+        return Action(env.action_space.sample())
+
+    traj, metrics = utils.run_policy(_policy,
+                                     env,
+                                     "test",
+                                     0,
+                                     task.goal_holds,
+                                     max_num_steps=3)
+    assert metrics["policy_call_time"] >= 3 * 0.1
+
 
 def test_run_policy_with_simulator():
     """Tests for run_policy_with_simulator()."""
@@ -1754,13 +1769,13 @@ def test_VideoMonitor():
     monitor = utils.VideoMonitor(env.render)
     policy = lambda _: Action(env.action_space.sample())
     task = env.get_task("test", 0)
-    traj = utils.run_policy(policy,
-                            env,
-                            "test",
-                            0,
-                            task.goal_holds,
-                            max_num_steps=2,
-                            monitor=monitor)
+    traj, _ = utils.run_policy(policy,
+                               env,
+                               "test",
+                               0,
+                               task.goal_holds,
+                               max_num_steps=2,
+                               monitor=monitor)
     assert not task.goal_holds(traj.states[-1])
     assert len(traj.states) == 3
     assert len(traj.actions) == 2
@@ -1774,13 +1789,13 @@ def test_SimulateVideoMonitor():
     task = env.get_task("test", 0)
     monitor = utils.SimulateVideoMonitor(task, env.render_state)
     policy = lambda _: Action(env.action_space.sample())
-    traj = utils.run_policy(policy,
-                            env,
-                            "test",
-                            0,
-                            task.goal_holds,
-                            max_num_steps=2,
-                            monitor=monitor)
+    traj, _ = utils.run_policy(policy,
+                               env,
+                               "test",
+                               0,
+                               task.goal_holds,
+                               max_num_steps=2,
+                               monitor=monitor)
     assert not task.goal_holds(traj.states[-1])
     assert len(traj.states) == 3
     assert len(traj.actions) == 2