Test speedups (#689)

ernestum · web-flow · commit 19e4b9b4baea · 2023-04-26T14:52:02.000-07:00
* Reduce duration of dagger (test_trainer_makes_progress) tests to ~20% of initial duration.

* Set n_epochs of the rl.fast configuration to 1 to speed up tests.

* Reduce duration of BC (test_smoke_bc_training) tests to ~10% of initial duration by adapting test parameters.

* Reduce duration of BC (test_density_with_other_trajectory_types) tests to ~5% of initial duration by adapting test parameters.

* Switch to proper statistical testing of reward improvement in preference comparison improvement testing and cut testing time to 50%.

* Try increasing number of parallel runners.

* Fix comment regarding the number of iteration needed to get a measurable improvement in DAgger.

* Try using the proper number of runners again
diff --git a/src/imitation/scripts/ingredients/rl.py b/src/imitation/scripts/ingredients/rl.py
@@ -55,9 +55,13 @@ def config_hook(config, command_name, logger):
 @rl_ingredient.named_config
 def fast():
     batch_size = 2
-    # SB3 RL seems to need batch size of 2, otherwise it runs into numeric
-    # issues when computing multinomial distribution during predict()
-    rl_kwargs = dict(batch_size=2)
+    rl_kwargs = dict(
+        # SB3 RL seems to need batch size of 2, otherwise it runs into numeric
+        # issues when computing multinomial distribution during predict()
+        batch_size=2,
+        # Setting n_epochs=1 speeds up thing a lot
+        n_epochs=1,
+    )
     locals()  # quieten flake8
 
 
@@ -133,6 +137,11 @@ def make_rl_algo(
             f"num_envs={venv.num_envs} must evenly divide batch_size={batch_size}.",
         )
     rl_kwargs = dict(rl_kwargs)
+
+    # TODO: this is a hack and an indicator that the rl ingredient should be refactored
+    if rl_cls == sb3.SAC:
+        del rl_kwargs["n_epochs"]
+
     # If on-policy, collect `batch_size` many timesteps each update.
     # If off-policy, train on `batch_size` many timesteps each update.
     # These are different notion of batches, but this seems the closest
@@ -180,6 +189,11 @@ def load_rl_algo_from_path(
     relabel_reward_fn: Optional[RewardFn] = None,
 ) -> base_class.BaseAlgorithm:
     rl_kwargs = dict(rl_kwargs)
+
+    # TODO: this is a hack and an indicator that the rl ingredient should be refactored
+    if rl_cls == sb3.SAC:
+        del rl_kwargs["n_epochs"]
+
     if issubclass(rl_cls, off_policy_algorithm.OffPolicyAlgorithm):
         rl_kwargs = _maybe_add_relabel_buffer(
             rl_kwargs=rl_kwargs,
diff --git a/tests/algorithms/test_bc.py b/tests/algorithms/test_bc.py
@@ -149,7 +149,7 @@ def test_smoke_bc_creation(
     expert_data_type=expert_data_types,
     rng=rngs,
 )
-@hypothesis.settings(deadline=20000, max_examples=50)
+@hypothesis.settings(deadline=20000, max_examples=15)
 def test_smoke_bc_training(
     env_name: str,
     bc_args: dict,
@@ -169,7 +169,7 @@ def test_smoke_bc_training(
             expert_data_type=expert_data_type,
             env_name=env_name,
             rng=rng,
-            num_trajectories=3,  # Only use 3 trajectories to speed up the test
+            num_trajectories=2,  # Only use 2 trajectories to speed up the test
         ),
     )
     # WHEN
diff --git a/tests/algorithms/test_dagger.py b/tests/algorithms/test_dagger.py
@@ -366,21 +366,21 @@ def test_trainer_makes_progress(init_trainer_fn, pendulum_venv, pendulum_expert_
         novice_rewards, _ = evaluation.evaluate_policy(
             trainer.policy,
             pendulum_venv,
-            15,
-            deterministic=False,
+            25,
+            deterministic=True,
             return_episode_rewards=True,
         )
         # note a randomly initialised policy does well for some seeds -- so may
         # want to adjust this check if changing seed. Pendulum return can range
         # from -1,200 to -130 (approx.), per Figure 3 in this PDF (on page 3):
         # https://arxiv.org/pdf/2106.09556.pdf
         assert np.mean(novice_rewards) < -1000
-        # Train for 10 iterations. (6 or less causes test to fail on some configs.)
+        # Train for 5 iterations. (4 or fewer causes test to fail on some configs.)
         # see https://github.com/HumanCompatibleAI/imitation/issues/580 for details
-        for i in range(10):
+        for i in range(5):
             # roll out a few trajectories for dataset, then train for a few steps
             collector = trainer.create_trajectory_collector()
-            for _ in range(5):
+            for _ in range(4):
                 obs = collector.reset()
                 dones = [False] * pendulum_venv.num_envs
                 while not np.any(dones):
@@ -394,7 +394,8 @@ def test_trainer_makes_progress(init_trainer_fn, pendulum_venv, pendulum_expert_
         rewards_after_training, _ = evaluation.evaluate_policy(
             trainer.policy,
             pendulum_venv,
-            15,
+            25,
+            deterministic=True,
             return_episode_rewards=True,
         )
 
diff --git a/tests/algorithms/test_density_baselines.py b/tests/algorithms/test_density_baselines.py
@@ -110,7 +110,12 @@ def test_density_with_other_trajectory_types(
     pendulum_venv,
     rng,
 ):
-    rl_algo = stable_baselines3.PPO(policies.ActorCriticPolicy, pendulum_venv)
+    rl_algo = stable_baselines3.PPO(
+        policies.ActorCriticPolicy,
+        pendulum_venv,
+        n_steps=10,  # small value to make test faster
+        n_epochs=2,  # small value to make test faster
+    )
     rollouts = pendulum_expert_trajectories[:2]
     transitions = rollout.flatten_trajectories_with_rew(rollouts)
     transitions_mappings = [
diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py
@@ -21,6 +21,7 @@
 from imitation.data.types import TrajectoryWithRew
 from imitation.regularization import regularizers, updaters
 from imitation.rewards import reward_nets
+from imitation.testing import reward_improvement
 from imitation.util import networks, util
 
 UNCERTAINTY_ON = ["logit", "probability", "label"]
@@ -1064,7 +1065,7 @@ def test_that_trainer_improves(
     novice_agent_rewards, _ = evaluation.evaluate_policy(
         agent_trainer.algorithm.policy,
         action_is_reward_venv,
-        15,
+        25,
         return_episode_rewards=True,
     )
 
@@ -1073,7 +1074,7 @@ def test_that_trainer_improves(
     # after this training, and thus `later_rewards` should have lower loss.
     first_reward_network_stats = main_trainer.train(20, 20)
 
-    later_reward_network_stats = main_trainer.train(1000, 20)
+    later_reward_network_stats = main_trainer.train(50, 20)
     assert (
         first_reward_network_stats["reward_loss"]
         > later_reward_network_stats["reward_loss"]
@@ -1083,8 +1084,11 @@ def test_that_trainer_improves(
     trained_agent_rewards, _ = evaluation.evaluate_policy(
         agent_trainer.algorithm.policy,
         action_is_reward_venv,
-        15,
+        25,
         return_episode_rewards=True,
     )
 
-    assert np.mean(trained_agent_rewards) > np.mean(novice_agent_rewards)
+    assert reward_improvement.is_significant_reward_improvement(
+        novice_agent_rewards,
+        trained_agent_rewards,
+    )