Skip to content

Commit 19e4b9b

Browse files
authored
Test speedups (#689)
* Reduce duration of dagger (test_trainer_makes_progress) tests to ~20% of initial duration. * Set n_epochs of the rl.fast configuration to 1 to speed up tests. * Reduce duration of BC (test_smoke_bc_training) tests to ~10% of initial duration by adapting test parameters. * Reduce duration of BC (test_density_with_other_trajectory_types) tests to ~5% of initial duration by adapting test parameters. * Switch to proper statistical testing of reward improvement in preference comparison improvement testing and cut testing time to 50%. * Try increasing number of parallel runners. * Fix comment regarding the number of iteration needed to get a measurable improvement in DAgger. * Try using the proper number of runners again
1 parent 1b0436b commit 19e4b9b

5 files changed

Lines changed: 40 additions & 16 deletions

File tree

src/imitation/scripts/ingredients/rl.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,13 @@ def config_hook(config, command_name, logger):
5555
@rl_ingredient.named_config
5656
def fast():
5757
batch_size = 2
58-
# SB3 RL seems to need batch size of 2, otherwise it runs into numeric
59-
# issues when computing multinomial distribution during predict()
60-
rl_kwargs = dict(batch_size=2)
58+
rl_kwargs = dict(
59+
# SB3 RL seems to need batch size of 2, otherwise it runs into numeric
60+
# issues when computing multinomial distribution during predict()
61+
batch_size=2,
62+
# Setting n_epochs=1 speeds up thing a lot
63+
n_epochs=1,
64+
)
6165
locals() # quieten flake8
6266

6367

@@ -133,6 +137,11 @@ def make_rl_algo(
133137
f"num_envs={venv.num_envs} must evenly divide batch_size={batch_size}.",
134138
)
135139
rl_kwargs = dict(rl_kwargs)
140+
141+
# TODO: this is a hack and an indicator that the rl ingredient should be refactored
142+
if rl_cls == sb3.SAC:
143+
del rl_kwargs["n_epochs"]
144+
136145
# If on-policy, collect `batch_size` many timesteps each update.
137146
# If off-policy, train on `batch_size` many timesteps each update.
138147
# These are different notion of batches, but this seems the closest
@@ -180,6 +189,11 @@ def load_rl_algo_from_path(
180189
relabel_reward_fn: Optional[RewardFn] = None,
181190
) -> base_class.BaseAlgorithm:
182191
rl_kwargs = dict(rl_kwargs)
192+
193+
# TODO: this is a hack and an indicator that the rl ingredient should be refactored
194+
if rl_cls == sb3.SAC:
195+
del rl_kwargs["n_epochs"]
196+
183197
if issubclass(rl_cls, off_policy_algorithm.OffPolicyAlgorithm):
184198
rl_kwargs = _maybe_add_relabel_buffer(
185199
rl_kwargs=rl_kwargs,

tests/algorithms/test_bc.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def test_smoke_bc_creation(
149149
expert_data_type=expert_data_types,
150150
rng=rngs,
151151
)
152-
@hypothesis.settings(deadline=20000, max_examples=50)
152+
@hypothesis.settings(deadline=20000, max_examples=15)
153153
def test_smoke_bc_training(
154154
env_name: str,
155155
bc_args: dict,
@@ -169,7 +169,7 @@ def test_smoke_bc_training(
169169
expert_data_type=expert_data_type,
170170
env_name=env_name,
171171
rng=rng,
172-
num_trajectories=3, # Only use 3 trajectories to speed up the test
172+
num_trajectories=2, # Only use 2 trajectories to speed up the test
173173
),
174174
)
175175
# WHEN

tests/algorithms/test_dagger.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -366,21 +366,21 @@ def test_trainer_makes_progress(init_trainer_fn, pendulum_venv, pendulum_expert_
366366
novice_rewards, _ = evaluation.evaluate_policy(
367367
trainer.policy,
368368
pendulum_venv,
369-
15,
370-
deterministic=False,
369+
25,
370+
deterministic=True,
371371
return_episode_rewards=True,
372372
)
373373
# note a randomly initialised policy does well for some seeds -- so may
374374
# want to adjust this check if changing seed. Pendulum return can range
375375
# from -1,200 to -130 (approx.), per Figure 3 in this PDF (on page 3):
376376
# https://arxiv.org/pdf/2106.09556.pdf
377377
assert np.mean(novice_rewards) < -1000
378-
# Train for 10 iterations. (6 or less causes test to fail on some configs.)
378+
# Train for 5 iterations. (4 or fewer causes test to fail on some configs.)
379379
# see https://github.com/HumanCompatibleAI/imitation/issues/580 for details
380-
for i in range(10):
380+
for i in range(5):
381381
# roll out a few trajectories for dataset, then train for a few steps
382382
collector = trainer.create_trajectory_collector()
383-
for _ in range(5):
383+
for _ in range(4):
384384
obs = collector.reset()
385385
dones = [False] * pendulum_venv.num_envs
386386
while not np.any(dones):
@@ -394,7 +394,8 @@ def test_trainer_makes_progress(init_trainer_fn, pendulum_venv, pendulum_expert_
394394
rewards_after_training, _ = evaluation.evaluate_policy(
395395
trainer.policy,
396396
pendulum_venv,
397-
15,
397+
25,
398+
deterministic=True,
398399
return_episode_rewards=True,
399400
)
400401

tests/algorithms/test_density_baselines.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,12 @@ def test_density_with_other_trajectory_types(
110110
pendulum_venv,
111111
rng,
112112
):
113-
rl_algo = stable_baselines3.PPO(policies.ActorCriticPolicy, pendulum_venv)
113+
rl_algo = stable_baselines3.PPO(
114+
policies.ActorCriticPolicy,
115+
pendulum_venv,
116+
n_steps=10, # small value to make test faster
117+
n_epochs=2, # small value to make test faster
118+
)
114119
rollouts = pendulum_expert_trajectories[:2]
115120
transitions = rollout.flatten_trajectories_with_rew(rollouts)
116121
transitions_mappings = [

tests/algorithms/test_preference_comparisons.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from imitation.data.types import TrajectoryWithRew
2222
from imitation.regularization import regularizers, updaters
2323
from imitation.rewards import reward_nets
24+
from imitation.testing import reward_improvement
2425
from imitation.util import networks, util
2526

2627
UNCERTAINTY_ON = ["logit", "probability", "label"]
@@ -1064,7 +1065,7 @@ def test_that_trainer_improves(
10641065
novice_agent_rewards, _ = evaluation.evaluate_policy(
10651066
agent_trainer.algorithm.policy,
10661067
action_is_reward_venv,
1067-
15,
1068+
25,
10681069
return_episode_rewards=True,
10691070
)
10701071

@@ -1073,7 +1074,7 @@ def test_that_trainer_improves(
10731074
# after this training, and thus `later_rewards` should have lower loss.
10741075
first_reward_network_stats = main_trainer.train(20, 20)
10751076

1076-
later_reward_network_stats = main_trainer.train(1000, 20)
1077+
later_reward_network_stats = main_trainer.train(50, 20)
10771078
assert (
10781079
first_reward_network_stats["reward_loss"]
10791080
> later_reward_network_stats["reward_loss"]
@@ -1083,8 +1084,11 @@ def test_that_trainer_improves(
10831084
trained_agent_rewards, _ = evaluation.evaluate_policy(
10841085
agent_trainer.algorithm.policy,
10851086
action_is_reward_venv,
1086-
15,
1087+
25,
10871088
return_episode_rewards=True,
10881089
)
10891090

1090-
assert np.mean(trained_agent_rewards) > np.mean(novice_agent_rewards)
1091+
assert reward_improvement.is_significant_reward_improvement(
1092+
novice_agent_rewards,
1093+
trained_agent_rewards,
1094+
)

0 commit comments

Comments
 (0)