Fix non-determinism in DAgger (fixes #643) (#649)

hacobe · web-flow · commit 8c1839715c3b · 2023-01-04T17:31:26.000-08:00
* Fix non-determinism in DAgger reported by #643 The issue was that DAgger demonstrations were loaded from disk for training in a different order each run. This was because the filenames for the saved demonstrations changed each run and that changed the order in which os.listdir returned the filenames. The filenames changed each run, because they included a timestamp and the first 6 characters of a UUID generated without fixing a random seed. This PR fixes the non-determinism by making the filenames the same each run as long as the same random seed is used. It does so by removing the timestamp from the filename and fixing the seed of the UUID. Because the timestamp is removed, the PR introduces a trajectory index in the filename, so that a user can tell the order in which trajectories were created. It also includes the entire UUID instead of just the first 6 characters. Finally, it sorts the filenames returned by os.listdir. listdir returns filenames in an arbitrary order that depends on the file system implementation (https://stackoverflow.com/questions/31534583/is-os-listdir-deterministic). We sort the filenames to ensure the order is consistent across file systems. Why include a UUID in the filename at all? If we removed the UUID from the filename, then the DAgger trainers would not overwrite filenames, because they take care to write to a new directory each round. However, if the InteractiveTrajectoryCollector is used independently of those trainers, then it can end up overwriting filenames without the UUID. Do we need to shuffle the filenames returned by os.listdir after sorting? We could, but the demonstrations loaded from the files are passed to a DataLoader, which shuffles them. That seems like the right place to handle the shuffling rather than making it the responsibility of the utility function that returns the filenames. * Assert that the DAgger demonstration file does not already exist before saving. * Minor clean-up: Shorten list comprehension * Make the reproducibility tests more thorough This PR makes the test_trainer_reproducible and test_traj_collector_reproducible more thorough. For test_trainer_reproducible, it tests that the trajectories from rolling out the trained policy are the same each run (instead of just testing that the rewards achieved by the trained policy are the same). For test_traj_collector_reproducible, it tests that the filenames for the files storing DAgger demonstrations are the same each run and that each file in the first run stores the same trajectory as the file with the same filename in the second run (instead of just testing that the observations from the trajectories are the same). * Reduce the number of training iterations in test_trainer_reproducible This PR reduces the number of training iterations in test_trainer_reproducible, because the previous number of iterations used was for testing that the policy improved with training, but that's not needed to test reproducibility.
diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import pathlib
+import uuid
 from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union
 
 import numpy as np
@@ -21,7 +22,6 @@
 from imitation.algorithms import base, bc
 from imitation.data import rollout, types
 from imitation.util import logger as imit_logger
-from imitation.util import util
 
 
 class BetaSchedule(abc.ABC):
@@ -99,16 +99,21 @@ def reconstruct_trainer(
 
 def _save_dagger_demo(
     trajectory: types.Trajectory,
+    trajectory_index: int,
     save_dir: types.AnyPath,
+    rng: np.random.Generator,
     prefix: str = "",
 ) -> None:
     save_dir = types.parse_path(save_dir)
     assert isinstance(trajectory, types.Trajectory)
     actual_prefix = f"{prefix}-" if prefix else ""
-    timestamp = util.make_unique_timestamp()
-    filename = f"{actual_prefix}dagger-demo-{timestamp}.npz"
-
+    randbits = int.from_bytes(rng.bytes(16), "big")
+    random_uuid = uuid.UUID(int=randbits, version=4).hex
+    filename = f"{actual_prefix}dagger-demo-{trajectory_index}-{random_uuid}.npz"
     npz_path = save_dir / filename
+    assert (
+        not npz_path.exists()
+    ), "The following DAgger demonstration path already exists: {0}".format(npz_path)
     types.save(npz_path, [trajectory])
     logging.info(f"Saved demo at '{npz_path}'")
 
@@ -246,8 +251,8 @@ def step_wait(self) -> VecEnvStepReturn:
             infos=infos,
             dones=dones,
         )
-        for traj in fresh_demos:
-            _save_dagger_demo(traj, self.save_dir)
+        for traj_index, traj in enumerate(fresh_demos):
+            _save_dagger_demo(traj, traj_index, self.save_dir, self.rng)
 
         return next_obs, rews, dones, infos
 
@@ -372,7 +377,13 @@ def _load_all_demos(self) -> Tuple[types.Transitions, List[int]]:
         return demo_transitions, num_demos_by_round
 
     def _get_demo_paths(self, round_dir: pathlib.Path) -> List[pathlib.Path]:
-        return [round_dir / p for p in os.listdir(round_dir) if p.endswith(".npz")]
+        # listdir returns filenames in an arbitrary order that depends on the
+        # file system implementation:
+        # https://stackoverflow.com/questions/31534583/is-os-listdir-deterministic
+        # To ensure the order is consistent across file systems,
+        # we sort by the filename.
+        filenames = sorted(os.listdir(round_dir))
+        return [round_dir / f for f in filenames if f.endswith(".npz")]
 
     def _demo_dir_path_for_round(self, round_num: Optional[int] = None) -> pathlib.Path:
         if round_num is None:
@@ -570,10 +581,12 @@ def __init__(
         if expert_trajs is not None:
             # Save each initial expert trajectory into the "round 0" demonstration
             # data directory.
-            for traj in expert_trajs:
+            for traj_index, traj in enumerate(expert_trajs):
                 _save_dagger_demo(
                     traj,
+                    traj_index,
                     self._demo_dir_path_for_round(),
+                    self.rng,
                     prefix="initial_data",
                 )
 
diff --git a/tests/algorithms/test_dagger.py b/tests/algorithms/test_dagger.py
@@ -105,6 +105,63 @@ def get_random_acts(obs):
     assert nonzero_acts == 0
 
 
+def test_traj_collector_reproducible(tmpdir, pendulum_venv):
+    # We run the collector twice with the same random seeds and
+    # check that the following 2 properties hold:
+    # 1) The files written in the first run have the same filenames
+    #    as the files written in the second run.
+    # 2) Each file in the first run stores the same trajectory as
+    #    the file with the same filename in the second run.
+    filename_to_traj_dicts = []
+    with torch.random.fork_rng():
+        for run_idx in range(2):
+            # Reset the random seeds.
+            save_dir = os.path.join(tmpdir, "run{0}".format(run_idx))
+            rng = np.random.default_rng(12345)
+            pendulum_venv.seed(12345)
+            pendulum_venv.action_space.seed(12345)
+
+            # Run the collector.
+            collector = dagger.InteractiveTrajectoryCollector(
+                venv=pendulum_venv,
+                get_robot_acts=lambda o: [
+                    pendulum_venv.action_space.sample() for _ in range(len(o))
+                ],
+                beta=0.5,
+                save_dir=save_dir,
+                rng=rng,
+            )
+            collector.reset()
+            zero_acts = np.zeros(
+                (pendulum_venv.num_envs,) + pendulum_venv.action_space.shape,
+                dtype=pendulum_venv.action_space.dtype,
+            )
+            for i in range(1000):
+                _, _, dones, _ = collector.step(zero_acts)
+
+            # Get the observations from all the collected trajectories.
+            file_paths = glob.glob(os.path.join(save_dir, "dagger-demo-*.npz"))
+            filename_to_traj_dict = {}
+            for fp in file_paths:
+                traj = types.load_with_rewards(fp)[0]
+                # For the purposes of testing, we remove `infos` from the
+                # trajectory, because `infos` contains the time that it
+                # takes to complete an episode, which we expect to differ
+                # slightly between runs.
+                traj_without_infos = types.TrajectoryWithRew(
+                    obs=traj.obs,
+                    acts=traj.acts,
+                    infos=None,
+                    terminal=traj.terminal,
+                    rews=traj.rews,
+                )
+                filename = os.path.basename(fp)
+                filename_to_traj_dict[filename] = traj_without_infos
+            filename_to_traj_dicts.append(filename_to_traj_dict)
+
+    assert filename_to_traj_dicts[0] == filename_to_traj_dicts[1]
+
+
 def _build_dagger_trainer(
     tmpdir,
     venv,
@@ -325,6 +382,67 @@ def test_trainer_makes_progress(init_trainer_fn, pendulum_venv, pendulum_expert_
     )
 
 
+@pytest.mark.parametrize(
+    "init_trainer_fn",
+    [_build_dagger_trainer, _build_simple_dagger_trainer],
+)
+def test_trainer_reproducible(
+    init_trainer_fn,
+    tmpdir,
+    pendulum_venv,
+    pendulum_expert_policy,
+    custom_logger,
+):
+    # Check that we get the same results if we run the trainer
+    # twice with the same random seeds.
+    # In particular, check that the trajectories from rolling out
+    # the trained policy are the same in each run.
+    run_trajs = []
+    with torch.random.fork_rng():
+        for run_idx in range(2):
+            # Reset the random seeds.
+            run_dir = os.path.join(tmpdir, "run{0}".format(run_idx))
+            torch.random.manual_seed(12345)
+            rng = np.random.default_rng(12345)
+            pendulum_venv.seed(12345)
+            pendulum_venv.action_space.seed(12345)
+
+            beta_schedule = None
+            maybe_pendulum_expert_trajectories = None
+            trainer = init_trainer_fn(
+                run_dir,
+                pendulum_venv,
+                beta_schedule,
+                pendulum_expert_policy,
+                maybe_pendulum_expert_trajectories,
+                custom_logger,
+                rng,
+            )
+
+            for i in range(2):
+                collector = trainer.create_trajectory_collector()
+                obs = collector.reset()
+                dones = [False] * pendulum_venv.num_envs
+                while not np.any(dones):
+                    expert_actions, _ = pendulum_expert_policy.predict(
+                        obs,
+                        deterministic=True,
+                    )
+                    obs, _, dones, _ = collector.step(expert_actions)
+                trainer.extend_and_update(dict(n_epochs=1))
+
+            trajs = rollout.rollout(
+                trainer.policy,
+                pendulum_venv,
+                rollout.make_sample_until(min_episodes=2),
+                rng,
+            )
+            run_trajs.append(trajs)
+
+    assert len(run_trajs) == 2
+    assert run_trajs[0] == run_trajs[1]
+
+
 def test_trainer_save_reload(tmpdir, init_trainer_fn, pendulum_venv):
     trainer = init_trainer_fn()
     trainer.round_num = 3