Merge branch 'master' into upgrade_pytype

HumanCompatibleAI · Oct 8, 2023 · 648a8a3 · 648a8a3
2 parents e9e0fc6 + aca4c07
commit 648a8a3
Show file tree

Hide file tree

Showing 40 changed files with 945 additions and 234 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -138,7 +138,7 @@ commands:
       # Download and cache dependencies
       - restore_cache:
           keys:
-            - v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
+            - v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
 
       - run:
           name: install python and binary dependencies
@@ -168,21 +168,29 @@ commands:
       - save_cache:
           paths:
             - .\venv
-          key: v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
+          key: v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
 
       - run:
           name: install imitation
           command: |
             .\venv\Scripts\activate
             pip install --upgrade --force-reinstall --no-deps .
           shell: powershell.exe
+
       - run:
           name: print installed packages
           command: |
             .\venv\Scripts\activate
             pip freeze --all
           shell: powershell.exe
 
+      - run:
+          name: enable long path
+          command: |
+            New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" `
+            -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+          shell: powershell.exe
+
   restore-pytest-cache:
     description: "Restore .pytest_cache from CircleCI cache."
     steps:
@@ -350,7 +358,10 @@ jobs:
           name: run tests
           command: |
             source ~/venv/bin/activate
-            TESTFILES=$(circleci tests glob tests/**/test*.py | circleci tests split --split-by=timings)
+            ALL_TESTFILES=$(circleci tests glob "tests/**/test*.py")
+            echo "All testfiles: ${ALL_TESTFILES}"
+            TESTFILES=$(circleci tests glob "tests/**/test*.py" | circleci tests split --split-by=timings)
+            echo "This shard testing: ${TESTFILES}"
             pytest -n auto --junitxml=/tmp/test-reports/junit.xml -vv $TESTFILES
 
       - save-pytest-cache

diff --git a/docs/conf.py b/docs/conf.py
@@ -55,6 +55,9 @@
 nb_execution_timeout = 120
 nb_merge_streams = True
 nb_output_stderr = "remove"
+nb_execution_raise_on_error = True
+nb_execution_show_tb = True
+
 # Enable LaTeX macros in markdown cells
 myst_enable_extensions = [
     "amsmath",

diff --git a/docs/requirements.txt b/docs/requirements.txt
diff --git a/docs/tutorials/3_train_gail.ipynb b/docs/tutorials/3_train_gail.ipynb
@@ -126,7 +126,7 @@
     ")\n",
     "\n",
     "# train the learner and evaluate again\n",
-    "gail_trainer.train(800_000)\n",
+    "gail_trainer.train(200_000)\n",
     "env.seed(SEED)\n",
     "learner_rewards_after_training, _ = evaluate_policy(\n",
     "    learner, env, 100, return_episode_rewards=True\n",

diff --git a/docs/tutorials/4_train_airl.ipynb b/docs/tutorials/4_train_airl.ipynb
@@ -23,8 +23,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import seals  # noqa: F401  # needed to load \"seals/\" environments\n",
     "import numpy as np\n",
-    "import gymnasium as gym\n",
     "from imitation.policies.serialize import load_policy\n",
     "from imitation.util.util import make_vec_env\n",
     "from imitation.data.wrappers import RolloutInfoWrapper\n",
@@ -34,11 +34,11 @@
     "FAST = True\n",
     "\n",
     "if FAST:\n",
-    "    N_RL_TRAIN_STEPS = 800_000\n",
+    "    N_RL_TRAIN_STEPS = 300_000\n",
     "else:\n",
     "    N_RL_TRAIN_STEPS = 2_000_000\n",
     "\n",
-    "env = make_vec_env(\n",
+    "venv = make_vec_env(\n",
     "    \"seals/CartPole-v0\",\n",
     "    rng=np.random.default_rng(SEED),\n",
     "    n_envs=8,\n",
@@ -50,7 +50,7 @@
     "    \"ppo-huggingface\",\n",
     "    organization=\"HumanCompatibleAI\",\n",
     "    env_name=\"seals-CartPole-v0\",\n",
-    "    venv=env,\n",
+    "    venv=venv,\n",
     ")"
    ]
   },
@@ -71,7 +71,7 @@
     "\n",
     "rollouts = rollout.rollout(\n",
     "    expert,\n",
-    "    env,\n",
+    "    venv,\n",
     "    rollout.make_sample_until(min_timesteps=None, min_episodes=60),\n",
     "    rng=np.random.default_rng(SEED),\n",
     ")"
@@ -101,7 +101,7 @@
     "\n",
     "\n",
     "learner = PPO(\n",
-    "    env=env,\n",
+    "    env=venv,\n",
     "    policy=MlpPolicy,\n",
     "    batch_size=64,\n",
     "    ent_coef=0.0,\n",
@@ -113,28 +113,28 @@
     "    seed=SEED,\n",
     ")\n",
     "reward_net = BasicShapedRewardNet(\n",
-    "    observation_space=env.observation_space,\n",
-    "    action_space=env.action_space,\n",
+    "    observation_space=venv.observation_space,\n",
+    "    action_space=venv.action_space,\n",
     "    normalize_input_layer=RunningNorm,\n",
     ")\n",
     "airl_trainer = AIRL(\n",
     "    demonstrations=rollouts,\n",
     "    demo_batch_size=2048,\n",
     "    gen_replay_buffer_capacity=512,\n",
     "    n_disc_updates_per_round=16,\n",
-    "    venv=env,\n",
+    "    venv=venv,\n",
     "    gen_algo=learner,\n",
     "    reward_net=reward_net,\n",
     ")\n",
     "\n",
-    "env.reset(seed=SEED)\n",
+    "venv.seed(SEED)\n",
     "learner_rewards_before_training, _ = evaluate_policy(\n",
-    "    learner, env, 100, return_episode_rewards=True\n",
+    "    learner, venv, 100, return_episode_rewards=True\n",
     ")\n",
     "airl_trainer.train(N_RL_TRAIN_STEPS)\n",
-    "env.seed(SEED)\n",
+    "venv.seed(SEED)\n",
     "learner_rewards_after_training, _ = evaluate_policy(\n",
-    "    learner, env, 100, return_episode_rewards=True\n",
+    "    learner, venv, 100, return_episode_rewards=True\n",
     ")"
    ]
   },

diff --git a/docs/tutorials/5a_train_preference_comparisons_with_cnn.ipynb b/docs/tutorials/5a_train_preference_comparisons_with_cnn.ipynb
@@ -29,7 +29,7 @@
    "source": [
     "import torch as th\n",
     "import gymnasium as gym\n",
-    "from gym.wrappers import TimeLimit\n",
+    "from gymnasium.wrappers import TimeLimit\n",
     "import numpy as np\n",
     "\n",
     "from seals.util import AutoResetWrapper\n",
@@ -64,7 +64,9 @@
     "\n",
     "# For real training, you will want a vectorized environment with 8 environments in parallel.\n",
     "# This can be done by passing in n_envs=8 as an argument to make_vec_env.\n",
-    "venv = make_vec_env(constant_length_asteroids, env_kwargs={\"num_steps\": 100})\n",
+    "# The seed needs to be set to 1 for reproducibility and also to avoid win32\n",
+    "# np.random.randint high bound error.\n",
+    "venv = make_vec_env(constant_length_asteroids, env_kwargs={\"num_steps\": 100}, seed=1)\n",
     "venv = VecFrameStack(venv, n_stack=4)\n",
     "\n",
     "reward_net = CnnRewardNet(\n",

diff --git a/docs/tutorials/8a_train_sqil_sac.ipynb b/docs/tutorials/8a_train_sqil_sac.ipynb
@@ -7,14 +7,14 @@
     "[download this notebook here](https://github.com/HumanCompatibleAI/imitation/blob/master/docs/tutorials/8a_train_sqil_sac.ipynb)\n",
     "# Train an Agent using Soft Q Imitation Learning with SAC\n",
     "\n",
-    "In the previous tutorial, we used Soft Q Imitation Learning ([SQIL](https://arxiv.org/abs/1905.11108)) on top of the DQN base algorithm. In fact, SQIL can be combined with any off-policy algorithm from `stable_baselines3`. Here, we train a HalfCheetah agent using SQIL + SAC."
+    "In the previous tutorial, we used Soft Q Imitation Learning ([SQIL](https://arxiv.org/abs/1905.11108)) on top of the DQN base algorithm. In fact, SQIL can be combined with any off-policy algorithm from `stable_baselines3`. Here, we train a Pendulum agent using SQIL + SAC."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, we need some expert trajectories in our environment (`seals/HalfCheetah-v0`).\n",
+    "First, we need some expert trajectories in our environment (`Pendulum-v1`).\n",
     "Note that you can use other environments, but the action space must be continuous."
    ]
   },
@@ -28,7 +28,7 @@
     "from imitation.data import huggingface_utils\n",
     "\n",
     "# Download some expert trajectories from the HuggingFace Datasets Hub.\n",
-    "dataset = datasets.load_dataset(\"HumanCompatibleAI/ppo-seals-HalfCheetah-v0\")\n",
+    "dataset = datasets.load_dataset(\"HumanCompatibleAI/ppo-Pendulum-v1\")\n",
     "\n",
     "# Convert the dataset to a format usable by the imitation library.\n",
     "expert_trajectories = huggingface_utils.TrajectoryDatasetSequence(dataset[\"train\"])"
@@ -75,12 +75,11 @@
     "from imitation.util.util import make_vec_env\n",
     "import numpy as np\n",
     "from stable_baselines3 import sac\n",
-    "import seals  # noqa: F401  # needed to load \"seals/\" environments\n",
     "\n",
     "SEED = 42\n",
     "\n",
     "venv = make_vec_env(\n",
-    "    \"seals/HalfCheetah-v0\",\n",
+    "    \"Pendulum-v1\",\n",
     "    rng=np.random.default_rng(seed=SEED),\n",
     ")\n",
     "\n",

diff --git a/readthedocs.yml b/readthedocs.yml
@@ -12,8 +12,6 @@ build:
 
 python:
   install:
-    # TODO(GH#707): remove docs/requirements.txt once Gym upgraded
-    - requirements: docs/requirements.txt
     - method: pip
       path: .
       extra_requirements:

diff --git a/setup.cfg b/setup.cfg
@@ -36,6 +36,10 @@ filterwarnings =
 markers =
     expensive: mark a test as expensive (deselect with '-m "not expensive"')
 
+# Terminate the test just before CircleCI's 10-minute timeout so we see the test failure
+# instead of a timeout.
+timeout = 590
+
 [coverage:run]
 source = imitation
 include=

diff --git a/setup.py b/setup.py
@@ -48,6 +48,7 @@
         "pytest~=7.1.2",
         "pytest-cov~=3.0.0",
         "pytest-notebook==0.8.0",
+        "pytest-timeout~=2.1.0",
         "pytest-xdist~=2.5.0",
         "scipy~=1.9.0",
         "wandb==0.12.21",
@@ -187,7 +188,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
     #   encode only known incompatibilities here. This prevents nasty dependency issues
     #   for our users.
     install_requires=[
-        "gymnasium[classic-control]~=0.28.1",
+        "gymnasium[classic-control]~=0.29",
         "matplotlib",
         "numpy>=1.15",
         "torch>=1.4.0",
@@ -219,7 +220,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         "docs": DOCS_REQUIRE,
         "parallel": PARALLEL_REQUIRE,
         "mujoco": [
-            "gymnasium[classic-control,mujoco]~=0.28.1",
+            "gymnasium[classic-control,mujoco]~=0.29",
         ],
         "atari": ATARI_REQUIRE,
     },

diff --git a/src/imitation/algorithms/bc.py b/src/imitation/algorithms/bc.py
@@ -9,6 +9,7 @@
 from typing import (
     Any,
     Callable,
+    Dict,
     Iterable,
     Iterator,
     Mapping,
@@ -22,7 +23,7 @@
 import numpy as np
 import torch as th
 import tqdm
-from stable_baselines3.common import policies, utils, vec_env
+from stable_baselines3.common import policies, torch_layers, utils, vec_env
 
 from imitation.algorithms import base as algo_base
 from imitation.data import rollout, types
@@ -99,7 +100,12 @@ class BehaviorCloningLossCalculator:
     def __call__(
         self,
         policy: policies.ActorCriticPolicy,
-        obs: Union[th.Tensor, np.ndarray],
+        obs: Union[
+            types.AnyTensor,
+            types.DictObs,
+            Dict[str, np.ndarray],
+            Dict[str, th.Tensor],
+        ],
         acts: Union[th.Tensor, np.ndarray],
     ) -> BCTrainingMetrics:
         """Calculate the supervised learning loss used to train the behavioral clone.
@@ -113,9 +119,18 @@ def __call__(
             A BCTrainingMetrics object with the loss and all the components it
             consists of.
         """
-        obs = util.safe_to_tensor(obs)
+        tensor_obs = types.map_maybe_dict(
+            util.safe_to_tensor,
+            types.maybe_unwrap_dictobs(obs),
+        )
         acts = util.safe_to_tensor(acts)
-        _, log_prob, entropy = policy.evaluate_actions(obs, acts)
+
+        # policy.evaluate_actions's type signatures are incorrect.
+        # See https://github.com/DLR-RM/stable-baselines3/issues/1679
+        (_, log_prob, entropy) = policy.evaluate_actions(
+            tensor_obs,  # type: ignore[arg-type]
+            acts,
+        )
         prob_true_act = th.exp(log_prob).mean()
         log_prob = log_prob.mean()
         entropy = entropy.mean() if entropy is not None else None
@@ -324,12 +339,18 @@ def __init__(
         self.rng = rng
 
         if policy is None:
+            extractor = (
+                torch_layers.CombinedExtractor
+                if isinstance(observation_space, gym.spaces.Dict)
+                else torch_layers.FlattenExtractor
+            )
             policy = policy_base.FeedForward32Policy(
                 observation_space=observation_space,
                 action_space=action_space,
                 # Set lr_schedule to max value to force error if policy.optimizer
                 # is used by mistake (should use self.optimizer instead).
                 lr_schedule=lambda _: th.finfo(th.float32).max,
+                features_extractor_class=extractor,
             )
         self._policy = policy.to(utils.get_device(device))
         # TODO(adam): make policy mandatory and delete observation/action space params?
@@ -464,9 +485,14 @@ def process_batch():
             minibatch_size,
             num_samples_so_far,
         ), batch in batches_with_stats:
-            obs = th.as_tensor(batch["obs"], device=self.policy.device).detach()
-            acts = th.as_tensor(batch["acts"], device=self.policy.device).detach()
-            training_metrics = self.loss_calculator(self.policy, obs, acts)
+            obs_tensor: Union[th.Tensor, Dict[str, th.Tensor]]
+            # unwraps the observation if it's a dictobs and converts arrays to tensors
+            obs_tensor = types.map_maybe_dict(
+                lambda x: util.safe_to_tensor(x, device=self.policy.device),
+                types.maybe_unwrap_dictobs(batch["obs"]),
+            )
+            acts = util.safe_to_tensor(batch["acts"], device=self.policy.device)
+            training_metrics = self.loss_calculator(self.policy, obs_tensor, acts)
 
             # Renormalise the loss to be averaged over the whole
             # batch size instead of the minibatch size.