Skip to content

Commit

Permalink
Merge branch 'master' into upgrade_pytype
Browse files Browse the repository at this point in the history
  • Loading branch information
AdamGleave authored Oct 8, 2023
2 parents e9e0fc6 + aca4c07 commit 648a8a3
Show file tree
Hide file tree
Showing 40 changed files with 945 additions and 234 deletions.
17 changes: 14 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ commands:
# Download and cache dependencies
- restore_cache:
keys:
- v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
- v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}

- run:
name: install python and binary dependencies
Expand Down Expand Up @@ -168,21 +168,29 @@ commands:
- save_cache:
paths:
- .\venv
key: v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
key: v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}

- run:
name: install imitation
command: |
.\venv\Scripts\activate
pip install --upgrade --force-reinstall --no-deps .
shell: powershell.exe

- run:
name: print installed packages
command: |
.\venv\Scripts\activate
pip freeze --all
shell: powershell.exe

- run:
name: enable long path
command: |
New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" `
-Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
shell: powershell.exe

restore-pytest-cache:
description: "Restore .pytest_cache from CircleCI cache."
steps:
Expand Down Expand Up @@ -350,7 +358,10 @@ jobs:
name: run tests
command: |
source ~/venv/bin/activate
TESTFILES=$(circleci tests glob tests/**/test*.py | circleci tests split --split-by=timings)
ALL_TESTFILES=$(circleci tests glob "tests/**/test*.py")
echo "All testfiles: ${ALL_TESTFILES}"
TESTFILES=$(circleci tests glob "tests/**/test*.py" | circleci tests split --split-by=timings)
echo "This shard testing: ${TESTFILES}"
pytest -n auto --junitxml=/tmp/test-reports/junit.xml -vv $TESTFILES
- save-pytest-cache
Expand Down
3 changes: 3 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
nb_execution_timeout = 120
nb_merge_streams = True
nb_output_stderr = "remove"
nb_execution_raise_on_error = True
nb_execution_show_tb = True

# Enable LaTeX macros in markdown cells
myst_enable_extensions = [
"amsmath",
Expand Down
2 changes: 0 additions & 2 deletions docs/requirements.txt

This file was deleted.

2 changes: 1 addition & 1 deletion docs/tutorials/3_train_gail.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
")\n",
"\n",
"# train the learner and evaluate again\n",
"gail_trainer.train(800_000)\n",
"gail_trainer.train(200_000)\n",
"env.seed(SEED)\n",
"learner_rewards_after_training, _ = evaluate_policy(\n",
" learner, env, 100, return_episode_rewards=True\n",
Expand Down
26 changes: 13 additions & 13 deletions docs/tutorials/4_train_airl.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
"metadata": {},
"outputs": [],
"source": [
"import seals # noqa: F401 # needed to load \"seals/\" environments\n",
"import numpy as np\n",
"import gymnasium as gym\n",
"from imitation.policies.serialize import load_policy\n",
"from imitation.util.util import make_vec_env\n",
"from imitation.data.wrappers import RolloutInfoWrapper\n",
Expand All @@ -34,11 +34,11 @@
"FAST = True\n",
"\n",
"if FAST:\n",
" N_RL_TRAIN_STEPS = 800_000\n",
" N_RL_TRAIN_STEPS = 300_000\n",
"else:\n",
" N_RL_TRAIN_STEPS = 2_000_000\n",
"\n",
"env = make_vec_env(\n",
"venv = make_vec_env(\n",
" \"seals/CartPole-v0\",\n",
" rng=np.random.default_rng(SEED),\n",
" n_envs=8,\n",
Expand All @@ -50,7 +50,7 @@
" \"ppo-huggingface\",\n",
" organization=\"HumanCompatibleAI\",\n",
" env_name=\"seals-CartPole-v0\",\n",
" venv=env,\n",
" venv=venv,\n",
")"
]
},
Expand All @@ -71,7 +71,7 @@
"\n",
"rollouts = rollout.rollout(\n",
" expert,\n",
" env,\n",
" venv,\n",
" rollout.make_sample_until(min_timesteps=None, min_episodes=60),\n",
" rng=np.random.default_rng(SEED),\n",
")"
Expand Down Expand Up @@ -101,7 +101,7 @@
"\n",
"\n",
"learner = PPO(\n",
" env=env,\n",
" env=venv,\n",
" policy=MlpPolicy,\n",
" batch_size=64,\n",
" ent_coef=0.0,\n",
Expand All @@ -113,28 +113,28 @@
" seed=SEED,\n",
")\n",
"reward_net = BasicShapedRewardNet(\n",
" observation_space=env.observation_space,\n",
" action_space=env.action_space,\n",
" observation_space=venv.observation_space,\n",
" action_space=venv.action_space,\n",
" normalize_input_layer=RunningNorm,\n",
")\n",
"airl_trainer = AIRL(\n",
" demonstrations=rollouts,\n",
" demo_batch_size=2048,\n",
" gen_replay_buffer_capacity=512,\n",
" n_disc_updates_per_round=16,\n",
" venv=env,\n",
" venv=venv,\n",
" gen_algo=learner,\n",
" reward_net=reward_net,\n",
")\n",
"\n",
"env.reset(seed=SEED)\n",
"venv.seed(SEED)\n",
"learner_rewards_before_training, _ = evaluate_policy(\n",
" learner, env, 100, return_episode_rewards=True\n",
" learner, venv, 100, return_episode_rewards=True\n",
")\n",
"airl_trainer.train(N_RL_TRAIN_STEPS)\n",
"env.seed(SEED)\n",
"venv.seed(SEED)\n",
"learner_rewards_after_training, _ = evaluate_policy(\n",
" learner, env, 100, return_episode_rewards=True\n",
" learner, venv, 100, return_episode_rewards=True\n",
")"
]
},
Expand Down
6 changes: 4 additions & 2 deletions docs/tutorials/5a_train_preference_comparisons_with_cnn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"source": [
"import torch as th\n",
"import gymnasium as gym\n",
"from gym.wrappers import TimeLimit\n",
"from gymnasium.wrappers import TimeLimit\n",
"import numpy as np\n",
"\n",
"from seals.util import AutoResetWrapper\n",
Expand Down Expand Up @@ -64,7 +64,9 @@
"\n",
"# For real training, you will want a vectorized environment with 8 environments in parallel.\n",
"# This can be done by passing in n_envs=8 as an argument to make_vec_env.\n",
"venv = make_vec_env(constant_length_asteroids, env_kwargs={\"num_steps\": 100})\n",
"# The seed needs to be set to 1 for reproducibility and also to avoid win32\n",
"# np.random.randint high bound error.\n",
"venv = make_vec_env(constant_length_asteroids, env_kwargs={\"num_steps\": 100}, seed=1)\n",
"venv = VecFrameStack(venv, n_stack=4)\n",
"\n",
"reward_net = CnnRewardNet(\n",
Expand Down
9 changes: 4 additions & 5 deletions docs/tutorials/8a_train_sqil_sac.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
"[download this notebook here](https://github.com/HumanCompatibleAI/imitation/blob/master/docs/tutorials/8a_train_sqil_sac.ipynb)\n",
"# Train an Agent using Soft Q Imitation Learning with SAC\n",
"\n",
"In the previous tutorial, we used Soft Q Imitation Learning ([SQIL](https://arxiv.org/abs/1905.11108)) on top of the DQN base algorithm. In fact, SQIL can be combined with any off-policy algorithm from `stable_baselines3`. Here, we train a HalfCheetah agent using SQIL + SAC."
"In the previous tutorial, we used Soft Q Imitation Learning ([SQIL](https://arxiv.org/abs/1905.11108)) on top of the DQN base algorithm. In fact, SQIL can be combined with any off-policy algorithm from `stable_baselines3`. Here, we train a Pendulum agent using SQIL + SAC."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, we need some expert trajectories in our environment (`seals/HalfCheetah-v0`).\n",
"First, we need some expert trajectories in our environment (`Pendulum-v1`).\n",
"Note that you can use other environments, but the action space must be continuous."
]
},
Expand All @@ -28,7 +28,7 @@
"from imitation.data import huggingface_utils\n",
"\n",
"# Download some expert trajectories from the HuggingFace Datasets Hub.\n",
"dataset = datasets.load_dataset(\"HumanCompatibleAI/ppo-seals-HalfCheetah-v0\")\n",
"dataset = datasets.load_dataset(\"HumanCompatibleAI/ppo-Pendulum-v1\")\n",
"\n",
"# Convert the dataset to a format usable by the imitation library.\n",
"expert_trajectories = huggingface_utils.TrajectoryDatasetSequence(dataset[\"train\"])"
Expand Down Expand Up @@ -75,12 +75,11 @@
"from imitation.util.util import make_vec_env\n",
"import numpy as np\n",
"from stable_baselines3 import sac\n",
"import seals # noqa: F401 # needed to load \"seals/\" environments\n",
"\n",
"SEED = 42\n",
"\n",
"venv = make_vec_env(\n",
" \"seals/HalfCheetah-v0\",\n",
" \"Pendulum-v1\",\n",
" rng=np.random.default_rng(seed=SEED),\n",
")\n",
"\n",
Expand Down
2 changes: 0 additions & 2 deletions readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ build:

python:
install:
# TODO(GH#707): remove docs/requirements.txt once Gym upgraded
- requirements: docs/requirements.txt
- method: pip
path: .
extra_requirements:
Expand Down
4 changes: 4 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ filterwarnings =
markers =
expensive: mark a test as expensive (deselect with '-m "not expensive"')

# Terminate the test just before CircleCI's 10-minute timeout so we see the test failure
# instead of a timeout.
timeout = 590

[coverage:run]
source = imitation
include=
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"pytest~=7.1.2",
"pytest-cov~=3.0.0",
"pytest-notebook==0.8.0",
"pytest-timeout~=2.1.0",
"pytest-xdist~=2.5.0",
"scipy~=1.9.0",
"wandb==0.12.21",
Expand Down Expand Up @@ -187,7 +188,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
# encode only known incompatibilities here. This prevents nasty dependency issues
# for our users.
install_requires=[
"gymnasium[classic-control]~=0.28.1",
"gymnasium[classic-control]~=0.29",
"matplotlib",
"numpy>=1.15",
"torch>=1.4.0",
Expand Down Expand Up @@ -219,7 +220,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
"docs": DOCS_REQUIRE,
"parallel": PARALLEL_REQUIRE,
"mujoco": [
"gymnasium[classic-control,mujoco]~=0.28.1",
"gymnasium[classic-control,mujoco]~=0.29",
],
"atari": ATARI_REQUIRE,
},
Expand Down
40 changes: 33 additions & 7 deletions src/imitation/algorithms/bc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import (
Any,
Callable,
Dict,
Iterable,
Iterator,
Mapping,
Expand All @@ -22,7 +23,7 @@
import numpy as np
import torch as th
import tqdm
from stable_baselines3.common import policies, utils, vec_env
from stable_baselines3.common import policies, torch_layers, utils, vec_env

from imitation.algorithms import base as algo_base
from imitation.data import rollout, types
Expand Down Expand Up @@ -99,7 +100,12 @@ class BehaviorCloningLossCalculator:
def __call__(
self,
policy: policies.ActorCriticPolicy,
obs: Union[th.Tensor, np.ndarray],
obs: Union[
types.AnyTensor,
types.DictObs,
Dict[str, np.ndarray],
Dict[str, th.Tensor],
],
acts: Union[th.Tensor, np.ndarray],
) -> BCTrainingMetrics:
"""Calculate the supervised learning loss used to train the behavioral clone.
Expand All @@ -113,9 +119,18 @@ def __call__(
A BCTrainingMetrics object with the loss and all the components it
consists of.
"""
obs = util.safe_to_tensor(obs)
tensor_obs = types.map_maybe_dict(
util.safe_to_tensor,
types.maybe_unwrap_dictobs(obs),
)
acts = util.safe_to_tensor(acts)
_, log_prob, entropy = policy.evaluate_actions(obs, acts)

# policy.evaluate_actions's type signatures are incorrect.
# See https://github.com/DLR-RM/stable-baselines3/issues/1679
(_, log_prob, entropy) = policy.evaluate_actions(
tensor_obs, # type: ignore[arg-type]
acts,
)
prob_true_act = th.exp(log_prob).mean()
log_prob = log_prob.mean()
entropy = entropy.mean() if entropy is not None else None
Expand Down Expand Up @@ -324,12 +339,18 @@ def __init__(
self.rng = rng

if policy is None:
extractor = (
torch_layers.CombinedExtractor
if isinstance(observation_space, gym.spaces.Dict)
else torch_layers.FlattenExtractor
)
policy = policy_base.FeedForward32Policy(
observation_space=observation_space,
action_space=action_space,
# Set lr_schedule to max value to force error if policy.optimizer
# is used by mistake (should use self.optimizer instead).
lr_schedule=lambda _: th.finfo(th.float32).max,
features_extractor_class=extractor,
)
self._policy = policy.to(utils.get_device(device))
# TODO(adam): make policy mandatory and delete observation/action space params?
Expand Down Expand Up @@ -464,9 +485,14 @@ def process_batch():
minibatch_size,
num_samples_so_far,
), batch in batches_with_stats:
obs = th.as_tensor(batch["obs"], device=self.policy.device).detach()
acts = th.as_tensor(batch["acts"], device=self.policy.device).detach()
training_metrics = self.loss_calculator(self.policy, obs, acts)
obs_tensor: Union[th.Tensor, Dict[str, th.Tensor]]
# unwraps the observation if it's a dictobs and converts arrays to tensors
obs_tensor = types.map_maybe_dict(
lambda x: util.safe_to_tensor(x, device=self.policy.device),
types.maybe_unwrap_dictobs(batch["obs"]),
)
acts = util.safe_to_tensor(batch["acts"], device=self.policy.device)
training_metrics = self.loss_calculator(self.policy, obs_tensor, acts)

# Renormalise the loss to be averaged over the whole
# batch size instead of the minibatch size.
Expand Down
Loading

0 comments on commit 648a8a3

Please sign in to comment.