Skip to content

Commit

Permalink
Merge branch 'master' into benchmarking-docs
Browse files Browse the repository at this point in the history
  • Loading branch information
taufeeque9 committed Oct 16, 2023
2 parents 8ccbdb0 + 20366b0 commit ef3ac84
Show file tree
Hide file tree
Showing 147 changed files with 6,207 additions and 1,271 deletions.
21 changes: 16 additions & 5 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ commands:
# Download and cache dependencies
- restore_cache:
keys:
- v7linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
- v8linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}

- run:
name: install dependencies
Expand All @@ -75,7 +75,7 @@ commands:
- save_cache:
paths:
- /venv
key: v7linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
key: v8linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}

- run:
name: install imitation
Expand Down Expand Up @@ -138,7 +138,7 @@ commands:
# Download and cache dependencies
- restore_cache:
keys:
- v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
- v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}

- run:
name: install python and binary dependencies
Expand Down Expand Up @@ -168,21 +168,29 @@ commands:
- save_cache:
paths:
- .\venv
key: v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
key: v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}

- run:
name: install imitation
command: |
.\venv\Scripts\activate
pip install --upgrade --force-reinstall --no-deps .
shell: powershell.exe

- run:
name: print installed packages
command: |
.\venv\Scripts\activate
pip freeze --all
shell: powershell.exe

- run:
name: enable long path
command: |
New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" `
-Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
shell: powershell.exe

restore-pytest-cache:
description: "Restore .pytest_cache from CircleCI cache."
steps:
Expand Down Expand Up @@ -350,7 +358,10 @@ jobs:
name: run tests
command: |
source ~/venv/bin/activate
TESTFILES=$(circleci tests glob tests/**/test*.py | circleci tests split --split-by=timings)
ALL_TESTFILES=$(circleci tests glob "tests/**/test*.py")
echo "All testfiles: ${ALL_TESTFILES}"
TESTFILES=$(circleci tests glob "tests/**/test*.py" | circleci tests split --split-by=timings)
echo "This shard testing: ${TESTFILES}"
pytest -n auto --junitxml=/tmp/test-reports/junit.xml -vv $TESTFILES
- save-pytest-cache
Expand Down
14 changes: 7 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
repos:
# Linting
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
rev: v4.4.0
hooks:
- id: check-ast
- id: trailing-whitespace
Expand All @@ -12,7 +12,7 @@ repos:
- id: check-toml
- id: check-added-large-files
- repo: https://github.com/psf/black
rev: 22.6.0
rev: 23.9.1
hooks:
- id: black
- id: black-jupyter
Expand All @@ -22,7 +22,7 @@ repos:
- id: isort
# Python static analysis
- repo: https://github.com/pycqa/flake8
rev: '5.0.4'
rev: '6.1.0'
hooks:
- id: flake8
additional_dependencies:
Expand All @@ -34,7 +34,7 @@ repos:
- flake8-docstrings~=1.6.0
# Shell static analysis
- repo: https://github.com/koalaman/shellcheck-precommit
rev: v0.8.0
rev: v0.9.0
hooks:
- id: shellcheck
# precommit invokes shellcheck once per file. shellcheck complains if file
Expand All @@ -43,12 +43,12 @@ repos:
args: ["-e", "SC1091"]
# Misc
- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
rev: v2.2.4
hooks:
- id: codespell
args: ["--skip=*.pyc,tests/testdata/*,*.ipynb,*.csv","--ignore-words-list=reacher,ith,iff"]
- repo: https://github.com/syntaqx/git-hooks
rev: v0.0.17
rev: v0.0.18
hooks:
- id: circleci-config-validate
# Hooks that run in local environment (not isolated venv) as they need
Expand Down Expand Up @@ -78,7 +78,7 @@ repos:
name: pytype
language: system
types: [python]
entry: "bash -c 'pytype -j ${NUM_CPUS:-auto}'"
entry: "bash -c 'pytype --keep-going -j ${NUM_CPUS:-auto}'"
require_serial: true
verbose: true
- id: docs
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Currently, we have implementations of the algorithms below. 'Discrete' and 'Cont
| [Adversarial Inverse Reinforcement Learning](https://arxiv.org/abs/1710.11248) | [`algoritms.airl`](https://imitation.readthedocs.io/en/latest/algorithms/airl.html) |||
| [Generative Adversarial Imitation Learning](https://arxiv.org/abs/1606.03476) | [`algorithms.gail`](https://imitation.readthedocs.io/en/latest/algorithms/gail.html) |||
| [Deep RL from Human Preferences](https://arxiv.org/abs/1706.03741) | [`algorithms.preference_comparisons`](https://imitation.readthedocs.io/en/latest/algorithms/preference_comparisons.html) |||
| [Soft Q Imitation Learning](https://arxiv.org/abs/1905.11108) | [`algorithms.sqil`](https://imitation.readthedocs.io/en/latest/algorithms/sqil.html) |||


You can find [the documentation here](https://imitation.readthedocs.io/en/latest/).
Expand Down Expand Up @@ -74,10 +75,10 @@ From [examples/quickstart.sh:](examples/quickstart.sh)
python -m imitation.scripts.train_rl with pendulum environment.fast policy_evaluation.fast rl.fast fast logging.log_dir=quickstart/rl/

# Train GAIL from demonstrations. Tensorboard logs saved in output/ (default log directory).
python -m imitation.scripts.train_adversarial gail with pendulum environment.fast demonstrations.fast policy_evaluation.fast rl.fast fast demonstrations.rollout_path=quickstart/rl/rollouts/final.npz
python -m imitation.scripts.train_adversarial gail with pendulum environment.fast demonstrations.fast policy_evaluation.fast rl.fast fast demonstrations.path=quickstart/rl/rollouts/final.npz demonstrations.source=local

# Train AIRL from demonstrations. Tensorboard logs saved in output/ (default log directory).
python -m imitation.scripts.train_adversarial airl with pendulum environment.fast demonstrations.fast policy_evaluation.fast rl.fast fast demonstrations.rollout_path=quickstart/rl/rollouts/final.npz
python -m imitation.scripts.train_adversarial airl with pendulum environment.fast demonstrations.fast policy_evaluation.fast rl.fast fast demonstrations.path=quickstart/rl/rollouts/final.npz demonstrations.source=local
```

Tips:
Expand Down
34 changes: 27 additions & 7 deletions benchmarking/README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
# Benchmarking imitation

This directory contains Sacred configuration files for benchmarking imitation's algorithms. For v0.3.2, these correspond to the hyperparameters used in the paper [imitation: Clean Imitation Learning Implementations](https://www.rocamonde.com/publication/gleave-imitation-2022/).
The `src/imitation/scripts/config/tuned_hps` directory provides the tuned hyperparameter configs for benchmarking imitation. For v0.4.0, these correspond to the hyperparameters used in the paper [imitation: Clean Imitation Learning Implementations](https://arxiv.org/abs/2211.11972).

Configuration files can be loaded either from the CLI or from the Python API. The examples below assume that your current working directory is the root of the `imitation` repository.
Configuration files can be loaded either from the CLI or from the Python API.

## Single benchmark

To run a single benchmark from the command line:

```bash
python -m imitation.scripts.<train_script> <algo> \
--name=<name> with benchmarking/<config_name>.json
python -m imitation.scripts.<train_script> <algo> with <algo>_<env>
```

`train_script` can be either 1) `train_imitation` with `algo` as `bc` or `dagger` or 2) `train_adversarial` with `algo` as `gail` or `airl`.
`train_script` can be either 1) `train_imitation` with `algo` as `bc` or `dagger` or 2) `train_adversarial` with `algo` as `gail` or `airl`. The `env` can be either of `seals_ant`, `seals_half_cheetah`, `seals_hopper`, `seals_swimmer`, or `seals_walker`. The hyperparameters for other environments are not tuned yet. You may be able to get reasonable performance by using hyperparameters tuned for a similar environment; alternatively, you can tune the hyperparameters using the `tuning` script.

To view the results:

Expand All @@ -28,7 +26,8 @@ To run a single benchmark from Python add the config to your Sacred experiment `

```python
...
ex.add_config('benchmarking/<config_name>.json')
from imitation.scripts.<train_script> import <train_ex>
<train_ex>.run(command_name="<algo>", named_configs=["<algo>_<env>"])
```

## Entire benchmark suite
Expand Down Expand Up @@ -97,3 +96,24 @@ To compute a p-value to test whether the differences from the paper are statisti
```bash
python -m imitation.scripts.compare_to_baseline results.csv
```
# Tuning Hyperparameters

The hyperparameters of any algorithm in imitation can be tuned using `src/imitation/scripts/tuning.py`.
The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
the search space defined in the `scripts/config/tuning.py`.

The tuning script proceeds in two phases:
1. Tune the hyperparameters using the search space provided.
2. Re-evaluate the best hyperparameter config found in the first phase based on the maximum mean return on a separate set of seeds. Report the mean and standard deviation of these trials.

To use it with the default search space:
```bash
python -m imitation.scripts.tuning with <algo> 'parallel_run_config.base_named_configs=["<env>"]'
```

In this command:
- `<algo>` provides the default search space and settings for the specific algorithm, which is defined in the `scripts/config/tuning.py`
- `<env>` sets the environment to tune the algorithm in. They are defined in the algo-specifc `scripts/config/train_[adversarial|imitation|preference_comparisons|rl].py` files. For the already tuned environments, use the `<algo>_<env>` named configs here.

See the documentation of `scripts/tuning.py` and `scripts/parallel.py` for many other arguments that can be
provided through the command line to change the tuning behavior.
4 changes: 2 additions & 2 deletions benchmarking/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,15 @@ def clean_config_file(file: pathlib.Path, write_path: pathlib.Path, /) -> None:
# remove key 'agent_path'
config.pop("agent_path")
config.pop("seed")
config.get("demonstrations", {}).pop("rollout_path")
config.get("demonstrations", {}).pop("path")
config.get("expert", {}).get("loader_kwargs", {}).pop("path", None)
env_name = config.pop("environment").pop("gym_id")
config["environment"] = {"gym_id": env_name}
config.pop("show_config", None)

remove_empty_dicts(config)
# files are of the format
# /path/to/file/example_<algo>_<env>_best_hp_eval/<other_info>/sacred/1/config.json
# /path/to/file/<algo>_<env>_best_hp_eval/<other_info>/sacred/1/config.json
# we want to write to /<write_path>/<algo>_<env>.json
with open(write_path / f"{file.parents[3].name}.json", "w") as f:
json.dump(config, f, indent=4)
Expand Down
4 changes: 0 additions & 4 deletions ci/build_and_activate_venv.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,4 @@ If ($venv -eq $null) {

virtualenv -p python3.8 $venv
& $venv\Scripts\activate
# Note: We need to install these versions of setuptools and wheel to allow installing gym==0.21.0 on Windows.
# See https://github.com/freqtrade/freqtrade/issues/8376
# TODO(GH#707): remove pin once upgraded Gym
python -m pip install --upgrade pip wheel==0.38.4 setuptools==65.5.1
pip install ".[docs,parallel,test]"
5 changes: 3 additions & 2 deletions ci/build_and_activate_venv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ fi
virtualenv -p ${python_version} ${venv}
# shellcheck disable=SC1090,SC1091
source ${venv}/bin/activate
# Note: We need to install setuptools==66.1.1 to allow installing gym==0.21.0.
python -m pip install --upgrade pip setuptools==66.1.1

# Update pip to the latest version.
pip install --upgrade pip

# If platform is linux, install pytorch CPU version.
# This will prevent installing the CUDA version in the pip install ".[docs,parallel,test]" command.
Expand Down
1 change: 0 additions & 1 deletion ci/clean_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def clean_notebook(file: pathlib.Path, check_only=False) -> None:
print(f"Checking {file}")

for cell in nb.cells:

# Remove empty cells
if cell["cell_type"] == "code" and not cell["source"]:
if check_only:
Expand Down
6 changes: 0 additions & 6 deletions docs/_templates/autosummary/module.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@
# import all functions from module since examples don't import them
from {{ fullname }} import *

.. doctest::

# empty test needed in case the module has no example usage.
# otherwise, testsetup throws an error
pass

.. autosummary::
{% for item in functions %}
{{ item }}
Expand Down
81 changes: 52 additions & 29 deletions docs/algorithms/airl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ that is more generalizable to changes in environment dynamics.

The expert policy must be stochastic.

Notes
-----
- AIRL paper: `Learning Robust Rewards with Adversarial Inverse Reinforcement Learning <https://arxiv.org/abs/1710.11248>`_

.. note::
AIRL paper: `Learning Robust Rewards with Adversarial Inverse Reinforcement Learning <https://arxiv.org/abs/1710.11248>`_

Example
=======
Expand All @@ -23,56 +23,79 @@ Detailed example notebook: :doc:`../tutorials/4_train_airl`
:skipif: skip_doctests

import numpy as np
import gym
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.ppo import MlpPolicy

from imitation.algorithms.adversarial.airl import AIRL
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env

rng = np.random.default_rng(0)

env = gym.make("seals/CartPole-v0")
expert = PPO(policy=MlpPolicy, env=env)
expert.learn(1000)
SEED = 42

env = make_vec_env(
"seals:seals/CartPole-v0",
rng=np.random.default_rng(SEED),
n_envs=8,
post_wrappers=[lambda env, _: RolloutInfoWrapper(env)], # to compute rollouts
)
expert = load_policy(
"ppo-huggingface",
organization="HumanCompatibleAI",
env_name="seals-CartPole-v0",
venv=env,
)
rollouts = rollout.rollout(
expert,
make_vec_env(
"seals/CartPole-v0",
rng=rng,
n_envs=5,
post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],
),
rollout.make_sample_until(min_timesteps=None, min_episodes=60),
rng=rng,
env,
rollout.make_sample_until(min_episodes=60),
rng=np.random.default_rng(SEED),
)

venv = make_vec_env("seals/CartPole-v0", rng=rng, n_envs=8)
learner = PPO(env=venv, policy=MlpPolicy)
learner = PPO(
env=env,
policy=MlpPolicy,
batch_size=64,
ent_coef=0.0,
learning_rate=0.0005,
gamma=0.95,
clip_range=0.1,
vf_coef=0.1,
n_epochs=5,
seed=SEED,
)
reward_net = BasicShapedRewardNet(
venv.observation_space,
venv.action_space,
observation_space=env.observation_space,
action_space=env.action_space,
normalize_input_layer=RunningNorm,
)
airl_trainer = AIRL(
demonstrations=rollouts,
demo_batch_size=1024,
gen_replay_buffer_capacity=2048,
n_disc_updates_per_round=4,
venv=venv,
demo_batch_size=2048,
gen_replay_buffer_capacity=512,
n_disc_updates_per_round=16,
venv=env,
gen_algo=learner,
reward_net=reward_net,
)
airl_trainer.train(20000)
rewards, _ = evaluate_policy(learner, venv, 100, return_episode_rewards=True)
print("Rewards:", rewards)

env.seed(SEED)
learner_rewards_before_training, _ = evaluate_policy(
learner, env, 100, return_episode_rewards=True,
)
airl_trainer.train(20000) # Train for 2_000_000 steps to match expert.
env.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
learner, env, 100, return_episode_rewards=True,
)

print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

.. testoutput::
:hide:
Expand Down
Loading

0 comments on commit ef3ac84

Please sign in to comment.