Add scripts to generate benchmark summary information and computing t…

…he probability of improvement.
HumanCompatibleAI · Oct 16, 2023 · 000975e · 000975e
1 parent baa1134
commit 000975e
Show file tree

Hide file tree

Showing 3 changed files with 398 additions and 0 deletions.
diff --git a/benchmarking/compute_probability_of_improvement.py b/benchmarking/compute_probability_of_improvement.py
@@ -0,0 +1,251 @@
+"""Compute the probability that one algorithm improved over another."""
+import argparse
+import pathlib
+import sys
+import warnings
+from typing import Dict, List, Optional
+
+import numpy as np
+from rliable import library as rly
+from rliable import metrics
+
+from imitation.util.sacred_file_parsing import SacredRun, group_runs_by_algo_and_env
+
+
+def sample_matrix_from_runs_by_env(
+    runs_by_env: Dict[str, List[SacredRun]],
+    envs: Optional[List[str]] = None,
+) -> np.ndarray:
+    """Samples a matrix of scores from the runs for each environment.
+
+    Note: when the number of samples for each environment is not equal, the samples
+    will be truncated to the minimum sample count.
+
+    Args:
+        runs_by_env: A dictionary mapping environment names to lists of runs.
+        envs: The environments to sample from. If None, all environments are used.
+
+    Returns:
+        A matrix of scores of shape (n_samples, n_envs).
+    """
+    if envs is None:
+        envs = list(runs_by_env.keys())
+
+    sample_counts_by_env = {env: len(runs_by_env[env]) for env in envs}
+
+    min_sample_count = min(sample_counts_by_env.values())
+    if not all(
+        sample_counts_by_env[env] == sample_counts_by_env[envs[0]] for env in envs
+    ):
+        warnings.warn(
+            f"The runs for the environments have different sample counts "
+            f"{sample_counts_by_env}. "
+            f"This is not supported by the probability of improvement. Therefore, "
+            f"samples will be truncated to the minimum sample count of"
+            f" {min_sample_count}",
+        )
+
+    return np.asarray(
+        [
+            [
+                run["result"]["imit_stats"]["monitor_return_mean"]
+                for run in runs_by_env[env][:min_sample_count]
+            ]
+            for env in envs
+        ],
+    ).T
+
+
+def compute_probability_of_improvement(
+    runs_by_env: Dict[str, List[SacredRun]],
+    baseline_runs_by_env: Dict[str, List[SacredRun]],
+    reps: int,
+):
+    """Computes the probability of improvement of the runs over the baseline runs.
+
+    Args:
+        runs_by_env: A dictionary mapping environment names to lists of runs.
+        baseline_runs_by_env: A dictionary mapping environment names to lists of runs.
+        reps: The number of bootstrap repetitions to use to compute the confidence
+            interval.
+
+    Returns:
+        A tuple of:
+         - probability of improvement
+         - confidence interval
+         - number of samples per env
+         - number of baseline samples per env
+    """
+    envs = runs_by_env.keys()
+    baseline_envs = baseline_runs_by_env.keys()
+    comparison_envs = sorted(set(envs).intersection(set(baseline_envs)))
+
+    run_scores = sample_matrix_from_runs_by_env(runs_by_env, comparison_envs)
+    baseline_run_scores = sample_matrix_from_runs_by_env(
+        baseline_runs_by_env,
+        comparison_envs,
+    )
+    samples_per_env = run_scores.shape[0]
+    baseline_samples_per_env = baseline_run_scores.shape[0]
+
+    probabs, error_intervals = rly.get_interval_estimates(
+        {"baseline_vs_new": (baseline_run_scores, run_scores)},
+        metrics.probability_of_improvement,
+        reps=reps,
+    )
+    probability_of_improvement = probabs["baseline_vs_new"]
+    confidence_interval = np.squeeze(error_intervals["baseline_vs_new"])
+
+    return (
+        probability_of_improvement,
+        confidence_interval,
+        samples_per_env,
+        baseline_samples_per_env,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("runs_dir", type=pathlib.Path)
+    parser.add_argument("baseline_runs_dir", nargs="?", default=None, type=pathlib.Path)
+    parser.add_argument("--baseline-algo", type=str)
+    parser.add_argument("--algo", type=str)
+    parser.add_argument("--bootstrap-reps", type=int, default=2000)
+
+    args = parser.parse_args()
+
+    if args.baseline_runs_dir is None:
+        args.baseline_runs_dir = args.runs_dir
+
+    runs_by_algo_and_env = group_runs_by_algo_and_env(
+        args.runs_dir,
+        only_completed_runs=True,
+    )
+    baseline_runs_by_algo_and_env = group_runs_by_algo_and_env(
+        args.baseline_runs_dir,
+        only_completed_runs=True,
+    )
+
+    algos = sorted(runs_by_algo_and_env.keys())
+    baseline_algos = sorted(baseline_runs_by_algo_and_env.keys())
+
+    try:
+        if len(algos) == 0:
+            raise ValueError(f"The run directory [{args.runs_dir}] contains no runs.")
+
+        if len(baseline_algos) == 0:
+            raise ValueError(
+                f"The baseline run directory [{args.baseline_runs_dir}] "
+                f"contains no runs.",
+            )
+
+        if "algo" not in args is None:
+            if len(algos) == 1:
+                args.algo = algos[0]
+            else:
+                raise ValueError(
+                    f"The run directory [{args.runs_dir}] contains runs for the "
+                    f"algorithms [{', '.join(algos)}]. Please use the --algo option "
+                    f" to specify which algorithms runs to compare.",
+                )
+
+        if args.baseline_algo is None:
+            if len(baseline_algos) == 1:
+                args.baseline_algo = baseline_algos[0]
+            elif args.algo in baseline_algos:
+                args.baseline_algo = args.algo
+            else:
+                raise ValueError(
+                    f"The baseline run directory [{args.baseline_runs_dir}] contains "
+                    f"runs for the algorithms [{', '.join(baseline_algos)}]. "
+                    f"Please use the --baseline-algo option specify which one to "
+                    f"compare to.",
+                )
+
+        if args.algo not in algos:
+            raise ValueError(
+                f"The run directory [{args.runs_dir}] contains runs for the algorithms"
+                f" [{', '.join(algos)}]. You specified [{args.algo}], for which no"
+                f" runs can be found in the run directory",
+            )
+
+        if args.baseline_algo not in baseline_algos:
+            raise ValueError(
+                f"The baseline run directory [{args.baseline_runs_dir}] contains runs "
+                f"for the algorithms [{', '.join(baseline_algos)}]. "
+                f"You specified [{args.baseline_algo}], for which no runs can be found"
+                f" in the baseline run directory",
+            )
+
+        if (args.algo == args.baseline_algo) and (
+            args.runs_dir == args.baseline_runs_dir
+        ):
+            warnings.warn(
+                "You are comparing two equal sets of runs. "
+                "This is probably not what you want.",
+            )
+
+        envs = runs_by_algo_and_env[args.algo].keys()
+        baseline_envs = baseline_runs_by_algo_and_env[args.baseline_algo].keys()
+
+        comparison_envs = set(envs).intersection(set(baseline_envs))
+
+        if len(comparison_envs) == 0:
+            raise ValueError(
+                f"The baseline runs are for the environments "
+                f"[{', '.join(baseline_envs)}], while the runs are for the "
+                f"environments [{', '.join(envs)}]. "
+                f"There is no overlap in the environments of the two run sets, so no "
+                f"comparison can be made",
+            )
+
+        ignoring_some_envs = len(comparison_envs) < len(envs)
+        ignoring_some_baseline_envs = len(comparison_envs) < len(baseline_envs)
+        if ignoring_some_envs or ignoring_some_baseline_envs:
+            warnings.warn(
+                f"The baseline runs are for the environments "
+                f"[{', '.join(baseline_envs)}], "
+                f"while the runs are for the environments [{', '.join(envs)}]. "
+                f"The comparison will only be made for the environments "
+                f"[{', '.join(comparison_envs)}].",
+            )
+
+    except ValueError as e:
+        print(e)
+        sys.exit(1)
+
+    (
+        probability_of_improvement,
+        error_interval,
+        n_samples,
+        n_baseline_samples,
+    ) = compute_probability_of_improvement(
+        runs_by_env=runs_by_algo_and_env[args.algo],
+        baseline_runs_by_env=baseline_runs_by_algo_and_env[args.baseline_algo],
+        reps=args.bootstrap_reps,
+    )
+
+    show_path = args.algo == args.baseline_algo
+    algo_str = f"{args.algo} ({args.runs_dir})" if show_path else args.algo
+    baseline_algo_str = (
+        f"{args.baseline_algo} ({args.baseline_runs_dir})"
+        if show_path
+        else args.baseline_algo
+    )
+
+    print(
+        f"Comparison based on {n_samples} samples per environment for {algo_str} and"
+        f" {n_baseline_samples} samples per environment for {baseline_algo_str}.",
+    )
+    print(f"Samples taken in {', '.join(comparison_envs)}")
+    print()
+    print(f"Probability of improvement of {algo_str} over {baseline_algo_str}:")
+    print(
+        f"{probability_of_improvement:.3f} "
+        f"({error_interval[0]:.3f}, {error_interval[1]:.3f}, "
+        f"reps={args.bootstrap_reps})",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarking/sacred_output_to_markdown_summary.py b/benchmarking/sacred_output_to_markdown_summary.py
@@ -0,0 +1,77 @@
+"""Generate a markdown summary of the results of a benchmarking run."""
+import pathlib
+import sys
+from collections import Counter
+
+from imitation.util.sacred_file_parsing import (
+    find_sacred_runs,
+    group_runs_by_algo_and_env,
+)
+
+
+def print_markdown_summary(path: pathlib.Path):
+    if not path.exists():
+        raise NotADirectoryError(f"Path {path} does not exist.")
+
+    print("# Benchmark Summary")
+    runs_by_algo_and_env = group_runs_by_algo_and_env(path)
+    algos = sorted(runs_by_algo_and_env.keys())
+
+    print("## Run status" "")
+    print("Status | Count")
+    print("--- | ---")
+    status_counts = Counter((run["status"] for _, run in find_sacred_runs(path)))
+    statuses = sorted(list(status_counts))
+    for status in statuses:
+        print(f"{status} | {status_counts[status]}")
+    print()
+
+    print("## Detailed Run Status")
+    print(f"Algorithm | Environment | {' | '.join(sorted(list(status_counts)))}")
+    print("--- | --- " + " | --- " * len(statuses))
+    for algo in algos:
+        envs = sorted(runs_by_algo_and_env[algo].keys())
+        for env in envs:
+            status_counts = Counter(
+                (run["status"] for run in runs_by_algo_and_env[algo][env]),
+            )
+            print(
+                f"{algo} | {env} | "
+                f"{' | '.join([str(status_counts[status]) for status in statuses])}",
+            )
+    print()
+    print("## Raw Scores")
+    print()
+    for algo in algos:
+        print(f"### {algo.upper()}")
+        print("Environment | Scores | Expert Scores")
+        print("--- | --- | ---")
+        envs = sorted(runs_by_algo_and_env[algo].keys())
+        for env in envs:
+            completed_runs = [
+                run
+                for run in runs_by_algo_and_env[algo][env]
+                if run["status"] == "COMPLETED"
+            ]
+            algo_scores = [
+                run["result"]["imit_stats"]["monitor_return_mean"]
+                for run in completed_runs
+            ]
+            expert_scores = [
+                run["result"]["expert_stats"]["monitor_return_mean"]
+                for run in completed_runs
+            ]
+            print(
+                f"{env} | "
+                f"{', '.join([f'{score:.2f}' for score in algo_scores])} | "
+                f"{', '.join([f'{score:.2f}' for score in expert_scores])}",
+            )
+        print()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <path to sacred run folder>")
+        sys.exit(1)
+
+    print_markdown_summary(pathlib.Path(sys.argv[1]))