Add mean/std/ICM and confidence intervals to the markdown summary scr…

…ipt.
HumanCompatibleAI · Oct 17, 2023 · 057ce8b · 057ce8b
1 parent c53888c
commit 057ce8b
Showing 1 changed file with 124 additions and 44 deletions.
diff --git a/benchmarking/sacred_output_to_markdown_summary.py b/benchmarking/sacred_output_to_markdown_summary.py
@@ -1,77 +1,157 @@
 """Generate a markdown summary of the results of a benchmarking run."""
+import argparse
 import pathlib
-import sys
 from collections import Counter
+from functools import lru_cache
+from typing import Generator, Sequence, cast
 
+import datasets
+import numpy as np
+from huggingface_sb3 import EnvironmentName
+from rliable import library as rly
+from rliable import metrics
+
+from imitation.data import rollout, types
+from imitation.data.huggingface_utils import TrajectoryDatasetSequence
 from imitation.util.sacred_file_parsing import (
     find_sacred_runs,
     group_runs_by_algo_and_env,
 )
 
 
-def print_markdown_summary(path: pathlib.Path):
+@lru_cache(maxsize=None)
+def get_random_agent_score(env: str):
+    stats = rollout.rollout_stats(
+        cast(
+            Sequence[types.TrajectoryWithRew],
+            TrajectoryDatasetSequence(
+                datasets.load_dataset(
+                    f"HumanCompatibleAI/random-{EnvironmentName(env)}",
+                )["train"],
+            ),
+        ),
+    )
+    return stats["monitor_return_mean"]
+
+
+def print_markdown_summary(path: pathlib.Path) -> Generator[str, None, None]:
     if not path.exists():
         raise NotADirectoryError(f"Path {path} does not exist.")
 
-    print("# Benchmark Summary")
+    yield "# Benchmark Summary"
+    yield ""
+    yield (
+        f"This is a summary of the sacred runs in `{path}` generated by "
+        f"`sacred_output_to_markdown_summary.py`."
+    )
+
     runs_by_algo_and_env = group_runs_by_algo_and_env(path)
     algos = sorted(runs_by_algo_and_env.keys())
 
-    print("## Run status" "")
-    print("Status | Count")
-    print("--- | ---")
     status_counts = Counter((run["status"] for _, run in find_sacred_runs(path)))
     statuses = sorted(list(status_counts))
-    for status in statuses:
-        print(f"{status} | {status_counts[status]}")
-    print()
+    # Note: we only print the status section if there are multiple statuses
+    if not (len(statuses) == 1 and statuses[0] == "COMPLETED"):
+        yield "## Run status" ""
+        yield "Status | Count"
+        yield "--- | ---"
+        for status in statuses:
+            yield f"{status} | {status_counts[status]}"
+        yield ""
+
+        yield "## Detailed Run Status"
+        yield f"Algorithm | Environment | {' | '.join(statuses)}"
+        yield "--- | --- " + " | --- " * len(statuses)
+        for algo in algos:
+            envs = sorted(runs_by_algo_and_env[algo].keys())
+            for env in envs:
+                status_counts = Counter(
+                    (run["status"] for run in runs_by_algo_and_env[algo][env]),
+                )
+                yield (
+                    f"{algo} | {env} | "
+                    f"{' | '.join([str(status_counts[status]) for status in statuses])}"
+                )
+
+    yield "## Scores"
+    yield ""
+    yield (
+        "The scores are normalized based on the performance of a random agent as the"
+        " baseline and the expert as the maximum possible score as explained "
+        "[in this blog post](https://araffin.github.io/post/rliable/):"
+    )
+    yield "> `(score - random_score) / (expert_score - random_score)`"
+    yield ""
+    yield (
+        "Aggregate scores and confidence intervals are computed using the "
+        "[rliable library](https://agarwl.github.io/rliable/)."
+    )
 
-    print("## Detailed Run Status")
-    print(f"Algorithm | Environment | {' | '.join(sorted(list(status_counts)))}")
-    print("--- | --- " + " | --- " * len(statuses))
-    for algo in algos:
-        envs = sorted(runs_by_algo_and_env[algo].keys())
-        for env in envs:
-            status_counts = Counter(
-                (run["status"] for run in runs_by_algo_and_env[algo][env]),
-            )
-            print(
-                f"{algo} | {env} | "
-                f"{' | '.join([str(status_counts[status]) for status in statuses])}",
-            )
-    print()
-    print("## Raw Scores")
-    print()
     for algo in algos:
-        print(f"### {algo.upper()}")
-        print("Environment | Scores | Expert Scores")
-        print("--- | --- | ---")
+        yield f"### {algo.upper()}"
+        yield "Environment | Score (mean/std)| Normalized Score (mean/std) | N"
+        yield " --- | --- | --- | --- "
         envs = sorted(runs_by_algo_and_env[algo].keys())
+        accumulated_normalized_scores = []
         for env in envs:
-            completed_runs = [
-                run
-                for run in runs_by_algo_and_env[algo][env]
-                if run["status"] == "COMPLETED"
-            ]
-            algo_scores = [
+            scores = [
                 run["result"]["imit_stats"]["monitor_return_mean"]
-                for run in completed_runs
+                for run in runs_by_algo_and_env[algo][env]
             ]
             expert_scores = [
                 run["result"]["expert_stats"]["monitor_return_mean"]
-                for run in completed_runs
+                for run in runs_by_algo_and_env[algo][env]
             ]
-            print(
+            random_score = get_random_agent_score(env)
+            normalized_score = [
+                (score - random_score) / (expert_score - random_score)
+                for score, expert_score in zip(scores, expert_scores)
+            ]
+            accumulated_normalized_scores.append(normalized_score)
+
+            yield (
                 f"{env} | "
-                f"{', '.join([f'{score:.2f}' for score in algo_scores])} | "
-                f"{', '.join([f'{score:.2f}' for score in expert_scores])}",
+                f"{np.mean(scores):.3f} / {np.std(scores):.3f} | "
+                f"{np.mean(normalized_score):.3f} / {np.std(normalized_score):.3f} | "
+                f"{len(scores)}"
             )
-        print()
+
+        aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(
+            {"normalized_score": np.asarray(accumulated_normalized_scores).T},
+            lambda x: np.array([metrics.aggregate_mean(x), metrics.aggregate_iqm(x)]),
+            reps=1000,
+        )
+        yield ""
+        yield "#### Aggregate Normalized scores"
+
+        yield "Metric | Value | 95% CI"
+        yield " --- | --- | --- "
+        yield (
+            f"Mean | "
+            f"{aggregate_scores['normalized_score'][0]:.3f} | "
+            f"[{aggregate_score_cis['normalized_score'][0][0]:.3f}, "
+            f"{aggregate_score_cis['normalized_score'][0][1]:.3f}]"
+        )
+        yield (
+            f"IQM | "
+            f"{aggregate_scores['normalized_score'][1]:.3f} | "
+            f"[{aggregate_score_cis['normalized_score'][1][0]:.3f}, "
+            f"{aggregate_score_cis['normalized_score'][1][1]:.3f}]"
+        )
+        yield ""
 
 
 if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print(f"Usage: {sys.argv[0]} <path to sacred run folder>")
-        sys.exit(1)
+    parser = argparse.ArgumentParser(
+        description="Generate a markdown summary of the results of a benchmarking run.",
+    )
+    parser.add_argument("path", type=pathlib.Path)
+    parser.add_argument("--output", type=pathlib.Path, default="summary.md")
+
+    args = parser.parse_args()
 
-    print_markdown_summary(pathlib.Path(sys.argv[1]))
+    with open(args.output, "w") as fh:
+        for line in print_markdown_summary(pathlib.Path(args.path)):
+            fh.write(line)
+            fh.write("\n")
+            fh.flush()