chore: refactor searcher operations out of master side searchers (#10024

) Refactors searchers in master to no longer issue operation workloads to trials. This only functionally affects ASHA search algorithms, which are now preemption-based instead of promotion-based. - Removes all `SearcherOperation` APIs. - Removes the non-stopping variant of adaptive ASHA, deprecating the `stop_once` config option. - Removes and deprecates the `max_length` requirement for all experiments. - Introduces `time_metric` and `max_time` as new resource configuration options for ASHA searchers. - Reporting validation metrics now reports to the searcher (only affects ASHA searchers). - Preview HP Search was refactored to estimate training lengths based on ASHA rungs.
determined-ai · Oct 25, 2024 · 17e942a · 17e942a
1 parent 99f03c2
commit 17e942a
Show file tree

Hide file tree

Showing 74 changed files with 6,205 additions and 8,374 deletions.
diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
@@ -2603,6 +2603,7 @@ jobs:
       - run: pip install mypy pytest coverage
       - install-codecov
       - setup-paths
+      - run: make -C harness install
       - run: COVERAGE_FILE=$PWD/test-unit-harness-tf2-pycov make -C harness test-tf2
       - run: coverage xml -i --data-file=./test-unit-harness-tf2-pycov
       - run: codecov -v -t $CODECOV_TOKEN -F harness

diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst
@@ -956,6 +956,76 @@ the model architecture of this experiment.
 Optional. Like ``source_trial_id``, but specifies an arbitrary checkpoint from which to initialize
 weights. At most one of ``source_trial_id`` or ``source_checkpoint_uuid`` should be set.
 
+.. _experiment-configuration-searcher-asha:
+
+Asynchronous Halving (ASHA)
+===========================
+
+The ``async_halving`` search performs a version of the asynchronous successive halving algorithm
+(`ASHA <https://arxiv.org/pdf/1810.05934.pdf>`_) that stops trials early if there is enough evidence
+to terminate training. Once trials are stopped, they will not be resumed.
+
+``metric``
+----------
+
+Required. The name of the validation metric used to evaluate the performance of a hyperparameter
+configuration.
+
+``time_metric``
+---------------
+
+Required. The name of the validation metric used to evaluate the progress of a given trial.
+
+``max_time``
+------------
+
+Required. The maximum value that ``time_metric`` should take when a trial finishes training. Early
+stopping is decided based on how far the ``time_metric`` has progressed towards this ``max_time``
+value.
+
+``max_trials``
+--------------
+
+Required. The number of trials, i.e., hyperparameter configurations, to evaluate.
+
+``num_rungs``
+-------------
+
+Required. The number of rounds of successive halving to perform.
+
+``smaller_is_better``
+---------------------
+
+Optional. Whether to minimize or maximize the metric defined above. The default value is ``true``
+(minimize).
+
+``divisor``
+-----------
+
+Optional. The fraction of trials to keep at each rung, and also determines the training length for
+each rung. The default setting is ``4``; only advanced users should consider changing this value.
+
+``max_concurrent_trials``
+-------------------------
+
+Optional. The maximum number of trials that can be worked on simultaneously. The default value is
+``16``, and we set reasonable values depending on ``max_trials`` and the number of rungs in the
+brackets. This is akin to controlling the degree of parallelism of the experiment. If this value is
+less than the number of brackets produced by the adaptive algorithm, it will be rounded up.
+
+``source_trial_id``
+-------------------
+
+Optional. If specified, the weights of *every* trial in the search will be initialized to the most
+recent checkpoint of the given trial ID. This will fail if the source trial's model architecture is
+inconsistent with the model architecture of any of the trials in this experiment.
+
+``source_checkpoint_uuid``
+--------------------------
+
+Optional. Like ``source_trial_id``, but specifies an arbitrary checkpoint from which to initialize
+weights. At most one of ``source_trial_id`` or ``source_checkpoint_uuid`` should be set.
+
 .. _experiment-configuration-searcher-adaptive:
 
 Adaptive ASHA
@@ -1007,14 +1077,6 @@ end of the spectrum, ``conservative`` mode performs significantly less downsampl
 consequence does not explore as many configurations given the same budget. We recommend using either
 ``aggressive`` or ``standard`` mode.
 
-``stop_once``
--------------
-
-Optional. If ``stop_once`` is set to ``true``, we will use a variant of ASHA that will not resume
-trials once stopped. This variant defaults to continuing training and will only stop trials if there
-is enough evidence to terminate training. We recommend using this version of ASHA when training a
-trial for the max length as fast as possible is important or when fault tolerance is too expensive.
-
 ``divisor``
 -----------
 

diff --git a/harness/determined/cli/cli.py b/harness/determined/cli/cli.py
@@ -47,66 +47,52 @@
     version,
     workspace,
 )
-from determined.common import api, util, yaml
+from determined.common import api, util
 from determined.common.api import bindings, certs
 
 
+def _render_search_summary(resp: bindings.v1PreviewHPSearchResponse) -> str:
+    output = [
+        termcolor.colored("Using search configuration:", "green"),
+    ]
+
+    # For mypy
+    assert resp.summary and resp.summary.config and resp.summary.trials
+    # Exclude empty configs from rendering.
+    searcher_config = {k: v for k, v in resp.summary.config.items() if v is not None}
+
+    config_str = render.format_object_as_yaml(searcher_config)
+    output.append(config_str)
+    headers = ["Trials", "Training Time"]
+    trial_summaries = []
+    for trial_summary in resp.summary.trials:
+        num_trials = trial_summary.count
+        trial_unit = trial_summary.unit
+        if trial_unit.maxLength:
+            summary = "train to completion"
+        else:
+            summary = f"train for {trial_unit.value} {trial_unit.name}"
+        trial_summaries.append([num_trials, summary])
+
+    output.append(tabulate.tabulate(trial_summaries, headers, tablefmt="presto"))
+    return "\n".join(output)
+
+
 def preview_search(args: argparse.Namespace) -> None:
     sess = cli.setup_session(args)
     experiment_config = util.safe_load_yaml_with_exceptions(args.config_file)
     args.config_file.close()
 
     if "searcher" not in experiment_config:
-        print("Experiment configuration must have 'searcher' section")
-        sys.exit(1)
-    r = sess.post("searcher/preview", json=experiment_config)
-    j = r.json()
+        raise errors.CliError("Missing 'searcher' config section in experiment config.")
 
-    def to_full_name(kind: str) -> str:
-        try:
-            # The unitless searcher case, for masters newer than 0.17.6.
-            length = int(kind)
-            return f"train for {length}"
-        except ValueError:
-            pass
-        if kind[-1] == "R":
-            return "train {} records".format(kind[:-1])
-        if kind[-1] == "B":
-            return "train {} batch(es)".format(kind[:-1])
-        if kind[-1] == "E":
-            return "train {} epoch(s)".format(kind[:-1])
-        if kind == "V":
-            return "validation"
-        raise ValueError("unexpected kind: {}".format(kind))
-
-    def render_sequence(sequence: List[str]) -> str:
-        if not sequence:
-            return "N/A"
-        instructions = []
-        current = sequence[0]
-        count = 0
-        for k in sequence:
-            if k != current:
-                instructions.append("{} x {}".format(count, to_full_name(current)))
-                current = k
-                count = 1
-            else:
-                count += 1
-        instructions.append("{} x {}".format(count, to_full_name(current)))
-        return ", ".join(instructions)
-
-    headers = ["Trials", "Breakdown"]
-    values = [
-        (count, render_sequence(operations.split())) for operations, count in j["results"].items()
-    ]
-
-    print(termcolor.colored("Using search configuration:", "green"))
-    yml = yaml.YAML()
-    yml.indent(mapping=2, sequence=4, offset=2)
-    yml.dump(experiment_config["searcher"], sys.stdout)
-    print()
-    print("This search will create a total of {} trial(s).".format(sum(j["results"].values())))
-    print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=False)
+    resp = bindings.post_PreviewHPSearch(
+        session=sess,
+        body=bindings.v1PreviewHPSearchRequest(
+            config=experiment_config,
+        ),
+    )
+    print(_render_search_summary(resp=resp))
 
 
 args_description = [