* ✨ Add a km.random_name resolver to enable auto-generated run names (#…

…426) * ✨ Add a km.random_name resolver to enable auto-generated names in configuration (#426) * add template example, add tests, idempotency still fails * add tests, idempotency still fails * add doc, ignore idempotency test, remove unused argument * add syntax with resolver in mlflow.yml * fix typo in doc * add changelog * add nested kedy agin in mlflow.yml * fix typo in changelog
Galileo-Galilei · Feb 9, 2024 · c98d2a3 · c98d2a3
1 parent e886799
commit c98d2a3
Show file tree

Hide file tree

Showing 6 changed files with 117 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@
 ### Fixed
 
 -   :bug: Add support for dataset factories in ``KedroPipelineModel`` ([#516, sebastiandro](https://github.com/Galileo-Galilei/kedro-mlflow/pull/516))
+-   :sparkles: Add a ``km.random_name`` resolver which enables to use auto-generated names for kedro runs instead of pipeline name in the ``mlflow.yml`` configuration file ([#426](https://github.com/Galileo-Galilei/kedro-mlflow/issues/426))
+
 
 ## [0.12.0] - 2023-12-19
 

diff --git a/docs/source/04_experimentation_tracking/01_configuration.md b/docs/source/04_experimentation_tracking/01_configuration.md
@@ -137,6 +137,17 @@ tracking:
     nested: True  # # if `nested` is False, you won't be able to launch sub-runs inside your nodes
 ```
 
+```{tip}
+If you want to generate a random name for each run (like mlflow's default), you can use the built-in ``km.random_name`` resolver:
+
+```yaml
+tracking:
+  run:
+    name: ${km.random_name:} # don't forget the trailing ":" at the end !
+```
+```
+
+
 - If you want to continue to log in an existing mlflow run, write its id in the `id` key.
 - If you want to enable the creation of sub runs inside your nodes (for instance, for model comparison or hyperparameter tuning), set the `nested` key to `True`
 

diff --git a/kedro_mlflow/config/resolvers.py b/kedro_mlflow/config/resolvers.py
@@ -0,0 +1,6 @@
+from mlflow.utils.name_utils import _generate_random_name
+
+
+def resolve_random_name():
+    # a resolver must have an argument, see: https://github.com/omry/omegaconf/issues/1060
+    return _generate_random_name()
diff --git a/kedro_mlflow/framework/hooks/mlflow_hook.py b/kedro_mlflow/framework/hooks/mlflow_hook.py
@@ -16,9 +16,11 @@
 from mlflow.models import infer_signature
 from mlflow.tracking import MlflowClient
 from mlflow.utils.validation import MAX_PARAM_VAL_LENGTH
+from omegaconf import OmegaConf
 from pydantic import __version__ as pydantic_version
 
 from kedro_mlflow.config.kedro_mlflow_config import KedroMlflowConfig
+from kedro_mlflow.config.resolvers import resolve_random_name
 from kedro_mlflow.framework.hooks.utils import (
     _assert_mlflow_enabled,
     _flatten_dict,
@@ -60,6 +62,12 @@ def after_context_created(
             context: The context that was created.
         """
 
+        LOGGER.info(r"Registering new custom resolver: 'km.random_name'")
+        if not OmegaConf.has_resolver("km.random_name"):
+            OmegaConf.register_new_resolver(
+                "km.random_name", resolve_random_name, use_cache=True
+            )
+
         try:
             if "mlflow" not in context.config_loader.config_patterns.keys():
                 context.config_loader.config_patterns.update(

diff --git a/kedro_mlflow/template/project/mlflow.yml b/kedro_mlflow/template/project/mlflow.yml
@@ -39,9 +39,8 @@ tracking:
 
   run:
     id: null # if `id` is None, a new run will be created
-    name: null # if `name` is None, pipeline name will be used for the run name
+    name: null # if `name` is None, pipeline name will be used for the run name. You can use "${km.random_name:}" to generate a random name (mlflow's default)
     nested: True  # if `nested` is False, you won't be able to launch sub-runs inside your nodes
-
   params:
     dict_params:
       flatten: False  # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.

diff --git a/tests/config/test_resolvers.py b/tests/config/test_resolvers.py
@@ -0,0 +1,89 @@
+import re
+
+import pytest
+import yaml
+from kedro.framework.session import KedroSession
+from kedro.framework.startup import bootstrap_project
+from mlflow.utils.name_utils import (
+    _GENERATOR_NOUNS,
+    _GENERATOR_PREDICATES,
+)
+from omegaconf import OmegaConf
+
+from kedro_mlflow.config.resolvers import resolve_random_name
+
+
+def _write_yaml(filepath, config):
+    yaml_str = yaml.dump(config)
+    filepath.write_text(yaml_str)
+
+
+def _is_mlflow_name(name: str) -> bool:
+    splitted_name = name.split("-")
+    flag1 = len(splitted_name) == 3  # noqa: PLR2004
+    flag2 = splitted_name[0] in _GENERATOR_PREDICATES
+    flag3 = splitted_name[1] in _GENERATOR_NOUNS
+    flag4 = re.search(pattern=r"^\d+$", string=splitted_name[2])
+    return all({flag1, flag2, flag3, flag4})
+
+
+@pytest.fixture
+def kedro_project_with_random_name(kedro_project):
+    # kedro_project is a pytest.fixture in conftest
+    dict_config = dict(
+        server=dict(
+            mlflow_tracking_uri="mlruns",
+            mlflow_registry_uri=None,
+            credentials=None,
+            request_header_provider=dict(type=None, pass_context=False, init_kwargs={}),
+        ),
+        tracking=dict(
+            disable_tracking=dict(pipelines=["my_disabled_pipeline"]),
+            experiment=dict(name="fake_package", restore_if_deleted=True),
+            run=dict(id="123456789", name="${km.random_name:}", nested=True),
+            params=dict(
+                dict_params=dict(
+                    flatten=True,
+                    recursive=False,
+                    sep="-",
+                ),
+                long_params_strategy="truncate",
+            ),
+        ),
+        ui=dict(port="5151", host="localhost"),
+    )
+
+    _write_yaml(kedro_project / "conf" / "local" / "mlflow.yml", dict_config)
+    expected = dict_config.copy()
+    expected["server"]["mlflow_tracking_uri"] = (kedro_project / "mlruns").as_uri()
+    return kedro_project
+
+
+def test_resolve_random_name_is_valid_mlflow_name():
+    random_name = resolve_random_name()
+    assert _is_mlflow_name(random_name)
+
+
+def test_resolve_random_name_is_registered(kedro_project_with_random_name):
+    bootstrap_project(kedro_project_with_random_name)
+    with KedroSession.create(project_path=kedro_project_with_random_name) as session:
+        session.load_context()
+        assert OmegaConf.has_resolver("km.random_name")
+
+
+def test_resolve_random_name_is_called_in_project(kedro_project_with_random_name):
+    bootstrap_project(kedro_project_with_random_name)
+    with KedroSession.create(project_path=kedro_project_with_random_name) as session:
+        context = session.load_context()
+        assert _is_mlflow_name(context.mlflow.tracking.run.name)
+
+
+@pytest.mark.skip(reason="kedro 0.19.2 does not take use_cache into account")
+def test_resolve_random_name_is_idempotent(kedro_project_with_random_name):
+    bootstrap_project(kedro_project_with_random_name)
+    with KedroSession.create(project_path=kedro_project_with_random_name) as session:
+        context = session.load_context()
+        assert (
+            context.config_loader["mlflow"]["tracking"]["run"]["name"]
+            == context.config_loader["mlflow"]["tracking"]["run"]["name"]
+        )  # when called twice, should be different is no use_cache because the resolver is random