From 6646df446f0c9d6371c92087220c73a188f179c1 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:42:30 +0000 Subject: [PATCH] Fix stale logic in mgr status command --- src/ai/backend/manager/cli/__main__.py | 9 ++++++++- src/ai/backend/manager/cli/context.py | 21 ++++++++++++++++++++- src/ai/backend/manager/config.py | 4 ++-- src/ai/backend/manager/server.py | 2 +- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/ai/backend/manager/cli/__main__.py b/src/ai/backend/manager/cli/__main__.py index 9acaa8a8916..07da586dedf 100644 --- a/src/ai/backend/manager/cli/__main__.py +++ b/src/ai/backend/manager/cli/__main__.py @@ -343,12 +343,19 @@ async def inspect_node_status(cli_ctx: CLIContext) -> None: headers = ["ENDPOINT", "NODE ID", "IS LEADER", "RAFT TERM", "RAFT APPLIED INDEX"] if raft_configs is not None: + raft_cluster_configs = cli_ctx.raft_cluster_config + assert raft_cluster_configs is not None + + other_peers = [{**peer, "myself": False} for peer in raft_cluster_configs["peers"]["other"]] + my_peers = [{**peer, "myself": True} for peer in raft_cluster_configs["peers"]["myself"]] + all_peers = sorted([*other_peers, *my_peers], key=lambda x: x["node-id"]) + initial_peers = Peers({ int(peer_config["node-id"]): Peer( addr=f"{peer_config['host']}:{peer_config['port']}", role=InitialRole.from_str(peer_config["role"]), ) - for peer_config in raft_configs["peers"] + for peer_config in all_peers }) peers: dict[str, Any] | None = None diff --git a/src/ai/backend/manager/cli/context.py b/src/ai/backend/manager/cli/context.py index 6544a2c1865..b249ad58385 100644 --- a/src/ai/backend/manager/cli/context.py +++ b/src/ai/backend/manager/cli/context.py @@ -22,18 +22,20 @@ from ai.backend.common.logging import AbstractLogger, LocalLogger from ai.backend.common.types import LogSeverity, RedisConnectionInfo -from ..config import LocalConfig, SharedConfig +from ..config import LocalConfig, SharedConfig, load_raft_cluster_config from ..config import load as load_config class CLIContext: _local_config: LocalConfig | None + _raft_cluster_config: LocalConfig | None _logger: AbstractLogger def __init__(self, config_path: Path, log_level: LogSeverity) -> None: self.config_path = config_path self.log_level = log_level self._local_config = None + self._raft_cluster_config = None @property def local_config(self) -> LocalConfig: @@ -50,6 +52,23 @@ def local_config(self) -> LocalConfig: raise click.Abort() return self._local_config + @property + def raft_cluster_config(self) -> LocalConfig | None: + # Lazy-load the configuration only when requested. + try: + if self._raft_cluster_config is None: + self._raft_cluster_config = load_raft_cluster_config( + self.config_path, self.log_level + ) + except ConfigurationError as e: + print( + "ConfigurationError: Could not read or validate the manager raft cluster config:", + file=sys.stderr, + ) + print(pformat(e.invalid_data), file=sys.stderr) + raise click.Abort() + return self._raft_cluster_config + def __enter__(self) -> Self: # The "start-server" command is injected by ai.backend.cli from the entrypoint # and it has its own multi-process-aware logging initialization. diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index 44bed248c21..10dfe13157d 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -615,8 +615,8 @@ def load( def load_raft_cluster_config( - debug_enabled: bool = False, raft_cluster_config_path: Optional[Path] = None, + log_level: LogSeverity = LogSeverity.INFO, ) -> Optional[LocalConfig]: try: raw_cfg, _ = config.read_from_file(raft_cluster_config_path, "raft-cluster-config") @@ -625,7 +625,7 @@ def load_raft_cluster_config( try: cfg = config.check(raw_cfg, manager_raft_cluster_config_iv) - if debug_enabled: + if log_level == LogSeverity.DEBUG: print("== Raft cluster configuration ==", file=sys.stderr) print(pformat(cfg), file=sys.stderr) except config.ConfigurationError as e: diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 887defd5532..c642fac1df6 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -1086,7 +1086,7 @@ def main( Start the manager service as a foreground process. """ cfg = load_config(config_path, LogSeverity.DEBUG if debug else log_level) - raft_cluster_cfg = load_raft_cluster_config(debug, raft_cluster_config_path) + raft_cluster_cfg = load_raft_cluster_config(raft_cluster_config_path, log_level) if ctx.invoked_subcommand is None: cfg["manager"]["pid-file"].write_text(str(os.getpid()))