Skip to content

Commit

Permalink
Fix stale logic in mgr status command
Browse files Browse the repository at this point in the history
  • Loading branch information
jopemachine committed Apr 1, 2024
1 parent 936753d commit 6646df4
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 5 deletions.
9 changes: 8 additions & 1 deletion src/ai/backend/manager/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,12 +343,19 @@ async def inspect_node_status(cli_ctx: CLIContext) -> None:
headers = ["ENDPOINT", "NODE ID", "IS LEADER", "RAFT TERM", "RAFT APPLIED INDEX"]

if raft_configs is not None:
raft_cluster_configs = cli_ctx.raft_cluster_config
assert raft_cluster_configs is not None

other_peers = [{**peer, "myself": False} for peer in raft_cluster_configs["peers"]["other"]]
my_peers = [{**peer, "myself": True} for peer in raft_cluster_configs["peers"]["myself"]]
all_peers = sorted([*other_peers, *my_peers], key=lambda x: x["node-id"])

initial_peers = Peers({
int(peer_config["node-id"]): Peer(
addr=f"{peer_config['host']}:{peer_config['port']}",
role=InitialRole.from_str(peer_config["role"]),
)
for peer_config in raft_configs["peers"]
for peer_config in all_peers
})

peers: dict[str, Any] | None = None
Expand Down
21 changes: 20 additions & 1 deletion src/ai/backend/manager/cli/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,20 @@
from ai.backend.common.logging import AbstractLogger, LocalLogger
from ai.backend.common.types import LogSeverity, RedisConnectionInfo

from ..config import LocalConfig, SharedConfig
from ..config import LocalConfig, SharedConfig, load_raft_cluster_config
from ..config import load as load_config


class CLIContext:
_local_config: LocalConfig | None
_raft_cluster_config: LocalConfig | None
_logger: AbstractLogger

def __init__(self, config_path: Path, log_level: LogSeverity) -> None:
self.config_path = config_path
self.log_level = log_level
self._local_config = None
self._raft_cluster_config = None

@property
def local_config(self) -> LocalConfig:
Expand All @@ -50,6 +52,23 @@ def local_config(self) -> LocalConfig:
raise click.Abort()
return self._local_config

@property
def raft_cluster_config(self) -> LocalConfig | None:
# Lazy-load the configuration only when requested.
try:
if self._raft_cluster_config is None:
self._raft_cluster_config = load_raft_cluster_config(
self.config_path, self.log_level
)
except ConfigurationError as e:
print(
"ConfigurationError: Could not read or validate the manager raft cluster config:",
file=sys.stderr,
)
print(pformat(e.invalid_data), file=sys.stderr)
raise click.Abort()
return self._raft_cluster_config

def __enter__(self) -> Self:
# The "start-server" command is injected by ai.backend.cli from the entrypoint
# and it has its own multi-process-aware logging initialization.
Expand Down
4 changes: 2 additions & 2 deletions src/ai/backend/manager/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,8 +615,8 @@ def load(


def load_raft_cluster_config(
debug_enabled: bool = False,
raft_cluster_config_path: Optional[Path] = None,
log_level: LogSeverity = LogSeverity.INFO,
) -> Optional[LocalConfig]:
try:
raw_cfg, _ = config.read_from_file(raft_cluster_config_path, "raft-cluster-config")
Expand All @@ -625,7 +625,7 @@ def load_raft_cluster_config(

try:
cfg = config.check(raw_cfg, manager_raft_cluster_config_iv)
if debug_enabled:
if log_level == LogSeverity.DEBUG:
print("== Raft cluster configuration ==", file=sys.stderr)
print(pformat(cfg), file=sys.stderr)
except config.ConfigurationError as e:
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/manager/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,7 @@ def main(
Start the manager service as a foreground process.
"""
cfg = load_config(config_path, LogSeverity.DEBUG if debug else log_level)
raft_cluster_cfg = load_raft_cluster_config(debug, raft_cluster_config_path)
raft_cluster_cfg = load_raft_cluster_config(raft_cluster_config_path, log_level)

if ctx.invoked_subcommand is None:
cfg["manager"]["pid-file"].write_text(str(os.getpid()))
Expand Down

0 comments on commit 6646df4

Please sign in to comment.