diff --git a/src/ai/backend/agent/cli.py b/src/ai/backend/agent/cli.py index 20ddbc85ff..d413edf3e6 100644 --- a/src/ai/backend/agent/cli.py +++ b/src/ai/backend/agent/cli.py @@ -1,7 +1,89 @@ +import asyncio +import pathlib +from pathlib import Path + import click +from tabulate import tabulate + +from ai.backend.agent.config import get_agent_cfg +from ai.backend.agent.server import agent_local_config_iv +from ai.backend.cli.types import CliContextInfo +from ai.backend.common import config +from ai.backend.common.types import LogSeverity @click.group() def main(): """The root entrypoint for unified CLI of agent""" pass + + +async def inspect_agent_status(cli_ctx: CliContextInfo, agent_pid: int) -> None: + command = f"ps -p '{agent_pid}' -f" + process = await asyncio.create_subprocess_shell(command, stdout=asyncio.subprocess.PIPE) + stdout, stderr = await process.communicate() + if stderr: + raise RuntimeError(f"Failed to execute the command: {command}") + + lines = stdout.decode().splitlines() + process_list = [] + + for line in lines[1:]: + columns = line.split() + # Combine all text following UID, PID, PPID, C, STIME, TTY, TIME into CMD + process_info = columns[:7] + [" ".join(columns[7:])] + process_list.append(process_info) + + print(tabulate(process_list, headers=lines[0].split(), tablefmt="pretty")) + pass + + +@main.command() +@click.pass_obj +@click.option( + "-f", + "--config-path", + "--config", + type=click.Path( + file_okay=True, + dir_okay=False, + exists=True, + path_type=pathlib.Path, + ), + default=None, + help="The config file path. (default: ./agent.toml and /etc/backend.ai/agent.toml)", +) +@click.option( + "--debug", + is_flag=True, + help="Set the logging level to DEBUG", +) +@click.option( + "-s", + "--systemctl", + is_flag=True, + help="Include the systemctl status command result in the output", +) +@click.option( + "--log-level", + type=click.Choice([*LogSeverity], case_sensitive=False), + default=LogSeverity.INFO, + help="Set the logging verbosity level", +) +def status( + cli_ctx: CliContextInfo, + config_path: Path, + log_level: LogSeverity, + debug: bool = False, + systemctl: bool = False, +) -> None: + """ + Collect and print each agent process's status. + """ + cfg = config.check(get_agent_cfg(config_path, log_level, debug), agent_local_config_iv) + pid_filepath = cfg["agent"]["pid-file"] + + with open(pid_filepath, "r") as file: + agent_pid = int(file.read()) + + asyncio.run(inspect_agent_status(cli_ctx, agent_pid)) diff --git a/src/ai/backend/agent/config.py b/src/ai/backend/agent/config.py index 6f2526b143..faabac6102 100644 --- a/src/ai/backend/agent/config.py +++ b/src/ai/backend/agent/config.py @@ -1,9 +1,14 @@ import os +import sys +from pathlib import Path +from pprint import pformat, pprint +import click import trafaret as t from ai.backend.common import config from ai.backend.common import validators as tx +from ai.backend.common.types import LogSeverity from .affinity_map import AffinityPolicy from .stats import StatModes @@ -138,3 +143,65 @@ t.Key("kernel-uid", optional=True): t.ToInt, t.Key("kernel-gid", optional=True): t.ToInt, }).allow_extra("*") + + +def get_agent_cfg( + config_path: Path, log_level: LogSeverity, debug: bool = False +) -> dict[str, t.Any]: + # Determine where to read configuration. + try: + raw_cfg, cfg_src_path = config.read_from_file(config_path, "agent") + except config.ConfigurationError as e: + print( + "ConfigurationError: Could not read or validate the storage-proxy local config:", + file=sys.stderr, + ) + print(pformat(e.invalid_data), file=sys.stderr) + raise click.Abort() + + # Override the read config with environment variables (for legacy). + config.override_with_env(raw_cfg, ("etcd", "namespace"), "BACKEND_NAMESPACE") + config.override_with_env(raw_cfg, ("etcd", "addr"), "BACKEND_ETCD_ADDR") + config.override_with_env(raw_cfg, ("etcd", "user"), "BACKEND_ETCD_USER") + config.override_with_env(raw_cfg, ("etcd", "password"), "BACKEND_ETCD_PASSWORD") + config.override_with_env( + raw_cfg, ("agent", "rpc-listen-addr", "host"), "BACKEND_AGENT_HOST_OVERRIDE" + ) + config.override_with_env(raw_cfg, ("agent", "rpc-listen-addr", "port"), "BACKEND_AGENT_PORT") + config.override_with_env(raw_cfg, ("agent", "pid-file"), "BACKEND_PID_FILE") + config.override_with_env(raw_cfg, ("container", "port-range"), "BACKEND_CONTAINER_PORT_RANGE") + config.override_with_env(raw_cfg, ("container", "bind-host"), "BACKEND_BIND_HOST_OVERRIDE") + config.override_with_env(raw_cfg, ("container", "sandbox-type"), "BACKEND_SANDBOX_TYPE") + config.override_with_env(raw_cfg, ("container", "scratch-root"), "BACKEND_SCRATCH_ROOT") + + if debug: + log_level = LogSeverity.DEBUG + config.override_key(raw_cfg, ("debug", "enabled"), log_level == LogSeverity.DEBUG) + config.override_key(raw_cfg, ("logging", "level"), log_level) + config.override_key(raw_cfg, ("logging", "pkg-ns", "ai.backend"), log_level) + + # Validate and fill configurations + # (allow_extra will make configs to be forward-copmatible) + try: + cfg = config.check(raw_cfg, agent_local_config_iv) + + if cfg["agent"]["backend"] == AgentBackend.KUBERNETES: + if cfg["container"]["scratch-type"] == "k8s-nfs" and ( + cfg["container"]["scratch-nfs-address"] is None + or cfg["container"]["scratch-nfs-options"] is None + ): + raise ValueError( + "scratch-nfs-address and scratch-nfs-options are required for k8s-nfs" + ) + if cfg["agent"]["backend"] == AgentBackend.DOCKER: + config.check(raw_cfg, docker_extra_config_iv) + if "debug" in cfg and cfg["debug"]["enabled"]: + print("== Agent configuration ==") + pprint(cfg) + cfg["_src"] = cfg_src_path + except config.ConfigurationError as e: + print("ConfigurationError: Validation of agent local config has failed:", file=sys.stderr) + print(pformat(e.invalid_data), file=sys.stderr) + raise click.Abort() + + return cfg diff --git a/src/ai/backend/agent/server.py b/src/ai/backend/agent/server.py index 74f1a81037..52df54f30b 100644 --- a/src/ai/backend/agent/server.py +++ b/src/ai/backend/agent/server.py @@ -15,7 +15,6 @@ from ipaddress import _BaseAddress as BaseIPAddress from ipaddress import ip_network from pathlib import Path -from pprint import pformat, pprint from typing import ( TYPE_CHECKING, Any, @@ -77,11 +76,11 @@ agent_etcd_config_iv, agent_local_config_iv, container_etcd_config_iv, - docker_extra_config_iv, + get_agent_cfg, ) from .exception import ResourceError from .monitor import AgentErrorPluginContext, AgentStatsPluginContext -from .types import AgentBackend, LifecycleEvent, VolumeInfo +from .types import LifecycleEvent, VolumeInfo from .utils import get_arch_name, get_subnet_ip if TYPE_CHECKING: @@ -970,60 +969,7 @@ def main( debug: bool = False, ) -> int: """Start the agent service as a foreground process.""" - # Determine where to read configuration. - try: - raw_cfg, cfg_src_path = config.read_from_file(config_path, "agent") - except config.ConfigurationError as e: - print( - "ConfigurationError: Could not read or validate the storage-proxy local config:", - file=sys.stderr, - ) - print(pformat(e.invalid_data), file=sys.stderr) - raise click.Abort() - - # Override the read config with environment variables (for legacy). - config.override_with_env(raw_cfg, ("etcd", "namespace"), "BACKEND_NAMESPACE") - config.override_with_env(raw_cfg, ("etcd", "addr"), "BACKEND_ETCD_ADDR") - config.override_with_env(raw_cfg, ("etcd", "user"), "BACKEND_ETCD_USER") - config.override_with_env(raw_cfg, ("etcd", "password"), "BACKEND_ETCD_PASSWORD") - config.override_with_env( - raw_cfg, ("agent", "rpc-listen-addr", "host"), "BACKEND_AGENT_HOST_OVERRIDE" - ) - config.override_with_env(raw_cfg, ("agent", "rpc-listen-addr", "port"), "BACKEND_AGENT_PORT") - config.override_with_env(raw_cfg, ("agent", "pid-file"), "BACKEND_PID_FILE") - config.override_with_env(raw_cfg, ("container", "port-range"), "BACKEND_CONTAINER_PORT_RANGE") - config.override_with_env(raw_cfg, ("container", "bind-host"), "BACKEND_BIND_HOST_OVERRIDE") - config.override_with_env(raw_cfg, ("container", "sandbox-type"), "BACKEND_SANDBOX_TYPE") - config.override_with_env(raw_cfg, ("container", "scratch-root"), "BACKEND_SCRATCH_ROOT") - - if debug: - log_level = LogSeverity.DEBUG - config.override_key(raw_cfg, ("debug", "enabled"), log_level == LogSeverity.DEBUG) - config.override_key(raw_cfg, ("logging", "level"), log_level) - config.override_key(raw_cfg, ("logging", "pkg-ns", "ai.backend"), log_level) - - # Validate and fill configurations - # (allow_extra will make configs to be forward-copmatible) - try: - cfg = config.check(raw_cfg, agent_local_config_iv) - if cfg["agent"]["backend"] == AgentBackend.KUBERNETES: - if cfg["container"]["scratch-type"] == "k8s-nfs" and ( - cfg["container"]["scratch-nfs-address"] is None - or cfg["container"]["scratch-nfs-options"] is None - ): - raise ValueError( - "scratch-nfs-address and scratch-nfs-options are required for k8s-nfs" - ) - if cfg["agent"]["backend"] == AgentBackend.DOCKER: - config.check(raw_cfg, docker_extra_config_iv) - if "debug" in cfg and cfg["debug"]["enabled"]: - print("== Agent configuration ==") - pprint(cfg) - cfg["_src"] = cfg_src_path - except config.ConfigurationError as e: - print("ConfigurationError: Validation of agent local config has failed:", file=sys.stderr) - print(pformat(e.invalid_data), file=sys.stderr) - raise click.Abort() + cfg = config.check(get_agent_cfg(config_path, log_level, debug), agent_local_config_iv) # FIXME: Remove this after ARM64 support lands on Jail current_arch = get_arch_name()