Skip to content

Commit

Permalink
feat: Implement agent status check CLI command
Browse files Browse the repository at this point in the history
  • Loading branch information
jopemachine committed Jun 24, 2024
1 parent 7848551 commit a5ee02a
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 57 deletions.
82 changes: 82 additions & 0 deletions src/ai/backend/agent/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,89 @@
import asyncio
import pathlib
from pathlib import Path

import click
from tabulate import tabulate

from ai.backend.agent.config import get_agent_cfg
from ai.backend.agent.server import agent_local_config_iv
from ai.backend.cli.types import CliContextInfo
from ai.backend.common import config
from ai.backend.common.types import LogSeverity


@click.group()
def main():
"""The root entrypoint for unified CLI of agent"""
pass


async def inspect_agent_status(cli_ctx: CliContextInfo, agent_pid: int) -> None:
command = f"ps -p '{agent_pid}' -f"
process = await asyncio.create_subprocess_shell(command, stdout=asyncio.subprocess.PIPE)
stdout, stderr = await process.communicate()
if stderr:
raise RuntimeError(f"Failed to execute the command: {command}")

lines = stdout.decode().splitlines()
process_list = []

for line in lines[1:]:
columns = line.split()
# Combine all text following UID, PID, PPID, C, STIME, TTY, TIME into CMD
process_info = columns[:7] + [" ".join(columns[7:])]
process_list.append(process_info)

print(tabulate(process_list, headers=lines[0].split(), tablefmt="pretty"))
pass


@main.command()
@click.pass_obj
@click.option(
"-f",
"--config-path",
"--config",
type=click.Path(
file_okay=True,
dir_okay=False,
exists=True,
path_type=pathlib.Path,
),
default=None,
help="The config file path. (default: ./agent.toml and /etc/backend.ai/agent.toml)",
)
@click.option(
"--debug",
is_flag=True,
help="Set the logging level to DEBUG",
)
@click.option(
"-s",
"--systemctl",
is_flag=True,
help="Include the systemctl status command result in the output",
)
@click.option(
"--log-level",
type=click.Choice([*LogSeverity], case_sensitive=False),
default=LogSeverity.INFO,
help="Set the logging verbosity level",
)
def status(
cli_ctx: CliContextInfo,
config_path: Path,
log_level: LogSeverity,
debug: bool = False,
systemctl: bool = False,
) -> None:
"""
Collect and print each agent process's status.
"""
cfg = config.check(get_agent_cfg(config_path, log_level, debug), agent_local_config_iv)
pid_filepath = cfg["agent"]["pid-file"]

with open(pid_filepath, "r") as file:
agent_pid = int(file.read())

asyncio.run(inspect_agent_status(cli_ctx, agent_pid))
67 changes: 67 additions & 0 deletions src/ai/backend/agent/config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import os
import sys
from pathlib import Path
from pprint import pformat, pprint

import click
import trafaret as t

from ai.backend.common import config
from ai.backend.common import validators as tx
from ai.backend.common.types import LogSeverity

from .affinity_map import AffinityPolicy
from .stats import StatModes
Expand Down Expand Up @@ -138,3 +143,65 @@
t.Key("kernel-uid", optional=True): t.ToInt,
t.Key("kernel-gid", optional=True): t.ToInt,
}).allow_extra("*")


def get_agent_cfg(
config_path: Path, log_level: LogSeverity, debug: bool = False
) -> dict[str, t.Any]:
# Determine where to read configuration.
try:
raw_cfg, cfg_src_path = config.read_from_file(config_path, "agent")
except config.ConfigurationError as e:
print(
"ConfigurationError: Could not read or validate the storage-proxy local config:",
file=sys.stderr,
)
print(pformat(e.invalid_data), file=sys.stderr)
raise click.Abort()

# Override the read config with environment variables (for legacy).
config.override_with_env(raw_cfg, ("etcd", "namespace"), "BACKEND_NAMESPACE")
config.override_with_env(raw_cfg, ("etcd", "addr"), "BACKEND_ETCD_ADDR")
config.override_with_env(raw_cfg, ("etcd", "user"), "BACKEND_ETCD_USER")
config.override_with_env(raw_cfg, ("etcd", "password"), "BACKEND_ETCD_PASSWORD")
config.override_with_env(
raw_cfg, ("agent", "rpc-listen-addr", "host"), "BACKEND_AGENT_HOST_OVERRIDE"
)
config.override_with_env(raw_cfg, ("agent", "rpc-listen-addr", "port"), "BACKEND_AGENT_PORT")
config.override_with_env(raw_cfg, ("agent", "pid-file"), "BACKEND_PID_FILE")
config.override_with_env(raw_cfg, ("container", "port-range"), "BACKEND_CONTAINER_PORT_RANGE")
config.override_with_env(raw_cfg, ("container", "bind-host"), "BACKEND_BIND_HOST_OVERRIDE")
config.override_with_env(raw_cfg, ("container", "sandbox-type"), "BACKEND_SANDBOX_TYPE")
config.override_with_env(raw_cfg, ("container", "scratch-root"), "BACKEND_SCRATCH_ROOT")

if debug:
log_level = LogSeverity.DEBUG
config.override_key(raw_cfg, ("debug", "enabled"), log_level == LogSeverity.DEBUG)
config.override_key(raw_cfg, ("logging", "level"), log_level)
config.override_key(raw_cfg, ("logging", "pkg-ns", "ai.backend"), log_level)

# Validate and fill configurations
# (allow_extra will make configs to be forward-copmatible)
try:
cfg = config.check(raw_cfg, agent_local_config_iv)

if cfg["agent"]["backend"] == AgentBackend.KUBERNETES:
if cfg["container"]["scratch-type"] == "k8s-nfs" and (
cfg["container"]["scratch-nfs-address"] is None
or cfg["container"]["scratch-nfs-options"] is None
):
raise ValueError(
"scratch-nfs-address and scratch-nfs-options are required for k8s-nfs"
)
if cfg["agent"]["backend"] == AgentBackend.DOCKER:
config.check(raw_cfg, docker_extra_config_iv)
if "debug" in cfg and cfg["debug"]["enabled"]:
print("== Agent configuration ==")
pprint(cfg)
cfg["_src"] = cfg_src_path
except config.ConfigurationError as e:
print("ConfigurationError: Validation of agent local config has failed:", file=sys.stderr)
print(pformat(e.invalid_data), file=sys.stderr)
raise click.Abort()

return cfg
60 changes: 3 additions & 57 deletions src/ai/backend/agent/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from ipaddress import _BaseAddress as BaseIPAddress
from ipaddress import ip_network
from pathlib import Path
from pprint import pformat, pprint
from typing import (
TYPE_CHECKING,
Any,
Expand Down Expand Up @@ -77,11 +76,11 @@
agent_etcd_config_iv,
agent_local_config_iv,
container_etcd_config_iv,
docker_extra_config_iv,
get_agent_cfg,
)
from .exception import ResourceError
from .monitor import AgentErrorPluginContext, AgentStatsPluginContext
from .types import AgentBackend, LifecycleEvent, VolumeInfo
from .types import LifecycleEvent, VolumeInfo
from .utils import get_arch_name, get_subnet_ip

if TYPE_CHECKING:
Expand Down Expand Up @@ -970,60 +969,7 @@ def main(
debug: bool = False,
) -> int:
"""Start the agent service as a foreground process."""
# Determine where to read configuration.
try:
raw_cfg, cfg_src_path = config.read_from_file(config_path, "agent")
except config.ConfigurationError as e:
print(
"ConfigurationError: Could not read or validate the storage-proxy local config:",
file=sys.stderr,
)
print(pformat(e.invalid_data), file=sys.stderr)
raise click.Abort()

# Override the read config with environment variables (for legacy).
config.override_with_env(raw_cfg, ("etcd", "namespace"), "BACKEND_NAMESPACE")
config.override_with_env(raw_cfg, ("etcd", "addr"), "BACKEND_ETCD_ADDR")
config.override_with_env(raw_cfg, ("etcd", "user"), "BACKEND_ETCD_USER")
config.override_with_env(raw_cfg, ("etcd", "password"), "BACKEND_ETCD_PASSWORD")
config.override_with_env(
raw_cfg, ("agent", "rpc-listen-addr", "host"), "BACKEND_AGENT_HOST_OVERRIDE"
)
config.override_with_env(raw_cfg, ("agent", "rpc-listen-addr", "port"), "BACKEND_AGENT_PORT")
config.override_with_env(raw_cfg, ("agent", "pid-file"), "BACKEND_PID_FILE")
config.override_with_env(raw_cfg, ("container", "port-range"), "BACKEND_CONTAINER_PORT_RANGE")
config.override_with_env(raw_cfg, ("container", "bind-host"), "BACKEND_BIND_HOST_OVERRIDE")
config.override_with_env(raw_cfg, ("container", "sandbox-type"), "BACKEND_SANDBOX_TYPE")
config.override_with_env(raw_cfg, ("container", "scratch-root"), "BACKEND_SCRATCH_ROOT")

if debug:
log_level = LogSeverity.DEBUG
config.override_key(raw_cfg, ("debug", "enabled"), log_level == LogSeverity.DEBUG)
config.override_key(raw_cfg, ("logging", "level"), log_level)
config.override_key(raw_cfg, ("logging", "pkg-ns", "ai.backend"), log_level)

# Validate and fill configurations
# (allow_extra will make configs to be forward-copmatible)
try:
cfg = config.check(raw_cfg, agent_local_config_iv)
if cfg["agent"]["backend"] == AgentBackend.KUBERNETES:
if cfg["container"]["scratch-type"] == "k8s-nfs" and (
cfg["container"]["scratch-nfs-address"] is None
or cfg["container"]["scratch-nfs-options"] is None
):
raise ValueError(
"scratch-nfs-address and scratch-nfs-options are required for k8s-nfs"
)
if cfg["agent"]["backend"] == AgentBackend.DOCKER:
config.check(raw_cfg, docker_extra_config_iv)
if "debug" in cfg and cfg["debug"]["enabled"]:
print("== Agent configuration ==")
pprint(cfg)
cfg["_src"] = cfg_src_path
except config.ConfigurationError as e:
print("ConfigurationError: Validation of agent local config has failed:", file=sys.stderr)
print(pformat(e.invalid_data), file=sys.stderr)
raise click.Abort()
cfg = config.check(get_agent_cfg(config_path, log_level, debug), agent_local_config_iv)

# FIXME: Remove this after ARM64 support lands on Jail
current_arch = get_arch_name()
Expand Down

0 comments on commit a5ee02a

Please sign in to comment.