From 67e8dad72427e580f72dd1d945516328497b98f1 Mon Sep 17 00:00:00 2001 From: octodog Date: Wed, 14 Aug 2024 14:19:54 +0900 Subject: [PATCH 1/9] fix:let vfolder GET requests contain data in params rather than body (#2706) (#2714) Co-authored-by: pilmo kim <68311908+why-arong@users.noreply.github.com> Co-authored-by: Kyujin Cho --- changes/2706.fix.md | 1 + src/ai/backend/client/func/vfolder.py | 24 ++++++++++-------------- tests/client/test_vfolder.py | 11 ++++++++--- 3 files changed, 19 insertions(+), 17 deletions(-) create mode 100644 changes/2706.fix.md diff --git a/changes/2706.fix.md b/changes/2706.fix.md new file mode 100644 index 0000000000..9db5a3d8c5 --- /dev/null +++ b/changes/2706.fix.md @@ -0,0 +1 @@ +Fix `list_files`, `get_fstab_contents`, `get_performance_metric` and `shared_vfolder_info` Python SDK function not working with `ValidationError` exception printed diff --git a/src/ai/backend/client/func/vfolder.py b/src/ai/backend/client/func/vfolder.py index ca8d959eb4..730dc8882a 100644 --- a/src/ai/backend/client/func/vfolder.py +++ b/src/ai/backend/client/func/vfolder.py @@ -547,10 +547,7 @@ async def delete_files(self, files: Sequence[Union[str, Path]], recursive: bool @api_function async def list_files(self, path: Union[str, Path] = "."): - rqst = Request("GET", "/folders/{}/files".format(self.name)) - rqst.set_json({ - "path": path, - }) + rqst = Request("GET", "/folders/{}/files".format(self.name), params={"path": str(path)}) async with rqst.fetch() as resp: return await resp.json() @@ -590,20 +587,20 @@ async def delete_invitation(cls, inv_id: str): @api_function @classmethod async def get_fstab_contents(cls, agent_id=None): - rqst = Request("GET", "/folders/_/fstab") - rqst.set_json({ - "agent_id": agent_id, - }) + rqst = Request( + "GET", + "/folders/_/fstab", + params={ + "agent_id": agent_id, + }, + ) async with rqst.fetch() as resp: return await resp.json() @api_function @classmethod async def get_performance_metric(cls, folder_host: str): - rqst = Request("GET", "/folders/_/perf-metric") - rqst.set_json({ - "folder_host": folder_host, - }) + rqst = Request("GET", "/folders/_/perf-metric", params={"folder_host": folder_host}) async with rqst.fetch() as resp: return await resp.json() @@ -702,8 +699,7 @@ async def list_shared_vfolders(cls): @api_function @classmethod async def shared_vfolder_info(cls, vfolder_id: str): - rqst = Request("GET", "folders/_/shared") - rqst.set_json({"vfolder_id": vfolder_id}) + rqst = Request("GET", "folders/_/shared", params={"vfolder_id": vfolder_id}) async with rqst.fetch() as resp: return await resp.json() diff --git a/tests/client/test_vfolder.py b/tests/client/test_vfolder.py index 5324bd8f0f..3e6bfdc76e 100644 --- a/tests/client/test_vfolder.py +++ b/tests/client/test_vfolder.py @@ -1,18 +1,21 @@ +from typing import Mapping, Union from unittest import mock import pytest from aioresponses import aioresponses -from ai.backend.client.config import API_VERSION +from ai.backend.client.config import API_VERSION, APIConfig from ai.backend.client.session import Session from ai.backend.testutils.mock import AsyncMock -def build_url(config, path: str): +def build_url(config: APIConfig, path: str, params: Mapping[str, Union[str, int]] = None): base_url = config.endpoint.path.rstrip("/") query_path = path.lstrip("/") if len(path) > 0 else "" path = "{0}/{1}".format(base_url, query_path) canonical_url = config.endpoint.with_path(path) + if params: + canonical_url = canonical_url.with_query(params) return canonical_url @@ -138,7 +141,9 @@ def test_vfolder_list_files(): "folder_path": "/mnt/local/1f6bd27fde1248cabfb50306ea83fc0a", } m.get( - build_url(session.config, "/folders/{}/files".format(vfolder_name)), + build_url( + session.config, "/folders/{}/files".format(vfolder_name), params={"path": "."} + ), status=200, payload=payload, ) From 2aef6ec37f34232065db699e997a0657c8b5179e Mon Sep 17 00:00:00 2001 From: octodog Date: Tue, 20 Aug 2024 17:57:00 +0900 Subject: [PATCH 2/9] fix: Handle fs error when deleting vfolder (#2741) (#2744) Co-authored-by: Sanghun Lee --- changes/2741.fix.md | 1 + src/ai/backend/storage/api/manager.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 changes/2741.fix.md diff --git a/changes/2741.fix.md b/changes/2741.fix.md new file mode 100644 index 0000000000..0ad0b273ee --- /dev/null +++ b/changes/2741.fix.md @@ -0,0 +1 @@ +Handle OS Error when deleting vfolders. diff --git a/src/ai/backend/storage/api/manager.py b/src/ai/backend/storage/api/manager.py index aa36fd05a5..41bced9a6c 100644 --- a/src/ai/backend/storage/api/manager.py +++ b/src/ai/backend/storage/api/manager.py @@ -381,7 +381,8 @@ class Params(TypedDict): await log_manager_api_entry(log, "delete_vfolder", params) ctx: RootContext = request.app["ctx"] async with ctx.get_volume(params["volume"]) as volume: - await volume.delete_vfolder(params["vfid"]) + with handle_fs_errors(volume, params["vfid"]): + await volume.delete_vfolder(params["vfid"]) return web.Response(status=204) From c81ff70510294434d7c0de9f44ddb6bd582ac870 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 26 Aug 2024 19:28:34 +0900 Subject: [PATCH 3/9] fix: Mishandling of undefined value in the `ModifyImage` mutation (#2770) --- changes/2028.fix.md | 1 + src/ai/backend/manager/models/image.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 changes/2028.fix.md diff --git a/changes/2028.fix.md b/changes/2028.fix.md new file mode 100644 index 0000000000..0b84457b1e --- /dev/null +++ b/changes/2028.fix.md @@ -0,0 +1 @@ +Fix handling of undefined values in the ModifyImage GraphQL mutation. diff --git a/src/ai/backend/manager/models/image.py b/src/ai/backend/manager/models/image.py index 172721c67f..28acbf124a 100644 --- a/src/ai/backend/manager/models/image.py +++ b/src/ai/backend/manager/models/image.py @@ -995,7 +995,7 @@ async def mutate( ) set_if_set(props, data, "labels", clean_func=lambda v: {pair.key: pair.value for pair in v}) - if props.resource_limits is not None: + if props.resource_limits is not Undefined: resources_data = {} for limit_option in props.resource_limits: limit_data = {} From 03b6e098a7ef2945be96de7c93d6a6cce8929006 Mon Sep 17 00:00:00 2001 From: octodog Date: Tue, 27 Aug 2024 01:17:32 +0900 Subject: [PATCH 4/9] fix: Make the config file be treated as multiline by re (#2771) (#2772) Co-authored-by: Sion Kang Co-authored-by: Joongi Kim --- changes/2771.fix.md | 1 + src/ai/backend/install/context.py | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 changes/2771.fix.md diff --git a/changes/2771.fix.md b/changes/2771.fix.md new file mode 100644 index 0000000000..77dbc29cd6 --- /dev/null +++ b/changes/2771.fix.md @@ -0,0 +1 @@ +Make the regex patterns to update configuration files working with multiline texts correctly in the TUI installer diff --git a/src/ai/backend/install/context.py b/src/ai/backend/install/context.py index cb9144f8ed..19cabd1d11 100644 --- a/src/ai/backend/install/context.py +++ b/src/ai/backend/install/context.py @@ -341,7 +341,7 @@ async def configure_manager(self) -> None: self.sed_in_place_multi( toml_path, [ - (re.compile("^num-proc = .*"), "num-proc = 1"), + (re.compile("^num-proc = .*", flags=re.M), "num-proc = 1"), ("port = 8120", f"port = {halfstack.etcd_addr[0].face.port}"), ("port = 8100", f"port = {halfstack.postgres_addr.face.port}"), ( @@ -349,7 +349,7 @@ async def configure_manager(self) -> None: f"port = {self.install_info.service_config.manager_addr.bind.port}", ), ( - re.compile("^(# )?ipc-base-path =.*"), + re.compile("^(# )?ipc-base-path =.*", flags=re.M), f'ipc-base-path = "{self.install_info.service_config.manager_ipc_base_path}"', ), ], @@ -424,15 +424,15 @@ async def configure_agent(self) -> None: ("port = 6001", f"port = {service.agent_rpc_addr.bind.port}"), ("port = 6009", f"port = {service.agent_watcher_addr.bind.port}"), ( - re.compile("^(# )?ipc-base-path = .*"), + re.compile("^(# )?ipc-base-path = .*", flags=re.M), f'ipc-base-path = "{service.agent_ipc_base_path}"', ), ( - re.compile("^(# )?var-base-path = .*"), + re.compile("^(# )?var-base-path = .*", flags=re.M), f'var-base-path = "{service.agent_var_base_path}"', ), ( - re.compile("(# )?mount_path = .*"), + re.compile("(# )?mount_path = .*", flags=re.M), f'"{self.install_info.base_path / service.vfolder_relpath}"', ), ], @@ -445,7 +445,7 @@ async def configure_agent(self) -> None: # "cuda.devices = 0" as the agent capacity, but it will still run. self.sed_in_place( toml_path, - re.compile("^(# )?allow-compute-plugins = .*"), + re.compile("^(# )?allow-compute-plugins = .*", flags=re.M), 'allow-compute-plugins = ["ai.backend.accelerator.cuda_open"]', ) # TODO: let the installer enable the CUDA plugin only when it verifies CUDA availability or From 77b86ccbde4c1682d2c355dc9b663506af4c5ccd Mon Sep 17 00:00:00 2001 From: octodog Date: Thu, 29 Aug 2024 18:48:05 +0900 Subject: [PATCH 5/9] fix: Handle container port mismatch when creating kernel (#2786) (#2787) Co-authored-by: Sanghun Lee --- changes/2786.fix.md | 1 + src/ai/backend/agent/docker/agent.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 changes/2786.fix.md diff --git a/changes/2786.fix.md b/changes/2786.fix.md new file mode 100644 index 0000000000..11c7c98764 --- /dev/null +++ b/changes/2786.fix.md @@ -0,0 +1 @@ +Handle container port mismatch when creating kernel. diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index 7d616e0311..c45b38d801 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -705,7 +705,9 @@ async def start_container( advertised_kernel_host = self.local_config["container"].get("advertised-host") repl_ports = [2000, 2001] if len(service_ports) + len(repl_ports) > len(self.port_pool): - raise RuntimeError("Container ports are not sufficiently available.") + raise RuntimeError( + f"Container ports are not sufficiently available. (remaining ports: {self.port_pool})" + ) exposed_ports = repl_ports host_ports = [self.port_pool.pop() for _ in repl_ports] for sport in service_ports: @@ -942,6 +944,11 @@ async def start_container( container_id=cid, message="Container port not found" ) host_port = int(ports[0]["HostPort"]) + if host_port != host_ports[idx]: + raise ContainerCreationError( + container_id=cid, + message=f"Port mapping mismatch. {host_port = }, {host_ports[idx] = }", + ) assert host_port == host_ports[idx] if port == 2000: # intrinsic repl_in_port = host_port From cb37801fd19c56223f64380f824fdf3892ce8e80 Mon Sep 17 00:00:00 2001 From: octodog Date: Thu, 29 Aug 2024 18:55:54 +0900 Subject: [PATCH 6/9] chore: change vastdata API token span (#2785) (#2790) Co-authored-by: Sanghun Lee --- src/ai/backend/storage/vast/vastdata_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ai/backend/storage/vast/vastdata_client.py b/src/ai/backend/storage/vast/vastdata_client.py index 71628d96a6..ea54a45386 100644 --- a/src/ai/backend/storage/vast/vastdata_client.py +++ b/src/ai/backend/storage/vast/vastdata_client.py @@ -28,8 +28,8 @@ log = BraceStyleAdapter(logging.getLogger(__spec__.name)) # type: ignore[name-defined] -DEFAULT_ACCESS_TOKEN_SPAN: Final = timedelta(hours=1) -DEFAULT_REFRESH_TOKEN_SPAN: Final = timedelta(hours=24) +DEFAULT_ACCESS_TOKEN_SPAN: Final = timedelta(minutes=1) +DEFAULT_REFRESH_TOKEN_SPAN: Final = timedelta(minutes=10) VASTQuotaID = NewType("VASTQuotaID", str) From 8f237e5be93138e14e73dcaca11e743c214b955f Mon Sep 17 00:00:00 2001 From: octodog Date: Sat, 31 Aug 2024 13:56:36 +0900 Subject: [PATCH 7/9] fix: Correct `msgpack` deserialization of `ResourceSlot` (#2754) (#2757) Co-authored-by: Gyubong Lee Co-authored-by: Joongi Kim --- changes/2754.fix.md | 1 + src/ai/backend/common/msgpack.py | 7 ++++++- tests/common/test_msgpack.py | 19 ++++++++++++++++++- 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 changes/2754.fix.md diff --git a/changes/2754.fix.md b/changes/2754.fix.md new file mode 100644 index 0000000000..e7d6d77916 --- /dev/null +++ b/changes/2754.fix.md @@ -0,0 +1 @@ +Correct `msgpack` deserialization of `ResourceSlot`. \ No newline at end of file diff --git a/src/ai/backend/common/msgpack.py b/src/ai/backend/common/msgpack.py index d8a2043ba3..9a435b8f4a 100644 --- a/src/ai/backend/common/msgpack.py +++ b/src/ai/backend/common/msgpack.py @@ -14,7 +14,7 @@ import msgpack as _msgpack import temporenc -from .types import BinarySize +from .types import BinarySize, ResourceSlot __all__ = ("packb", "unpackb") @@ -27,6 +27,7 @@ class ExtTypes(enum.IntEnum): POSIX_PATH = 4 PURE_POSIX_PATH = 5 ENUM = 6 + RESOURCE_SLOT = 8 BACKENDAI_BINARY_SIZE = 16 @@ -46,6 +47,8 @@ def _default(obj: object) -> Any: return _msgpack.ExtType(ExtTypes.POSIX_PATH, os.fsencode(obj)) case PurePosixPath(): return _msgpack.ExtType(ExtTypes.PURE_POSIX_PATH, os.fsencode(obj)) + case ResourceSlot(): + return _msgpack.ExtType(ExtTypes.RESOURCE_SLOT, pickle.dumps(obj, protocol=5)) case enum.Enum(): return _msgpack.ExtType(ExtTypes.ENUM, pickle.dumps(obj, protocol=5)) raise TypeError(f"Unknown type: {obj!r} ({type(obj)})") @@ -65,6 +68,8 @@ def _ext_hook(code: int, data: bytes) -> Any: return PurePosixPath(os.fsdecode(data)) case ExtTypes.ENUM: return pickle.loads(data) + case ExtTypes.RESOURCE_SLOT: + return pickle.loads(data) case ExtTypes.BACKENDAI_BINARY_SIZE: return pickle.loads(data) return _msgpack.ExtType(code, data) diff --git a/tests/common/test_msgpack.py b/tests/common/test_msgpack.py index 3b4b811296..74ae3829c4 100644 --- a/tests/common/test_msgpack.py +++ b/tests/common/test_msgpack.py @@ -6,7 +6,7 @@ from dateutil.tz import gettz, tzutc from ai.backend.common import msgpack -from ai.backend.common.types import BinarySize, SlotTypes +from ai.backend.common.types import BinarySize, ResourceSlot, SlotTypes def test_msgpack_with_unicode(): @@ -125,3 +125,20 @@ def test_msgpack_posixpath(): unpacked = msgpack.unpackb(packed) assert isinstance(unpacked["path"], PosixPath) assert unpacked["path"] == path + + +def test_msgpack_resource_slot(): + resource_slot = ResourceSlot({"cpu": 1, "mem": 1024}) + packed = msgpack.packb(resource_slot) + unpacked = msgpack.unpackb(packed) + assert unpacked == resource_slot + + resource_slot = ResourceSlot({"cpu": 2, "mem": Decimal(1024**5)}) + packed = msgpack.packb(resource_slot) + unpacked = msgpack.unpackb(packed) + assert unpacked == resource_slot + + resource_slot = ResourceSlot({"cpu": 3, "mem": "1125899906842624"}) + packed = msgpack.packb(resource_slot) + unpacked = msgpack.unpackb(packed) + assert unpacked == resource_slot From d2510a1b5e7094c9e3cb78e8c23079ae376f4705 Mon Sep 17 00:00:00 2001 From: octodog Date: Mon, 2 Sep 2024 16:29:58 +0900 Subject: [PATCH 8/9] feat: Add `scaling-group-type` in agent.toml (#2796) (#2799) Co-authored-by: Joongi Kim --- changes/2796.feature.md | 1 + configs/agent/sample.toml | 8 +++++++- src/ai/backend/agent/config.py | 7 ++++++- src/ai/backend/agent/stats.py | 2 +- src/ai/backend/common/types.py | 8 +++++++- 5 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 changes/2796.feature.md diff --git a/changes/2796.feature.md b/changes/2796.feature.md new file mode 100644 index 0000000000..a5be486d36 --- /dev/null +++ b/changes/2796.feature.md @@ -0,0 +1 @@ +Add an explicit configuration `scaling-group-type` to `agent.toml` so that the agent could distinguish whether itself belongs to an SFTP resource group or not diff --git a/configs/agent/sample.toml b/configs/agent/sample.toml index 8efdd3ba01..2f39c2cd29 100644 --- a/configs/agent/sample.toml +++ b/configs/agent/sample.toml @@ -48,10 +48,16 @@ agent-sock-port = 6007 # This affects the per-node configuration scope. # id = "i-something-special" -# Set the scaling group of this agent. +# Set the scaling group (aka resource group) of this agent. # This affects the per-sgroup configuration scope. scaling-group = "default" +# Set the type of scaling group (aka resource group) of this agent. +# - "compute": The agent hosts computing workloads, facing the internal cluster nodes. +# - "storage": The agent hosts storage-access containers, directly facing public/user-side netweorks. +# [default: "compute"] +# scaling-group-type = "compute" + # Set the volume mount path for the agent node. # mount-path = "/vfroot" diff --git a/src/ai/backend/agent/config.py b/src/ai/backend/agent/config.py index 7cbad69576..6dc0958d4f 100644 --- a/src/ai/backend/agent/config.py +++ b/src/ai/backend/agent/config.py @@ -4,6 +4,7 @@ from ai.backend.common import config from ai.backend.common import validators as tx +from ai.backend.common.types import ResourceGroupType from .affinity_map import AffinityPolicy from .stats import StatModes @@ -38,6 +39,9 @@ t.Key("region", default=None): t.Null | t.String, t.Key("instance-type", default=None): t.Null | t.String, t.Key("scaling-group", default="default"): t.String, + t.Key("scaling-group-type", default=ResourceGroupType.COMPUTE): t.Enum( + *(e.value for e in ResourceGroupType) + ), t.Key("pid-file", default=os.devnull): tx.Path( type="file", allow_nonexisting=True, allow_devnull=True ), @@ -65,7 +69,8 @@ t.Key("bind-host", default=""): t.String(allow_blank=True), t.Key("advertised-host", default=None): t.Null | t.String(), t.Key("port-range", default=(30000, 31000)): tx.PortRange, - t.Key("stats-type", default="docker"): t.Null | t.Enum(*[e.value for e in StatModes]), + t.Key("stats-type", default=StatModes.DOCKER): t.Null + | t.Enum(*(e.value for e in StatModes)), t.Key("sandbox-type", default="docker"): t.Enum("docker", "jail"), t.Key("jail-args", default=[]): t.List(t.String), t.Key("scratch-type"): t.Enum("hostdir", "hostfile", "memory", "k8s-nfs"), diff --git a/src/ai/backend/agent/stats.py b/src/ai/backend/agent/stats.py index 9a1f940c59..60eeeefe7f 100644 --- a/src/ai/backend/agent/stats.py +++ b/src/ai/backend/agent/stats.py @@ -67,7 +67,7 @@ def check_cgroup_available(): return not is_containerized() and sys.platform.startswith("linux") -class StatModes(enum.Enum): +class StatModes(enum.StrEnum): CGROUP = "cgroup" DOCKER = "docker" diff --git a/src/ai/backend/common/types.py b/src/ai/backend/common/types.py index 2344ec58a5..7c468af0e1 100644 --- a/src/ai/backend/common/types.py +++ b/src/ai/backend/common/types.py @@ -65,6 +65,7 @@ "SlotName", "IntrinsicSlotNames", "ResourceSlot", + "ResourceGroupType", "ReadableCIDR", "HardwareMetadata", "ModelServiceStatus", @@ -287,7 +288,12 @@ class SessionResult(str, enum.Enum): FAILURE = "failure" -class ClusterMode(str, enum.Enum): +class ResourceGroupType(enum.StrEnum): + COMPUTE = enum.auto() + STORAGE = enum.auto() + + +class ClusterMode(enum.StrEnum): SINGLE_NODE = "single-node" MULTI_NODE = "multi-node" From aa8b69de3ee9bb4c70e0cebe8d2f6ec6509a4084 Mon Sep 17 00:00:00 2001 From: octodog Date: Mon, 2 Sep 2024 17:05:35 +0900 Subject: [PATCH 9/9] fix: Explicitly specify protected service ports not to expose publicly (#2797) (#2802) Co-authored-by: Joongi Kim --- changes/2797.fix.md | 1 + src/ai/backend/agent/agent.py | 16 +++++++++ src/ai/backend/agent/docker/agent.py | 41 ++++++++++++++++++++---- src/ai/backend/agent/docker/kernel.py | 2 +- src/ai/backend/agent/dummy/agent.py | 12 +++++++ src/ai/backend/agent/kubernetes/agent.py | 14 +++++++- 6 files changed, 78 insertions(+), 8 deletions(-) create mode 100644 changes/2797.fix.md diff --git a/changes/2797.fix.md b/changes/2797.fix.md new file mode 100644 index 0000000000..e92a956d34 --- /dev/null +++ b/changes/2797.fix.md @@ -0,0 +1 @@ +Explicitly set the protected service ports depending on the resource group type and the service types diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index eb4721e062..a7cc323227 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -247,6 +247,22 @@ def update_user_bootstrap_script(self, script: str) -> None: """ self.kernel_config["bootstrap_script"] = script + @property + @abstractmethod + def repl_ports(self) -> Sequence[int]: + """ + Return the list of intrinsic REPL ports to exclude from public mapping. + """ + raise NotImplementedError + + @property + @abstractmethod + def protected_services(self) -> Sequence[str]: + """ + Return the list of protected (intrinsic) service names to exclude from public mapping. + """ + raise NotImplementedError + @abstractmethod async def apply_network(self, cluster_info: ClusterInfo) -> None: """ diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index c45b38d801..47ff819081 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -40,6 +40,7 @@ from aiodocker.exceptions import DockerError from aiomonitor.task import preserve_termination_log from async_timeout import timeout +from typing_extensions import override from ai.backend.common.cgroup import get_cgroup_mount_point from ai.backend.common.docker import MAX_KERNELSPEC, MIN_KERNELSPEC, ImageRef @@ -61,6 +62,7 @@ KernelId, MountPermission, MountTypes, + ResourceGroupType, ResourceSlot, Sentinel, ServicePort, @@ -687,6 +689,21 @@ def _populate_ssh_config(): ) return kernel_obj + @property + @override + def repl_ports(self) -> Sequence[int]: + return (2000, 2001) + + @property + @override + def protected_services(self) -> Sequence[str]: + rgtype: ResourceGroupType = self.local_config["agent"]["scaling-group-type"] + match rgtype: + case ResourceGroupType.COMPUTE: + return () + case ResourceGroupType.STORAGE: + return ("ttyd",) + async def start_container( self, kernel_obj: AbstractKernel, @@ -703,13 +720,13 @@ async def start_container( # PHASE 4: Run! container_bind_host = self.local_config["container"]["bind-host"] advertised_kernel_host = self.local_config["container"].get("advertised-host") - repl_ports = [2000, 2001] - if len(service_ports) + len(repl_ports) > len(self.port_pool): + if len(service_ports) + len(self.repl_ports) > len(self.port_pool): raise RuntimeError( f"Container ports are not sufficiently available. (remaining ports: {self.port_pool})" ) - exposed_ports = repl_ports - host_ports = [self.port_pool.pop() for _ in repl_ports] + exposed_ports = [*self.repl_ports] + host_ports = [self.port_pool.pop() for _ in self.repl_ports] + host_ips = [] for sport in service_ports: exposed_ports.extend(sport["container_ports"]) if ( @@ -725,6 +742,18 @@ async def start_container( else: hport = self.port_pool.pop() host_ports.append(hport) + protected_service_ports: set[int] = set() + for sport in service_ports: + if sport["name"] in self.protected_services: + protected_service_ports.update(sport["container_ports"]) + for eport in exposed_ports: + if eport in self.repl_ports: # always protected + host_ips.append("127.0.0.1") + elif eport in protected_service_ports: # check if protected by resource group type + host_ips.append("127.0.0.1") + else: + host_ips.append(str(container_bind_host)) + assert len(host_ips) == len(host_ports) == len(exposed_ports) container_log_size = self.local_config["agent"]["container-logs"]["max-length"] container_log_file_count = 5 @@ -752,8 +781,8 @@ async def start_container( "HostConfig": { "Init": True, "PortBindings": { - f"{eport}/tcp": [{"HostPort": str(hport), "HostIp": str(container_bind_host)}] - for eport, hport in zip(exposed_ports, host_ports) + f"{eport}/tcp": [{"HostPort": str(hport), "HostIp": hip}] + for eport, hport, hip in zip(exposed_ports, host_ports, host_ips) }, "PublishAllPorts": False, # we manage port mapping manually! "CapAdd": [ diff --git a/src/ai/backend/agent/docker/kernel.py b/src/ai/backend/agent/docker/kernel.py index 3cf12bce99..6e428cc2a4 100644 --- a/src/ai/backend/agent/docker/kernel.py +++ b/src/ai/backend/agent/docker/kernel.py @@ -82,7 +82,7 @@ async def create_code_runner( self.kernel_id, self.session_id, event_producer, - kernel_host=self.data["kernel_host"], + kernel_host="127.0.0.1", # repl ports are always bound to 127.0.0.1 repl_in_port=self.data["repl_in_port"], repl_out_port=self.data["repl_out_port"], exec_timeout=0, diff --git a/src/ai/backend/agent/dummy/agent.py b/src/ai/backend/agent/dummy/agent.py index 48a04e2467..6a3255b718 100644 --- a/src/ai/backend/agent/dummy/agent.py +++ b/src/ai/backend/agent/dummy/agent.py @@ -12,6 +12,8 @@ Tuple, ) +from typing_extensions import override + from ai.backend.common.config import read_from_file from ai.backend.common.docker import ImageRef from ai.backend.common.events import EventProducer @@ -108,6 +110,16 @@ async def prepare_scratch(self) -> None: async def get_intrinsic_mounts(self) -> Sequence[Mount]: return [] + @property + @override + def repl_ports(self) -> Sequence[int]: + return (2000, 2001) + + @property + @override + def protected_services(self) -> Sequence[str]: + return () + async def apply_network(self, cluster_info: ClusterInfo) -> None: return diff --git a/src/ai/backend/agent/kubernetes/agent.py b/src/ai/backend/agent/kubernetes/agent.py index 1f609f5dcc..a2e361b6f2 100644 --- a/src/ai/backend/agent/kubernetes/agent.py +++ b/src/ai/backend/agent/kubernetes/agent.py @@ -30,6 +30,7 @@ import pkg_resources from kubernetes_asyncio import client as kube_client from kubernetes_asyncio import config as kube_config +from typing_extensions import override from ai.backend.common.asyncio import current_loop from ai.backend.common.docker import ImageRef @@ -303,6 +304,17 @@ async def get_intrinsic_mounts(self) -> Sequence[Mount]: return mounts + @property + @override + def repl_ports(self) -> Sequence[int]: + return (2000, 2001) + + @property + @override + def protected_services(self) -> Sequence[str]: + # NOTE: Currently K8s does not support binding container ports to 127.0.0.1 when using NodePort. + return () + async def apply_network(self, cluster_info: ClusterInfo) -> None: pass @@ -655,7 +667,7 @@ async def start_container( await kube_config.load_kube_config() core_api = kube_client.CoreV1Api() apps_api = kube_client.AppsV1Api() - exposed_ports = [2000, 2001] + exposed_ports = [*self.repl_ports] for sport in service_ports: exposed_ports.extend(sport["container_ports"])