diff --git a/changes/2972.fix.md b/changes/2972.fix.md new file mode 100644 index 0000000000..7444f5e7c8 --- /dev/null +++ b/changes/2972.fix.md @@ -0,0 +1 @@ +Handle error when convert `shmem` string value into `BinarySize` diff --git a/src/ai/backend/manager/defs.py b/src/ai/backend/manager/defs.py index 7c93275b18..0dc2421579 100644 --- a/src/ai/backend/manager/defs.py +++ b/src/ai/backend/manager/defs.py @@ -88,3 +88,5 @@ class LockID(enum.IntEnum): DEFAULT_KEYPAIR_RESOURCE_POLICY_NAME: Final = "default" DEFAULT_KEYPAIR_RATE_LIMIT: Final = 10000 + +DEFAULT_SHARED_MEMORY_SIZE: Final[str] = "64m" diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index e1bb4679f0..e93904ea0e 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -132,7 +132,7 @@ TooManySessionsMatched, ) from .config import LocalConfig, SharedConfig -from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, INTRINSIC_SLOTS +from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, DEFAULT_SHARED_MEMORY_SIZE, INTRINSIC_SLOTS from .exceptions import MultiAgentError, convert_to_status_data from .models import ( AGENT_RESOURCE_OCCUPYING_KERNEL_STATUSES, @@ -1128,10 +1128,20 @@ async def enqueue_session( # We need to subtract the amount of shared memory from the memory limit of # a container, since tmpfs including /dev/shm uses host-side kernel memory # and cgroup's memory limit does not apply. - shmem = resource_opts.get("shmem", None) - if shmem is None: - shmem = labels.get("ai.backend.resource.preferred.shmem", "64m") - shmem = BinarySize.from_str(shmem) + raw_shmem: Optional[str] = resource_opts.get("shmem") + if raw_shmem is None: + raw_shmem = labels.get("ai.backend.resource.preferred.shmem") + if not raw_shmem: + # raw_shmem is None or empty string ("") + raw_shmem = DEFAULT_SHARED_MEMORY_SIZE + try: + shmem = BinarySize.from_str(raw_shmem) + except ValueError: + log.warning( + f"Failed to convert raw `shmem({raw_shmem})` " + f"to a decimal value. Fallback to default({DEFAULT_SHARED_MEMORY_SIZE})." + ) + shmem = BinarySize.from_str(DEFAULT_SHARED_MEMORY_SIZE) resource_opts["shmem"] = shmem image_min_slots = copy.deepcopy(image_min_slots) image_min_slots["mem"] += shmem