From 45cde2053bd9b0d1c47d411ae6d93aad1b199ca8 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Sat, 3 Aug 2024 00:18:36 +0900 Subject: [PATCH] fix: Robust type check when idle checker fetches utilization data (#2601) --- changes/2601.fix.md | 1 + src/ai/backend/manager/idle.py | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) create mode 100644 changes/2601.fix.md diff --git a/changes/2601.fix.md b/changes/2601.fix.md new file mode 100644 index 0000000000..ee8b87c38c --- /dev/null +++ b/changes/2601.fix.md @@ -0,0 +1 @@ +Robust type check when idle checker fetches utilization data. diff --git a/src/ai/backend/manager/idle.py b/src/ai/backend/manager/idle.py index 648d30793e..89f952b995 100644 --- a/src/ai/backend/manager/idle.py +++ b/src/ai/backend/manager/idle.py @@ -1038,12 +1038,21 @@ async def get_current_utilization( try: utilizations = {k: 0.0 for k in self.resource_thresholds.keys()} live_stat = {} + divider = len(kernel_ids) if kernel_ids else 1 for kernel_id in kernel_ids: - raw_live_stat = await redis_helper.execute( - self._redis_stat, - lambda r: r.get(str(kernel_id)), + raw_live_stat = cast( + bytes | None, + await redis_helper.execute( + self._redis_stat, + lambda r: r.get(str(kernel_id)), + ), ) - live_stat = msgpack.unpackb(raw_live_stat) + if raw_live_stat is None: + log.warning( + f"Utilization data not found or failed to fetch utilization data, abort idle check (k:{kernel_id})" + ) + return None + live_stat = cast(dict[str, Any], msgpack.unpackb(raw_live_stat)) kernel_utils = { k: float(nmget(live_stat, f"{k}.pct", 0.0)) for k in self.resource_thresholds.keys() @@ -1052,9 +1061,7 @@ async def get_current_utilization( utilizations = { k: utilizations[k] + kernel_utils[k] for k in self.resource_thresholds.keys() } - utilizations = { - k: utilizations[k] / len(kernel_ids) for k in self.resource_thresholds.keys() - } + utilizations = {k: utilizations[k] / divider for k in self.resource_thresholds.keys()} # NOTE: Manual calculation of mem utilization. # mem.capacity does not report total amount of memory allocated to @@ -1065,7 +1072,8 @@ async def get_current_utilization( utilizations["mem"] = mem_current / mem_slots * 100 if mem_slots > 0 else 0 return utilizations except Exception as e: - log.warning("Unable to collect utilization for idleness check", exc_info=e) + _msg = f"Unable to collect utilization for idleness check (kernels:{kernel_ids})" + log.warning(_msg, exc_info=e) return None async def get_checker_result(