Skip to content

Commit

Permalink
fix: Robust type check when idle checker fetches utilization data (#2601
Browse files Browse the repository at this point in the history
)
  • Loading branch information
fregataa authored Aug 2, 2024
1 parent bc1ac79 commit 45cde20
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 8 deletions.
1 change: 1 addition & 0 deletions changes/2601.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Robust type check when idle checker fetches utilization data.
24 changes: 16 additions & 8 deletions src/ai/backend/manager/idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,12 +1038,21 @@ async def get_current_utilization(
try:
utilizations = {k: 0.0 for k in self.resource_thresholds.keys()}
live_stat = {}
divider = len(kernel_ids) if kernel_ids else 1
for kernel_id in kernel_ids:
raw_live_stat = await redis_helper.execute(
self._redis_stat,
lambda r: r.get(str(kernel_id)),
raw_live_stat = cast(
bytes | None,
await redis_helper.execute(
self._redis_stat,
lambda r: r.get(str(kernel_id)),
),
)
live_stat = msgpack.unpackb(raw_live_stat)
if raw_live_stat is None:
log.warning(
f"Utilization data not found or failed to fetch utilization data, abort idle check (k:{kernel_id})"
)
return None
live_stat = cast(dict[str, Any], msgpack.unpackb(raw_live_stat))
kernel_utils = {
k: float(nmget(live_stat, f"{k}.pct", 0.0))
for k in self.resource_thresholds.keys()
Expand All @@ -1052,9 +1061,7 @@ async def get_current_utilization(
utilizations = {
k: utilizations[k] + kernel_utils[k] for k in self.resource_thresholds.keys()
}
utilizations = {
k: utilizations[k] / len(kernel_ids) for k in self.resource_thresholds.keys()
}
utilizations = {k: utilizations[k] / divider for k in self.resource_thresholds.keys()}

# NOTE: Manual calculation of mem utilization.
# mem.capacity does not report total amount of memory allocated to
Expand All @@ -1065,7 +1072,8 @@ async def get_current_utilization(
utilizations["mem"] = mem_current / mem_slots * 100 if mem_slots > 0 else 0
return utilizations
except Exception as e:
log.warning("Unable to collect utilization for idleness check", exc_info=e)
_msg = f"Unable to collect utilization for idleness check (kernels:{kernel_ids})"
log.warning(_msg, exc_info=e)
return None

async def get_checker_result(
Expand Down

0 comments on commit 45cde20

Please sign in to comment.