Skip to content

Commit

Permalink
change logs to be time elapsed
Browse files Browse the repository at this point in the history
  • Loading branch information
spal1 committed Dec 12, 2024
1 parent a4c08ba commit 8c16297
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
20 changes: 15 additions & 5 deletions truss/templates/server/model_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import time
import weakref
from contextlib import asynccontextmanager
from datetime import datetime, timezone
from enum import Enum
from functools import cached_property
from multiprocessing import Lock
Expand Down Expand Up @@ -291,6 +292,7 @@ class ModelWrapper:
_predict_semaphore: Semaphore
_poll_for_environment_updates_task: Optional[asyncio.Task]
_environment: Optional[dict]
_first_health_check_failure: Optional[datetime]

class Status(Enum):
NOT_READY = 0
Expand Down Expand Up @@ -320,7 +322,7 @@ def __init__(self, config: Dict, tracer: sdk_trace.Tracer):
)
self._poll_for_environment_updates_task = None
self._environment = None
self._is_ready_failures = 0
self._first_health_check_failure = None

@property
def _model(self) -> Any:
Expand Down Expand Up @@ -556,10 +558,18 @@ async def is_ready(self) -> Optional[bool]:
exc_info=errors.filter_traceback(self._model_file_name),
)
if not is_ready:
self._is_ready_failures += 1
self._logger.warning(
f"Model is not ready. Consecutive failures: {self._is_ready_failures}"
)
if self._first_health_check_failure is None:
self._first_health_check_failure = datetime.now(timezone.utc)
self._logger.warning("Model is not ready. Health checks failing.")
else:
seconds_since_first_failure = round(
(
datetime.now(timezone.utc) - self._first_health_check_failure
).total_seconds()
)
self._logger.warning(
f"Model is not ready. Health checks failing for {seconds_since_first_failure} seconds."
)
elif is_ready:
self._is_ready_failures = 0
return is_ready
Expand Down
15 changes: 9 additions & 6 deletions truss/tests/test_model_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ def predict(self, model_input):
local_port=8090, detach=True, wait_for_server_ready=False
)

# Sleep a few seconds to get the server some time to wake up
# Sleep a few seconds to get the server some time to wake up
time.sleep(10)

truss_server_addr = "http://localhost:8090"
Expand All @@ -994,7 +994,7 @@ def predict(self, model_input):
assert (
"Exception while checking if model is ready: not ready" in container.logs()
)
assert "Model is not ready. Consecutive failures: 1" in container.logs()
assert "Model is not ready. Health checks failing." in container.logs()

model = """
class Model:
Expand All @@ -1009,18 +1009,21 @@ def predict(self, model_input):
local_port=8090, detach=True, wait_for_server_ready=False
)

# Sleep a few seconds to get the server some time to wake up
# Sleep a few seconds to get the server some time to wake up
time.sleep(10)

truss_server_addr = "http://localhost:8090"

ready = requests.get(f"{truss_server_addr}/v1/models/model")
assert ready.status_code == 503
assert "Model is not ready. Consecutive failures: 1" in container.logs()

assert "Model is not ready. Health checks failing." in container.logs()
time.sleep(5)
ready = requests.get(f"{truss_server_addr}/v1/models/model")
assert ready.status_code == 503
assert "Model is not ready. Consecutive failures: 2" in container.logs()
assert (
"Model is not ready. Health checks failing for 5 seconds."
in container.logs()
)

model = """
class Model:
Expand Down

0 comments on commit 8c16297

Please sign in to comment.