Remove stack trace when model load takes long (#1674)

saptarshi-baseten · web-flow · commit e6334ce2cdef · 2025-05-29T12:18:50.000-04:00
* remove stack trace when model load takes long

* fix server tests and return 503 instead of raising exception

* don't retry health checks to control server, but retry other endpoints

* add comment clarifying different response for health check

* refactor method to check if method is health check

* clarify model not ready message and comments

* update truss rc
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "truss"
-version = "0.9.96rc003"
+version = "0.9.96rc018"
 description = "A seamless bridge from model development to model delivery"
 license = "MIT"
 readme = "README.md"
diff --git a/truss/templates/control/control/endpoints.py b/truss/templates/control/control/endpoints.py
@@ -5,13 +5,17 @@
 import httpx
 from fastapi import APIRouter, WebSocket
 from fastapi.responses import JSONResponse, StreamingResponse
-from helpers.errors import ModelLoadFailed, ModelNotReady
 from httpx_ws import aconnect_ws
 from starlette.requests import ClientDisconnect, Request
 from starlette.responses import Response
 from tenacity import RetryCallState, Retrying, retry_if_exception_type, wait_fixed
 from wsproto.events import BytesMessage, TextMessage
 
+from truss.templates.control.control.helpers.errors import (
+    ModelLoadFailed,
+    ModelNotReady,
+)
+
 INFERENCE_SERVER_START_WAIT_SECS = 60
 BASE_RETRY_EXCEPTIONS = (
     retry_if_exception_type(httpx.ConnectError)
@@ -65,7 +69,16 @@ async def proxy_http(request: Request):
                 resp = await client.send(inf_serv_req, stream=True)
 
                 if await _is_model_not_ready(resp):
-                    raise ModelNotReady("Model has started running, but not ready yet.")
+                    # If this is a health check request, don't raise an error so that a stack
+                    # trace isn't logged upon deploying a model with a long load time.
+                    if _is_health_check(path):
+                        return JSONResponse(
+                            "The server is live, but the model has not completed loading.",
+                            status_code=503,
+                        )
+                    raise ModelNotReady(
+                        "The server is live, but the model has not completed loading."
+                    )
             except (httpx.RemoteProtocolError, httpx.ConnectError) as exp:
                 # This check is a bit expensive so we don't do it before every request, we
                 # do it only if request fails with connection error. If the inference server
@@ -99,7 +112,7 @@ def inference_retries(
         retry=retry_condition,
         stop=_custom_stop_strategy,
         wait=wait_fixed(1),
-        reraise=False,
+        reraise=True,
     ):
         yield attempt
 
@@ -216,6 +229,13 @@ def _reroute_if_health_check(path: str) -> str:
     return path
 
 
+def _is_health_check(path: str) -> bool:
+    """
+    Checks if the request path is for the health check endpoint.
+    """
+    return path == "/v1/models/model/loaded"
+
+
 def _custom_stop_strategy(retry_state: RetryCallState) -> bool:
     # Stop after 10 attempts for ModelNotReady
     if retry_state.outcome is not None and isinstance(
diff --git a/truss/templates/control/control/helpers/errors.py b/truss/templates/control/control/helpers/errors.py
@@ -37,7 +37,7 @@ class InadmissiblePatch(PatchApplicatonError):
 
 
 class ModelNotReady(Error):
-    """Model has started running, but not ready yet."""
+    """The server is live, but the model has not completed loading."""
 
     pass
 
diff --git a/truss/tests/templates/control/control/test_server.py b/truss/tests/templates/control/control/test_server.py
@@ -8,7 +8,6 @@
 
 import httpx
 import pytest
-from tenacity import RetryError
 
 from truss.truss_handle.patch.custom_types import PatchRequest
 
@@ -246,11 +245,10 @@ async def mock_send(*args, **kwargs):
 
     app.state.proxy_client.send = AsyncMock(side_effect=mock_send)
 
-    with pytest.raises(RetryError):
-        await client.get("/v1/models/model")
+    await client.get("/v1/models/model")
 
-    # Health check was retried 10 times
-    assert app.state.proxy_client.send.call_count == 10
+    # Health check did not retry
+    assert app.state.proxy_client.send.call_count == 1
 
 
 @pytest.mark.anyio
@@ -277,7 +275,7 @@ async def test_retries(client, app):
 
     with (
         patch("endpoints.INFERENCE_SERVER_START_WAIT_SECS", new=4),
-        pytest.raises(RetryError),
+        pytest.raises(httpx.RemoteProtocolError),
     ):
         await client.get("/v1/models/model")