fix(proxy_server.py): Fix "Circular reference detected" error when max_parallel_requests = 0 (#9671)

krrishdholakia · ishaan-jaff · web-flow · commit f2a7edaddce5 · 2025-03-31T22:06:02.000-07:00
* fix(proxy_server.py): remove non-functional parent backoff/retry on /chat/completion Causes circular reference error * fix(http_parsing_utils.py): safely return parsed body - don't allow mutation of cached request body by client functions Root cause fix for circular reference error * Revert "fix: Anthropic prompt caching on GCP Vertex AI (#9605)" (#9670) This reverts commit a867324. * add type hints for AnthropicMessagesResponse * define types for response form AnthropicMessagesResponse * fix response typing * allow using litellm.messages.acreate and litellm.messages.create * fix anthropic_messages implementation * add clear type hints to litellm.messages.create functions * fix anthropic_messages * working anthropic API tests * fixes - anthropic messages interface * use new anthropic interface * fix code quality check * docs anthropic messages endpoint * add namespace_packages = True to mypy * fix mypy lint errors * docs anthropic messages interface * test: fix unit test * test(test_http_parsing_utils.py): update tests --------- Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
diff --git a/litellm/proxy/common_request_processing.py b/litellm/proxy/common_request_processing.py
@@ -123,6 +123,7 @@ async def base_process_llm_request(
         """
         Common request processing logic for both chat completions and responses API endpoints
         """
+
         verbose_proxy_logger.debug(
             "Request received by LiteLLM:\n{}".format(json.dumps(self.data, indent=4)),
         )
diff --git a/litellm/proxy/common_utils/http_parsing_utils.py b/litellm/proxy/common_utils/http_parsing_utils.py
@@ -81,8 +81,13 @@ async def _read_request_body(request: Optional[Request]) -> Dict:
 def _safe_get_request_parsed_body(request: Optional[Request]) -> Optional[dict]:
     if request is None:
         return None
-    if hasattr(request, "scope") and "parsed_body" in request.scope:
-        return request.scope["parsed_body"]
+    if (
+        hasattr(request, "scope")
+        and "parsed_body" in request.scope
+        and isinstance(request.scope["parsed_body"], tuple)
+    ):
+        accepted_keys, parsed_body = request.scope["parsed_body"]
+        return {key: parsed_body[key] for key in accepted_keys}
     return None
 
 
@@ -93,7 +98,7 @@ def _safe_set_request_parsed_body(
     try:
         if request is None:
             return
-        request.scope["parsed_body"] = parsed_body
+        request.scope["parsed_body"] = (tuple(parsed_body.keys()), parsed_body)
     except Exception as e:
         verbose_proxy_logger.debug(
             "Unexpected error setting request parsed body - {}".format(e)
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
@@ -3308,15 +3308,6 @@ async def model_list(
     tags=["chat/completions"],
     responses={200: {"description": "Successful response"}, **ERROR_RESPONSES},
 )  # azure compatible endpoint
-@backoff.on_exception(
-    backoff.expo,
-    Exception,  # base exception to catch for the backoff
-    max_tries=global_max_parallel_request_retries,  # maximum number of retries
-    max_time=global_max_parallel_request_retry_timeout,  # maximum total time to retry for
-    on_backoff=on_backoff,  # specifying the function to call on backoff
-    giveup=giveup,
-    logger=verbose_proxy_logger,
-)
 async def chat_completion(  # noqa: PLR0915
     request: Request,
     fastapi_response: Response,
diff --git a/tests/litellm/proxy/common_utils/test_http_parsing_utils.py b/tests/litellm/proxy/common_utils/test_http_parsing_utils.py
@@ -39,7 +39,7 @@ async def test_request_body_caching():
     result1 = await _read_request_body(mock_request)
     assert result1 == test_data
     assert "parsed_body" in mock_request.scope
-    assert mock_request.scope["parsed_body"] == test_data
+    assert mock_request.scope["parsed_body"] == (("key",), {"key": "value"})
 
     # Verify the body was read once
     mock_request.body.assert_called_once()
@@ -49,7 +49,7 @@ async def test_request_body_caching():
 
     # Second call should use the cached body
     result2 = await _read_request_body(mock_request)
-    assert result2 == test_data
+    assert result2 == {"key": "value"}
 
     # Verify the body was not read again
     mock_request.body.assert_not_called()
@@ -75,7 +75,10 @@ async def test_form_data_parsing():
     # Verify the form data was correctly parsed
     assert result == test_data
     assert "parsed_body" in mock_request.scope
-    assert mock_request.scope["parsed_body"] == test_data
+    assert mock_request.scope["parsed_body"] == (
+        ("name", "message"),
+        {"name": "test_user", "message": "hello world"},
+    )
 
     # Verify form() was called
     mock_request.form.assert_called_once()
@@ -101,7 +104,46 @@ async def test_empty_request_body():
     # Verify an empty dict is returned
     assert result == {}
     assert "parsed_body" in mock_request.scope
-    assert mock_request.scope["parsed_body"] == {}
+    assert mock_request.scope["parsed_body"] == ((), {})
 
     # Verify the body was read
     mock_request.body.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_circular_reference_handling():
+    """
+    Test that cached request body isn't modified when the returned result is modified.
+    Demonstrates the mutable dictionary reference issue.
+    """
+    # Create a mock request with initial data
+    mock_request = MagicMock()
+    initial_body = {
+        "model": "gpt-4",
+        "messages": [{"role": "user", "content": "Hello"}],
+    }
+
+    mock_request.body = AsyncMock(return_value=orjson.dumps(initial_body))
+    mock_request.headers = {"content-type": "application/json"}
+    mock_request.scope = {}
+
+    # First parse
+    result = await _read_request_body(mock_request)
+
+    # Verify initial parse
+    assert result["model"] == "gpt-4"
+    assert result["messages"] == [{"role": "user", "content": "Hello"}]
+
+    # Modify the result by adding proxy_server_request
+    result["proxy_server_request"] = {
+        "url": "http://0.0.0.0:4000/v1/chat/completions",
+        "method": "POST",
+        "headers": {"content-type": "application/json"},
+        "body": result,  # Creates circular reference
+    }
+
+    # Second parse using the same request - will use the modified cached value
+    result2 = await _read_request_body(mock_request)
+    assert (
+        "proxy_server_request" not in result2
+    )  # This will pass, showing the cache pollution

Original file line number	Diff line number	Diff line change
`@@ -123,6 +123,7 @@ async def base_process_llm_request(`
`123`	`123`	`"""`
`124`	`124`	`Common request processing logic for both chat completions and responses API endpoints`
`125`	`125`	`"""`
	`126`	`+`
`126`	`127`	`verbose_proxy_logger.debug(`
`127`	`128`	`"Request received by LiteLLM:\n{}".format(json.dumps(self.data, indent=4)),`
`128`	`129`	`)`