From ee5a81d5b7495024ed60947c75e2ea708946a730 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 27 Mar 2025 13:43:44 -0700 Subject: [PATCH 1/6] fix(anthropic/chat/transformation.py): Don't set tool choice on response_format conversion when thinking is enabled Not allowed by Anthropic Fixes https://github.com/BerriAI/litellm/issues/8901 --- litellm/llms/anthropic/chat/transformation.py | 11 +++++++++-- .../test_anthropic_completion.py | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py index 1a77c453f407..b74a3d6fb712 100644 --- a/litellm/llms/anthropic/chat/transformation.py +++ b/litellm/llms/anthropic/chat/transformation.py @@ -300,6 +300,10 @@ def map_openai_params( model: str, drop_params: bool, ) -> dict: + + is_thinking_enabled = False + if "thinking" in non_default_params: + is_thinking_enabled = True for param, value in non_default_params.items(): if param == "max_tokens": optional_params["max_tokens"] = value @@ -349,14 +353,17 @@ def map_openai_params( - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. """ - _tool_choice = {"name": RESPONSE_FORMAT_TOOL_NAME, "type": "tool"} + if not is_thinking_enabled: + _tool_choice = {"name": RESPONSE_FORMAT_TOOL_NAME, "type": "tool"} + optional_params["tool_choice"] = _tool_choice + _tool = self._create_json_tool_call_for_response_format( json_schema=json_schema, ) optional_params = self._add_tools_to_optional_params( optional_params=optional_params, tools=[_tool] ) - optional_params["tool_choice"] = _tool_choice + optional_params["json_mode"] = True if param == "user": optional_params["metadata"] = {"user_id": value} diff --git a/tests/llm_translation/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py index a83d1d69e9f9..58c68b47c79c 100644 --- a/tests/llm_translation/test_anthropic_completion.py +++ b/tests/llm_translation/test_anthropic_completion.py @@ -1110,3 +1110,22 @@ def test_anthropic_thinking_in_assistant_message(model): response = litellm.completion(**params) assert response is not None + + +def test_completion_thinking_with_response_format(): + from pydantic import BaseModel + + class RFormat(BaseModel): + question: str + answer: str + + messages = [{"role": "user", "content": "Generate 5 question + answer pairs"}] + response = completion( + model="claude-3-7-sonnet-20250219", + messages=messages, + response_format=RFormat, + thinking={"type": "enabled", "budget_tokens": 16000}, + max_tokens=16500, + ) + + print(response) From 7f606abfa46ce541eda096be58f724b01cc9371d Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 27 Mar 2025 13:48:15 -0700 Subject: [PATCH 2/6] refactor: move test to base anthropic chat tests ensures consistent behaviour across vertex/anthropic/bedrock --- tests/llm_translation/base_llm_unit_tests.py | 24 ++++++++++++++++++ .../test_anthropic_completion.py | 25 +++++-------------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py index 82a1ef40fb62..b40d3d2fe22f 100644 --- a/tests/llm_translation/base_llm_unit_tests.py +++ b/tests/llm_translation/base_llm_unit_tests.py @@ -1008,6 +1008,11 @@ def get_base_completion_call_args(self) -> dict: """Must return the base completion call args""" pass + @abstractmethod + def get_base_completion_call_args_with_thinking(self) -> dict: + """Must return the base completion call args""" + pass + @property def completion_function(self): return litellm.completion @@ -1066,3 +1071,22 @@ def test_anthropic_response_format_streaming_vs_non_streaming(self): json.loads(built_response.choices[0].message.content).keys() == json.loads(non_stream_response.choices[0].message.content).keys() ), f"Got={json.loads(built_response.choices[0].message.content)}, Expected={json.loads(non_stream_response.choices[0].message.content)}" + + def test_completion_thinking_with_response_format(self): + from pydantic import BaseModel + + class RFormat(BaseModel): + question: str + answer: str + + base_completion_call_args = self.get_base_completion_call_args_with_thinking() + + messages = [{"role": "user", "content": "Generate 5 question + answer pairs"}] + response = self.completion_function( + **base_completion_call_args, + messages=messages, + response_format=RFormat, + max_tokens=16500, + ) + + print(response) diff --git a/tests/llm_translation/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py index 58c68b47c79c..3f4c0b63f018 100644 --- a/tests/llm_translation/test_anthropic_completion.py +++ b/tests/llm_translation/test_anthropic_completion.py @@ -467,6 +467,12 @@ class TestAnthropicCompletion(BaseLLMChatTest, BaseAnthropicChatTest): def get_base_completion_call_args(self) -> dict: return {"model": "anthropic/claude-3-5-sonnet-20240620"} + def get_base_completion_call_args_with_thinking(self) -> dict: + return { + "model": "anthropic/claude-3-7-sonnet-latest", + "thinking": {"type": "enabled", "budget_tokens": 16000}, + } + def test_tool_call_no_arguments(self, tool_call_no_arguments): """Test that tool calls with no arguments is translated correctly. Relevant issue: https://github.com/BerriAI/litellm/issues/6833""" from litellm.litellm_core_utils.prompt_templates.factory import ( @@ -1110,22 +1116,3 @@ def test_anthropic_thinking_in_assistant_message(model): response = litellm.completion(**params) assert response is not None - - -def test_completion_thinking_with_response_format(): - from pydantic import BaseModel - - class RFormat(BaseModel): - question: str - answer: str - - messages = [{"role": "user", "content": "Generate 5 question + answer pairs"}] - response = completion( - model="claude-3-7-sonnet-20250219", - messages=messages, - response_format=RFormat, - thinking={"type": "enabled", "budget_tokens": 16000}, - max_tokens=16500, - ) - - print(response) From c3a268fbb4c8b506c4031a5732bf9ee997e0bbff Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 27 Mar 2025 13:54:24 -0700 Subject: [PATCH 3/6] fix(anthropic/chat/transformation.py): if thinking token is specified and max tokens is not - ensure max token to anthropic is higher than thinking tokens --- litellm/llms/anthropic/chat/transformation.py | 8 ++++++++ tests/llm_translation/base_llm_unit_tests.py | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py index b74a3d6fb712..701ea6bedb80 100644 --- a/litellm/llms/anthropic/chat/transformation.py +++ b/litellm/llms/anthropic/chat/transformation.py @@ -369,6 +369,14 @@ def map_openai_params( optional_params["metadata"] = {"user_id": value} if param == "thinking": optional_params["thinking"] = value + + ## handle thinking tokens + if is_thinking_enabled and "max_tokens" not in optional_params: + thinking_token_budget = cast(dict, optional_params["thinking"]).get( + "budget_tokens", None + ) + if thinking_token_budget is not None: + optional_params["max_tokens"] = thinking_token_budget + self.max_tokens return optional_params def _create_json_tool_call_for_response_format( diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py index b40d3d2fe22f..eb5851219781 100644 --- a/tests/llm_translation/base_llm_unit_tests.py +++ b/tests/llm_translation/base_llm_unit_tests.py @@ -1086,7 +1086,6 @@ class RFormat(BaseModel): **base_completion_call_args, messages=messages, response_format=RFormat, - max_tokens=16500, ) print(response) From 8d13a81bfb90d586c0db9ab8f649463c2a56f189 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 27 Mar 2025 14:03:43 -0700 Subject: [PATCH 4/6] feat(converse_transformation.py): correctly handle thinking + response format on Bedrock Converse Fixes https://github.com/BerriAI/litellm/issues/8901 --- litellm/constants.py | 1 + litellm/llms/anthropic/chat/transformation.py | 15 ++++++-------- litellm/llms/base_llm/chat/transformation.py | 20 ++++++++++++++++++- .../bedrock/chat/converse_transformation.py | 11 ++++++++-- .../test_bedrock_completion.py | 15 +++++++++++++- 5 files changed, 49 insertions(+), 13 deletions(-) diff --git a/litellm/constants.py b/litellm/constants.py index da66f897c920..de0a7e366d49 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -7,6 +7,7 @@ DEFAULT_FAILURE_THRESHOLD_PERCENT = ( 0.5 # default cooldown a deployment if 50% of requests fail in a given minute ) +DEFAULT_MAX_TOKENS = 4096 DEFAULT_REDIS_SYNC_INTERVAL = 1 DEFAULT_COOLDOWN_TIME_SECONDS = 5 DEFAULT_REPLICATE_POLLING_RETRIES = 5 diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py index 701ea6bedb80..6c223ca9cdff 100644 --- a/litellm/llms/anthropic/chat/transformation.py +++ b/litellm/llms/anthropic/chat/transformation.py @@ -301,9 +301,9 @@ def map_openai_params( drop_params: bool, ) -> dict: - is_thinking_enabled = False - if "thinking" in non_default_params: - is_thinking_enabled = True + is_thinking_enabled = self.is_thinking_enabled( + non_default_params=non_default_params + ) for param, value in non_default_params.items(): if param == "max_tokens": optional_params["max_tokens"] = value @@ -371,12 +371,9 @@ def map_openai_params( optional_params["thinking"] = value ## handle thinking tokens - if is_thinking_enabled and "max_tokens" not in optional_params: - thinking_token_budget = cast(dict, optional_params["thinking"]).get( - "budget_tokens", None - ) - if thinking_token_budget is not None: - optional_params["max_tokens"] = thinking_token_budget + self.max_tokens + self.update_optional_params_with_thinking_tokens( + optional_params=optional_params + ) return optional_params def _create_json_tool_call_for_response_format( diff --git a/litellm/llms/base_llm/chat/transformation.py b/litellm/llms/base_llm/chat/transformation.py index 1b5a6bc58e31..488640ae2c2a 100644 --- a/litellm/llms/base_llm/chat/transformation.py +++ b/litellm/llms/base_llm/chat/transformation.py @@ -13,12 +13,13 @@ Optional, Type, Union, + cast, ) import httpx from pydantic import BaseModel -from litellm.constants import RESPONSE_FORMAT_TOOL_NAME +from litellm.constants import DEFAULT_MAX_TOKENS, RESPONSE_FORMAT_TOOL_NAME from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.types.llms.openai import ( AllMessageValues, @@ -102,6 +103,23 @@ def get_json_schema_from_pydantic_object( ) -> Optional[dict]: return type_to_response_format_param(response_format=response_format) + def is_thinking_enabled(self, non_default_params: dict) -> bool: + return non_default_params.get("thinking", {}).get("type", None) == "enabled" + + def update_optional_params_with_thinking_tokens(self, optional_params: dict): + """ + Handles scenario where max tokens is not specified. For anthropic models (anthropic api/bedrock/vertex ai), this requires having the max tokens being set and being greater than the thinking token budget. + """ + is_thinking_enabled = self.is_thinking_enabled(optional_params) + if is_thinking_enabled and "max_tokens" not in optional_params: + thinking_token_budget = cast(dict, optional_params["thinking"]).get( + "budget_tokens", None + ) + if thinking_token_budget is not None: + optional_params["max_tokens"] = ( + thinking_token_budget + DEFAULT_MAX_TOKENS + ) + def should_fake_stream( self, model: Optional[str], diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py index bb874cfe38e9..429c578cab30 100644 --- a/litellm/llms/bedrock/chat/converse_transformation.py +++ b/litellm/llms/bedrock/chat/converse_transformation.py @@ -210,6 +210,7 @@ def map_openai_params( drop_params: bool, messages: Optional[List[AllMessageValues]] = None, ) -> dict: + is_thinking_enabled = self.is_thinking_enabled(non_default_params) for param, value in non_default_params.items(): if param == "response_format" and isinstance(value, dict): @@ -247,8 +248,11 @@ def map_openai_params( optional_params = self._add_tools_to_optional_params( optional_params=optional_params, tools=[_tool] ) - if litellm.utils.supports_tool_choice( - model=model, custom_llm_provider=self.custom_llm_provider + if ( + litellm.utils.supports_tool_choice( + model=model, custom_llm_provider=self.custom_llm_provider + ) + and not is_thinking_enabled ): optional_params["tool_choice"] = ToolChoiceValuesBlock( tool=SpecificToolChoiceBlock( @@ -284,6 +288,9 @@ def map_openai_params( optional_params["tool_choice"] = _tool_choice_value if param == "thinking": optional_params["thinking"] = value + self.update_optional_params_with_thinking_tokens( + optional_params=optional_params + ) return optional_params @overload diff --git a/tests/llm_translation/test_bedrock_completion.py b/tests/llm_translation/test_bedrock_completion.py index 1694033a098a..917993a98c09 100644 --- a/tests/llm_translation/test_bedrock_completion.py +++ b/tests/llm_translation/test_bedrock_completion.py @@ -35,7 +35,7 @@ from litellm.llms.bedrock.chat import BedrockLLM from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.litellm_core_utils.prompt_templates.factory import _bedrock_tools_pt -from base_llm_unit_tests import BaseLLMChatTest +from base_llm_unit_tests import BaseLLMChatTest, BaseAnthropicChatTest from base_rerank_unit_tests import BaseLLMRerankTest from base_embedding_unit_tests import BaseLLMEmbeddingTest @@ -2191,6 +2191,19 @@ def test_completion_cost(self): assert cost > 0 +class TestBedrockConverseAnthropicUnitTests(BaseAnthropicChatTest): + def get_base_completion_call_args(self) -> dict: + return { + "model": "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0", + } + + def get_base_completion_call_args_with_thinking(self) -> dict: + return { + "model": "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", + "thinking": {"type": "enabled", "budget_tokens": 16000}, + } + + class TestBedrockConverseChatNormal(BaseLLMChatTest): def get_base_completion_call_args(self) -> dict: os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" From c0d20e6db8081c1a8dea983703bc6fe413063803 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 27 Mar 2025 22:10:46 -0700 Subject: [PATCH 5/6] fix(converse_transformation.py): correctly handle adding max tokens --- litellm/llms/anthropic/chat/transformation.py | 9 +++++---- litellm/llms/base_llm/chat/transformation.py | 14 ++++++++++---- .../llms/bedrock/chat/converse_transformation.py | 7 ++++--- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py index 6c223ca9cdff..dcbc6775dcc3 100644 --- a/litellm/llms/anthropic/chat/transformation.py +++ b/litellm/llms/anthropic/chat/transformation.py @@ -304,6 +304,11 @@ def map_openai_params( is_thinking_enabled = self.is_thinking_enabled( non_default_params=non_default_params ) + + ## handle thinking tokens + self.update_optional_params_with_thinking_tokens( + non_default_params=non_default_params, optional_params=optional_params + ) for param, value in non_default_params.items(): if param == "max_tokens": optional_params["max_tokens"] = value @@ -370,10 +375,6 @@ def map_openai_params( if param == "thinking": optional_params["thinking"] = value - ## handle thinking tokens - self.update_optional_params_with_thinking_tokens( - optional_params=optional_params - ) return optional_params def _create_json_tool_call_for_response_format( diff --git a/litellm/llms/base_llm/chat/transformation.py b/litellm/llms/base_llm/chat/transformation.py index 488640ae2c2a..45ea06b9e468 100644 --- a/litellm/llms/base_llm/chat/transformation.py +++ b/litellm/llms/base_llm/chat/transformation.py @@ -106,13 +106,19 @@ def get_json_schema_from_pydantic_object( def is_thinking_enabled(self, non_default_params: dict) -> bool: return non_default_params.get("thinking", {}).get("type", None) == "enabled" - def update_optional_params_with_thinking_tokens(self, optional_params: dict): + def update_optional_params_with_thinking_tokens( + self, non_default_params: dict, optional_params: dict + ): """ Handles scenario where max tokens is not specified. For anthropic models (anthropic api/bedrock/vertex ai), this requires having the max tokens being set and being greater than the thinking token budget. + + Checks 'non_default_params' for 'thinking' and 'max_tokens' + + if 'thinking' is enabled and 'max_tokens' is not specified, set 'max_tokens' to the thinking token budget + DEFAULT_MAX_TOKENS """ - is_thinking_enabled = self.is_thinking_enabled(optional_params) - if is_thinking_enabled and "max_tokens" not in optional_params: - thinking_token_budget = cast(dict, optional_params["thinking"]).get( + is_thinking_enabled = self.is_thinking_enabled(non_default_params) + if is_thinking_enabled and "max_tokens" not in non_default_params: + thinking_token_budget = cast(dict, non_default_params["thinking"]).get( "budget_tokens", None ) if thinking_token_budget is not None: diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py index 429c578cab30..b63b76a8ee55 100644 --- a/litellm/llms/bedrock/chat/converse_transformation.py +++ b/litellm/llms/bedrock/chat/converse_transformation.py @@ -211,6 +211,9 @@ def map_openai_params( messages: Optional[List[AllMessageValues]] = None, ) -> dict: is_thinking_enabled = self.is_thinking_enabled(non_default_params) + self.update_optional_params_with_thinking_tokens( + non_default_params=non_default_params, optional_params=optional_params + ) for param, value in non_default_params.items(): if param == "response_format" and isinstance(value, dict): @@ -288,9 +291,7 @@ def map_openai_params( optional_params["tool_choice"] = _tool_choice_value if param == "thinking": optional_params["thinking"] = value - self.update_optional_params_with_thinking_tokens( - optional_params=optional_params - ) + return optional_params @overload From 9f07f10979efca748daab454605bca2f90e70b53 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 27 Mar 2025 22:44:09 -0700 Subject: [PATCH 6/6] test: handle service unavailable error --- tests/llm_translation/test_cohere.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/llm_translation/test_cohere.py b/tests/llm_translation/test_cohere.py index f34098477d27..254a42593b92 100644 --- a/tests/llm_translation/test_cohere.py +++ b/tests/llm_translation/test_cohere.py @@ -55,6 +55,8 @@ async def test_chat_completion_cohere_citations(stream): assert citations_chunk else: assert response.citations is not None + except litellm.ServiceUnavailableError: + pass except Exception as e: pytest.fail(f"Error occurred: {e}")