From ee5a81d5b7495024ed60947c75e2ea708946a730 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 27 Mar 2025 13:43:44 -0700
Subject: [PATCH 1/6] fix(anthropic/chat/transformation.py): Don't set tool
 choice on response_format conversion when thinking is enabled

Not allowed by Anthropic

Fixes https://github.com/BerriAI/litellm/issues/8901
---
 litellm/llms/anthropic/chat/transformation.py | 11 +++++++++--
 .../test_anthropic_completion.py              | 19 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index 1a77c453f407..b74a3d6fb712 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -300,6 +300,10 @@ def map_openai_params(
         model: str,
         drop_params: bool,
     ) -> dict:
+
+        is_thinking_enabled = False
+        if "thinking" in non_default_params:
+            is_thinking_enabled = True
         for param, value in non_default_params.items():
             if param == "max_tokens":
                 optional_params["max_tokens"] = value
@@ -349,14 +353,17 @@ def map_openai_params(
                 - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
                 """
 
-                _tool_choice = {"name": RESPONSE_FORMAT_TOOL_NAME, "type": "tool"}
+                if not is_thinking_enabled:
+                    _tool_choice = {"name": RESPONSE_FORMAT_TOOL_NAME, "type": "tool"}
+                    optional_params["tool_choice"] = _tool_choice
+
                 _tool = self._create_json_tool_call_for_response_format(
                     json_schema=json_schema,
                 )
                 optional_params = self._add_tools_to_optional_params(
                     optional_params=optional_params, tools=[_tool]
                 )
-                optional_params["tool_choice"] = _tool_choice
+
                 optional_params["json_mode"] = True
             if param == "user":
                 optional_params["metadata"] = {"user_id": value}
diff --git a/tests/llm_translation/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py
index a83d1d69e9f9..58c68b47c79c 100644
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@@ -1110,3 +1110,22 @@ def test_anthropic_thinking_in_assistant_message(model):
     response = litellm.completion(**params)
 
     assert response is not None
+
+
+def test_completion_thinking_with_response_format():
+    from pydantic import BaseModel
+
+    class RFormat(BaseModel):
+        question: str
+        answer: str
+
+    messages = [{"role": "user", "content": "Generate 5 question + answer pairs"}]
+    response = completion(
+        model="claude-3-7-sonnet-20250219",
+        messages=messages,
+        response_format=RFormat,
+        thinking={"type": "enabled", "budget_tokens": 16000},
+        max_tokens=16500,
+    )
+
+    print(response)

From 7f606abfa46ce541eda096be58f724b01cc9371d Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 27 Mar 2025 13:48:15 -0700
Subject: [PATCH 2/6] refactor: move test to base anthropic chat tests

ensures consistent behaviour across vertex/anthropic/bedrock
---
 tests/llm_translation/base_llm_unit_tests.py  | 24 ++++++++++++++++++
 .../test_anthropic_completion.py              | 25 +++++--------------
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py
index 82a1ef40fb62..b40d3d2fe22f 100644
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@@ -1008,6 +1008,11 @@ def get_base_completion_call_args(self) -> dict:
         """Must return the base completion call args"""
         pass
 
+    @abstractmethod
+    def get_base_completion_call_args_with_thinking(self) -> dict:
+        """Must return the base completion call args"""
+        pass
+
     @property
     def completion_function(self):
         return litellm.completion
@@ -1066,3 +1071,22 @@ def test_anthropic_response_format_streaming_vs_non_streaming(self):
             json.loads(built_response.choices[0].message.content).keys()
             == json.loads(non_stream_response.choices[0].message.content).keys()
         ), f"Got={json.loads(built_response.choices[0].message.content)}, Expected={json.loads(non_stream_response.choices[0].message.content)}"
+
+    def test_completion_thinking_with_response_format(self):
+        from pydantic import BaseModel
+
+        class RFormat(BaseModel):
+            question: str
+            answer: str
+
+        base_completion_call_args = self.get_base_completion_call_args_with_thinking()
+
+        messages = [{"role": "user", "content": "Generate 5 question + answer pairs"}]
+        response = self.completion_function(
+            **base_completion_call_args,
+            messages=messages,
+            response_format=RFormat,
+            max_tokens=16500,
+        )
+
+        print(response)
diff --git a/tests/llm_translation/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py
index 58c68b47c79c..3f4c0b63f018 100644
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@@ -467,6 +467,12 @@ class TestAnthropicCompletion(BaseLLMChatTest, BaseAnthropicChatTest):
     def get_base_completion_call_args(self) -> dict:
         return {"model": "anthropic/claude-3-5-sonnet-20240620"}
 
+    def get_base_completion_call_args_with_thinking(self) -> dict:
+        return {
+            "model": "anthropic/claude-3-7-sonnet-latest",
+            "thinking": {"type": "enabled", "budget_tokens": 16000},
+        }
+
     def test_tool_call_no_arguments(self, tool_call_no_arguments):
         """Test that tool calls with no arguments is translated correctly. Relevant issue: https://github.com/BerriAI/litellm/issues/6833"""
         from litellm.litellm_core_utils.prompt_templates.factory import (
@@ -1110,22 +1116,3 @@ def test_anthropic_thinking_in_assistant_message(model):
     response = litellm.completion(**params)
 
     assert response is not None
-
-
-def test_completion_thinking_with_response_format():
-    from pydantic import BaseModel
-
-    class RFormat(BaseModel):
-        question: str
-        answer: str
-
-    messages = [{"role": "user", "content": "Generate 5 question + answer pairs"}]
-    response = completion(
-        model="claude-3-7-sonnet-20250219",
-        messages=messages,
-        response_format=RFormat,
-        thinking={"type": "enabled", "budget_tokens": 16000},
-        max_tokens=16500,
-    )
-
-    print(response)

From c3a268fbb4c8b506c4031a5732bf9ee997e0bbff Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 27 Mar 2025 13:54:24 -0700
Subject: [PATCH 3/6] fix(anthropic/chat/transformation.py): if thinking token
 is specified and max tokens is not - ensure max token to anthropic is higher
 than thinking tokens

---
 litellm/llms/anthropic/chat/transformation.py | 8 ++++++++
 tests/llm_translation/base_llm_unit_tests.py  | 1 -
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index b74a3d6fb712..701ea6bedb80 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -369,6 +369,14 @@ def map_openai_params(
                 optional_params["metadata"] = {"user_id": value}
             if param == "thinking":
                 optional_params["thinking"] = value
+
+        ## handle thinking tokens
+        if is_thinking_enabled and "max_tokens" not in optional_params:
+            thinking_token_budget = cast(dict, optional_params["thinking"]).get(
+                "budget_tokens", None
+            )
+            if thinking_token_budget is not None:
+                optional_params["max_tokens"] = thinking_token_budget + self.max_tokens
         return optional_params
 
     def _create_json_tool_call_for_response_format(
diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py
index b40d3d2fe22f..eb5851219781 100644
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@@ -1086,7 +1086,6 @@ class RFormat(BaseModel):
             **base_completion_call_args,
             messages=messages,
             response_format=RFormat,
-            max_tokens=16500,
         )
 
         print(response)

From 8d13a81bfb90d586c0db9ab8f649463c2a56f189 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 27 Mar 2025 14:03:43 -0700
Subject: [PATCH 4/6] feat(converse_transformation.py): correctly handle
 thinking + response format on Bedrock Converse

Fixes https://github.com/BerriAI/litellm/issues/8901
---
 litellm/constants.py                          |  1 +
 litellm/llms/anthropic/chat/transformation.py | 15 ++++++--------
 litellm/llms/base_llm/chat/transformation.py  | 20 ++++++++++++++++++-
 .../bedrock/chat/converse_transformation.py   | 11 ++++++++--
 .../test_bedrock_completion.py                | 15 +++++++++++++-
 5 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/litellm/constants.py b/litellm/constants.py
index da66f897c920..de0a7e366d49 100644
--- a/litellm/constants.py
+++ b/litellm/constants.py
@@ -7,6 +7,7 @@
 DEFAULT_FAILURE_THRESHOLD_PERCENT = (
     0.5  # default cooldown a deployment if 50% of requests fail in a given minute
 )
+DEFAULT_MAX_TOKENS = 4096
 DEFAULT_REDIS_SYNC_INTERVAL = 1
 DEFAULT_COOLDOWN_TIME_SECONDS = 5
 DEFAULT_REPLICATE_POLLING_RETRIES = 5
diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index 701ea6bedb80..6c223ca9cdff 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -301,9 +301,9 @@ def map_openai_params(
         drop_params: bool,
     ) -> dict:
 
-        is_thinking_enabled = False
-        if "thinking" in non_default_params:
-            is_thinking_enabled = True
+        is_thinking_enabled = self.is_thinking_enabled(
+            non_default_params=non_default_params
+        )
         for param, value in non_default_params.items():
             if param == "max_tokens":
                 optional_params["max_tokens"] = value
@@ -371,12 +371,9 @@ def map_openai_params(
                 optional_params["thinking"] = value
 
         ## handle thinking tokens
-        if is_thinking_enabled and "max_tokens" not in optional_params:
-            thinking_token_budget = cast(dict, optional_params["thinking"]).get(
-                "budget_tokens", None
-            )
-            if thinking_token_budget is not None:
-                optional_params["max_tokens"] = thinking_token_budget + self.max_tokens
+        self.update_optional_params_with_thinking_tokens(
+            optional_params=optional_params
+        )
         return optional_params
 
     def _create_json_tool_call_for_response_format(
diff --git a/litellm/llms/base_llm/chat/transformation.py b/litellm/llms/base_llm/chat/transformation.py
index 1b5a6bc58e31..488640ae2c2a 100644
--- a/litellm/llms/base_llm/chat/transformation.py
+++ b/litellm/llms/base_llm/chat/transformation.py
@@ -13,12 +13,13 @@
     Optional,
     Type,
     Union,
+    cast,
 )
 
 import httpx
 from pydantic import BaseModel
 
-from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
+from litellm.constants import DEFAULT_MAX_TOKENS, RESPONSE_FORMAT_TOOL_NAME
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.llms.openai import (
     AllMessageValues,
@@ -102,6 +103,23 @@ def get_json_schema_from_pydantic_object(
     ) -> Optional[dict]:
         return type_to_response_format_param(response_format=response_format)
 
+    def is_thinking_enabled(self, non_default_params: dict) -> bool:
+        return non_default_params.get("thinking", {}).get("type", None) == "enabled"
+
+    def update_optional_params_with_thinking_tokens(self, optional_params: dict):
+        """
+        Handles scenario where max tokens is not specified. For anthropic models (anthropic api/bedrock/vertex ai), this requires having the max tokens being set and being greater than the thinking token budget.
+        """
+        is_thinking_enabled = self.is_thinking_enabled(optional_params)
+        if is_thinking_enabled and "max_tokens" not in optional_params:
+            thinking_token_budget = cast(dict, optional_params["thinking"]).get(
+                "budget_tokens", None
+            )
+            if thinking_token_budget is not None:
+                optional_params["max_tokens"] = (
+                    thinking_token_budget + DEFAULT_MAX_TOKENS
+                )
+
     def should_fake_stream(
         self,
         model: Optional[str],
diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py
index bb874cfe38e9..429c578cab30 100644
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@@ -210,6 +210,7 @@ def map_openai_params(
         drop_params: bool,
         messages: Optional[List[AllMessageValues]] = None,
     ) -> dict:
+        is_thinking_enabled = self.is_thinking_enabled(non_default_params)
         for param, value in non_default_params.items():
             if param == "response_format" and isinstance(value, dict):
 
@@ -247,8 +248,11 @@ def map_openai_params(
                 optional_params = self._add_tools_to_optional_params(
                     optional_params=optional_params, tools=[_tool]
                 )
-                if litellm.utils.supports_tool_choice(
-                    model=model, custom_llm_provider=self.custom_llm_provider
+                if (
+                    litellm.utils.supports_tool_choice(
+                        model=model, custom_llm_provider=self.custom_llm_provider
+                    )
+                    and not is_thinking_enabled
                 ):
                     optional_params["tool_choice"] = ToolChoiceValuesBlock(
                         tool=SpecificToolChoiceBlock(
@@ -284,6 +288,9 @@ def map_openai_params(
                     optional_params["tool_choice"] = _tool_choice_value
             if param == "thinking":
                 optional_params["thinking"] = value
+        self.update_optional_params_with_thinking_tokens(
+            optional_params=optional_params
+        )
         return optional_params
 
     @overload
diff --git a/tests/llm_translation/test_bedrock_completion.py b/tests/llm_translation/test_bedrock_completion.py
index 1694033a098a..917993a98c09 100644
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
@@ -35,7 +35,7 @@
 from litellm.llms.bedrock.chat import BedrockLLM
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.litellm_core_utils.prompt_templates.factory import _bedrock_tools_pt
-from base_llm_unit_tests import BaseLLMChatTest
+from base_llm_unit_tests import BaseLLMChatTest, BaseAnthropicChatTest
 from base_rerank_unit_tests import BaseLLMRerankTest
 from base_embedding_unit_tests import BaseLLMEmbeddingTest
 
@@ -2191,6 +2191,19 @@ def test_completion_cost(self):
         assert cost > 0
 
 
+class TestBedrockConverseAnthropicUnitTests(BaseAnthropicChatTest):
+    def get_base_completion_call_args(self) -> dict:
+        return {
+            "model": "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0",
+        }
+
+    def get_base_completion_call_args_with_thinking(self) -> dict:
+        return {
+            "model": "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+            "thinking": {"type": "enabled", "budget_tokens": 16000},
+        }
+
+
 class TestBedrockConverseChatNormal(BaseLLMChatTest):
     def get_base_completion_call_args(self) -> dict:
         os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"

From c0d20e6db8081c1a8dea983703bc6fe413063803 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 27 Mar 2025 22:10:46 -0700
Subject: [PATCH 5/6] fix(converse_transformation.py): correctly handle adding
 max tokens

---
 litellm/llms/anthropic/chat/transformation.py      |  9 +++++----
 litellm/llms/base_llm/chat/transformation.py       | 14 ++++++++++----
 .../llms/bedrock/chat/converse_transformation.py   |  7 ++++---
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index 6c223ca9cdff..dcbc6775dcc3 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -304,6 +304,11 @@ def map_openai_params(
         is_thinking_enabled = self.is_thinking_enabled(
             non_default_params=non_default_params
         )
+
+        ## handle thinking tokens
+        self.update_optional_params_with_thinking_tokens(
+            non_default_params=non_default_params, optional_params=optional_params
+        )
         for param, value in non_default_params.items():
             if param == "max_tokens":
                 optional_params["max_tokens"] = value
@@ -370,10 +375,6 @@ def map_openai_params(
             if param == "thinking":
                 optional_params["thinking"] = value
 
-        ## handle thinking tokens
-        self.update_optional_params_with_thinking_tokens(
-            optional_params=optional_params
-        )
         return optional_params
 
     def _create_json_tool_call_for_response_format(
diff --git a/litellm/llms/base_llm/chat/transformation.py b/litellm/llms/base_llm/chat/transformation.py
index 488640ae2c2a..45ea06b9e468 100644
--- a/litellm/llms/base_llm/chat/transformation.py
+++ b/litellm/llms/base_llm/chat/transformation.py
@@ -106,13 +106,19 @@ def get_json_schema_from_pydantic_object(
     def is_thinking_enabled(self, non_default_params: dict) -> bool:
         return non_default_params.get("thinking", {}).get("type", None) == "enabled"
 
-    def update_optional_params_with_thinking_tokens(self, optional_params: dict):
+    def update_optional_params_with_thinking_tokens(
+        self, non_default_params: dict, optional_params: dict
+    ):
         """
         Handles scenario where max tokens is not specified. For anthropic models (anthropic api/bedrock/vertex ai), this requires having the max tokens being set and being greater than the thinking token budget.
+
+        Checks 'non_default_params' for 'thinking' and 'max_tokens'
+
+        if 'thinking' is enabled and 'max_tokens' is not specified, set 'max_tokens' to the thinking token budget + DEFAULT_MAX_TOKENS
         """
-        is_thinking_enabled = self.is_thinking_enabled(optional_params)
-        if is_thinking_enabled and "max_tokens" not in optional_params:
-            thinking_token_budget = cast(dict, optional_params["thinking"]).get(
+        is_thinking_enabled = self.is_thinking_enabled(non_default_params)
+        if is_thinking_enabled and "max_tokens" not in non_default_params:
+            thinking_token_budget = cast(dict, non_default_params["thinking"]).get(
                 "budget_tokens", None
             )
             if thinking_token_budget is not None:
diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py
index 429c578cab30..b63b76a8ee55 100644
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@@ -211,6 +211,9 @@ def map_openai_params(
         messages: Optional[List[AllMessageValues]] = None,
     ) -> dict:
         is_thinking_enabled = self.is_thinking_enabled(non_default_params)
+        self.update_optional_params_with_thinking_tokens(
+            non_default_params=non_default_params, optional_params=optional_params
+        )
         for param, value in non_default_params.items():
             if param == "response_format" and isinstance(value, dict):
 
@@ -288,9 +291,7 @@ def map_openai_params(
                     optional_params["tool_choice"] = _tool_choice_value
             if param == "thinking":
                 optional_params["thinking"] = value
-        self.update_optional_params_with_thinking_tokens(
-            optional_params=optional_params
-        )
+
         return optional_params
 
     @overload

From 9f07f10979efca748daab454605bca2f90e70b53 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 27 Mar 2025 22:44:09 -0700
Subject: [PATCH 6/6] test: handle service unavailable error

---
 tests/llm_translation/test_cohere.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/llm_translation/test_cohere.py b/tests/llm_translation/test_cohere.py
index f34098477d27..254a42593b92 100644
--- a/tests/llm_translation/test_cohere.py
+++ b/tests/llm_translation/test_cohere.py
@@ -55,6 +55,8 @@ async def test_chat_completion_cohere_citations(stream):
             assert citations_chunk
         else:
             assert response.citations is not None
+    except litellm.ServiceUnavailableError:
+        pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")