From 190f6caa682e1869cc5de7514090e4cc9b5b52f2 Mon Sep 17 00:00:00 2001
From: Sebastian Schoennenbeck <sebastian.schoennenbeck@comma-soft.com>
Date: Wed, 21 May 2025 22:41:23 +0200
Subject: [PATCH 01/26] [Bugfix] Consistent ascii handling in tool parsers
 (#17704)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sebastian Schönnenbeck <sebastian.schoennenbeck@comma-soft.com>
Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .../tool_parsers/granite_20b_fc_tool_parser.py   | 12 ++++++++----
 .../openai/tool_parsers/granite_tool_parser.py   | 12 ++++++++----
 .../openai/tool_parsers/internlm2_tool_parser.py | 12 ++++++++----
 .../openai/tool_parsers/jamba_tool_parser.py     | 16 ++++++++++------
 .../openai/tool_parsers/llama_tool_parser.py     | 12 ++++++++----
 .../openai/tool_parsers/phi4mini_tool_parser.py  |  9 +++++----
 .../openai/tool_parsers/pythonic_tool_parser.py  |  9 ++++++---
 7 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 76da63c5800..61aa3b1092d 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -80,7 +80,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"]),
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
                     ),
                 ) for function_call in raw_function_calls
             ]
@@ -166,7 +167,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -218,7 +220,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -226,7 +229,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
 
                             prefix = find_common_prefix(
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index 91afc88ef3d..52c78e8d9f7 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -67,7 +67,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"]),
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
                     ),
                 ) for function_call in raw_function_calls
             ]
@@ -151,7 +152,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -197,7 +199,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -205,7 +208,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
                             prefix = find_common_prefix(
                                 prev_args_json, cur_args_json)
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 57d7c77c64f..59ac36cd23b 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -133,7 +133,8 @@ def extract_tool_calls_streaming(
                     delta = None
                 # first time to get parameters
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
 
                     arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                          index(delta_text) +
@@ -148,8 +149,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
                 # both prev and cur parameters, send the increase parameters
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
 
                     argument_diff = extract_intermediate_diff(
                         cur_args_json, prev_args_json)
@@ -190,7 +193,8 @@ def extract_tool_calls(
             action_dict = json.loads(action)
             name, parameters = action_dict['name'], json.dumps(
                 action_dict.get('parameters', action_dict.get('arguments',
-                                                              {})))
+                                                              {})),
+                ensure_ascii=False)
 
             if not tools or name not in [t.function.name for t in tools]:
                 ExtractedToolCallInformation(tools_called=False,
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 8df106bf271..50fed9baf8f 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -96,8 +96,9 @@ def extract_tool_calls(
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"])))
-                    for function_call in raw_function_calls
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False),
+                        )) for function_call in raw_function_calls
                 ]
 
                 content = model_output[:model_output.
@@ -187,7 +188,7 @@ def extract_tool_calls_streaming(
                     diff: Union[str, None] = current_tool_call.get("arguments")
 
                     if diff:
-                        diff = json.dumps(diff).replace(
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
                             self.streamed_args_for_tool[self.current_tool_id],
                             "")
                         delta = DeltaMessage(tool_calls=[
@@ -248,7 +249,8 @@ def extract_tool_calls_streaming(
                         "mid-arguments")
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
                     logger.debug("finding %s in %s", new_text,
                                  cur_arguments_json)
 
@@ -267,8 +269,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
                     logger.debug("Searching for diff between \n%s\n%s",
                                  cur_args_json, prev_args_json)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 5c181616aa0..9dbd7efdc44 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -88,7 +88,8 @@ def extract_tool_calls(
                         # function call args are JSON but as a string
                         arguments=json.dumps(raw_function_call["arguments"] \
                                 if "arguments" in raw_function_call \
-                                else raw_function_call["parameters"])))
+                                else raw_function_call["parameters"],
+                                ensure_ascii=False)))
                 for raw_function_call in function_call_arr
             ]
 
@@ -174,7 +175,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -226,7 +228,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -234,7 +237,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
 
                             prefix = find_common_prefix(
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 668776a832e..084f7acb5d8 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -79,10 +79,11 @@ def extract_tool_calls(
                         name=raw_function_call["name"],
                         # function call args are JSON but as a string
                         arguments=json.dumps(
-                            raw_function_call["arguments"] if "arguments" in
-                            raw_function_call else
-                            raw_function_call["parameters"])))
-                for raw_function_call in function_call_arr
+                            raw_function_call["arguments"]
+                            if "arguments" in raw_function_call else
+                            raw_function_call["parameters"],
+                            ensure_ascii=False),
+                    )) for raw_function_call in function_call_arr
             ]
 
             # get any content before the tool call
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 9f141d6b334..e795eb3fa8c 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -200,9 +200,12 @@ def _handle_single_tool(call: ast.Call) -> ToolCall:
     arguments = {}
     for keyword in call.keywords:
         arguments[keyword.arg] = _get_parameter_value(keyword.value)
-    return ToolCall(type="function",
-                    function=FunctionCall(name=function_name,
-                                          arguments=json.dumps(arguments)))
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=function_name,
+                              arguments=json.dumps(arguments,
+                                                   ensure_ascii=False)),
+    )
 
 
 def _make_valid_python(text: str) -> Union[tuple[str, str], None]:

From 328af25234ae30068305669cbb968e3b8a936194 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Wed, 9 Jul 2025 16:30:22 +0300
Subject: [PATCH 02/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources"

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 vllm/executor/ray_distributed_executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 9b0b98731e0..04b48d53021 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -528,12 +528,12 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         ray.get(parallel_worker_tasks)
 
     def _check_ray_cgraph_installation(self):
-        import pkg_resources
+        import importlib.metadata
         from packaging import version
 
         required_version = version.parse("2.43.0")
         current_version = version.parse(
-            pkg_resources.get_distribution("ray").version)
+            importlib.metadata.version("ray"))
         if current_version < required_version:
             raise ValueError(f"Ray version {required_version} is "
                              f"required, but found {current_version}")

From d820b7abf38528b43cd7031647e6d44579c42360 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 08:28:52 +0300
Subject: [PATCH 03/26] revert changes for pre-commit check

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 vllm/executor/ray_distributed_executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 04b48d53021..9b0b98731e0 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -528,12 +528,12 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         ray.get(parallel_worker_tasks)
 
     def _check_ray_cgraph_installation(self):
-        import importlib.metadata
+        import pkg_resources
         from packaging import version
 
         required_version = version.parse("2.43.0")
         current_version = version.parse(
-            importlib.metadata.version("ray"))
+            pkg_resources.get_distribution("ray").version)
         if current_version < required_version:
             raise ValueError(f"Ray version {required_version} is "
                              f"required, but found {current_version}")

From dfb943ad5270411285de210b38afcaafa3bb270d Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 08:48:12 +0300
Subject: [PATCH 04/26] add types-setuptools to the lint requirements

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 requirements/lint.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements/lint.txt b/requirements/lint.txt
index 62446f94048..81f0cfc76a6 100644
--- a/requirements/lint.txt
+++ b/requirements/lint.txt
@@ -1,2 +1,4 @@
 # formatting
 pre-commit==4.0.1
+
+types-setuptools
\ No newline at end of file

From 4b86ed6d801e54286b6fde633397d29ff2f70a48 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 09:03:44 +0300
Subject: [PATCH 05/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (1)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 pyproject.toml        | 7 +++++++
 requirements/lint.txt | 2 --
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b5f1039b44d..e91f8c0fac2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,13 @@ Slack="http://slack.vllm.ai/"
 [project.scripts]
 vllm = "vllm.entrypoints.cli.main:main"
 
+[project.optional-dependencies]
+dev = [
+    "types-setuptools",  # Required by MyPy for pkg_resources module
+    # Other development-specific tools might go here too,
+    # e.g., "mypy", "ruff", "pre-commit" if not otherwise managed.
+]
+
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm
 
diff --git a/requirements/lint.txt b/requirements/lint.txt
index 81f0cfc76a6..62446f94048 100644
--- a/requirements/lint.txt
+++ b/requirements/lint.txt
@@ -1,4 +1,2 @@
 # formatting
 pre-commit==4.0.1
-
-types-setuptools
\ No newline at end of file

From 84bc0dba78616c82aebfe7aee72dfee3dcf1e480 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 09:16:29 +0300
Subject: [PATCH 06/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (2)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e91f8c0fac2..22728ad669d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
 requires-python = ">=3.9,<3.13"
-dynamic = [ "version", "dependencies", "optional-dependencies"]
+dynamic = [ "version", "dependencies"]
 
 [project.urls]
 Homepage="https://github.com/vllm-project/vllm"

From db0c2d678ef822d63e7d8b90f3e933071ac65307 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 09:32:59 +0300
Subject: [PATCH 07/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (3)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 6ab63a40277..60583608ba3 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -9,12 +9,16 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+    - uses: actions/checkout@v4 # Use the latest stable version tag
+    - uses: actions/setup-python@v5 # Use the latest stable version tag
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
-    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+    - name: Install vLLM Build Dependencies
+      run: pip install --requirement requirements/build.txt
+    - name: Install vLLM Project and Dev Dependencies
+      run: pip install -e ".[dev]"
+    - uses: pre-commit/action@v3.0.1 # Use the latest stable version tag
       with:
-        extra_args: --all-files --hook-stage manual
+        extra_args: --all-files --hook-stage manual
\ No newline at end of file

From 03b99f57d9e170ae2120a2833330e591d30da385 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 10:28:41 +0300
Subject: [PATCH 08/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (4)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 60583608ba3..8bf7d0027d5 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -15,8 +15,10 @@ jobs:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
+    - name: Set VLLM_TARGET_DEVICE to CPU for pre-commit checks
+      run: echo "VLLM_TARGET_DEVICE=cpu" >> $GITHUB_ENV
     - name: Install vLLM Build Dependencies
-      run: pip install --requirement requirements/build.txt
+      run: pip install -r requirements/build.txt
     - name: Install vLLM Project and Dev Dependencies
       run: pip install -e ".[dev]"
     - uses: pre-commit/action@v3.0.1 # Use the latest stable version tag

From 3b528fe13eda52e01be63e5e20ec44d74d14487e Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 10:40:05 +0300
Subject: [PATCH 09/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (5)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 setup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a1867960e59..26b1437317a 100755
--- a/setup.py
+++ b/setup.py
@@ -585,8 +585,9 @@ def get_vllm_version() -> str:
     elif _is_tpu():
         version += f"{sep}tpu"
     elif _is_cpu():
-        if envs.VLLM_TARGET_DEVICE == "cpu":
-            version += f"{sep}cpu"
+        # For CPU builds, we don't append a suffix to the version.
+        # The standard PyPI `torch` package is CPU-only by default.
+        pass # Do not append +cpu to the version string
     elif _is_xpu():
         version += f"{sep}xpu"
     else:

From 31602110d06d19c87f788d3c94abc2073dda2d9c Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 10:48:45 +0300
Subject: [PATCH 10/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (5)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 8bf7d0027d5..8114e5c3de4 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -9,18 +9,28 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4 # Use the latest stable version tag
-    - uses: actions/setup-python@v5 # Use the latest stable version tag
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
     - name: Set VLLM_TARGET_DEVICE to CPU for pre-commit checks
       run: echo "VLLM_TARGET_DEVICE=cpu" >> $GITHUB_ENV
-    - name: Install vLLM Build Dependencies
-      run: pip install -r requirements/build.txt
+    - name: Install PyTorch for CPU
+      # Explicitly install the CPU version of PyTorch from their stable index.
+      # This ensures pip finds the correct wheel without the problematic '+cpu' tag.
+      # Check PyTorch's official website for the exact command for torch==2.6.0 CPU
+      # As of my last update, it would typically be:
+      run: pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
+    - name: Install vLLM Build Dependencies (excluding torch, as it's already installed)
+      # We need to filter out torch from build.txt for this step
+      # A simple way is to use grep or sed, or if build.txt is small, just list others.
+      # Given build.txt has `torch==2.6.0`, we'll exclude it here.
+      run: pip install $(grep -v 'torch==' requirements/build.txt | tr '\n' ' ')
     - name: Install vLLM Project and Dev Dependencies
       run: pip install -e ".[dev]"
-    - uses: pre-commit/action@v3.0.1 # Use the latest stable version tag
+
+    - uses: pre-commit/action@v3.0.1
       with:
         extra_args: --all-files --hook-stage manual
\ No newline at end of file

From b1632888dacd53c0dbfd7277c8394cc3acdb1bae Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 10:53:52 +0300
Subject: [PATCH 11/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (6)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 8114e5c3de4..14f0722df12 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -18,16 +18,10 @@ jobs:
     - name: Set VLLM_TARGET_DEVICE to CPU for pre-commit checks
       run: echo "VLLM_TARGET_DEVICE=cpu" >> $GITHUB_ENV
     - name: Install PyTorch for CPU
-      # Explicitly install the CPU version of PyTorch from their stable index.
-      # This ensures pip finds the correct wheel without the problematic '+cpu' tag.
-      # Check PyTorch's official website for the exact command for torch==2.6.0 CPU
-      # As of my last update, it would typically be:
       run: pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
-    - name: Install vLLM Build Dependencies (excluding torch, as it's already installed)
-      # We need to filter out torch from build.txt for this step
-      # A simple way is to use grep or sed, or if build.txt is small, just list others.
-      # Given build.txt has `torch==2.6.0`, we'll exclude it here.
-      run: pip install $(grep -v 'torch==' requirements/build.txt | tr '\n' ' ')
+    - name: Install vLLM Build Dependencies (excluding torch and comments)
+      # This command filters out lines with 'torch==' AND lines starting with '#'
+      run: pip install $(grep -v 'torch==' requirements/build.txt | grep -v '^#' | tr '\n' ' ')
     - name: Install vLLM Project and Dev Dependencies
       run: pip install -e ".[dev]"
 

From 459cb6dbe9f20a4af2d642a0bd94df98f0daf6cd Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 11:06:39 +0300
Subject: [PATCH 12/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (7)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>a
Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 14f0722df12..052c44e4f15 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -9,22 +9,25 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
+    - uses: actions/checkout@v4 # Keep this as v4
+    - uses: actions/setup-python@v5 # Keep this as v5
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
-    - name: Set VLLM_TARGET_DEVICE to CPU for pre-commit checks
-      run: echo "VLLM_TARGET_DEVICE=cpu" >> $GITHUB_ENV
+    
+    - name: Set VLLM_TARGET_DEVICE and VLLM_CMAKE_ARGS for CPU build
+      run: |
+        echo "VLLM_TARGET_DEVICE=cpu" >> $GITHUB_ENV
+        # Crucial: Tell CMake to explicitly disable CUDA usage.
+        # This should override PyTorch/Caffe2's internal checks for CUDA libraries.
+        echo "VLLM_CMAKE_ARGS=-DUSE_CUDA=OFF" >> $GITHUB_ENV
     - name: Install PyTorch for CPU
       run: pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
     - name: Install vLLM Build Dependencies (excluding torch and comments)
-      # This command filters out lines with 'torch==' AND lines starting with '#'
       run: pip install $(grep -v 'torch==' requirements/build.txt | grep -v '^#' | tr '\n' ' ')
     - name: Install vLLM Project and Dev Dependencies
       run: pip install -e ".[dev]"
-
-    - uses: pre-commit/action@v3.0.1
+    - uses: pre-commit/action@v3.0.1 # Keep this as v3.0.1
       with:
         extra_args: --all-files --hook-stage manual
\ No newline at end of file

From b2f6d015dad7b4733dde90006e52c79fc61a32b0 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 11:23:13 +0300
Subject: [PATCH 13/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (8)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>a
Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index 26b1437317a..4cf28610627 100755
--- a/setup.py
+++ b/setup.py
@@ -238,6 +238,9 @@ def target_name(s: str) -> str:
             *[f"--target={name}" for name in targets],
         ]
 
+        if VLLM_TARGET_DEVICE == "cpu":
+            build_args.append("-DUSE_CUDA=OFF")
+
         subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
 
         # Install the libraries

From b6ae15563c2a2488e664550521cc785eaedb3830 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 11:34:20 +0300
Subject: [PATCH 14/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (9)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 1 -
 setup.py                         | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 052c44e4f15..4889ed3a874 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -21,7 +21,6 @@ jobs:
         echo "VLLM_TARGET_DEVICE=cpu" >> $GITHUB_ENV
         # Crucial: Tell CMake to explicitly disable CUDA usage.
         # This should override PyTorch/Caffe2's internal checks for CUDA libraries.
-        echo "VLLM_CMAKE_ARGS=-DUSE_CUDA=OFF" >> $GITHUB_ENV
     - name: Install PyTorch for CPU
       run: pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
     - name: Install vLLM Build Dependencies (excluding torch and comments)
diff --git a/setup.py b/setup.py
index 4cf28610627..a2411d349fd 100755
--- a/setup.py
+++ b/setup.py
@@ -149,6 +149,9 @@ def configure(self, ext: CMakeExtension) -> None:
             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
         ]
 
+        if VLLM_TARGET_DEVICE == "cpu":
+            cmake_args.append("-DUSE_CUDA=OFF")
+
         verbose = envs.VERBOSE
         if verbose:
             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
@@ -238,9 +241,6 @@ def target_name(s: str) -> str:
             *[f"--target={name}" for name in targets],
         ]
 
-        if VLLM_TARGET_DEVICE == "cpu":
-            build_args.append("-DUSE_CUDA=OFF")
-
         subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
 
         # Install the libraries

From 665ab7f0a24df8bf41096829237114e577153408 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 11:44:44 +0300
Subject: [PATCH 15/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (10)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index a2411d349fd..770d1f8d5e4 100755
--- a/setup.py
+++ b/setup.py
@@ -151,6 +151,9 @@ def configure(self, ext: CMakeExtension) -> None:
 
         if VLLM_TARGET_DEVICE == "cpu":
             cmake_args.append("-DUSE_CUDA=OFF")
+            cmake_args.append("-DBUILD_CUDA_LIBS=OFF")
+            cmake_args.append("-DUSE_CUDNN=OFF")
+            cmake_args.append("-DTORCH_CUDA_ARCH_LIST=NoCUDA")
 
         verbose = envs.VERBOSE
         if verbose:

From 5d272749921d7b7db35685d19e417253459842df Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 11:57:55 +0300
Subject: [PATCH 16/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (11)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml |  21 +--
 CMakeLists.txt                   | 243 +++++++++++++++++++++----------
 2 files changed, 178 insertions(+), 86 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 4889ed3a874..6e1c15f0464 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -9,24 +9,25 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4 # Keep this as v4
-    - uses: actions/setup-python@v5 # Keep this as v5
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
-    
-    - name: Set VLLM_TARGET_DEVICE and VLLM_CMAKE_ARGS for CPU build
+
+    - name: Install PyTorch for CPU and NumPy
       run: |
-        echo "VLLM_TARGET_DEVICE=cpu" >> $GITHUB_ENV
-        # Crucial: Tell CMake to explicitly disable CUDA usage.
-        # This should override PyTorch/Caffe2's internal checks for CUDA libraries.
-    - name: Install PyTorch for CPU
-      run: pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
+        pip install numpy
+        pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
+
     - name: Install vLLM Build Dependencies (excluding torch and comments)
       run: pip install $(grep -v 'torch==' requirements/build.txt | grep -v '^#' | tr '\n' ' ')
+
     - name: Install vLLM Project and Dev Dependencies
+
       run: pip install -e ".[dev]"
-    - uses: pre-commit/action@v3.0.1 # Keep this as v3.0.1
+
+    - uses: pre-commit/action@v3.0.1
       with:
         extra_args: --all-files --hook-stage manual
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3314f05fd2a..77f75258ad1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,8 +25,8 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 
 #
-# Supported python versions.  These versions will be searched in order, the
-# first match will be selected.  These should be kept in sync with setup.py.
+# Supported python versions. These versions will be searched in order, the
+# first match will be selected. These should be kept in sync with setup.py.
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
@@ -43,7 +43,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # rather than an error.
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
-# requirements.txt files and should be kept consistent.  The ROCm torch
+# requirements.txt files and should be kept consistent. The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
@@ -66,10 +66,51 @@ endif()
 #
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 
+
+
+# Move the core logic for CPU-only builds much earlier.
+# If building for CPU, we will handle Torch finding differently and then exit.
+if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
+
+    # Explicitly set CUDA_FOUND and HIP_FOUND to FALSE for CPU builds.
+    # This prevents 'find_package(Torch)' from potentially setting them based on
+    # system-wide detections or the Torch config's internal checks for CUDA components.
+    set(CUDA_FOUND FALSE CACHE INTERNAL "Force CUDA to be not found for CPU build" FORCE)
+    set(HIP_FOUND FALSE CACHE INTERNAL "Force HIP to be not found for CPU build" FORCE)
+
+    # Set VLLM_GPU_LANG to an empty string or a placeholder
+    set(VLLM_GPU_LANG "" CACHE INTERNAL "No GPU language for CPU build" FORCE)
+
+    # Now, find Torch, but with explicit instructions to not look for CUDA/HIP
+    # This might still trigger Caffe2 warnings, but it's the correct way to try.
+    # If the Caffe2 error persists here, it means the PyTorch CPU wheel's CMake
+    # config is truly stubborn.
+    find_package(Torch REQUIRED COMPONENTS Python) # Only require Python components
+
+    # Ensure the 'nvcc' command is NOT searched for or checked for CPU builds.
+    # This whole 'find_program(NVCC_EXECUTABLE nvcc)' block is now unnecessary for CPU.
+    # We will simply not define NVCC_EXECUTABLE or let it be 'NOTFOUND'.
+
+    # Include the cpu_extension.cmake directly and then return/exit this CMakeList.
+    # This ensures no further GPU-specific code is processed.
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+
+    # After configuring for CPU, we don't need to process any GPU-specific logic.
+    # This `return()` effectively stops CMake processing this file for GPU-related sections.
+    return()
+endif()
+
+
+
+# The following blocks will ONLY be processed if VLLM_TARGET_DEVICE is NOT "cpu"
+# (i.e., it's "cuda" or "rocm" or potentially some other future GPU target)
+
 # Ensure the 'nvcc' command is in the PATH
+# This block is now outside the "if cpu" condition, so it only runs for GPU builds.
 find_program(NVCC_EXECUTABLE nvcc)
 if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
-    message(FATAL_ERROR "nvcc not found")
+  message(FATAL_ERROR "nvcc not found")
 endif()
 
 #
@@ -78,13 +119,18 @@ endif()
 # so there is no need to do this explicitly with check_language/enable_language,
 # etc.
 #
+# This find_package(Torch REQUIRED) call only happens for GPU builds now.
 find_package(Torch REQUIRED)
 
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
-#
+# This block should now logically only hit for HIP if VLLM_TARGET_DEVICE is not "cpu"
+# and also not "cuda". But your structure handles it well.
 if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
     NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
+    # This 'if cpu' block within here is now redundant because the main 'if (VLLM_TARGET_DEVICE STREQUAL "cpu")'
+    # at the top handles it. However, keeping it doesn't harm, just means this outer if
+    # will never be true in practice for a "cpu" target given the early return.
     if (VLLM_TARGET_DEVICE STREQUAL "cpu")
         include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
     else()
@@ -119,6 +165,9 @@ elseif(HIP_FOUND)
       "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()
 else()
+  # This FATAL_ERROR will now only trigger if VLLM_TARGET_DEVICE is something
+  # that requires a GPU (like "cuda" or "rocm") but neither CUDA nor HIP are found.
+  # It will NOT trigger for "cpu" anymore.
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()
 
@@ -204,11 +253,12 @@ endif()
 set(VLLM_CUMEM_EXT_SRC
   "csrc/cumem_allocator.cpp")
 
-set_gencode_flags_for_srcs(
-  SRCS "${VLLM_CUMEM_EXT_SRC}"
-  CUDA_ARCHS "${CUDA_ARCHS}")
-
+# --- MODIFICATION: Make cumem_allocator conditional ---
 if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_CUMEM_EXT_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
+
   message(STATUS "Enabling cumem allocator extension.")
   # link against cuda driver library
   list(APPEND CUMEM_LIBS CUDA::cuda_driver)
@@ -222,6 +272,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     WITH_SOABI)
 endif()
 
+
 #
 # _C extension
 #
@@ -246,6 +297,7 @@ set(VLLM_EXT_SRC
   "csrc/custom_all_reduce.cu"
   "csrc/torch_bindings.cpp")
 
+# --- MODIFICATION: Wrap all CUDA-specific source additions and FetchContent ---
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
@@ -303,13 +355,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
-       "csrc/quantization/fp8/fp8_marlin.cu"
-       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
-       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
+        "csrc/quantization/fp8/fp8_marlin.cu"
+        "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+        "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
+        "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+        "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -317,15 +369,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
   else()
     message(STATUS "Not building Marlin kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                    " in CUDA target architectures")
   endif()
 
   # Only build AllSpark kernels if we are building for at least some compatible archs.
   cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
   if (ALLSPARK_ARCHS)
     set(ALLSPARK_SRCS
-       "csrc/quantization/gptq_allspark/allspark_repack.cu"
-       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
+        "csrc/quantization/gptq_allspark/allspark_repack.cu"
+        "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
     set_gencode_flags_for_srcs(
       SRCS "${ALLSPARK_SRCS}"
       CUDA_ARCHS "${ALLSPARK_ARCHS}")
@@ -333,7 +385,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
   else()
     message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                    " in CUDA target architectures")
   endif()
 
 
@@ -343,11 +395,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
     set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+        "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
+        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -359,12 +411,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
       message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Hopper.")
+                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                      "later if you intend on running FP8 quantized models on "
+                      "Hopper.")
     else()
       message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
-                     "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -387,12 +439,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
       message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Blackwell.")
+                      "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                      "later if you intend on running FP8 quantized models on "
+                      "Blackwell.")
     else()
       message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
-                     "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -414,10 +466,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (SCALED_MM_3X_ARCHS)
       message(STATUS "Not building scaled_mm_c2x as all archs are already built"
-                     " for and covered by scaled_mm_c3x")
+                      " for and covered by scaled_mm_c3x")
     else()
       message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
-                    "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -438,11 +490,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
       message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
-                     "if you intend on running FP8 sparse quantized models on Hopper.")
+                      "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
+                      "if you intend on running FP8 sparse quantized models on Hopper.")
     else()
       message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -492,7 +544,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
-             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+              "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -502,11 +554,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
       message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
-                     "if you intend on running FP8 quantized MoE models on Hopper.")
+                      "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                      "if you intend on running FP8 quantized MoE models on Hopper.")
     else()
       message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -571,35 +623,57 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
         AND MACHETE_ARCHS)
       message(STATUS "Not building Machete kernels as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running w4a16 quantized models on "
-                     "Hopper.")
+                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                      "later if you intend on running w4a16 quantized models on "
+                      "Hopper.")
     else()
       message(STATUS "Not building Machete kernels as no compatible archs "
-                     "found in CUDA target architectures")
+                      "found in CUDA target architectures")
     endif()
   endif()
-# if CUDA endif
-endif()
+endif() # End of if(VLLM_GPU_LANG STREQUAL "CUDA") for _C extension sources
+
 
 message(STATUS "Enabling C extension.")
-define_gpu_extension_target(
-  _C
-  DESTINATION vllm
-  LANGUAGE ${VLLM_GPU_LANG}
-  SOURCES ${VLLM_EXT_SRC}
-  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
-  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
-  USE_SABI 3
-  WITH_SOABI)
+# --- MODIFICATION: Make _C extension target conditional ---
+if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    # For CPU, define a C++ extension with no GPU-specific sources or flags
+    # You might need to adjust VLLM_EXT_SRC for CPU-only files here if any exist
+    # For now, we'll assume torch_bindings.cpp is the main one
+    set(VLLM_EXT_SRC "csrc/torch_bindings.cpp") # Only C++ sources
+    define_gpu_extension_target(
+        _C
+        DESTINATION vllm
+        LANGUAGE CXX # Use CXX language for CPU
+        SOURCES ${VLLM_EXT_SRC}
+        # No GPU specific flags or architectures for CPU
+        USE_SABI 3
+        WITH_SOABI)
+else()
+    # Original logic for GPU targets
+    define_gpu_extension_target(
+        _C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${VLLM_EXT_SRC}
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+        INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+        USE_SABI 3
+        WITH_SOABI)
+endif()
+
 
 # If CUTLASS is compiled on NVCC >= 12.5, it by default uses
 # cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
-target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
+# --- MODIFICATION: Make this conditional for CUDA ---
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+    target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
+endif()
+
 
 #
 # _moe_C extension
@@ -610,15 +684,14 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/moe_align_sum_kernels.cu"
   "csrc/moe/topk_softmax_kernels.cu")
 
+# --- MODIFICATION: Wrap all MoE specific CUDA source additions and build ---
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
-endif()
 
-set_gencode_flags_for_srcs(
-  SRCS "${VLLM_MOE_EXT_SRC}"
-  CUDA_ARCHS "${CUDA_ARCHS}")
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_MOE_EXT_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
 
-if(VLLM_GPU_LANG STREQUAL "CUDA")
   set(VLLM_MOE_WNA16_SRC
     "csrc/moe/moe_wna16.cu")
 
@@ -677,20 +750,37 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
   else()
     message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                    " in CUDA target architectures")
   endif()
-endif()
+endif() # End of if(VLLM_GPU_LANG STREQUAL "CUDA") for moe
+
 
 message(STATUS "Enabling moe extension.")
-define_gpu_extension_target(
-  _moe_C
-  DESTINATION vllm
-  LANGUAGE ${VLLM_GPU_LANG}
-  SOURCES ${VLLM_MOE_EXT_SRC}
-  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  USE_SABI 3
-  WITH_SOABI)
+# --- MODIFICATION: Make _moe_C extension target conditional ---
+if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    # For CPU, define a C++ extension for MoE with only C++ sources
+    set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp") # Only C++ sources
+    define_gpu_extension_target(
+        _moe_C
+        DESTINATION vllm
+        LANGUAGE CXX # Use CXX language for CPU
+        SOURCES ${VLLM_MOE_EXT_SRC}
+        USE_SABI 3
+        WITH_SOABI)
+else()
+    # Original logic for GPU targets
+    define_gpu_extension_target(
+        _moe_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${VLLM_MOE_EXT_SRC}
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        USE_SABI 3
+        WITH_SOABI)
+endif()
+
+
 
 if(VLLM_GPU_LANG STREQUAL "HIP")
   #
@@ -713,6 +803,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 
 # For CUDA we also build and ship some external projects.
+# --- MODIFICATION: Make external projects conditional ---
 if (VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/flashmla.cmake)
     include(cmake/external_projects/vllm_flash_attn.cmake)

From 6d51d900879521c2c0cd815dc214032e2a9d2c2f Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 12:11:58 +0300
Subject: [PATCH 17/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (12)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 5 +++--
 setup.py                         | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 6e1c15f0464..a111491be11 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -15,7 +15,7 @@ jobs:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
-
+    
     - name: Install PyTorch for CPU and NumPy
       run: |
         pip install numpy
@@ -25,7 +25,8 @@ jobs:
       run: pip install $(grep -v 'torch==' requirements/build.txt | grep -v '^#' | tr '\n' ' ')
 
     - name: Install vLLM Project and Dev Dependencies
-
+      env:
+        VLLM_BUILD_TARGET: cpu
       run: pip install -e ".[dev]"
 
     - uses: pre-commit/action@v3.0.1
diff --git a/setup.py b/setup.py
index 770d1f8d5e4..12d78cbdd6c 100755
--- a/setup.py
+++ b/setup.py
@@ -529,7 +529,10 @@ def get_nvcc_cuda_version() -> Version:
 
     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
     """
-    assert CUDA_HOME is not None, "CUDA_HOME is not set"
+    if VLLM_TARGET_DEVICE == "cpu":
+        return Version("0.0")
+
+    assert CUDA_HOME is not None, "CUDA_HOME is not set for a CUDA/HIP build target."
     nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
                                           universal_newlines=True)
     output = nvcc_output.split()

From 8b226e5d569dd433f7603ba32836c253d4cb1a9d Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 15:16:34 +0300
Subject: [PATCH 18/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (13)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index a111491be11..11e7da9926e 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -26,7 +26,7 @@ jobs:
 
     - name: Install vLLM Project and Dev Dependencies
       env:
-        VLLM_BUILD_TARGET: cpu
+        VLLM_TARGET_DEVICE: cpu
       run: pip install -e ".[dev]"
 
     - uses: pre-commit/action@v3.0.1

From b1682bb3e02c4d569e0583da6fd2dc97d5bcfec6 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 15:27:09 +0300
Subject: [PATCH 19/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (14)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml |  5 ++++-
 CMakeLists.txt                   | 38 +++++++++++++++++++++++---------
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 11e7da9926e..b8b33dd915f 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -19,7 +19,7 @@ jobs:
     - name: Install PyTorch for CPU and NumPy
       run: |
         pip install numpy
-        pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
+        pip install torch==2.6.0+cpu torchvision==0.21.0+cpu torchaudio==2.6.0+cpu --index-url https://download.pytorch.org/whl/cpu
 
     - name: Install vLLM Build Dependencies (excluding torch and comments)
       run: pip install $(grep -v 'torch==' requirements/build.txt | grep -v '^#' | tr '\n' ' ')
@@ -27,6 +27,9 @@ jobs:
     - name: Install vLLM Project and Dev Dependencies
       env:
         VLLM_TARGET_DEVICE: cpu
+        USE_CUDA: "OFF"
+        CUDA_VISIBLE_DEVICES: ""
+        FORCE_CUDA: "0"
       run: pip install -e ".[dev]"
 
     - uses: pre-commit/action@v3.0.1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77f75258ad1..29aea51ea8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,16 +126,34 @@ find_package(Torch REQUIRED)
 # Forward the non-CUDA device extensions to external CMake scripts.
 # This block should now logically only hit for HIP if VLLM_TARGET_DEVICE is not "cpu"
 # and also not "cuda". But your structure handles it well.
-if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
-    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
-    # This 'if cpu' block within here is now redundant because the main 'if (VLLM_TARGET_DEVICE STREQUAL "cpu")'
-    # at the top handles it. However, keeping it doesn't harm, just means this outer if
-    # will never be true in practice for a "cpu" target given the early return.
-    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
-        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
-    else()
-        return()
-    endif()
+if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
+
+    # Explicitly set CUDA_FOUND and HIP_FOUND to FALSE for CPU builds.
+    # This prevents 'find_package(Torch)' from potentially setting them based on
+    # system-wide detections or the Torch config's internal checks for CUDA components.
+    set(CUDA_FOUND FALSE CACHE INTERNAL "Force CUDA to be not found for CPU build" FORCE)
+    set(HIP_FOUND FALSE CACHE INTERNAL "Force HIP to be not found for CPU build" FORCE)
+
+    # Set VLLM_GPU_LANG to an empty string or a placeholder
+    set(VLLM_GPU_LANG "" CACHE INTERNAL "No GPU language for CPU build" FORCE)
+
+    # Set environment variables to disable CUDA detection in PyTorch
+    set(ENV{USE_CUDA} OFF)
+    set(ENV{CUDA_VISIBLE_DEVICES} "")
+    
+    # Try to find Torch with CUDA explicitly disabled
+    set(CAFFE2_USE_CUDA OFF CACHE BOOL "Disable CUDA for CPU build" FORCE)
+    
+    # Now, find Torch, but with explicit instructions to not look for CUDA/HIP
+    find_package(Torch REQUIRED)
+
+    # Include the cpu_extension.cmake directly and then return/exit this CMakeList.
+    # This ensures no further GPU-specific code is processed.
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+
+    # After configuring for CPU, we don't need to process any GPU-specific logic.
+    # This `return()` effectively stops CMake processing this file for GPU-related sections.
     return()
 endif()
 

From a45affc15fbbe1fc9c8c81cbf8cd1209798d50ea Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 16:12:44 +0300
Subject: [PATCH 20/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (15)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 CMakeLists.txt | 81 ++++++++------------------------------------------
 1 file changed, 13 insertions(+), 68 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29aea51ea8b..ff1acd30119 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,47 +61,27 @@ else()
     " before running cmake configure.")
 endif()
 
-#
-# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
-#
-append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
-
-
-
-# Move the core logic for CPU-only builds much earlier.
-# If building for CPU, we will handle Torch finding differently and then exit.
 if (VLLM_TARGET_DEVICE STREQUAL "cpu")
     message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
-
-    # Explicitly set CUDA_FOUND and HIP_FOUND to FALSE for CPU builds.
-    # This prevents 'find_package(Torch)' from potentially setting them based on
-    # system-wide detections or the Torch config's internal checks for CUDA components.
+    
+    # For CPU builds, we need to find Torch but without triggering CUDA detection
+    # Set these variables to prevent CUDA detection
     set(CUDA_FOUND FALSE CACHE INTERNAL "Force CUDA to be not found for CPU build" FORCE)
     set(HIP_FOUND FALSE CACHE INTERNAL "Force HIP to be not found for CPU build" FORCE)
-
-    # Set VLLM_GPU_LANG to an empty string or a placeholder
-    set(VLLM_GPU_LANG "" CACHE INTERNAL "No GPU language for CPU build" FORCE)
-
-    # Now, find Torch, but with explicit instructions to not look for CUDA/HIP
-    # This might still trigger Caffe2 warnings, but it's the correct way to try.
-    # If the Caffe2 error persists here, it means the PyTorch CPU wheel's CMake
-    # config is truly stubborn.
-    find_package(Torch REQUIRED COMPONENTS Python) # Only require Python components
-
-    # Ensure the 'nvcc' command is NOT searched for or checked for CPU builds.
-    # This whole 'find_program(NVCC_EXECUTABLE nvcc)' block is now unnecessary for CPU.
-    # We will simply not define NVCC_EXECUTABLE or let it be 'NOTFOUND'.
-
-    # Include the cpu_extension.cmake directly and then return/exit this CMakeList.
-    # This ensures no further GPU-specific code is processed.
+    set(CAFFE2_USE_CUDA OFF CACHE BOOL "Disable CUDA for CPU build" FORCE)
+    
+    # Find Torch with minimal requirements for CPU
+    find_package(Torch REQUIRED)
+    
+    # Include the CPU extension cmake and return early
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
-
-    # After configuring for CPU, we don't need to process any GPU-specific logic.
-    # This `return()` effectively stops CMake processing this file for GPU-related sections.
     return()
 endif()
 
-
+#
+# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
+#
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 
 # The following blocks will ONLY be processed if VLLM_TARGET_DEVICE is NOT "cpu"
 # (i.e., it's "cuda" or "rocm" or potentially some other future GPU target)
@@ -122,41 +102,6 @@ endif()
 # This find_package(Torch REQUIRED) call only happens for GPU builds now.
 find_package(Torch REQUIRED)
 
-#
-# Forward the non-CUDA device extensions to external CMake scripts.
-# This block should now logically only hit for HIP if VLLM_TARGET_DEVICE is not "cpu"
-# and also not "cuda". But your structure handles it well.
-if (VLLM_TARGET_DEVICE STREQUAL "cpu")
-    message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
-
-    # Explicitly set CUDA_FOUND and HIP_FOUND to FALSE for CPU builds.
-    # This prevents 'find_package(Torch)' from potentially setting them based on
-    # system-wide detections or the Torch config's internal checks for CUDA components.
-    set(CUDA_FOUND FALSE CACHE INTERNAL "Force CUDA to be not found for CPU build" FORCE)
-    set(HIP_FOUND FALSE CACHE INTERNAL "Force HIP to be not found for CPU build" FORCE)
-
-    # Set VLLM_GPU_LANG to an empty string or a placeholder
-    set(VLLM_GPU_LANG "" CACHE INTERNAL "No GPU language for CPU build" FORCE)
-
-    # Set environment variables to disable CUDA detection in PyTorch
-    set(ENV{USE_CUDA} OFF)
-    set(ENV{CUDA_VISIBLE_DEVICES} "")
-    
-    # Try to find Torch with CUDA explicitly disabled
-    set(CAFFE2_USE_CUDA OFF CACHE BOOL "Disable CUDA for CPU build" FORCE)
-    
-    # Now, find Torch, but with explicit instructions to not look for CUDA/HIP
-    find_package(Torch REQUIRED)
-
-    # Include the cpu_extension.cmake directly and then return/exit this CMakeList.
-    # This ensures no further GPU-specific code is processed.
-    include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
-
-    # After configuring for CPU, we don't need to process any GPU-specific logic.
-    # This `return()` effectively stops CMake processing this file for GPU-related sections.
-    return()
-endif()
-
 #
 # Set up GPU language and check the torch version and warn if it isn't
 # what is expected.

From 427b93fd44807c0724035f7c1049d9222dfc1e43 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 16:18:17 +0300
Subject: [PATCH 21/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (16)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff1acd30119..bc69a61d7f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,11 @@ else()
     " before running cmake configure.")
 endif()
 
+#
+# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
+#
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+
 if (VLLM_TARGET_DEVICE STREQUAL "cpu")
     message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
     
@@ -78,11 +83,6 @@ if (VLLM_TARGET_DEVICE STREQUAL "cpu")
     return()
 endif()
 
-#
-# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
-#
-append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
-
 # The following blocks will ONLY be processed if VLLM_TARGET_DEVICE is NOT "cpu"
 # (i.e., it's "cuda" or "rocm" or potentially some other future GPU target)
 

From e36236c79c410a9a86ff4649845aa5fff67912a1 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 16:22:58 +0300
Subject: [PATCH 22/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (17)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 CMakeLists.txt | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc69a61d7f3..647c0d915dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,14 +69,8 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 if (VLLM_TARGET_DEVICE STREQUAL "cpu")
     message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
     
-    # For CPU builds, we need to find Torch but without triggering CUDA detection
-    # Set these variables to prevent CUDA detection
-    set(CUDA_FOUND FALSE CACHE INTERNAL "Force CUDA to be not found for CPU build" FORCE)
-    set(HIP_FOUND FALSE CACHE INTERNAL "Force HIP to be not found for CPU build" FORCE)
-    set(CAFFE2_USE_CUDA OFF CACHE BOOL "Disable CUDA for CPU build" FORCE)
-    
-    # Find Torch with minimal requirements for CPU
-    find_package(Torch REQUIRED)
+    # For CPU builds, we don't need to find Torch through CMake
+    # The Python extension will link against torch through the Python environment
     
     # Include the CPU extension cmake and return early
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)

From 076966dda615a92809bf3c20ba9d41235039251c Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 16:30:41 +0300
Subject: [PATCH 23/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (18)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 .github/workflows/pre-commit.yml | 5 +++++
 CMakeLists.txt                   | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index b8b33dd915f..bfe549d3899 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,6 +16,11 @@ jobs:
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
     
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y libnuma-dev
+    
     - name: Install PyTorch for CPU and NumPy
       run: |
         pip install numpy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 647c0d915dd..900a3d7f089 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,8 +69,9 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 if (VLLM_TARGET_DEVICE STREQUAL "cpu")
     message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
     
-    # For CPU builds, we don't need to find Torch through CMake
-    # The Python extension will link against torch through the Python environment
+    # For CPU builds, we still need to find Torch to get headers, but skip CUDA detection
+    set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH})
+    find_package(Torch REQUIRED)
     
     # Include the CPU extension cmake and return early
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)

From 9bf532063c4ac04733554fec3d8baea785fefde4 Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 16:35:42 +0300
Subject: [PATCH 24/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (19)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 900a3d7f089..2360d19d4f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,9 +69,8 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 if (VLLM_TARGET_DEVICE STREQUAL "cpu")
     message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
     
-    # For CPU builds, we still need to find Torch to get headers, but skip CUDA detection
-    set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH})
-    find_package(Torch REQUIRED)
+    # For CPU builds, we don't need to find Torch through CMake
+    # The CPU extension will handle torch headers through the Python environment
     
     # Include the CPU extension cmake and return early
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)

From 62dd0e29b8e534e8b15f2c0aed8a3a7b9bcd640d Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 16:40:18 +0300
Subject: [PATCH 25/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (20)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 CMakeLists.txt | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2360d19d4f7..391e1ab2b4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,8 +69,19 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 if (VLLM_TARGET_DEVICE STREQUAL "cpu")
     message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
     
-    # For CPU builds, we don't need to find Torch through CMake
-    # The CPU extension will handle torch headers through the Python environment
+    # For CPU builds, we need to get torch include directories without full CMake config
+    # Use Python to get torch include paths
+    execute_process(
+        COMMAND ${Python_EXECUTABLE} -c "import torch; print(';'.join(torch.utils.cpp_extension.include_paths()), end='')"
+        OUTPUT_VARIABLE TORCH_INCLUDE_DIRS
+        RESULT_VARIABLE TORCH_INCLUDE_RESULT
+    )
+    
+    if(NOT TORCH_INCLUDE_RESULT EQUAL 0)
+        message(FATAL_ERROR "Failed to get torch include directories")
+    endif()
+    
+    message(STATUS "Torch include directories: ${TORCH_INCLUDE_DIRS}")
     
     # Include the CPU extension cmake and return early
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)

From ce22b7dad3328f249fa8c23c02d3e4d31e81df2a Mon Sep 17 00:00:00 2001
From: Alexandru Badea <george-alexandru.badea@ionos.com>
Date: Thu, 10 Jul 2025 16:44:24 +0300
Subject: [PATCH 26/26] fix for pre-commit failed job - Error:
 vllm/executor/ray_distributed_executor.py:531: error: Library stubs not
 installed for "pkg_resources" (21)

Signed-off-by: Alexandru Badea <george-alexandru.badea@ionos.com>
---
 CMakeLists.txt | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 391e1ab2b4c..85af9eef87b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,18 +69,22 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 if (VLLM_TARGET_DEVICE STREQUAL "cpu")
     message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
     
-    # For CPU builds, we need to get torch include directories without full CMake config
-    # Use Python to get torch include paths
+    # For CPU builds, we need to get torch include directories without cpp_extension
+    # Get torch installation path and construct include paths manually
     execute_process(
-        COMMAND ${Python_EXECUTABLE} -c "import torch; print(';'.join(torch.utils.cpp_extension.include_paths()), end='')"
-        OUTPUT_VARIABLE TORCH_INCLUDE_DIRS
-        RESULT_VARIABLE TORCH_INCLUDE_RESULT
+        COMMAND ${Python_EXECUTABLE} -c "import torch; import os; print(os.path.dirname(torch.__file__), end='')"
+        OUTPUT_VARIABLE TORCH_INSTALL_PATH
+        RESULT_VARIABLE TORCH_PATH_RESULT
     )
     
-    if(NOT TORCH_INCLUDE_RESULT EQUAL 0)
-        message(FATAL_ERROR "Failed to get torch include directories")
+    if(NOT TORCH_PATH_RESULT EQUAL 0)
+        message(FATAL_ERROR "Failed to get torch installation path")
     endif()
     
+    # Construct torch include directories manually
+    set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PATH}/include;${TORCH_INSTALL_PATH}/include/torch/csrc/api/include")
+    
+    message(STATUS "Torch installation path: ${TORCH_INSTALL_PATH}")
     message(STATUS "Torch include directories: ${TORCH_INCLUDE_DIRS}")
     
     # Include the CPU extension cmake and return early