diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 6ab63a40277..bfe549d3899 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -9,12 +9,34 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
-    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+    
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y libnuma-dev
+    
+    - name: Install PyTorch for CPU and NumPy
+      run: |
+        pip install numpy
+        pip install torch==2.6.0+cpu torchvision==0.21.0+cpu torchaudio==2.6.0+cpu --index-url https://download.pytorch.org/whl/cpu
+
+    - name: Install vLLM Build Dependencies (excluding torch and comments)
+      run: pip install $(grep -v 'torch==' requirements/build.txt | grep -v '^#' | tr '\n' ' ')
+
+    - name: Install vLLM Project and Dev Dependencies
+      env:
+        VLLM_TARGET_DEVICE: cpu
+        USE_CUDA: "OFF"
+        CUDA_VISIBLE_DEVICES: ""
+        FORCE_CUDA: "0"
+      run: pip install -e ".[dev]"
+
+    - uses: pre-commit/action@v3.0.1
       with:
-        extra_args: --all-files --hook-stage manual
+        extra_args: --all-files --hook-stage manual
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3314f05fd2a..85af9eef87b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,8 +25,8 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 
 #
-# Supported python versions.  These versions will be searched in order, the
-# first match will be selected.  These should be kept in sync with setup.py.
+# Supported python versions. These versions will be searched in order, the
+# first match will be selected. These should be kept in sync with setup.py.
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
@@ -43,7 +43,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # rather than an error.
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
-# requirements.txt files and should be kept consistent.  The ROCm torch
+# requirements.txt files and should be kept consistent. The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
@@ -66,10 +66,40 @@ endif()
 #
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 
+if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.")
+    
+    # For CPU builds, we need to get torch include directories without cpp_extension
+    # Get torch installation path and construct include paths manually
+    execute_process(
+        COMMAND ${Python_EXECUTABLE} -c "import torch; import os; print(os.path.dirname(torch.__file__), end='')"
+        OUTPUT_VARIABLE TORCH_INSTALL_PATH
+        RESULT_VARIABLE TORCH_PATH_RESULT
+    )
+    
+    if(NOT TORCH_PATH_RESULT EQUAL 0)
+        message(FATAL_ERROR "Failed to get torch installation path")
+    endif()
+    
+    # Construct torch include directories manually
+    set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PATH}/include;${TORCH_INSTALL_PATH}/include/torch/csrc/api/include")
+    
+    message(STATUS "Torch installation path: ${TORCH_INSTALL_PATH}")
+    message(STATUS "Torch include directories: ${TORCH_INCLUDE_DIRS}")
+    
+    # Include the CPU extension cmake and return early
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+    return()
+endif()
+
+# The following blocks will ONLY be processed if VLLM_TARGET_DEVICE is NOT "cpu"
+# (i.e., it's "cuda" or "rocm" or potentially some other future GPU target)
+
 # Ensure the 'nvcc' command is in the PATH
+# This block is now outside the "if cpu" condition, so it only runs for GPU builds.
 find_program(NVCC_EXECUTABLE nvcc)
 if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
-    message(FATAL_ERROR "nvcc not found")
+  message(FATAL_ERROR "nvcc not found")
 endif()
 
 #
@@ -78,21 +108,9 @@ endif()
 # so there is no need to do this explicitly with check_language/enable_language,
 # etc.
 #
+# This find_package(Torch REQUIRED) call only happens for GPU builds now.
 find_package(Torch REQUIRED)
 
-#
-# Forward the non-CUDA device extensions to external CMake scripts.
-#
-if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
-    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
-    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
-        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
-    else()
-        return()
-    endif()
-    return()
-endif()
-
 #
 # Set up GPU language and check the torch version and warn if it isn't
 # what is expected.
@@ -119,6 +137,9 @@ elseif(HIP_FOUND)
       "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()
 else()
+  # This FATAL_ERROR will now only trigger if VLLM_TARGET_DEVICE is something
+  # that requires a GPU (like "cuda" or "rocm") but neither CUDA nor HIP are found.
+  # It will NOT trigger for "cpu" anymore.
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()
 
@@ -204,11 +225,12 @@ endif()
 set(VLLM_CUMEM_EXT_SRC
   "csrc/cumem_allocator.cpp")
 
-set_gencode_flags_for_srcs(
-  SRCS "${VLLM_CUMEM_EXT_SRC}"
-  CUDA_ARCHS "${CUDA_ARCHS}")
-
+# --- MODIFICATION: Make cumem_allocator conditional ---
 if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_CUMEM_EXT_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
+
   message(STATUS "Enabling cumem allocator extension.")
   # link against cuda driver library
   list(APPEND CUMEM_LIBS CUDA::cuda_driver)
@@ -222,6 +244,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     WITH_SOABI)
 endif()
 
+
 #
 # _C extension
 #
@@ -246,6 +269,7 @@ set(VLLM_EXT_SRC
   "csrc/custom_all_reduce.cu"
   "csrc/torch_bindings.cpp")
 
+# --- MODIFICATION: Wrap all CUDA-specific source additions and FetchContent ---
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
@@ -303,13 +327,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
-       "csrc/quantization/fp8/fp8_marlin.cu"
-       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
-       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
+        "csrc/quantization/fp8/fp8_marlin.cu"
+        "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+        "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
+        "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+        "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -317,15 +341,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
   else()
     message(STATUS "Not building Marlin kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                    " in CUDA target architectures")
   endif()
 
   # Only build AllSpark kernels if we are building for at least some compatible archs.
   cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
   if (ALLSPARK_ARCHS)
     set(ALLSPARK_SRCS
-       "csrc/quantization/gptq_allspark/allspark_repack.cu"
-       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
+        "csrc/quantization/gptq_allspark/allspark_repack.cu"
+        "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
     set_gencode_flags_for_srcs(
       SRCS "${ALLSPARK_SRCS}"
       CUDA_ARCHS "${ALLSPARK_ARCHS}")
@@ -333,7 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
   else()
     message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                    " in CUDA target architectures")
   endif()
 
 
@@ -343,11 +367,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
     set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+        "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
+        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -359,12 +383,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
       message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Hopper.")
+                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                      "later if you intend on running FP8 quantized models on "
+                      "Hopper.")
     else()
       message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
-                     "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -387,12 +411,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
       message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
-                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Blackwell.")
+                      "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                      "later if you intend on running FP8 quantized models on "
+                      "Blackwell.")
     else()
       message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
-                     "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -414,10 +438,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (SCALED_MM_3X_ARCHS)
       message(STATUS "Not building scaled_mm_c2x as all archs are already built"
-                     " for and covered by scaled_mm_c3x")
+                      " for and covered by scaled_mm_c3x")
     else()
       message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
-                    "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -438,11 +462,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
       message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
-                     "if you intend on running FP8 sparse quantized models on Hopper.")
+                      "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
+                      "if you intend on running FP8 sparse quantized models on Hopper.")
     else()
       message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -492,7 +516,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
-             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+              "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -502,11 +526,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
       message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
-                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
-                     "if you intend on running FP8 quantized MoE models on Hopper.")
+                      "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                      "if you intend on running FP8 quantized MoE models on Hopper.")
     else()
       message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
+                      "in CUDA target architectures")
     endif()
   endif()
 
@@ -571,35 +595,57 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
         AND MACHETE_ARCHS)
       message(STATUS "Not building Machete kernels as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running w4a16 quantized models on "
-                     "Hopper.")
+                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                      "later if you intend on running w4a16 quantized models on "
+                      "Hopper.")
     else()
       message(STATUS "Not building Machete kernels as no compatible archs "
-                     "found in CUDA target architectures")
+                      "found in CUDA target architectures")
     endif()
   endif()
-# if CUDA endif
-endif()
+endif() # End of if(VLLM_GPU_LANG STREQUAL "CUDA") for _C extension sources
+
 
 message(STATUS "Enabling C extension.")
-define_gpu_extension_target(
-  _C
-  DESTINATION vllm
-  LANGUAGE ${VLLM_GPU_LANG}
-  SOURCES ${VLLM_EXT_SRC}
-  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
-  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
-  USE_SABI 3
-  WITH_SOABI)
+# --- MODIFICATION: Make _C extension target conditional ---
+if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    # For CPU, define a C++ extension with no GPU-specific sources or flags
+    # You might need to adjust VLLM_EXT_SRC for CPU-only files here if any exist
+    # For now, we'll assume torch_bindings.cpp is the main one
+    set(VLLM_EXT_SRC "csrc/torch_bindings.cpp") # Only C++ sources
+    define_gpu_extension_target(
+        _C
+        DESTINATION vllm
+        LANGUAGE CXX # Use CXX language for CPU
+        SOURCES ${VLLM_EXT_SRC}
+        # No GPU specific flags or architectures for CPU
+        USE_SABI 3
+        WITH_SOABI)
+else()
+    # Original logic for GPU targets
+    define_gpu_extension_target(
+        _C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${VLLM_EXT_SRC}
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+        INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+        USE_SABI 3
+        WITH_SOABI)
+endif()
+
 
 # If CUTLASS is compiled on NVCC >= 12.5, it by default uses
 # cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
-target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
+# --- MODIFICATION: Make this conditional for CUDA ---
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+    target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
+endif()
+
 
 #
 # _moe_C extension
@@ -610,15 +656,14 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/moe_align_sum_kernels.cu"
   "csrc/moe/topk_softmax_kernels.cu")
 
+# --- MODIFICATION: Wrap all MoE specific CUDA source additions and build ---
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
-endif()
 
-set_gencode_flags_for_srcs(
-  SRCS "${VLLM_MOE_EXT_SRC}"
-  CUDA_ARCHS "${CUDA_ARCHS}")
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_MOE_EXT_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
 
-if(VLLM_GPU_LANG STREQUAL "CUDA")
   set(VLLM_MOE_WNA16_SRC
     "csrc/moe/moe_wna16.cu")
 
@@ -677,20 +722,37 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
   else()
     message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                    " in CUDA target architectures")
   endif()
-endif()
+endif() # End of if(VLLM_GPU_LANG STREQUAL "CUDA") for moe
+
 
 message(STATUS "Enabling moe extension.")
-define_gpu_extension_target(
-  _moe_C
-  DESTINATION vllm
-  LANGUAGE ${VLLM_GPU_LANG}
-  SOURCES ${VLLM_MOE_EXT_SRC}
-  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  USE_SABI 3
-  WITH_SOABI)
+# --- MODIFICATION: Make _moe_C extension target conditional ---
+if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    # For CPU, define a C++ extension for MoE with only C++ sources
+    set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp") # Only C++ sources
+    define_gpu_extension_target(
+        _moe_C
+        DESTINATION vllm
+        LANGUAGE CXX # Use CXX language for CPU
+        SOURCES ${VLLM_MOE_EXT_SRC}
+        USE_SABI 3
+        WITH_SOABI)
+else()
+    # Original logic for GPU targets
+    define_gpu_extension_target(
+        _moe_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${VLLM_MOE_EXT_SRC}
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        USE_SABI 3
+        WITH_SOABI)
+endif()
+
+
 
 if(VLLM_GPU_LANG STREQUAL "HIP")
   #
@@ -713,6 +775,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 
 # For CUDA we also build and ship some external projects.
+# --- MODIFICATION: Make external projects conditional ---
 if (VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/flashmla.cmake)
     include(cmake/external_projects/vllm_flash_attn.cmake)
diff --git a/pyproject.toml b/pyproject.toml
index b5f1039b44d..22728ad669d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
 requires-python = ">=3.9,<3.13"
-dynamic = [ "version", "dependencies", "optional-dependencies"]
+dynamic = [ "version", "dependencies"]
 
 [project.urls]
 Homepage="https://github.com/vllm-project/vllm"
@@ -41,6 +41,13 @@ Slack="http://slack.vllm.ai/"
 [project.scripts]
 vllm = "vllm.entrypoints.cli.main:main"
 
+[project.optional-dependencies]
+dev = [
+    "types-setuptools",  # Required by MyPy for pkg_resources module
+    # Other development-specific tools might go here too,
+    # e.g., "mypy", "ruff", "pre-commit" if not otherwise managed.
+]
+
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm
 
diff --git a/setup.py b/setup.py
index a1867960e59..12d78cbdd6c 100755
--- a/setup.py
+++ b/setup.py
@@ -149,6 +149,12 @@ def configure(self, ext: CMakeExtension) -> None:
             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
         ]
 
+        if VLLM_TARGET_DEVICE == "cpu":
+            cmake_args.append("-DUSE_CUDA=OFF")
+            cmake_args.append("-DBUILD_CUDA_LIBS=OFF")
+            cmake_args.append("-DUSE_CUDNN=OFF")
+            cmake_args.append("-DTORCH_CUDA_ARCH_LIST=NoCUDA")
+
         verbose = envs.VERBOSE
         if verbose:
             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
@@ -523,7 +529,10 @@ def get_nvcc_cuda_version() -> Version:
 
     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
     """
-    assert CUDA_HOME is not None, "CUDA_HOME is not set"
+    if VLLM_TARGET_DEVICE == "cpu":
+        return Version("0.0")
+
+    assert CUDA_HOME is not None, "CUDA_HOME is not set for a CUDA/HIP build target."
     nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
                                           universal_newlines=True)
     output = nvcc_output.split()
@@ -585,8 +594,9 @@ def get_vllm_version() -> str:
     elif _is_tpu():
         version += f"{sep}tpu"
     elif _is_cpu():
-        if envs.VLLM_TARGET_DEVICE == "cpu":
-            version += f"{sep}cpu"
+        # For CPU builds, we don't append a suffix to the version.
+        # The standard PyPI `torch` package is CPU-only by default.
+        pass # Do not append +cpu to the version string
     elif _is_xpu():
         version += f"{sep}xpu"
     else:
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 76da63c5800..61aa3b1092d 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -80,7 +80,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"]),
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
                     ),
                 ) for function_call in raw_function_calls
             ]
@@ -166,7 +167,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -218,7 +220,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -226,7 +229,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
 
                             prefix = find_common_prefix(
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index 91afc88ef3d..52c78e8d9f7 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -67,7 +67,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"]),
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
                     ),
                 ) for function_call in raw_function_calls
             ]
@@ -151,7 +152,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -197,7 +199,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -205,7 +208,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
                             prefix = find_common_prefix(
                                 prev_args_json, cur_args_json)
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 57d7c77c64f..59ac36cd23b 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -133,7 +133,8 @@ def extract_tool_calls_streaming(
                     delta = None
                 # first time to get parameters
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
 
                     arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                          index(delta_text) +
@@ -148,8 +149,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
                 # both prev and cur parameters, send the increase parameters
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
 
                     argument_diff = extract_intermediate_diff(
                         cur_args_json, prev_args_json)
@@ -190,7 +193,8 @@ def extract_tool_calls(
             action_dict = json.loads(action)
             name, parameters = action_dict['name'], json.dumps(
                 action_dict.get('parameters', action_dict.get('arguments',
-                                                              {})))
+                                                              {})),
+                ensure_ascii=False)
 
             if not tools or name not in [t.function.name for t in tools]:
                 ExtractedToolCallInformation(tools_called=False,
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 8df106bf271..50fed9baf8f 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -96,8 +96,9 @@ def extract_tool_calls(
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"])))
-                    for function_call in raw_function_calls
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False),
+                        )) for function_call in raw_function_calls
                 ]
 
                 content = model_output[:model_output.
@@ -187,7 +188,7 @@ def extract_tool_calls_streaming(
                     diff: Union[str, None] = current_tool_call.get("arguments")
 
                     if diff:
-                        diff = json.dumps(diff).replace(
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
                             self.streamed_args_for_tool[self.current_tool_id],
                             "")
                         delta = DeltaMessage(tool_calls=[
@@ -248,7 +249,8 @@ def extract_tool_calls_streaming(
                         "mid-arguments")
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
                     logger.debug("finding %s in %s", new_text,
                                  cur_arguments_json)
 
@@ -267,8 +269,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
                     logger.debug("Searching for diff between \n%s\n%s",
                                  cur_args_json, prev_args_json)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 5c181616aa0..9dbd7efdc44 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -88,7 +88,8 @@ def extract_tool_calls(
                         # function call args are JSON but as a string
                         arguments=json.dumps(raw_function_call["arguments"] \
                                 if "arguments" in raw_function_call \
-                                else raw_function_call["parameters"])))
+                                else raw_function_call["parameters"],
+                                ensure_ascii=False)))
                 for raw_function_call in function_call_arr
             ]
 
@@ -174,7 +175,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -226,7 +228,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -234,7 +237,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
 
                             prefix = find_common_prefix(
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 668776a832e..084f7acb5d8 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -79,10 +79,11 @@ def extract_tool_calls(
                         name=raw_function_call["name"],
                         # function call args are JSON but as a string
                         arguments=json.dumps(
-                            raw_function_call["arguments"] if "arguments" in
-                            raw_function_call else
-                            raw_function_call["parameters"])))
-                for raw_function_call in function_call_arr
+                            raw_function_call["arguments"]
+                            if "arguments" in raw_function_call else
+                            raw_function_call["parameters"],
+                            ensure_ascii=False),
+                    )) for raw_function_call in function_call_arr
             ]
 
             # get any content before the tool call
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 9f141d6b334..e795eb3fa8c 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -200,9 +200,12 @@ def _handle_single_tool(call: ast.Call) -> ToolCall:
     arguments = {}
     for keyword in call.keywords:
         arguments[keyword.arg] = _get_parameter_value(keyword.value)
-    return ToolCall(type="function",
-                    function=FunctionCall(name=function_name,
-                                          arguments=json.dumps(arguments)))
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=function_name,
+                              arguments=json.dumps(arguments,
+                                                   ensure_ascii=False)),
+    )
 
 
 def _make_valid_python(text: str) -> Union[tuple[str, str], None]: