diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 6ab63a40277..bfe549d3899 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -9,12 +9,34 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: "3.12" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - run: echo "::add-matcher::.github/workflows/matchers/mypy.json" - - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libnuma-dev + + - name: Install PyTorch for CPU and NumPy + run: | + pip install numpy + pip install torch==2.6.0+cpu torchvision==0.21.0+cpu torchaudio==2.6.0+cpu --index-url https://download.pytorch.org/whl/cpu + + - name: Install vLLM Build Dependencies (excluding torch and comments) + run: pip install $(grep -v 'torch==' requirements/build.txt | grep -v '^#' | tr '\n' ' ') + + - name: Install vLLM Project and Dev Dependencies + env: + VLLM_TARGET_DEVICE: cpu + USE_CUDA: "OFF" + CUDA_VISIBLE_DEVICES: "" + FORCE_CUDA: "0" + run: pip install -e ".[dev]" + + - uses: pre-commit/action@v3.0.1 with: - extra_args: --all-files --hook-stage manual + extra_args: --all-files --hook-stage manual \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 3314f05fd2a..85af9eef87b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,8 +25,8 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) set(ignoreMe "${VLLM_PYTHON_PATH}") # -# Supported python versions. These versions will be searched in order, the -# first match will be selected. These should be kept in sync with setup.py. +# Supported python versions. These versions will be searched in order, the +# first match will be selected. These should be kept in sync with setup.py. # set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") @@ -43,7 +43,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 # rather than an error. # # Note: the CUDA torch version is derived from pyproject.toml and various -# requirements.txt files and should be kept consistent. The ROCm torch +# requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") @@ -66,10 +66,40 @@ endif() # append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") +if (VLLM_TARGET_DEVICE STREQUAL "cpu") + message(STATUS "VLLM: Building for CPU. Skipping all CUDA/HIP related configuration.") + + # For CPU builds, we need to get torch include directories without cpp_extension + # Get torch installation path and construct include paths manually + execute_process( + COMMAND ${Python_EXECUTABLE} -c "import torch; import os; print(os.path.dirname(torch.__file__), end='')" + OUTPUT_VARIABLE TORCH_INSTALL_PATH + RESULT_VARIABLE TORCH_PATH_RESULT + ) + + if(NOT TORCH_PATH_RESULT EQUAL 0) + message(FATAL_ERROR "Failed to get torch installation path") + endif() + + # Construct torch include directories manually + set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PATH}/include;${TORCH_INSTALL_PATH}/include/torch/csrc/api/include") + + message(STATUS "Torch installation path: ${TORCH_INSTALL_PATH}") + message(STATUS "Torch include directories: ${TORCH_INCLUDE_DIRS}") + + # Include the CPU extension cmake and return early + include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake) + return() +endif() + +# The following blocks will ONLY be processed if VLLM_TARGET_DEVICE is NOT "cpu" +# (i.e., it's "cuda" or "rocm" or potentially some other future GPU target) + # Ensure the 'nvcc' command is in the PATH +# This block is now outside the "if cpu" condition, so it only runs for GPU builds. find_program(NVCC_EXECUTABLE nvcc) if (CUDA_FOUND AND NOT NVCC_EXECUTABLE) - message(FATAL_ERROR "nvcc not found") + message(FATAL_ERROR "nvcc not found") endif() # @@ -78,21 +108,9 @@ endif() # so there is no need to do this explicitly with check_language/enable_language, # etc. # +# This find_package(Torch REQUIRED) call only happens for GPU builds now. find_package(Torch REQUIRED) -# -# Forward the non-CUDA device extensions to external CMake scripts. -# -if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND - NOT VLLM_TARGET_DEVICE STREQUAL "rocm") - if (VLLM_TARGET_DEVICE STREQUAL "cpu") - include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake) - else() - return() - endif() - return() -endif() - # # Set up GPU language and check the torch version and warn if it isn't # what is expected. @@ -119,6 +137,9 @@ elseif(HIP_FOUND) "expected for ROCm build, saw ${Torch_VERSION} instead.") endif() else() + # This FATAL_ERROR will now only trigger if VLLM_TARGET_DEVICE is something + # that requires a GPU (like "cuda" or "rocm") but neither CUDA nor HIP are found. + # It will NOT trigger for "cpu" anymore. message(FATAL_ERROR "Can't find CUDA or HIP installation.") endif() @@ -204,11 +225,12 @@ endif() set(VLLM_CUMEM_EXT_SRC "csrc/cumem_allocator.cpp") -set_gencode_flags_for_srcs( - SRCS "${VLLM_CUMEM_EXT_SRC}" - CUDA_ARCHS "${CUDA_ARCHS}") - +# --- MODIFICATION: Make cumem_allocator conditional --- if(VLLM_GPU_LANG STREQUAL "CUDA") + set_gencode_flags_for_srcs( + SRCS "${VLLM_CUMEM_EXT_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") + message(STATUS "Enabling cumem allocator extension.") # link against cuda driver library list(APPEND CUMEM_LIBS CUDA::cuda_driver) @@ -222,6 +244,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") WITH_SOABI) endif() + # # _C extension # @@ -246,6 +269,7 @@ set(VLLM_EXT_SRC "csrc/custom_all_reduce.cu" "csrc/torch_bindings.cpp") +# --- MODIFICATION: Wrap all CUDA-specific source additions and FetchContent --- if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") @@ -303,13 +327,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") if (MARLIN_ARCHS) set(MARLIN_SRCS - "csrc/quantization/fp8/fp8_marlin.cu" - "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" - "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" - "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" - "csrc/quantization/gptq_marlin/gptq_marlin.cu" - "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" - "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") + "csrc/quantization/fp8/fp8_marlin.cu" + "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" + "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" + "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" + "csrc/quantization/gptq_marlin/gptq_marlin.cu" + "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" + "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") set_gencode_flags_for_srcs( SRCS "${MARLIN_SRCS}" CUDA_ARCHS "${MARLIN_ARCHS}") @@ -317,15 +341,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" - " in CUDA target architectures") + " in CUDA target architectures") endif() # Only build AllSpark kernels if we are building for at least some compatible archs. cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}") if (ALLSPARK_ARCHS) set(ALLSPARK_SRCS - "csrc/quantization/gptq_allspark/allspark_repack.cu" - "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu") + "csrc/quantization/gptq_allspark/allspark_repack.cu" + "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu") set_gencode_flags_for_srcs( SRCS "${ALLSPARK_SRCS}" CUDA_ARCHS "${ALLSPARK_ARCHS}") @@ -333,7 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}") else() message(STATUS "Not building AllSpark kernels as no compatible archs found" - " in CUDA target architectures") + " in CUDA target architectures") endif() @@ -343,11 +367,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -359,12 +383,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") else() if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is " - "not >= 12.0, we recommend upgrading to CUDA 12.0 or " - "later if you intend on running FP8 quantized models on " - "Hopper.") + "not >= 12.0, we recommend upgrading to CUDA 12.0 or " + "later if you intend on running FP8 quantized models on " + "Hopper.") else() message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found " - "in CUDA target architectures") + "in CUDA target architectures") endif() endif() @@ -387,12 +411,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") else() if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is " - "not >= 12.8, we recommend upgrading to CUDA 12.8 or " - "later if you intend on running FP8 quantized models on " - "Blackwell.") + "not >= 12.8, we recommend upgrading to CUDA 12.8 or " + "later if you intend on running FP8 quantized models on " + "Blackwell.") else() message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found " - "in CUDA target architectures") + "in CUDA target architectures") endif() endif() @@ -414,10 +438,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") else() if (SCALED_MM_3X_ARCHS) message(STATUS "Not building scaled_mm_c2x as all archs are already built" - " for and covered by scaled_mm_c3x") + " for and covered by scaled_mm_c3x") else() message(STATUS "Not building scaled_mm_c2x as no compatible archs found " - "in CUDA target architectures") + "in CUDA target architectures") endif() endif() @@ -438,11 +462,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") else() if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is " - "not >= 12.2, we recommend upgrading to CUDA 12.2 or later " - "if you intend on running FP8 sparse quantized models on Hopper.") + "not >= 12.2, we recommend upgrading to CUDA 12.2 or later " + "if you intend on running FP8 sparse quantized models on Hopper.") else() message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found " - "in CUDA target architectures") + "in CUDA target architectures") endif() endif() @@ -492,7 +516,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" - "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") + "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -502,11 +526,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") else() if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is " - "not >= 12.3, we recommend upgrading to CUDA 12.3 or later " - "if you intend on running FP8 quantized MoE models on Hopper.") + "not >= 12.3, we recommend upgrading to CUDA 12.3 or later " + "if you intend on running FP8 quantized MoE models on Hopper.") else() message(STATUS "Not building grouped_mm_c3x as no compatible archs found " - "in CUDA target architectures") + "in CUDA target architectures") endif() endif() @@ -571,35 +595,57 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) message(STATUS "Not building Machete kernels as CUDA Compiler version is " - "not >= 12.0, we recommend upgrading to CUDA 12.0 or " - "later if you intend on running w4a16 quantized models on " - "Hopper.") + "not >= 12.0, we recommend upgrading to CUDA 12.0 or " + "later if you intend on running w4a16 quantized models on " + "Hopper.") else() message(STATUS "Not building Machete kernels as no compatible archs " - "found in CUDA target architectures") + "found in CUDA target architectures") endif() endif() -# if CUDA endif -endif() +endif() # End of if(VLLM_GPU_LANG STREQUAL "CUDA") for _C extension sources + message(STATUS "Enabling C extension.") -define_gpu_extension_target( - _C - DESTINATION vllm - LANGUAGE ${VLLM_GPU_LANG} - SOURCES ${VLLM_EXT_SRC} - COMPILE_FLAGS ${VLLM_GPU_FLAGS} - ARCHITECTURES ${VLLM_GPU_ARCHES} - INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} - INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} - USE_SABI 3 - WITH_SOABI) +# --- MODIFICATION: Make _C extension target conditional --- +if (VLLM_TARGET_DEVICE STREQUAL "cpu") + # For CPU, define a C++ extension with no GPU-specific sources or flags + # You might need to adjust VLLM_EXT_SRC for CPU-only files here if any exist + # For now, we'll assume torch_bindings.cpp is the main one + set(VLLM_EXT_SRC "csrc/torch_bindings.cpp") # Only C++ sources + define_gpu_extension_target( + _C + DESTINATION vllm + LANGUAGE CXX # Use CXX language for CPU + SOURCES ${VLLM_EXT_SRC} + # No GPU specific flags or architectures for CPU + USE_SABI 3 + WITH_SOABI) +else() + # Original logic for GPU targets + define_gpu_extension_target( + _C + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_EXT_SRC} + COMPILE_FLAGS ${VLLM_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} + INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} + USE_SABI 3 + WITH_SOABI) +endif() + # If CUTLASS is compiled on NVCC >= 12.5, it by default uses # cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the # driver API. This causes problems when linking with earlier versions of CUDA. # Setting this variable sidesteps the issue by calling the driver directly. -target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) +# --- MODIFICATION: Make this conditional for CUDA --- +if(VLLM_GPU_LANG STREQUAL "CUDA") + target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) +endif() + # # _moe_C extension @@ -610,15 +656,14 @@ set(VLLM_MOE_EXT_SRC "csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") +# --- MODIFICATION: Wrap all MoE specific CUDA source additions and build --- if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu") -endif() -set_gencode_flags_for_srcs( - SRCS "${VLLM_MOE_EXT_SRC}" - CUDA_ARCHS "${CUDA_ARCHS}") + set_gencode_flags_for_srcs( + SRCS "${VLLM_MOE_EXT_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") -if(VLLM_GPU_LANG STREQUAL "CUDA") set(VLLM_MOE_WNA16_SRC "csrc/moe/moe_wna16.cu") @@ -677,20 +722,37 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") else() message(STATUS "Not building Marlin MOE kernels as no compatible archs found" - " in CUDA target architectures") + " in CUDA target architectures") endif() -endif() +endif() # End of if(VLLM_GPU_LANG STREQUAL "CUDA") for moe + message(STATUS "Enabling moe extension.") -define_gpu_extension_target( - _moe_C - DESTINATION vllm - LANGUAGE ${VLLM_GPU_LANG} - SOURCES ${VLLM_MOE_EXT_SRC} - COMPILE_FLAGS ${VLLM_GPU_FLAGS} - ARCHITECTURES ${VLLM_GPU_ARCHES} - USE_SABI 3 - WITH_SOABI) +# --- MODIFICATION: Make _moe_C extension target conditional --- +if (VLLM_TARGET_DEVICE STREQUAL "cpu") + # For CPU, define a C++ extension for MoE with only C++ sources + set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp") # Only C++ sources + define_gpu_extension_target( + _moe_C + DESTINATION vllm + LANGUAGE CXX # Use CXX language for CPU + SOURCES ${VLLM_MOE_EXT_SRC} + USE_SABI 3 + WITH_SOABI) +else() + # Original logic for GPU targets + define_gpu_extension_target( + _moe_C + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_MOE_EXT_SRC} + COMPILE_FLAGS ${VLLM_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + USE_SABI 3 + WITH_SOABI) +endif() + + if(VLLM_GPU_LANG STREQUAL "HIP") # @@ -713,6 +775,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP") endif() # For CUDA we also build and ship some external projects. +# --- MODIFICATION: Make external projects conditional --- if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) include(cmake/external_projects/vllm_flash_attn.cmake) diff --git a/pyproject.toml b/pyproject.toml index b5f1039b44d..22728ad669d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Information Analysis", ] requires-python = ">=3.9,<3.13" -dynamic = [ "version", "dependencies", "optional-dependencies"] +dynamic = [ "version", "dependencies"] [project.urls] Homepage="https://github.com/vllm-project/vllm" @@ -41,6 +41,13 @@ Slack="http://slack.vllm.ai/" [project.scripts] vllm = "vllm.entrypoints.cli.main:main" +[project.optional-dependencies] +dev = [ + "types-setuptools", # Required by MyPy for pkg_resources module + # Other development-specific tools might go here too, + # e.g., "mypy", "ruff", "pre-commit" if not otherwise managed. +] + [tool.setuptools_scm] # no extra settings needed, presence enables setuptools-scm diff --git a/setup.py b/setup.py index a1867960e59..12d78cbdd6c 100755 --- a/setup.py +++ b/setup.py @@ -149,6 +149,12 @@ def configure(self, ext: CMakeExtension) -> None: '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE), ] + if VLLM_TARGET_DEVICE == "cpu": + cmake_args.append("-DUSE_CUDA=OFF") + cmake_args.append("-DBUILD_CUDA_LIBS=OFF") + cmake_args.append("-DUSE_CUDNN=OFF") + cmake_args.append("-DTORCH_CUDA_ARCH_LIST=NoCUDA") + verbose = envs.VERBOSE if verbose: cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] @@ -523,7 +529,10 @@ def get_nvcc_cuda_version() -> Version: Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py """ - assert CUDA_HOME is not None, "CUDA_HOME is not set" + if VLLM_TARGET_DEVICE == "cpu": + return Version("0.0") + + assert CUDA_HOME is not None, "CUDA_HOME is not set for a CUDA/HIP build target." nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True) output = nvcc_output.split() @@ -585,8 +594,9 @@ def get_vllm_version() -> str: elif _is_tpu(): version += f"{sep}tpu" elif _is_cpu(): - if envs.VLLM_TARGET_DEVICE == "cpu": - version += f"{sep}cpu" + # For CPU builds, we don't append a suffix to the version. + # The standard PyPI `torch` package is CPU-only by default. + pass # Do not append +cpu to the version string elif _is_xpu(): version += f"{sep}xpu" else: diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 76da63c5800..61aa3b1092d 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -80,7 +80,8 @@ def extract_tool_calls( function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]), + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), ), ) for function_call in raw_function_calls ] @@ -166,7 +167,8 @@ def extract_tool_calls_streaming( if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -218,7 +220,8 @@ def extract_tool_calls_streaming( if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -226,7 +229,8 @@ def extract_tool_calls_streaming( if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index 91afc88ef3d..52c78e8d9f7 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -67,7 +67,8 @@ def extract_tool_calls( function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]), + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), ), ) for function_call in raw_function_calls ] @@ -151,7 +152,8 @@ def extract_tool_calls_streaming( if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -197,7 +199,8 @@ def extract_tool_calls_streaming( if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -205,7 +208,8 @@ def extract_tool_calls_streaming( if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( prev_args_json, cur_args_json) diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 57d7c77c64f..59ac36cd23b 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -133,7 +133,8 @@ def extract_tool_calls_streaming( delta = None # first time to get parameters elif cur_arguments and not prev_arguments: - cur_arguments_json = json.dumps(cur_arguments) + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) arguments_delta = cur_arguments_json[:cur_arguments_json. index(delta_text) + @@ -148,8 +149,10 @@ def extract_tool_calls_streaming( self.current_tool_id] += arguments_delta # both prev and cur parameters, send the increase parameters elif cur_arguments and prev_arguments: - cur_args_json = json.dumps(cur_arguments) - prev_args_json = json.dumps(prev_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) argument_diff = extract_intermediate_diff( cur_args_json, prev_args_json) @@ -190,7 +193,8 @@ def extract_tool_calls( action_dict = json.loads(action) name, parameters = action_dict['name'], json.dumps( action_dict.get('parameters', action_dict.get('arguments', - {}))) + {})), + ensure_ascii=False) if not tools or name not in [t.function.name for t in tools]: ExtractedToolCallInformation(tools_called=False, diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 8df106bf271..50fed9baf8f 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -96,8 +96,9 @@ def extract_tool_calls( function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]))) - for function_call in raw_function_calls + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), + )) for function_call in raw_function_calls ] content = model_output[:model_output. @@ -187,7 +188,7 @@ def extract_tool_calls_streaming( diff: Union[str, None] = current_tool_call.get("arguments") if diff: - diff = json.dumps(diff).replace( + diff = json.dumps(diff, ensure_ascii=False).replace( self.streamed_args_for_tool[self.current_tool_id], "") delta = DeltaMessage(tool_calls=[ @@ -248,7 +249,8 @@ def extract_tool_calls_streaming( "mid-arguments") delta = None elif cur_arguments and not prev_arguments: - cur_arguments_json = json.dumps(cur_arguments) + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) logger.debug("finding %s in %s", new_text, cur_arguments_json) @@ -267,8 +269,10 @@ def extract_tool_calls_streaming( self.current_tool_id] += arguments_delta elif cur_arguments and prev_arguments: - cur_args_json = json.dumps(cur_arguments) - prev_args_json = json.dumps(prev_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) logger.debug("Searching for diff between \n%s\n%s", cur_args_json, prev_args_json) diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 5c181616aa0..9dbd7efdc44 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -88,7 +88,8 @@ def extract_tool_calls( # function call args are JSON but as a string arguments=json.dumps(raw_function_call["arguments"] \ if "arguments" in raw_function_call \ - else raw_function_call["parameters"]))) + else raw_function_call["parameters"], + ensure_ascii=False))) for raw_function_call in function_call_arr ] @@ -174,7 +175,8 @@ def extract_tool_calls_streaming( if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -226,7 +228,8 @@ def extract_tool_calls_streaming( if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -234,7 +237,8 @@ def extract_tool_calls_streaming( if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index 668776a832e..084f7acb5d8 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -79,10 +79,11 @@ def extract_tool_calls( name=raw_function_call["name"], # function call args are JSON but as a string arguments=json.dumps( - raw_function_call["arguments"] if "arguments" in - raw_function_call else - raw_function_call["parameters"]))) - for raw_function_call in function_call_arr + raw_function_call["arguments"] + if "arguments" in raw_function_call else + raw_function_call["parameters"], + ensure_ascii=False), + )) for raw_function_call in function_call_arr ] # get any content before the tool call diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 9f141d6b334..e795eb3fa8c 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -200,9 +200,12 @@ def _handle_single_tool(call: ast.Call) -> ToolCall: arguments = {} for keyword in call.keywords: arguments[keyword.arg] = _get_parameter_value(keyword.value) - return ToolCall(type="function", - function=FunctionCall(name=function_name, - arguments=json.dumps(arguments))) + return ToolCall( + type="function", + function=FunctionCall(name=function_name, + arguments=json.dumps(arguments, + ensure_ascii=False)), + ) def _make_valid_python(text: str) -> Union[tuple[str, str], None]: