Skip to content

Commit e202dd2

Browse files
[V0 deprecation] Remove V0 CPU/XPU/TPU backends (vllm-project#20412)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: jiang1.li <jiang1.li@intel.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com>
1 parent 43813e6 commit e202dd2

File tree

20 files changed

+46
-5034
lines changed

20 files changed

+46
-5034
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@ function cpu_tests() {
6666
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
6767

6868
# Run AWQ test
69-
docker exec cpu-test-"$NUMA_NODE" bash -c "
70-
set -e
71-
VLLM_USE_V1=0 pytest -s -v \
72-
tests/quantization/test_ipex_quant.py"
69+
# docker exec cpu-test-"$NUMA_NODE" bash -c "
70+
# set -e
71+
# VLLM_USE_V1=0 pytest -s -v \
72+
# tests/quantization/test_ipex_quant.py"
7373

7474
# Run chunked-prefill and prefix-cache test
7575
docker exec cpu-test-"$NUMA_NODE" bash -c "

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,5 @@ docker run \
2626
--name "${container_name}" \
2727
"${image_name}" \
2828
sh -c '
29-
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
30-
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
3129
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
3230
'

examples/online_serving/chart-helm/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ image:
88
# -- Image tag
99
tag: "latest"
1010
# -- Container launch command
11-
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
11+
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
1212

1313
# -- Container port
1414
containerPort: 8000

tests/kernels/attention/test_attention_selector.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def clear_cache():
3636
DEVICE_MLA_BLOCK_SIZES = {
3737
"cuda": [16, 64], # CUDA supports both standard and extended block sizes
3838
"hip": [16, 1], # HIP requires special handling for block_size=1
39-
"cpu": [16] # CPU uses fixed block size from test cases
39+
# "cpu": [16] # CPU uses fixed block size from test cases
40+
"cpu": [] # FIXME(woosuk): Temporarily disable CPU tests
4041
}
4142

4243

@@ -81,14 +82,14 @@ def test_env(
8182
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
8283

8384
if device == "cpu":
85+
if not use_v1:
86+
pytest.skip("CPU backend only supports V1")
87+
8488
with patch("vllm.attention.selector.current_platform",
8589
CpuPlatform()):
8690
backend = get_attn_backend(16, torch.float16, torch.float16,
8791
block_size, False)
88-
if use_v1:
89-
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
90-
else:
91-
assert backend.get_name() == "TORCH_SDPA"
92+
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
9293

9394
elif device == "hip":
9495
with patch("vllm.attention.selector.current_platform",
@@ -193,12 +194,14 @@ def test_fp32_fallback(
193194
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
194195

195196
if device == "cpu":
197+
if not use_v1:
198+
pytest.skip("CPU backend only supports V1")
199+
196200
with patch("vllm.attention.selector.current_platform",
197201
CpuPlatform()):
198202
backend = get_attn_backend(16, torch.float32, torch.float32,
199203
16, False)
200-
assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
201-
if use_v1 else "TORCH_SDPA")
204+
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
202205

203206
elif device == "cuda":
204207
with patch("vllm.attention.selector.current_platform",

vllm/attention/backends/cpu_mla.py

Lines changed: 0 additions & 307 deletions
This file was deleted.

0 commit comments

Comments
 (0)