Skip to content

Commit 7721ef1

Browse files
authored
[CI/Build][CPU] Fix CPU CI and remove all CPU V0 files (#20560)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
1 parent 8369b7c commit 7721ef1

File tree

9 files changed

+785
-839
lines changed

9 files changed

+785
-839
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,16 @@ function cpu_tests() {
4848
# Run basic model test
4949
docker exec cpu-test-"$NUMA_NODE" bash -c "
5050
set -e
51-
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
52-
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
53-
pytest -v -s tests/models/language/generation -m cpu_model
54-
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
51+
# Note: disable until supports V1
52+
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
53+
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
54+
55+
# Note: disable Bart until supports V1
56+
pytest -v -s tests/models/language/generation -m cpu_model \
57+
--ignore=tests/models/language/generation/test_bart.py
58+
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
59+
--ignore=tests/models/language/generation/test_bart.py
60+
5561
pytest -v -s tests/models/language/pooling -m cpu_model
5662
pytest -v -s tests/models/multimodal/generation \
5763
--ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -62,21 +68,15 @@ function cpu_tests() {
6268
docker exec cpu-test-"$NUMA_NODE" bash -c "
6369
set -e
6470
pytest -s -v \
65-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
66-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
71+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
6772

73+
# Note: disable it until supports V1
6874
# Run AWQ test
6975
# docker exec cpu-test-"$NUMA_NODE" bash -c "
7076
# set -e
7177
# VLLM_USE_V1=0 pytest -s -v \
7278
# tests/quantization/test_ipex_quant.py"
7379

74-
# Run chunked-prefill and prefix-cache test
75-
docker exec cpu-test-"$NUMA_NODE" bash -c "
76-
set -e
77-
pytest -s -v -k cpu_model \
78-
tests/basic_correctness/test_chunked_prefill.py"
79-
8080
# online serving
8181
docker exec cpu-test-"$NUMA_NODE" bash -c "
8282
set -e

tests/basic_correctness/test_chunked_prefill.py

Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -294,61 +294,3 @@ def test_with_prefix_caching(
294294
name_0="w/o prefix caching",
295295
name_1="with prefix caching",
296296
)
297-
298-
299-
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
300-
@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
301-
@pytest.mark.parametrize("max_tokens", [32])
302-
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
303-
@pytest.mark.parametrize("enforce_eager", [False])
304-
@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
305-
@pytest.mark.cpu_model
306-
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
307-
def test_models_cpu(
308-
hf_runner: HfRunner,
309-
vllm_runner: VllmRunner,
310-
example_prompts,
311-
model: str,
312-
dtype: str,
313-
max_tokens: int,
314-
chunked_prefill_token_size: int,
315-
enforce_eager: bool,
316-
attention_backend: str,
317-
monkeypatch: pytest.MonkeyPatch,
318-
) -> None:
319-
test_models(
320-
hf_runner,
321-
vllm_runner,
322-
example_prompts,
323-
model,
324-
dtype,
325-
max_tokens,
326-
chunked_prefill_token_size,
327-
enforce_eager,
328-
1,
329-
attention_backend,
330-
monkeypatch,
331-
)
332-
333-
334-
@pytest.mark.parametrize("max_tokens", [16])
335-
@pytest.mark.parametrize("enforce_eager", [False])
336-
@pytest.mark.parametrize("chunk_size", [30, 32])
337-
@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
338-
@pytest.mark.cpu_model
339-
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
340-
def test_with_prefix_caching_cpu(
341-
vllm_runner: VllmRunner,
342-
max_tokens: int,
343-
enforce_eager: bool,
344-
chunk_size: int,
345-
dtype: str,
346-
) -> None:
347-
test_with_prefix_caching(
348-
vllm_runner,
349-
max_tokens,
350-
enforce_eager,
351-
chunk_size,
352-
1,
353-
dtype,
354-
)

tests/models/language/generation/test_common.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
[
4040
pytest.param(
4141
"bigscience/bloom-560m", # bloom - testing alibi slopes
42-
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
42+
marks=[pytest.mark.core_model],
4343
),
4444
pytest.param(
4545
"openai-community/gpt2", # gpt2
@@ -87,7 +87,11 @@
8787
pytest.param("bigcode/starcoder2-3b"), # starcoder2
8888
pytest.param(
8989
"TitanML/tiny-mixtral", # mixtral
90-
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
90+
marks=[pytest.mark.core_model],
91+
),
92+
pytest.param(
93+
"Qwen/Qwen1.5-MoE-A2.7B-Chat",
94+
marks=[pytest.mark.cpu_model],
9195
)
9296
])
9397
@pytest.mark.parametrize("max_tokens", [32])

tests/models/language/pooling/test_embedding.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3-
import os
43
from typing import Optional
54

65
import pytest
@@ -29,20 +28,24 @@ def v1(run_with_both_engines):
2928
# [Decoder-only]
3029
pytest.param("BAAI/bge-multilingual-gemma2",
3130
marks=[pytest.mark.core_model]),
32-
pytest.param("intfloat/e5-mistral-7b-instruct",
33-
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
31+
pytest.param(
32+
"intfloat/e5-mistral-7b-instruct",
33+
# CPU v1 doesn't support sliding window
34+
marks=[pytest.mark.core_model]),
3435
# the qwen models interfere with each other (see PR
3536
# https://github.com/vllm-project/vllm/pull/18720).
3637
# To avoid this problem, for now we skip v0 since it will be
3738
# deprecated anyway.
3839
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
3940
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
4041
# [Encoder-only]
41-
pytest.param("BAAI/bge-base-en-v1.5",
42-
marks=[
43-
pytest.mark.core_model, pytest.mark.cpu_model,
44-
pytest.mark.skip_v1
45-
]),
42+
pytest.param(
43+
"BAAI/bge-base-en-v1.5",
44+
marks=[
45+
# CPU only supports V1
46+
pytest.mark.core_model,
47+
pytest.mark.skip_v1
48+
]),
4649
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
4750
marks=[pytest.mark.skip_v1]),
4851
pytest.param("intfloat/multilingual-e5-small",
@@ -61,10 +64,6 @@ def test_models(
6164
model,
6265
monkeypatch,
6366
) -> None:
64-
if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
65-
) and os.environ.get("VLLM_USE_V1", "0") == "1":
66-
pytest.skip("CPU V1 doesn't support sliding window")
67-
6867
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
6968
# ROCm Triton FA does not currently support sliding window attention
7069
# switch to use ROCm CK FA backend

tests/models/language/pooling/test_reward.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import os
4+
35
import pytest
46
import torch
57
import torch.nn.functional as F
@@ -84,6 +86,9 @@ def test_prm_models(
8486
dtype: str,
8587
monkeypatch,
8688
) -> None:
89+
if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
90+
pytest.skip("CPU only supports V1")
91+
8792
if current_platform.is_rocm():
8893
# ROCm Triton FA does not currently support sliding window attention
8994
# switch to use ROCm CK FA backend

tests/quantization/test_compressed_tensors.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ def use_v0_only(monkeypatch):
4545
"""
4646
This module relies on V0 internals, so set VLLM_USE_V1=0.
4747
"""
48-
monkeypatch.setenv('VLLM_USE_V1', '0')
48+
if not current_platform.is_cpu():
49+
monkeypatch.setenv('VLLM_USE_V1', '0')
4950

5051

5152
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)