Skip to content

Commit 6f5df79

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 166d0ef + 4f605a6 commit 6f5df79

File tree

229 files changed

+9758
-6586
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

229 files changed

+9758
-6586
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
2+
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.335
8+
- name: "exact_match,flexible-extract"
9+
value: 0.323
10+
limit: 1319
11+
num_fewshot: 5
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
2+
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.54
8+
- name: "exact_match,flexible-extract"
9+
value: 0.59
10+
limit: 1319
11+
num_fewshot: 5
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
2+
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.47
8+
- name: "exact_match,flexible-extract"
9+
value: 0.64
10+
limit: 1319
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-large.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
33
Mixtral-8x7B-Instruct-v0.1.yaml
44
Qwen2-57B-A14-Instruct.yaml
55
DeepSeek-V2-Lite-Chat.yaml
6+
Meta-Llama-3-8B-QQQ.yaml
Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
1-
Meta-Llama-3-8B-Instruct.yaml
2-
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
1+
Qwen2.5-1.5B-Instruct.yaml
32
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
43
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
54
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
6-
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
5+
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
76
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
8-
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
9-
Qwen2-1.5B-Instruct-FP8W8.yaml
10-
Meta-Llama-3-8B-QQQ.yaml
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
from pathlib import Path
3+
4+
import pytest
5+
6+
7+
def pytest_addoption(parser):
8+
parser.addoption(
9+
"--config-list-file",
10+
action="store",
11+
help="Path to the file listing model config YAMLs (one per line)")
12+
parser.addoption("--tp-size",
13+
action="store",
14+
default="1",
15+
help="Tensor parallel size to use for evaluation")
16+
17+
18+
@pytest.fixture(scope="session")
19+
def config_list_file(pytestconfig, config_dir):
20+
rel_path = pytestconfig.getoption("--config-list-file")
21+
return config_dir / rel_path
22+
23+
24+
@pytest.fixture(scope="session")
25+
def tp_size(pytestconfig):
26+
return pytestconfig.getoption("--tp-size")
27+
28+
29+
def pytest_generate_tests(metafunc):
30+
if "config_filename" in metafunc.fixturenames:
31+
rel_path = metafunc.config.getoption("--config-list-file")
32+
config_list_file = Path(rel_path).resolve()
33+
config_dir = config_list_file.parent
34+
with open(config_list_file, encoding="utf-8") as f:
35+
configs = [
36+
config_dir / line.strip() for line in f
37+
if line.strip() and not line.startswith("#")
38+
]
39+
metafunc.parametrize("config_filename", configs)

.buildkite/lm-eval-harness/run-tests.sh

Lines changed: 0 additions & 59 deletions
This file was deleted.

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,67 +3,48 @@
33
LM eval harness on model to compare vs HF baseline computed offline.
44
Configs are found in configs/$MODEL.yaml
55
6-
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
7-
* export LM_EVAL_TP_SIZE=4
8-
* pytest -s test_lm_eval_correctness.py
6+
pytest -s -v test_lm_eval_correctness.py \
7+
--config-list-file=configs/models-small.txt \
8+
--tp-size=1
99
"""
1010

11-
import os
12-
from pathlib import Path
13-
1411
import lm_eval
15-
import numpy
16-
import pytest
12+
import numpy as np
1713
import yaml
1814

1915
RTOL = 0.08
20-
TEST_DATA_FILE = os.environ.get(
21-
"LM_EVAL_TEST_DATA_FILE",
22-
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
23-
24-
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
2516

2617

27-
def launch_lm_eval(eval_config):
18+
def launch_lm_eval(eval_config, tp_size):
2819
trust_remote_code = eval_config.get('trust_remote_code', False)
29-
3020
model_args = f"pretrained={eval_config['model_name']}," \
31-
f"tensor_parallel_size={TP_SIZE}," \
21+
f"tensor_parallel_size={tp_size}," \
22+
f"enforce_eager=true," \
3223
f"add_bos_token=true," \
3324
f"trust_remote_code={trust_remote_code}"
34-
3525
results = lm_eval.simple_evaluate(
3626
model="vllm",
3727
model_args=model_args,
3828
tasks=[task["name"] for task in eval_config["tasks"]],
3929
num_fewshot=eval_config["num_fewshot"],
4030
limit=eval_config["limit"],
4131
batch_size="auto")
42-
4332
return results
4433

4534

46-
def test_lm_eval_correctness():
47-
eval_config = yaml.safe_load(
48-
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
49-
50-
if eval_config[
51-
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
52-
pytest.skip("FBGEMM is currently failing on main.")
35+
def test_lm_eval_correctness_param(config_filename, tp_size):
36+
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
5337

54-
# Launch eval requests.
55-
results = launch_lm_eval(eval_config)
38+
results = launch_lm_eval(eval_config, tp_size)
5639

57-
# Confirm scores match ground truth.
5840
success = True
5941
for task in eval_config["tasks"]:
6042
for metric in task["metrics"]:
6143
ground_truth = metric["value"]
6244
measured_value = results["results"][task["name"]][metric["name"]]
6345
print(f'{task["name"]} | {metric["name"]}: '
6446
f'ground_truth={ground_truth} | measured={measured_value}')
65-
success = success and numpy.isclose(
47+
success = success and np.isclose(
6648
ground_truth, measured_value, rtol=RTOL)
6749

68-
# Assert at the end, print all scores even on failure for debugging.
6950
assert success
Lines changed: 75 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
set -xue
3+
set -xu
44

55
# Build the docker image.
66
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
@@ -24,31 +24,80 @@ docker run --privileged --net host --shm-size=16G -it \
2424
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
2525
&& echo HARDWARE \
2626
&& tpu-info \
27-
&& echo TEST_0 \
28-
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
29-
&& echo TEST_1 \
30-
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
31-
&& echo TEST_2 \
32-
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
33-
&& echo TEST_3 \
34-
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
35-
&& echo TEST_4 \
36-
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
37-
&& echo TEST_5 \
38-
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
39-
&& echo TEST_6 \
40-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
41-
&& echo TEST_7 \
42-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
43-
&& echo TEST_8 \
44-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
45-
&& echo TEST_9 \
46-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
47-
&& echo TEST_10 \
48-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
49-
&& echo TEST_11 \
50-
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
51-
27+
&& { \
28+
echo TEST_0: Running test_perf.py; \
29+
pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
30+
echo TEST_0_EXIT_CODE: \$?; \
31+
} & \
32+
&& { \
33+
echo TEST_1: Running test_compilation.py; \
34+
pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
35+
echo TEST_1_EXIT_CODE: \$?; \
36+
} & \
37+
{ \
38+
echo TEST_2: Running test_basic.py; \
39+
pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
40+
echo TEST_2_EXIT_CODE: \$?; \
41+
} & \
42+
{ \
43+
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
44+
pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
45+
echo TEST_3_EXIT_CODE: \$?; \
46+
} & \
47+
{ \
48+
echo TEST_4: Running test_quantization_accuracy.py; \
49+
pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
50+
echo TEST_4_EXIT_CODE: \$?; \
51+
} & \
52+
{ \
53+
echo TEST_5: Running examples/offline_inference/tpu.py; \
54+
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
55+
echo TEST_5_EXIT_CODE: \$?; \
56+
} & \
57+
{ \
58+
echo TEST_6: Running test_tpu_model_runner.py; \
59+
pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
60+
echo TEST_6_EXIT_CODE: \$?; \
61+
} & \
62+
&& { \
63+
echo TEST_7: Running test_sampler.py; \
64+
pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
65+
echo TEST_7_EXIT_CODE: \$?; \
66+
} & \
67+
&& { \
68+
echo TEST_8: Running test_topk_topp_sampler.py; \
69+
pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
70+
echo TEST_8_EXIT_CODE: \$?; \
71+
} & \
72+
&& { \
73+
echo TEST_9: Running test_multimodal.py; \
74+
pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
75+
echo TEST_9_EXIT_CODE: \$?; \
76+
} & \
77+
&& { \
78+
echo TEST_10: Running test_pallas.py; \
79+
pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
80+
echo TEST_10_EXIT_CODE: \$?; \
81+
} & \
82+
&& { \
83+
echo TEST_11: Running test_struct_output_generate.py; \
84+
pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
85+
echo TEST_11_EXIT_CODE: \$?; \
86+
} & \
87+
&& { \
88+
echo TEST_12: Running test_moe_pallas.py; \
89+
pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
90+
echo TEST_12_EXIT_CODE: \$?; \
91+
} & \
92+
# Disable the TPU LoRA tests until the feature is activated
93+
# && { \
94+
# echo TEST_13: Running test_moe_pallas.py; \
95+
# pytest -s -v /workspace/vllm/tests/tpu/lora/; \
96+
# echo TEST_13_EXIT_CODE: \$?; \
97+
# } & \
98+
wait \
99+
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
100+
"
52101

53102
# TODO: This test fails because it uses RANDOM_SEED sampling
54103
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ steps:
419419
- vllm/model_executor/layers/quantization
420420
commands:
421421
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
422-
- bash ./run-tests.sh -c configs/models-small.txt -t 1
422+
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
423423

424424
- label: OpenAI API correctness
425425
source_file_dependencies:
@@ -725,4 +725,4 @@ steps:
725725
- vllm/model_executor/layers/quantization
726726
commands:
727727
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
728-
- bash ./run-tests.sh -c configs/models-large.txt -t 4
728+
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

0 commit comments

Comments
 (0)