|
19 | 19 |
|
20 | 20 | import gc
|
21 | 21 | import multiprocessing
|
| 22 | +import sys |
22 | 23 | from multiprocessing import Queue
|
23 | 24 |
|
24 | 25 | import lm_eval
|
25 | 26 | import pytest
|
26 | 27 | import torch
|
27 | 28 |
|
28 | 29 | # pre-trained model path on Hugging Face.
|
29 |
| -MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" |
30 |
| -# Math reasoning benchmark (Grade School Math 8K). |
31 |
| -TASK = "gsm8k" |
| 30 | +MODEL_NAME = ["Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct"] |
| 31 | +# Benchmark configuration mapping models to evaluation tasks: |
| 32 | +# - Text model: GSM8K (grade school math reasoning) |
| 33 | +# - Vision-language model: MMMU Art & Design validation (multimodal understanding) |
| 34 | +TASK = { |
| 35 | + "Qwen/Qwen2.5-0.5B-Instruct": "gsm8k", |
| 36 | + "Qwen/Qwen2.5-VL-3B-Instruct": "mmmu_val_art_and_design" |
| 37 | +} |
32 | 38 | # Answer validation requiring format consistency.
|
33 |
| -FILTER = "exact_match,strict-match" |
| 39 | +FILTER = { |
| 40 | + "Qwen/Qwen2.5-0.5B-Instruct": "exact_match,strict-match", |
| 41 | + "Qwen/Qwen2.5-VL-3B-Instruct": "acc,none" |
| 42 | +} |
34 | 43 | # 3% relative tolerance for numerical accuracy.
|
35 | 44 | RTOL = 0.03
|
36 | 45 | # Baseline accuracy after VLLM optimization.
|
37 |
| -EXPECTED_VALUE = 0.316 |
| 46 | +EXPECTED_VALUE = { |
| 47 | + "Qwen/Qwen2.5-0.5B-Instruct": 0.316, |
| 48 | + "Qwen/Qwen2.5-VL-3B-Instruct": 0.541 |
| 49 | +} |
| 50 | +# Maximum context length configuration for each model. |
| 51 | +MAX_MODEL_LEN = { |
| 52 | + "Qwen/Qwen2.5-0.5B-Instruct": 4096, |
| 53 | + "Qwen/Qwen2.5-VL-3B-Instruct": 8192 |
| 54 | +} |
| 55 | +# Model types distinguishing text-only and vision-language models. |
| 56 | +MODEL_TYPE = { |
| 57 | + "Qwen/Qwen2.5-0.5B-Instruct": "vllm", |
| 58 | + "Qwen/Qwen2.5-VL-3B-Instruct": "vllm-vlm" |
| 59 | +} |
| 60 | +# wrap prompts in a chat-style template. |
| 61 | +APPLY_CHAT_TEMPLATE = {"vllm": False, "vllm-vlm": True} |
| 62 | +# Few-shot examples handling as multi-turn dialogues. |
| 63 | +FEWSHOT_AS_MULTITURN = {"vllm": False, "vllm-vlm": True} |
38 | 64 |
|
39 | 65 |
|
40 |
| -def run_test(queue, more_args=None): |
41 |
| - model_args = f"pretrained={MODEL_NAME},max_model_len=4096" |
42 |
| - if more_args is not None: |
43 |
| - model_args = f"{model_args},{more_args}" |
44 |
| - results = lm_eval.simple_evaluate( |
45 |
| - model="vllm", |
46 |
| - model_args=model_args, |
47 |
| - tasks=TASK, |
48 |
| - batch_size="auto", |
49 |
| - ) |
50 |
| - result = results["results"][TASK][FILTER] |
51 |
| - print("result:", result) |
52 |
| - queue.put(result) |
53 |
| - del results |
54 |
| - torch.npu.empty_cache() |
55 |
| - gc.collect() |
| 66 | +def run_test(queue, model, max_model_len, model_type): |
| 67 | + try: |
| 68 | + if model_type == "vllm-vlm": |
| 69 | + model_args = (f"pretrained={model},max_model_len={max_model_len}," |
| 70 | + "dtype=auto,max_images=2") |
| 71 | + else: |
| 72 | + model_args = (f"pretrained={model},max_model_len={max_model_len}," |
| 73 | + "dtype=auto") |
| 74 | + results = lm_eval.simple_evaluate( |
| 75 | + model=model_type, |
| 76 | + model_args=model_args, |
| 77 | + tasks=TASK[model], |
| 78 | + batch_size="auto", |
| 79 | + apply_chat_template=APPLY_CHAT_TEMPLATE[model_type], |
| 80 | + fewshot_as_multiturn=FEWSHOT_AS_MULTITURN[model_type], |
| 81 | + ) |
| 82 | + result = results["results"][TASK[model]][FILTER[model]] |
| 83 | + print("result:", result) |
| 84 | + queue.put(result) |
| 85 | + except Exception as e: |
| 86 | + queue.put(e) |
| 87 | + sys.exit(1) |
| 88 | + finally: |
| 89 | + gc.collect() |
| 90 | + torch.npu.empty_cache() |
56 | 91 |
|
57 | 92 |
|
58 |
| -def test_lm_eval_accuracy(monkeypatch: pytest.MonkeyPatch): |
59 |
| - with monkeypatch.context(): |
| 93 | +@pytest.mark.parametrize("model", MODEL_NAME) |
| 94 | +@pytest.mark.parametrize("VLLM_USE_V1", ["0", "1"]) |
| 95 | +def test_lm_eval_accuracy(monkeypatch: pytest.MonkeyPatch, model, VLLM_USE_V1): |
| 96 | + if model == "Qwen/Qwen2.5-VL-3B-Instruct" and VLLM_USE_V1 == "1": |
| 97 | + pytest.skip( |
| 98 | + "Qwen2.5-VL-3B-Instruct is not supported when VLLM_USE_V1=1") |
| 99 | + with monkeypatch.context() as m: |
| 100 | + m.setenv("VLLM_USE_V1", VLLM_USE_V1) |
60 | 101 | result_queue: Queue[float] = multiprocessing.Queue()
|
61 |
| - p = multiprocessing.Process(target=run_test, args=(result_queue, )) |
| 102 | + p = multiprocessing.Process(target=run_test, |
| 103 | + args=(result_queue, model, |
| 104 | + MAX_MODEL_LEN[model], |
| 105 | + MODEL_TYPE[model])) |
62 | 106 | p.start()
|
63 | 107 | p.join()
|
64 | 108 | result = result_queue.get()
|
65 |
| - assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL), \ |
66 |
| - f"Expected: {EXPECTED_VALUE}±{RTOL} | Measured: {result}" |
| 109 | + print(result) |
| 110 | + assert (EXPECTED_VALUE[model] - RTOL < result < EXPECTED_VALUE[model] + RTOL), \ |
| 111 | + f"Expected: {EXPECTED_VALUE[model]}±{RTOL} | Measured: {result}" |
0 commit comments