26
26
import torch
27
27
28
28
# pre-trained model path on Hugging Face.
29
- MODELS = [
30
- "Qwen/Qwen2.5-0.5B-Instruct" ,
31
- "deepseek-ai/DeepSeek-V2-Lite"
32
- ]
29
+ MODELS = ["Qwen/Qwen2.5-0.5B-Instruct" , "deepseek-ai/DeepSeek-V2-Lite" ]
33
30
# Math reasoning benchmark (Grade School Math 8K).
34
31
TASK = "gsm8k"
35
32
# Answer validation requiring format consistency.
39
36
# Baseline accuracy after VLLM optimization.
40
37
EXPECTED_VALUE = 0.316
41
38
39
+
42
40
def run_test (model_name , queue , more_args = None ):
43
41
model_args = f"pretrained={ model_name } ,max_model_len=4096,trust_remote_code=True"
44
42
if more_args is not None :
@@ -61,9 +59,13 @@ def run_test(model_name, queue, more_args=None):
61
59
def test_lm_eval_accuracy (model , monkeypatch : pytest .MonkeyPatch ):
62
60
with monkeypatch .context ():
63
61
result_queue : Queue [float ] = multiprocessing .Queue ()
64
- p = multiprocessing .Process (target = run_test , args = (model , result_queue , ))
62
+ p = multiprocessing .Process (target = run_test ,
63
+ args = (
64
+ model ,
65
+ result_queue ,
66
+ ))
65
67
p .start ()
66
68
p .join ()
67
69
result = result_queue .get ()
68
70
assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL ), \
69
- f"Expected: { EXPECTED_VALUE } ±{ RTOL } | Measured: { result } "
71
+ f"Expected: { EXPECTED_VALUE } ±{ RTOL } | Measured: { result } "
0 commit comments