@@ -350,13 +350,13 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
350
350
"dtype" : "float16" ,
351
351
352
352
# Main model
353
- "model_name" : "meta-llama /Llama-2-7b-chat-hf"
353
+ "model_name" : "vllm-ascend /Llama-2-7b-chat-hf"
354
354
}])
355
355
@pytest .mark .parametrize ("per_test_common_llm_kwargs" , [{}])
356
356
@pytest .mark .parametrize ("baseline_llm_kwargs" , [{}])
357
357
@pytest .mark .parametrize ("test_llm_kwargs" , [
358
358
{
359
- "speculative_model" : "yuhuili /EAGLE-llama2-chat-7B" ,
359
+ "speculative_model" : "vllm-ascend /EAGLE-llama2-chat-7B" ,
360
360
"num_speculative_tokens" : MAX_SPEC_TOKENS ,
361
361
},
362
362
])
@@ -368,21 +368,25 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
368
368
])
369
369
@pytest .mark .parametrize ("batch_size" , [1 , 5 ])
370
370
@pytest .mark .parametrize ("seed" , [1 ])
371
- def test_llama2_eagle_e2e_greedy_correctness (vllm_runner , common_llm_kwargs ,
371
+ def test_llama2_eagle_e2e_greedy_correctness (monkeypatch : pytest .MonkeyPatch ,
372
+ vllm_runner , common_llm_kwargs ,
372
373
per_test_common_llm_kwargs ,
373
374
baseline_llm_kwargs ,
374
375
test_llm_kwargs , batch_size : int ,
375
376
output_len : int , seed : int ):
376
377
377
- run_equality_correctness_test (vllm_runner ,
378
- common_llm_kwargs ,
379
- per_test_common_llm_kwargs ,
380
- baseline_llm_kwargs ,
381
- test_llm_kwargs ,
382
- batch_size ,
383
- output_len ,
384
- seed ,
385
- temperature = 0.0 )
378
+ # TODO: it is a wrong way to use modelscope.
379
+ with monkeypatch .context () as m :
380
+ m .setenv ("VLLM_USE_MODELSCOPE" , "True" )
381
+ run_equality_correctness_test (vllm_runner ,
382
+ common_llm_kwargs ,
383
+ per_test_common_llm_kwargs ,
384
+ baseline_llm_kwargs ,
385
+ test_llm_kwargs ,
386
+ batch_size ,
387
+ output_len ,
388
+ seed ,
389
+ temperature = 0.0 )
386
390
387
391
388
392
@pytest .mark .skipif (True , reason = "Open it when CI could use modelscope" )
@@ -399,13 +403,13 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
399
403
"dtype" : "float16" ,
400
404
401
405
# Main model
402
- "model_name" : "meta-llama /Meta-Llama-3-8B-Instruct"
406
+ "model_name" : "vllm-ascend /Meta-Llama-3-8B-Instruct"
403
407
}])
404
408
@pytest .mark .parametrize ("per_test_common_llm_kwargs" , [{}])
405
409
@pytest .mark .parametrize ("baseline_llm_kwargs" , [{}])
406
410
@pytest .mark .parametrize ("test_llm_kwargs" , [
407
411
{
408
- "speculative_model" : "yuhuili /EAGLE-LLaMA3-Instruct-8B" ,
412
+ "speculative_model" : "vllm-ascend /EAGLE-LLaMA3-Instruct-8B" ,
409
413
"num_speculative_tokens" : MAX_SPEC_TOKENS ,
410
414
},
411
415
])
@@ -417,21 +421,25 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
417
421
])
418
422
@pytest .mark .parametrize ("batch_size" , [1 , 5 ])
419
423
@pytest .mark .parametrize ("seed" , [1 ])
420
- def test_llama3_eagle_e2e_greedy_correctness (vllm_runner , common_llm_kwargs ,
424
+ def test_llama3_eagle_e2e_greedy_correctness (monkeypatch : pytest .MonkeyPatch ,
425
+ vllm_runner , common_llm_kwargs ,
421
426
per_test_common_llm_kwargs ,
422
427
baseline_llm_kwargs ,
423
428
test_llm_kwargs , batch_size : int ,
424
429
output_len : int , seed : int ):
425
430
426
- run_equality_correctness_test (vllm_runner ,
427
- common_llm_kwargs ,
428
- per_test_common_llm_kwargs ,
429
- baseline_llm_kwargs ,
430
- test_llm_kwargs ,
431
- batch_size ,
432
- output_len ,
433
- seed ,
434
- temperature = 0.0 )
431
+ # TODO: it is a wrong way to use modelscope.
432
+ with monkeypatch .context () as m :
433
+ m .setenv ("VLLM_USE_MODELSCOPE" , "True" )
434
+ run_equality_correctness_test (vllm_runner ,
435
+ common_llm_kwargs ,
436
+ per_test_common_llm_kwargs ,
437
+ baseline_llm_kwargs ,
438
+ test_llm_kwargs ,
439
+ batch_size ,
440
+ output_len ,
441
+ seed ,
442
+ temperature = 0.0 )
435
443
436
444
437
445
@pytest .mark .parametrize (
0 commit comments