14
14
15
15
import pytest
16
16
17
- from vllm .config import TaskOption
17
+ from vllm .config import _FLOAT16_NOT_SUPPORTED_MODELS , TaskOption
18
18
from vllm .logger import init_logger
19
+ from vllm .transformers_utils .config import get_config
19
20
20
21
from ..models .registry import HF_EXAMPLE_MODELS
21
22
from ..utils import compare_two_settings , create_new_process_for_each_test
@@ -158,7 +159,7 @@ def iter_params(self, model_id: str):
158
159
"databricks/dbrx-instruct" : PPTestSettings .fast (load_format = "dummy" ),
159
160
"Deci/DeciLM-7B-instruct" : PPTestSettings .fast (),
160
161
"deepseek-ai/deepseek-llm-7b-chat" : PPTestSettings .fast (),
161
- "deepseek-ai/DeepSeek-V2-Lite-Chat" : PPTestSettings .fast (),
162
+ "deepseek-ai/DeepSeek-V2-Lite-Chat" : PPTestSettings .fast (tp_base = 2 ),
162
163
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" : PPTestSettings .fast (),
163
164
"tiiuae/falcon-7b" : PPTestSettings .fast (),
164
165
"google/gemma-1.1-2b-it" : PPTestSettings .fast (),
@@ -210,9 +211,11 @@ def iter_params(self, model_id: str):
210
211
211
212
EMBEDDING_MODELS = { # type: ignore[var-annotated]
212
213
# [Text-only]
213
- "intfloat/e5-mistral-7b-instruct" : PPTestSettings .fast (),
214
- "BAAI/bge-multilingual-gemma2" : PPTestSettings .fast (),
215
- "Qwen/Qwen2.5-Math-RM-72B" : PPTestSettings .fast (load_format = "dummy" ),
214
+ "intfloat/e5-mistral-7b-instruct" : PPTestSettings .fast (task = "embed" ),
215
+ "BAAI/bge-multilingual-gemma2" : PPTestSettings .fast (task = "embed" ),
216
+ "Qwen/Qwen2.5-Math-RM-72B" : PPTestSettings .fast (
217
+ load_format = "dummy" , task = "embed"
218
+ ),
216
219
}
217
220
218
221
MULTIMODAL_MODELS = {
@@ -248,6 +251,7 @@ def iter_params(self, model_id: str):
248
251
"meta-llama/Llama-3.2-1B-Instruct" ,
249
252
"ArthurZ/Ilama-3.2-1B" ,
250
253
"ibm/PowerLM-3b" ,
254
+ "deepseek-ai/DeepSeek-V2-Lite-Chat" ,
251
255
# [LANGUAGE EMBEDDING]
252
256
"intfloat/e5-mistral-7b-instruct" ,
253
257
"BAAI/bge-multilingual-gemma2" ,
@@ -287,6 +291,11 @@ def _compare_tp(
287
291
trust_remote_code = model_info .trust_remote_code
288
292
tokenizer_mode = model_info .tokenizer_mode
289
293
hf_overrides = model_info .hf_overrides
294
+ hf_config = get_config (model_id , trust_remote_code )
295
+
296
+ dtype = "float16"
297
+ if hf_config .model_type in _FLOAT16_NOT_SUPPORTED_MODELS :
298
+ dtype = "bfloat16"
290
299
291
300
if load_format == "dummy" :
292
301
# Avoid OOM
@@ -316,7 +325,7 @@ def _compare_tp(
316
325
common_args = [
317
326
# use half precision for speed and memory savings in CI environment
318
327
"--dtype" ,
319
- "float16" ,
328
+ dtype ,
320
329
"--max-model-len" ,
321
330
"2048" ,
322
331
"--max-num-seqs" ,
@@ -338,6 +347,7 @@ def _compare_tp(
338
347
common_args .extend (["--hf-overrides" , json .dumps (hf_overrides )])
339
348
340
349
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
350
+ testing_ray_compiled_graph = False
341
351
if distributed_backend == "ray" and (vllm_major_version == "1"
342
352
or specific_case ):
343
353
# For V1, test Ray Compiled Graph for all the tests
@@ -351,6 +361,7 @@ def _compare_tp(
351
361
# Temporary. Currently when zeromq + SPMD is used, it does not properly
352
362
# terminate because of a Ray Compiled Graph issue.
353
363
common_args .append ("--disable-frontend-multiprocessing" )
364
+ testing_ray_compiled_graph = True
354
365
elif distributed_backend == "mp" :
355
366
# Both V0/V1 of multiprocessing executor support PP
356
367
pp_env = {
@@ -394,7 +405,6 @@ def _compare_tp(
394
405
tp_env ,
395
406
method = method )
396
407
except Exception :
397
- testing_ray_compiled_graph = pp_env is not None
398
408
if testing_ray_compiled_graph and vllm_major_version == "0" :
399
409
# Ray Compiled Graph tests are flaky for V0,
400
410
# so we don't want to fail the test
0 commit comments