Fix inadvertently silenced PP tests for mp, add DeepSeek V2/V3 model family to PP tests (#20831)

eicherseiji · web-flow · commit d0dc4cfca48c · 2025-07-16T00:14:49.000-07:00
Signed-off-by: Seiji Eicher &lt;seiji@anyscale.com&gt;
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
@@ -14,8 +14,9 @@
 
 import pytest
 
-from vllm.config import TaskOption
+from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import compare_two_settings, create_new_process_for_each_test
@@ -158,7 +159,7 @@ def iter_params(self, model_id: str):
     "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
     "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
     "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
     "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
     "tiiuae/falcon-7b": PPTestSettings.fast(),
     "google/gemma-1.1-2b-it": PPTestSettings.fast(),
@@ -210,9 +211,11 @@ def iter_params(self, model_id: str):
 
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
     # [Text-only]
-    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
-    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
+        load_format="dummy", task="embed"
+    ),
 }
 
 MULTIMODAL_MODELS = {
@@ -248,6 +251,7 @@ def iter_params(self, model_id: str):
     "meta-llama/Llama-3.2-1B-Instruct",
     "ArthurZ/Ilama-3.2-1B",
     "ibm/PowerLM-3b",
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",
     "BAAI/bge-multilingual-gemma2",
@@ -287,6 +291,11 @@ def _compare_tp(
     trust_remote_code = model_info.trust_remote_code
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
+    hf_config = get_config(model_id, trust_remote_code)
+
+    dtype = "float16"
+    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
+        dtype = "bfloat16"
 
     if load_format == "dummy":
         # Avoid OOM
@@ -316,7 +325,7 @@ def _compare_tp(
     common_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "float16",
+        dtype,
         "--max-model-len",
         "2048",
         "--max-num-seqs",
@@ -338,6 +347,7 @@ def _compare_tp(
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
     specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
+    testing_ray_compiled_graph = False
     if distributed_backend == "ray" and (vllm_major_version == "1"
                                          or specific_case):
         # For V1, test Ray Compiled Graph for all the tests
@@ -351,6 +361,7 @@ def _compare_tp(
         # Temporary. Currently when zeromq + SPMD is used, it does not properly
         # terminate because of a Ray Compiled Graph issue.
         common_args.append("--disable-frontend-multiprocessing")
+        testing_ray_compiled_graph = True
     elif distributed_backend == "mp":
         # Both V0/V1 of multiprocessing executor support PP
         pp_env = {
@@ -394,7 +405,6 @@ def _compare_tp(
                              tp_env,
                              method=method)
     except Exception:
-        testing_ray_compiled_graph = pp_env is not None
         if testing_ray_compiled_graph and vllm_major_version == "0":
             # Ray Compiled Graph tests are flaky for V0,
             # so we don't want to fail the test