wwl2755-google
diff --git a/‎examples/offline_inference/basic/embed.py
Lines changed: 4 additions & 1 deletion b/‎examples/offline_inference/basic/embed.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/offline_inference/vision_language_embedding.py
Lines changed: 1 addition & 0 deletions b/‎examples/offline_inference/vision_language_embedding.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/compile/test_basic_correctness.py
Lines changed: 18 additions & 14 deletions b/‎tests/compile/test_basic_correctness.py
Lines changed: 18 additions & 14 deletions
diff --git a/‎tests/conftest.py
Lines changed: 3 additions & 0 deletions b/‎tests/conftest.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/entrypoints/llm/test_encode.py
Lines changed: 20 additions & 4 deletions b/‎tests/entrypoints/llm/test_encode.py
Lines changed: 20 additions & 4 deletions
diff --git a/‎tests/entrypoints/openai/test_embedding.py
Lines changed: 8 additions & 0 deletions b/‎tests/entrypoints/openai/test_embedding.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_pooling.py
Lines changed: 11 additions & 4 deletions b/‎tests/entrypoints/openai/test_pooling.py
Lines changed: 11 additions & 4 deletions
diff --git a/‎tests/entrypoints/openai/test_rerank.py
Lines changed: 8 additions & 0 deletions b/‎tests/entrypoints/openai/test_rerank.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_score.py
Lines changed: 9 additions & 0 deletions b/‎tests/entrypoints/openai/test_score.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎tests/models/language/pooling/test_classification.py
Lines changed: 9 additions & 1 deletion b/‎tests/models/language/pooling/test_classification.py
Lines changed: 9 additions & 1 deletion
@@ -12,7 +12,10 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True
+        model="intfloat/e5-mistral-7b-instruct",
+        task="embed",
+        enforce_eager=True,
+        max_model_len=1024,
     )
     return parser.parse_args()
 
 
@@ -94,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
     engine_args = EngineArgs(
         model="TIGER-Lab/VLM2Vec-Full",
         task="embed",
+        max_model_len=4096,
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
         limit_mm_per_prompt={"image": 1},
 
@@ -31,7 +31,7 @@ class TestSetting:
         # basic llama model
         TestSetting(
             model="meta-llama/Llama-3.2-1B-Instruct",
-            model_args=[],
+            model_args=["--max-model-len", "2048"],
             pp_size=2,
             tp_size=2,
             attn_backend="FLASHINFER",
@@ -41,7 +41,7 @@ class TestSetting:
         # llama model with quantization
         TestSetting(
             model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            model_args=["--quantization", "gptq"],
+            model_args=["--quantization", "gptq", "--max-model-len", "2048"],
             pp_size=1,
             tp_size=1,
             attn_backend="FLASH_ATTN",
@@ -51,7 +51,7 @@ class TestSetting:
         # MoE model
         TestSetting(
             model="ibm/PowerMoE-3b",
-            model_args=[],
+            model_args=["--max-model-len", "2048"],
             pp_size=1,
             tp_size=2,
             attn_backend="FLASH_ATTN",
@@ -61,23 +61,27 @@ class TestSetting:
         # embedding model
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
-            model_args=["--task", "embed", "--dtype", "bfloat16"],
+            model_args=[
+                "--task", "embed", "--dtype", "bfloat16", "--max-model-len",
+                "2048"
+            ],
             pp_size=1,
             tp_size=1,
             attn_backend="FLASH_ATTN",
             method="encode",
             fullgraph=True,
         ),
-        # encoder-based embedding model (BERT)
-        TestSetting(
-            model="BAAI/bge-base-en-v1.5",
-            model_args=["--task", "embed"],
-            pp_size=1,
-            tp_size=1,
-            attn_backend="XFORMERS",
-            method="encode",
-            fullgraph=True,
-        ),
+        # TODO: bert models are not supported in V1 yet
+        # # encoder-based embedding model (BERT)
+        # TestSetting(
+        #     model="BAAI/bge-base-en-v1.5",
+        #     model_args=["--task", "embed"],
+        #     pp_size=1,
+        #     tp_size=1,
+        #     attn_backend="XFORMERS",
+        #     method="encode",
+        #     fullgraph=True,
+        # ),
         # vision language model
         TestSetting(
             model="microsoft/Phi-3.5-vision-instruct",
 
@@ -145,13 +145,16 @@ def run_with_both_engines(request, monkeypatch):
     # Automatically runs tests twice, once with V1 and once without
     use_v1 = request.param
     # Tests decorated with `@skip_v1` are only run without v1
+    skip_v0 = request.node.get_closest_marker("skip_v0")
     skip_v1 = request.node.get_closest_marker("skip_v1")
 
     if use_v1:
         if skip_v1:
             pytest.skip("Skipping test on vllm V1")
         monkeypatch.setenv('VLLM_USE_V1', '1')
     else:
+        if skip_v0:
+            pytest.skip("Skipping test on vllm V0")
         monkeypatch.setenv('VLLM_USE_V1', '0')
 
     yield
 
@@ -8,6 +8,8 @@
 from vllm import LLM, PoolingParams, PoolingRequestOutput
 from vllm.distributed import cleanup_dist_env_and_memory
 
+from ...models.utils import check_embeddings_close
+
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
 PROMPTS = [
@@ -27,6 +29,14 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
@@ -46,9 +56,15 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_equal(o1: list[PoolingRequestOutput],
+def assert_outputs_match(o1: list[PoolingRequestOutput],
                          o2: list[PoolingRequestOutput]):
-    assert [o.outputs for o in o1] == [o.outputs for o in o2]
+    check_embeddings_close(
+        embeddings_0_lst=[o.outputs.data for o in o1],
+        embeddings_1_lst=[o.outputs.data for o in o2],
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
 
 
 @pytest.mark.skip_global_cleanup
@@ -63,7 +79,7 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
 
     v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
                            pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
+    assert_outputs_match(v1_output, v2_output)
 
 
 @pytest.mark.skip_global_cleanup
@@ -80,7 +96,7 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
         } for p in TOKEN_IDS],
         pooling_params=pooling_params,
     )
-    assert_outputs_equal(v1_output, v2_output)
+    assert_outputs_match(v1_output, v2_output)
 
 
 @pytest.mark.skip_global_cleanup
 
@@ -21,6 +21,14 @@
 DTYPE = "bfloat16"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.fixture(scope="module")
 def server():
     args = [
 
@@ -7,6 +7,7 @@
 import pytest
 import requests
 
+from tests.models.utils import check_embeddings_close
 from vllm.entrypoints.openai.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -223,8 +224,11 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
             np.frombuffer(base64.b64decode(data.data),
                           dtype="float32").tolist())
 
-    assert responses_float.data[0].data == decoded_responses_base64_data[0]
-    assert responses_float.data[1].data == decoded_responses_base64_data[1]
+    check_embeddings_close(
+        embeddings_0_lst=[d.data for d in responses_float.data],
+        embeddings_1_lst=decoded_responses_base64_data,
+        name_0="float32",
+        name_1="base64")
 
     # Default response is float32 decoded from base64 by OpenAI Client
     default_response = requests.post(
@@ -237,5 +241,8 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
     default_response.raise_for_status()
     responses_default = PoolingResponse.model_validate(default_response.json())
 
-    assert responses_float.data[0].data == responses_default.data[0].data
-    assert responses_float.data[1].data == responses_default.data[1].data
+    check_embeddings_close(
+        embeddings_0_lst=[d.data for d in responses_default.data],
+        embeddings_1_lst=[d.data for d in responses_default.data],
+        name_0="float32",
+        name_1="base64")
@@ -12,6 +12,14 @@
 DTYPE = "bfloat16"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.fixture(scope="module")
 def server():
     args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
 
@@ -11,6 +11,15 @@
 
 from ...utils import RemoteOpenAIServer
 
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 MODELS = [
     {
         "name": "BAAI/bge-reranker-v2-m3",
 
@@ -6,6 +6,14 @@
 
 from vllm.platforms import current_platform
 
+# TODO: enable when float32 is supported by V1
+# @pytest.fixture(autouse=True)
+# def v1(run_with_both_engines):
+#     # Simple autouse wrapper to run both engines for each test
+#     # This can be promoted up to conftest.py to run for every
+#     # test in a package
+#     pass
+
 
 @pytest.mark.parametrize(
     "model",
@@ -29,7 +37,7 @@ def test_models(
         # switch to use ROCm CK FA backend
         monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
     with hf_runner(model,