vllm-project · noooop · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025
@@ -176,9 +176,12 @@ def mteb_test_embed_models(hf_runner,
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
 
+        model_config = vllm_model.model.llm_engine.model_config
+
         if model_info.architecture:
-            assert (model_info.architecture
-                    in vllm_model.model.llm_engine.model_config.architectures)
+            assert model_info.architecture in model_config.architectures
+        assert (model_config.model_info.default_pooling_type ==
+                model_info.default_pooling_type)
 
         vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                               MTEB_EMBED_TASKS)
@@ -289,6 +292,8 @@ def mteb_test_rerank_models(hf_runner,
         if model_info.architecture:
             assert (model_info.architecture in model_config.architectures)
         assert model_config.hf_config.num_labels == 1
+        assert (model_config.model_info.default_pooling_type ==
+                model_info.default_pooling_type)
 
         vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
                                           tasks=MTEB_RERANK_TASKS,

@@ -2,55 +2,56 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import EmbedModelInfo, RerankModelInfo
+from ...utils import (CLSEmbedModelInfo, CLSRerankModelInfo, EmbedModelInfo,
+                      RerankModelInfo)
 from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("BAAI/bge-base-en",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("BAAI/bge-base-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-en",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-en",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-base-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-base-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-base-en",
+                      architecture="BertModel",
+                      enable_test=True),
+    CLSEmbedModelInfo("BAAI/bge-base-zh",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-small-en",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-small-zh",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-large-en",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-large-zh",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-base-en-v1.5",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-base-zh-v1.5",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-small-en-v1.5",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-small-zh-v1.5",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-large-en-v1.5",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("BAAI/bge-large-zh-v1.5",
+                      architecture="BertModel",
+                      enable_test=False),
     ########## XLMRobertaModel
-    EmbedModelInfo("BAAI/bge-m3",
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
+    CLSEmbedModelInfo("BAAI/bge-m3",
+                      architecture="XLMRobertaModel",
+                      enable_test=True),
     ########## Qwen2Model
     EmbedModelInfo("BAAI/bge-code-v1",
                    architecture="Qwen2Model",
@@ -60,15 +61,15 @@
 
 RERANK_MODELS = [
     ########## XLMRobertaForSequenceClassification
-    RerankModelInfo("BAAI/bge-reranker-base",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=True),
-    RerankModelInfo("BAAI/bge-reranker-large",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=False),
-    RerankModelInfo("BAAI/bge-reranker-v2-m3",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=False)
+    CLSRerankModelInfo("BAAI/bge-reranker-base",
+                       architecture="XLMRobertaForSequenceClassification",
+                       enable_test=True),
+    CLSRerankModelInfo("BAAI/bge-reranker-large",
+                       architecture="XLMRobertaForSequenceClassification",
+                       enable_test=False),
+    CLSRerankModelInfo("BAAI/bge-reranker-v2-m3",
+                       architecture="XLMRobertaForSequenceClassification",
+                       enable_test=False)
 ]
 
 

@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: SIM117
+# Keep Decode-only SequenceClassification models support auto prefix cache
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["jason9693/Qwen2.5-1.5B-apeach"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_decode_only_classify(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    monkeypatch,
+) -> None:
+
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     enable_prefix_caching=True) as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Alibaba-NLP/gte-Qwen2-1.5B-instruct"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_encode_only_classify(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    with pytest.raises(RuntimeError):
+        with vllm_runner(model,
+                         max_model_len=512,
+                         dtype=dtype,
+                         enable_prefix_caching=True) as vllm_model:
+            vllm_model.classify(example_prompts)
+    # Is there any way to capture errors in worker processes?
+    # NotImplementedError: Encoder self-attention and encoder/decoder
+    # cross-attention are not implemented for FlashAttentionImpl
@@ -2,11 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+from ...utils import CLSRerankModelInfo, RerankModelInfo
+from .mteb_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
-                    architecture="BertForSequenceClassification"),
+    CLSRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
+                       architecture="BertForSequenceClassification"),
     RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
                     architecture="Qwen3ForSequenceClassification")
 ]

@@ -4,47 +4,48 @@
 
 import pytest
 
-from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from ...utils import CLSEmbedModelInfo, EmbedModelInfo
+from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("thenlper/gte-large",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("thenlper/gte-base",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-small",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-large-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-base-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-small-zh",
-                   architecture="BertModel",
-                   enable_test=False),
+    CLSEmbedModelInfo("thenlper/gte-large",
+                      architecture="BertModel",
+                      enable_test=True),
+    CLSEmbedModelInfo("thenlper/gte-base",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("thenlper/gte-small",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("thenlper/gte-large-zh",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("thenlper/gte-base-zh",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("thenlper/gte-small-zh",
+                      architecture="BertModel",
+                      enable_test=False),
     ########### NewModel
-    EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
-                   architecture="GteNewModel",
-                   enable_test=True),
-    EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
-                   architecture="GteNewModel",
-                   enable_test=True),
-    EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
-                   architecture="GteNewModel",
-                   enable_test=True),
+    CLSEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
+                      architecture="GteNewModel",
+                      enable_test=True),
+    CLSEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
+                      architecture="GteNewModel",
+                      enable_test=True),
+    CLSEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
+                      architecture="GteNewModel",
+                      enable_test=True),
     ########### Qwen2ForCausalLM
     EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                    architecture="Qwen2ForCausalLM",
                    enable_test=True),
     ########## ModernBertModel
-    EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
-                   architecture="ModernBertModel",
-                   enable_test=True),
+    CLSEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
+                      architecture="ModernBertModel",
+                      enable_test=True),
     ########## Qwen3ForCausalLM
     EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
                    architecture="Qwen3ForCausalLM",

@@ -2,34 +2,34 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import EmbedModelInfo
+from ...utils import CLSEmbedModelInfo, EmbedModelInfo
 from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("intfloat/e5-small",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("intfloat/e5-base",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/e5-large",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/multilingual-e5-small",
-                   architecture="BertModel",
-                   enable_test=False),
+    CLSEmbedModelInfo("intfloat/e5-small",
+                      architecture="BertModel",
+                      enable_test=True),
+    CLSEmbedModelInfo("intfloat/e5-base",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("intfloat/e5-large",
+                      architecture="BertModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("intfloat/multilingual-e5-small",
+                      architecture="BertModel",
+                      enable_test=False),
     ########## XLMRobertaModel
-    EmbedModelInfo("intfloat/multilingual-e5-base",
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
-    EmbedModelInfo("intfloat/multilingual-e5-large",
-                   architecture="XLMRobertaModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
-                   architecture="XLMRobertaModel",
-                   enable_test=False),
+    CLSEmbedModelInfo("intfloat/multilingual-e5-base",
+                      architecture="XLMRobertaModel",
+                      enable_test=True),
+    CLSEmbedModelInfo("intfloat/multilingual-e5-large",
+                      architecture="XLMRobertaModel",
+                      enable_test=False),
+    CLSEmbedModelInfo("intfloat/multilingual-e5-large-instruct",
+                      architecture="XLMRobertaModel",
+                      enable_test=False),
 ]