Skip to content

[Model] Re-add the implicit conversion feature for as_seq_cls_model #20930

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
9 changes: 7 additions & 2 deletions tests/models/language/pooling/mteb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,12 @@ def mteb_test_embed_models(hf_runner,
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:

model_config = vllm_model.model.llm_engine.model_config

if model_info.architecture:
assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures)
assert model_info.architecture in model_config.architectures
assert (model_config.model_info.default_pooling_type ==
model_info.default_pooling_type)

vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS)
Expand Down Expand Up @@ -289,6 +292,8 @@ def mteb_test_rerank_models(hf_runner,
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_config.hf_config.num_labels == 1
assert (model_config.model_info.default_pooling_type ==
model_info.default_pooling_type)

vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
Expand Down
105 changes: 53 additions & 52 deletions tests/models/language/pooling/test_baai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,56 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from ...utils import EmbedModelInfo, RerankModelInfo
from ...utils import (CLSEmbedModelInfo, CLSRerankModelInfo, EmbedModelInfo,
RerankModelInfo)
from .embed_utils import correctness_test_embed_models
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models

MODELS = [
########## BertModel
EmbedModelInfo("BAAI/bge-base-en",
architecture="BertModel",
enable_test=True),
EmbedModelInfo("BAAI/bge-base-zh",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-small-en",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-small-zh",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-large-en",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-large-zh",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-base-en-v1.5",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-base-zh-v1.5",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-small-en-v1.5",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-small-zh-v1.5",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-large-en-v1.5",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("BAAI/bge-large-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-base-en",
architecture="BertModel",
enable_test=True),
CLSEmbedModelInfo("BAAI/bge-base-zh",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-small-en",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-small-zh",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-large-en",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-large-zh",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-base-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-base-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-small-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-small-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-large-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("BAAI/bge-large-zh-v1.5",
architecture="BertModel",
enable_test=False),
########## XLMRobertaModel
EmbedModelInfo("BAAI/bge-m3",
architecture="XLMRobertaModel",
enable_test=True),
CLSEmbedModelInfo("BAAI/bge-m3",
architecture="XLMRobertaModel",
enable_test=True),
########## Qwen2Model
EmbedModelInfo("BAAI/bge-code-v1",
architecture="Qwen2Model",
Expand All @@ -60,15 +61,15 @@

RERANK_MODELS = [
########## XLMRobertaForSequenceClassification
RerankModelInfo("BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
enable_test=True),
RerankModelInfo("BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification",
enable_test=False),
RerankModelInfo("BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification",
enable_test=False)
CLSRerankModelInfo("BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
enable_test=True),
CLSRerankModelInfo("BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification",
enable_test=False),
CLSRerankModelInfo("BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification",
enable_test=False)
]


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: SIM117
# Keep Decode-only SequenceClassification models support auto prefix cache
import pytest
import torch
from transformers import AutoModelForSequenceClassification


@pytest.mark.parametrize(
"model",
["jason9693/Qwen2.5-1.5B-apeach"],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_decode_only_classify(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
monkeypatch,
) -> None:

with vllm_runner(model,
max_model_len=512,
dtype=dtype,
enable_prefix_caching=True) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)

with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
hf_outputs = hf_model.classify(example_prompts)

for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)

assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)


@pytest.mark.parametrize(
"model",
["Alibaba-NLP/gte-Qwen2-1.5B-instruct"],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_encode_only_classify(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
monkeypatch,
) -> None:
with pytest.raises(RuntimeError):
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
enable_prefix_caching=True) as vllm_model:
vllm_model.classify(example_prompts)
# Is there any way to capture errors in worker processes?
# NotImplementedError: Encoder self-attention and encoder/decoder
# cross-attention are not implemented for FlashAttentionImpl
7 changes: 4 additions & 3 deletions tests/models/language/pooling/test_cross_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
from ...utils import CLSRerankModelInfo, RerankModelInfo
from .mteb_utils import mteb_test_rerank_models

RERANK_MODELS = [
RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
architecture="BertForSequenceClassification"),
CLSRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
architecture="BertForSequenceClassification"),
RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
architecture="Qwen3ForSequenceClassification")
]
Expand Down
63 changes: 32 additions & 31 deletions tests/models/language/pooling/test_gte.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,48 @@

import pytest

from .embed_utils import EmbedModelInfo, correctness_test_embed_models
from ...utils import CLSEmbedModelInfo, EmbedModelInfo
from .embed_utils import correctness_test_embed_models
from .mteb_utils import mteb_test_embed_models

MODELS = [
########## BertModel
EmbedModelInfo("thenlper/gte-large",
architecture="BertModel",
enable_test=True),
EmbedModelInfo("thenlper/gte-base",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("thenlper/gte-small",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("thenlper/gte-large-zh",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("thenlper/gte-base-zh",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("thenlper/gte-small-zh",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("thenlper/gte-large",
architecture="BertModel",
enable_test=True),
CLSEmbedModelInfo("thenlper/gte-base",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("thenlper/gte-small",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("thenlper/gte-large-zh",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("thenlper/gte-base-zh",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("thenlper/gte-small-zh",
architecture="BertModel",
enable_test=False),
########### NewModel
EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
enable_test=True),
EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
enable_test=True),
EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
enable_test=True),
CLSEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
enable_test=True),
CLSEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
enable_test=True),
CLSEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
enable_test=True),
########### Qwen2ForCausalLM
EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
architecture="Qwen2ForCausalLM",
enable_test=True),
########## ModernBertModel
EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
architecture="ModernBertModel",
enable_test=True),
CLSEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
architecture="ModernBertModel",
enable_test=True),
########## Qwen3ForCausalLM
EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
architecture="Qwen3ForCausalLM",
Expand Down
44 changes: 22 additions & 22 deletions tests/models/language/pooling/test_intfloat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,34 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from ...utils import EmbedModelInfo
from ...utils import CLSEmbedModelInfo, EmbedModelInfo
from .embed_utils import correctness_test_embed_models
from .mteb_utils import mteb_test_embed_models

MODELS = [
########## BertModel
EmbedModelInfo("intfloat/e5-small",
architecture="BertModel",
enable_test=True),
EmbedModelInfo("intfloat/e5-base",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("intfloat/e5-large",
architecture="BertModel",
enable_test=False),
EmbedModelInfo("intfloat/multilingual-e5-small",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("intfloat/e5-small",
architecture="BertModel",
enable_test=True),
CLSEmbedModelInfo("intfloat/e5-base",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("intfloat/e5-large",
architecture="BertModel",
enable_test=False),
CLSEmbedModelInfo("intfloat/multilingual-e5-small",
architecture="BertModel",
enable_test=False),
########## XLMRobertaModel
EmbedModelInfo("intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
enable_test=True),
EmbedModelInfo("intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False),
EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False),
CLSEmbedModelInfo("intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
enable_test=True),
CLSEmbedModelInfo("intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False),
CLSEmbedModelInfo("intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False),
]


Expand Down
Loading