Skip to content

[Model] Re-add the implicit conversion feature for as_seq_cls_model #20930

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
7 changes: 5 additions & 2 deletions tests/models/language/pooling/mteb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,12 @@ def mteb_test_embed_models(hf_runner,
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:

model_config = vllm_model.model.llm_engine.model_config

if model_info.architecture:
assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures)
assert model_info.architecture in model_config.architectures
assert (model_config.default_pooling_type ==
model_info.default_pooling_type)

vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS)
Expand Down
25 changes: 15 additions & 10 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,6 @@ def check_available_online(
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
"Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
"Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
"Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"), # noqa: E501
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
Expand All @@ -292,7 +291,6 @@ def check_available_online(
# [Text-only]
"BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
"Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True), # noqa: E501
"GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"), # noqa: E501
"GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
"GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
trust_remote_code=True),
Expand All @@ -311,7 +309,6 @@ def check_available_online(
"Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
"Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
"Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
"Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501
"RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True), # noqa: E501
"RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True), # noqa: E501
"XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True), # noqa: E501
Expand All @@ -327,12 +324,6 @@ def check_available_online(
_CROSS_ENCODER_EXAMPLE_MODELS = {
# [Text-only]
"BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True), # noqa: E501
"GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma", # noqa: E501
v0_only=True,
hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
"classifier_from_token": ["Yes"], # noqa: E501
"method": "no_post_processing"}), # noqa: E501
"LlamaForSequenceClassification": _HfExamplesInfo("Skywork/Skywork-Reward-V2-Llama-3.2-1B"), # noqa: E501
"ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
"RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True), # noqa: E501
"XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True), # noqa: E501
Expand Down Expand Up @@ -446,6 +437,19 @@ def check_available_online(
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
}

_AUTOMATIC_CONVERTED_MODELS = {
# Use as_seq_cls_model for automatic conversion
"GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma", # noqa: E501
v0_only=True,
hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
"classifier_from_token": ["Yes"], # noqa: E501
"method": "no_post_processing"}), # noqa: E501
"GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"), # noqa: E501
"LlamaForSequenceClassification": _HfExamplesInfo("Skywork/Skywork-Reward-V2-Llama-3.2-1B"), # noqa: E501
"Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501
"Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"), # noqa: E501
}

_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"EAGLEModel": _HfExamplesInfo("JackFram/llama-68m",
speculative_model="abhigoyal/vllm-eagle-llama-68m-random"), # noqa: E501
Expand Down Expand Up @@ -513,4 +517,5 @@ def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
raise ValueError(f"No example model defined for {model_id}")


HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
AUTO_EXAMPLE_MODELS = HfExampleModels(_AUTOMATIC_CONVERTED_MODELS)
7 changes: 5 additions & 2 deletions tests/models/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
from vllm.v1.engine.core import EngineCore as V1EngineCore

from ..utils import create_new_process_for_each_test
from .registry import HF_EXAMPLE_MODELS
from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS


@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
@pytest.mark.parametrize(
"model_arch",
HF_EXAMPLE_MODELS.get_supported_archs()
& AUTO_EXAMPLE_MODELS.get_supported_archs())
@create_new_process_for_each_test()
def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
"""The reason for using create_new_process_for_each_test is to avoid
Expand Down
35 changes: 35 additions & 0 deletions tests/models/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,38 @@ def test_quantization(
name_0="transformers",
name_1="vllm",
)


@pytest.mark.parametrize(
"model",
["jason9693/Qwen2.5-1.5B-apeach"],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_classify(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
monkeypatch,
) -> None:
import torch
from transformers import AutoModelForSequenceClassification

with vllm_runner(model,
max_model_len=512,
dtype=dtype,
model_impl="transformers") as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)

with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
hf_outputs = hf_model.classify(example_prompts)

for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)

assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
2 changes: 2 additions & 0 deletions tests/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,13 @@ class EmbedModelInfo(NamedTuple):
matryoshka_dimensions: Optional[list[int]] = None
architecture: str = ""
dtype: str = "auto"
default_pooling_type: str = "LAST"
enable_test: bool = True


class RerankModelInfo(NamedTuple):
name: str
architecture: str = ""
dtype: str = "auto"
default_pooling_type: str = "LAST"
enable_test: bool = True
20 changes: 20 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,26 @@ def test_get_pooling_config_from_args():
assert asdict(pooling_config) == asdict(override_pooler_config)


@pytest.mark.parametrize(
("model_id", "default_pooling_type"),
[
("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "LAST"), # LLM
("BAAI/bge-base-en", "CLS") # BertModel
])
def test_default_pooling_type(model_id, default_pooling_type):
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
assert model_config.model_info.default_pooling_type == default_pooling_type


@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_get_bert_tokenization_sentence_transformer_config():
Expand Down
57 changes: 42 additions & 15 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from torch.distributed import ProcessGroup, ReduceOp
from typing_extensions import Self, deprecated, runtime_checkable
from typing_extensions import Self, assert_never, deprecated, runtime_checkable

import vllm.envs as envs
from vllm import version
Expand Down Expand Up @@ -551,7 +551,7 @@
# For pooling models, self.task is used to indicate the
# user-selected task
if self.task == "score":
if self.registry.is_cross_encoder_model(self.architectures):
if self._is_classify_task(self.architectures):
self.task = "classify"
else:
self.task = "embed"
Expand Down Expand Up @@ -772,6 +772,27 @@
if getattr(pooler_config, k) is None:
setattr(pooler_config, k, v)

# set default pooler config
if pooler_config.pooling_type is None:
default_pooling_type = self.model_info.default_pooling_type
pooler_config.pooling_type = default_pooling_type

if pooler_config.normalize is None:
if self.task in ["classify", "reward"]:
pooler_config.normalize = False
elif self.task == "embed":
pooler_config.normalize = True
else:
assert_never(self.task)

Check failure on line 786 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'classify', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

Check failure on line 786 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'classify', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

Check failure on line 786 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'classify', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

Check failure on line 786 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'classify', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

Check failure on line 786 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'classify', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

if pooler_config.softmax is None:
if self.task == "classify":
pooler_config.softmax = True
elif self.task == "embed":
pooler_config.normalize = False
else:
assert_never(self.task)

Check failure on line 794 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

Check failure on line 794 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

Check failure on line 794 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

Check failure on line 794 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

Check failure on line 794 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Argument 1 to "assert_never" has incompatible type "Literal['auto', 'generate', 'embedding', 'score', 'reward', 'transcription', 'draft']"; expected "Never" [arg-type]

if self.is_matryoshka:
if pooler_config.normalize is None:
pooler_config.normalize = True
Expand Down Expand Up @@ -806,21 +827,24 @@
f"one of {get_args(TokenizerMode)}.")
self.tokenizer_mode = tokenizer_mode

def _is_classify_task(self, architectures: list[str]):
for arch in architectures:
if arch.endswith("ForSequenceClassification"):
return True
return self.registry.is_cross_encoder_model(architectures)

def _get_preferred_pooling_task(
self,
architectures: list[str],
) -> _ResolvedTask:
model_id = self.model
if get_pooling_config(model_id, self.revision):
return "embed"
if self.registry.is_cross_encoder_model(architectures):
return "classify"
if self.registry.is_transcription_model(architectures):
return "transcription"

suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [
# Other models follow this pattern
("ForSequenceClassification", "classify"),
("EmbeddingModel", "embed"),
("RewardModel", "reward"),
]
Expand Down Expand Up @@ -878,11 +902,14 @@
self,
task_option: TaskOption,
) -> dict[RunnerType, list[_ResolvedTask]]:
return {
"generate": self._get_supported_generation_tasks(task_option),
"pooling": self._get_supported_pooling_tasks(task_option),
"draft": ["draft"]
}
if self._is_classify_task(self.architectures):
return {"generate": [], "pooling": ["classify"], "draft": []}
else:
return {
"generate": self._get_supported_generation_tasks(task_option),
"pooling": self._get_supported_pooling_tasks(task_option),
"draft": ["draft"]
}

def _get_supported_runner_types(
self,
Expand Down Expand Up @@ -925,12 +952,16 @@
f"Available tasks for runner={task_runner!r}: "
f"{supported_tasks[task_runner]}")

if "classify" in supported_tasks.get("pooling", []):
# When multiple pooling tasks are present, default to
# pooling (eg cross-encoder) for non-standard architectures.
return "pooling"

suffix_to_preferred_runner: list[tuple[str, RunnerType]] = [
("ForCausalLM", "generate"),
("ForConditionalGeneration", "generate"),
("ChatModel", "generate"),
("LMHeadModel", "generate"),
("ForSequenceClassification", "pooling"),
("EmbeddingModel", "pooling"),
("RewardModel", "pooling"),
]
Expand All @@ -940,10 +971,6 @@
if arch.endswith(suffix) and pref_runner in supported_runner_types:
return pref_runner

if "classify" in supported_tasks.get("pooling", []):
# When multiple pooling tasks are present, default to
# pooling (eg cross-encoder) for non-standard architectures.
return "pooling"
if "generate" in supported_runner_types:
return "generate"
if "pooling" in supported_runner_types:
Expand Down
28 changes: 24 additions & 4 deletions vllm/model_executor/model_loader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.adapters import (as_embedding_model,
as_reward_model)
as_reward_model,
as_seq_cls_model)
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.utils import is_pin_memory_available

Expand Down Expand Up @@ -238,22 +239,41 @@ def get_model_architecture(
vllm_supported_archs = ModelRegistry.get_supported_archs()
vllm_not_supported = not any(arch in vllm_supported_archs
for arch in architectures)

if vllm_not_supported:
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue

assert model_config.task in ["auto", "classify"]
model_config.task = "classify"
new_arch = arch.replace("ForSequenceClassification", "ForCausalLM")
vllm_supported = not any(arch in vllm_supported_archs
for arch in architectures)
if vllm_supported:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is something strange here, vllm_supported has the exact same definition as vllm_not_supported.

Suggested change
vllm_supported = not any(arch in vllm_supported_archs
for arch in architectures)
if vllm_supported:
if vllm_not_supported:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is indeed a problem with this logic, thank you for pointing it out.

architectures = [new_arch]
vllm_not_supported = False
break

if (model_config.model_impl == ModelImpl.TRANSFORMERS or
model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
architectures = resolve_transformers_arch(model_config, architectures)
logger.debug_once("Resolve transformers arch %s", str(architectures))
elif (model_config.quantization is not None
and model_config.quantization not in mixtral_supported
and "MixtralForCausalLM" in architectures):
architectures = ["QuantMixtralForCausalLM"]

model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
if model_config.task == "embed":
logger.debug_once("Automatic conversion using `as_embedding_model`.")
model_cls = as_embedding_model(model_cls)
elif model_config.task == "classify":
# Cannot automatically run as_seq_cls_model,
# otherwise it will cause a circular reference on is_cross_encoder_model
pass
logger.debug_once("Automatic conversion using `as_seq_cls_model`.")
model_cls = as_seq_cls_model(model_cls)
elif model_config.task == "reward":
logger.debug_once("Automatic conversion using `as_reward_model`.")
model_cls = as_reward_model(model_cls)

return model_cls, arch
Expand Down
Loading