vllm-project · noooop · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025
@@ -263,7 +263,6 @@ def check_available_online(
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
-    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
@@ -290,7 +289,6 @@ def check_available_online(
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
-    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                                trust_remote_code=True),
@@ -299,7 +297,6 @@ def check_available_online(
                                    hf_overrides={"architectures": ["GteNewModel"]}),  # noqa: E501
     "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
                                                trust_remote_code=True),
-    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
@@ -309,7 +306,6 @@ def check_available_online(
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
     "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
-    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
     "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
     "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
     "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
@@ -325,12 +321,6 @@ def check_available_online(
 _CROSS_ENCODER_EXAMPLE_MODELS = {
     # [Text-only]
     "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
-    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma", # noqa: E501
-                                                      v0_only=True,
-                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
-                                                                    "classifier_from_token": ["Yes"], # noqa: E501
-                                                                    "method": "no_post_processing"}), # noqa: E501
-    "LlamaForSequenceClassification": _HfExamplesInfo("Skywork/Skywork-Reward-V2-Llama-3.2-1B"), # noqa: E501
     "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
     "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
     "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
@@ -444,6 +434,20 @@ def check_available_online(
     "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),   # noqa: E501
 }
 
+_AUTOMATIC_CONVERED_MODELS = {
+    # Use as_seq_cls_model for automatic conversion
+    "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma",  # noqa: E501
+                                                      v0_only=True,
+                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
+                                                                    "classifier_from_token": ["Yes"],  # noqa: E501
+                                                                    "method": "no_post_processing"}),  # noqa: E501
+    "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
+    "LlamaForSequenceClassification": _HfExamplesInfo("Skywork/Skywork-Reward-V2-Llama-3.2-1B"),  # noqa: E501
+    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
+    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
+    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
+}
+
 _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
     "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m",
                                   speculative_model="abhigoyal/vllm-eagle-llama-68m-random"),  # noqa: E501
@@ -511,4 +515,5 @@ def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
         raise ValueError(f"No example model defined for {model_id}")
 
 
-HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
+HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
+AUTO_EXAMPLE_MODELS = HfExampleModels(_AUTOMATIC_CONVERED_MODELS)
@@ -13,10 +13,13 @@
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
 from ..utils import create_new_process_for_each_test
-from .registry import HF_EXAMPLE_MODELS
+from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
 
 
-@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
+@pytest.mark.parametrize(
+    "model_arch",
+    HF_EXAMPLE_MODELS.get_supported_archs() +
+    AUTO_EXAMPLE_MODELS.get_supported_archs())
 @create_new_process_for_each_test()
 def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     """The reason for using create_new_process_for_each_test is to avoid 

@@ -551,7 +551,7 @@ def __post_init__(self) -> None:
         # For pooling models, self.task is used to indicate the
         # user-selected task
         if self.task == "score":
-            if self.registry.is_cross_encoder_model(self.architectures):
+            if self._is_classify_task(self.architectures):
                 self.task = "classify"
             else:
                 self.task = "embed"
@@ -569,7 +569,6 @@ def __post_init__(self) -> None:
         runner_type = self._resolve_runner(self.runner, self.task,
                                            supported_runner_types,
                                            all_supported_tasks)
-
         logger.debug("Selected runner type: %s", runner_type)
         # For pooling models, self.task is used to indicate the
         # user-selected task
@@ -806,21 +805,24 @@ def _verify_tokenizer_mode(self) -> None:
                 f"one of {get_args(TokenizerMode)}.")
         self.tokenizer_mode = tokenizer_mode
 
+    def _is_classify_task(self, architectures):
+        for arch in architectures:
+            if arch.endswith("ForSequenceClassification"):
+                return True
+        return self.registry.is_cross_encoder_model(architectures)
+
     def _get_preferred_pooling_task(
         self,
         architectures: list[str],
     ) -> _ResolvedTask:
         model_id = self.model
         if get_pooling_config(model_id, self.revision):
             return "embed"
-        if self.registry.is_cross_encoder_model(architectures):
-            return "classify"
         if self.registry.is_transcription_model(architectures):
             return "transcription"
 
         suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [
             # Other models follow this pattern
-            ("ForSequenceClassification", "classify"),
             ("EmbeddingModel", "embed"),
             ("RewardModel", "reward"),
         ]
@@ -878,11 +880,14 @@ def _get_supported_tasks(
         self,
         task_option: TaskOption,
     ) -> dict[RunnerType, list[_ResolvedTask]]:
-        return {
-            "generate": self._get_supported_generation_tasks(task_option),
-            "pooling": self._get_supported_pooling_tasks(task_option),
-            "draft": ["draft"]
-        }
+        if self._is_classify_task(self.architectures):
+            return {"generate": [], "pooling": ["classify"], "draft": []}
+        else:
+            return {
+                "generate": self._get_supported_generation_tasks(task_option),
+                "pooling": self._get_supported_pooling_tasks(task_option),
+                "draft": ["draft"]
+            }
 
     def _get_supported_runner_types(
         self,
@@ -925,12 +930,16 @@ def _resolve_runner(
                     f"Available tasks for runner={task_runner!r}: "
                     f"{supported_tasks[task_runner]}")
 
+        if "classify" in supported_tasks.get("pooling", []):
+            # When multiple pooling tasks are present, default to
+            # pooling (eg cross-encoder) for non-standard architectures.
+            return "pooling"
+
         suffix_to_preferred_runner: list[tuple[str, RunnerType]] = [
             ("ForCausalLM", "generate"),
             ("ForConditionalGeneration", "generate"),
             ("ChatModel", "generate"),
             ("LMHeadModel", "generate"),
-            ("ForSequenceClassification", "pooling"),
             ("EmbeddingModel", "pooling"),
             ("RewardModel", "pooling"),
         ]
@@ -940,10 +949,6 @@ def _resolve_runner(
             if arch.endswith(suffix) and pref_runner in supported_runner_types:
                 return pref_runner
 
-        if "classify" in supported_tasks.get("pooling", []):
-            # When multiple pooling tasks are present, default to
-            # pooling (eg cross-encoder) for non-standard architectures.
-            return "pooling"
         if "generate" in supported_runner_types:
             return "generate"
         if "pooling" in supported_runner_types:

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -22,7 +22,7 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_embedding_model,
-                                                 as_reward_model)
+                                                 as_reward_model, as_seq_cls_model)
 from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.utils import is_pin_memory_available
 
@@ -238,6 +238,25 @@ def get_model_architecture(
     vllm_supported_archs = ModelRegistry.get_supported_archs()
     vllm_not_supported = not any(arch in vllm_supported_archs
                                  for arch in architectures)
+
+    if vllm_not_supported:
+        # try automatic conversion in adapters.py
+        for arch in architectures:
+            if not arch.endswith("ForSequenceClassification"):
+                continue
+
+            assert model_config.task in ["auto", "classify"]
+            model_config.task = "classify"
+
+            old_arch = arch
+            arch = arch.replace("ForSequenceClassification", "ForCausalLM")
+            logger.info("Automatic conversion %s -> %s", arch, old_arch)
+            vllm_supported = not any(arch in vllm_supported_archs
+                                         for arch in architectures)
+            if vllm_supported:
+                architectures = [arch]
+                vllm_not_supported = False
+
     if (model_config.model_impl == ModelImpl.TRANSFORMERS or
             model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
         architectures = resolve_transformers_arch(model_config, architectures)
@@ -250,9 +269,7 @@ def get_model_architecture(
     if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)
     elif model_config.task == "classify":
-        # Cannot automatically run as_seq_cls_model,
-        # otherwise it will cause a circular reference on is_cross_encoder_model
-        pass
+        model_cls = as_seq_cls_model(model_cls)
     elif model_config.task == "reward":
         model_cls = as_reward_model(model_cls)
 

diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
@@ -193,6 +193,7 @@ def __init__(
 
             config = vllm_config.model_config.hf_config
             quant_config = vllm_config.quant_config
+            score_bias: bool = getattr(config, 'score_bias', False)
 
             self.vllm_config = vllm_config
             self.task = vllm_config.model_config.task
@@ -203,7 +204,7 @@ def __init__(
                                            config.num_labels,
                                            quant_config=quant_config,
                                            input_is_parallel=False,
-                                           bias=False,
+                                           bias=score_bias,
                                            prefix=maybe_prefix(
                                                prefix, "score"))
 
@@ -349,13 +350,13 @@ def load_weights_using_from_2_way_softmax(
 
     false_id = tokenizer.convert_tokens_to_ids(tokens[0])
     true_id = tokenizer.convert_tokens_to_ids(tokens[1])
-    weight = model.lm_head.weight.data[[true_id]].to(
+    score_weight = model.lm_head.weight.data[[true_id]].to(
         torch.float32) - model.lm_head.weight.data[[false_id]].to(
             torch.float32)
 
     param = model.score.weight
     weight_loader = getattr(param, "weight_loader", default_weight_loader)
-    weight_loader(param, weight)
+    weight_loader(param, score_weight)
 
     del model.lm_head
     loaded_weights.add("score.weight")
@@ -368,15 +369,15 @@ def load_weights_no_post_processing(model,
                                                             torch.Tensor]]):
     from vllm.model_executor.layers.vocab_parallel_embedding import (
         ParallelLMHead)
+    from vllm.model_executor.model_loader.weight_utils import (
+        default_weight_loader)
     from vllm.model_executor.models.utils import AutoWeightsLoader
 
     model_config = model.vllm_config.model_config
     tokens = getattr(model.config, "classifier_from_token", [])
     tokens = cast(list[int], tokens)
     assert len(tokens) > 0
 
-    device = model.score.weight.device
-
     if model.config.tie_word_embeddings:
         model.lm_head = model.model.embed_tokens
     else:
@@ -394,8 +395,11 @@ def load_weights_no_post_processing(model,
                               trust_remote_code=model_config.trust_remote_code)
 
     token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
-    score_weight = model.lm_head.weight.data[token_ids].to(device)
-    model.score.weight.data.copy_(score_weight)
+    score_weight = model.lm_head.weight.data[token_ids]
+
+    param = model.score.weight
+    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+    weight_loader(param, score_weight)
 
     del model.lm_head
     loaded_weights.add("score.weight")

diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
@@ -43,7 +43,6 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .adapters import as_seq_cls_model
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
@@ -426,6 +425,3 @@ def load_weights(self, weights: Iterable[tuple[str,
                            if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
-
-
-GemmaForSequenceClassification = as_seq_cls_model(GemmaForCausalLM)
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
@@ -19,16 +19,14 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
@@ -558,35 +556,3 @@ def _is_moe_layer(name: str):
             "experts",
             "router",
         ]])
-
-
-class JambaForSequenceClassification(JambaForCausalLM):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-        config = vllm_config.model_config.hf_config
-        num_labels: int = config.num_labels
-        score_bias: bool = getattr(config, 'score_bias', False)
-        self.score = nn.Linear(config.hidden_size, num_labels, bias=score_bias)
-
-        pooler_config = vllm_config.model_config.pooler_config
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=False,
-            softmax=False)
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        hidden_states = hidden_states.float()
-        logits = self.score(hidden_states)
-        return self._pooler(logits, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        # TODO: The reward weights themselves have float32 accuracy data, we
-        # would like to load them in fp32 to get that extra precision.
-        super().load_weights(weights)
-        self.score = self.score.float()
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -49,7 +49,6 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .adapters import as_seq_cls_model
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                     is_pp_missing_parameter,
@@ -646,6 +645,3 @@ def permute(w: torch.Tensor, n_heads: int):
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight
-
-
-LlamaForSequenceClassification = as_seq_cls_model(LlamaForCausalLM)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
@@ -50,7 +50,6 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .adapters import as_seq_cls_model
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                     is_pp_missing_parameter,
@@ -496,6 +495,3 @@ def load_weights(self, weights: Iterable[tuple[str,
                            if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
-
-
-Qwen2ForSequenceClassification = as_seq_cls_model(Qwen2ForCausalLM)