vllm-project · maxdebayser · Jun 23, 2025 · Jun 23, 2025 · Jun 24, 2025 · Jul 1, 2025
@@ -39,22 +39,13 @@ def v1(run_with_both_engines):
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param(
-            "BAAI/bge-base-en-v1.5",
-            marks=[
-                # CPU only supports V1
-                pytest.mark.core_model,
-                pytest.mark.skip_v1
-            ]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
-                     marks=[pytest.mark.skip_v1]),
-        pytest.param("intfloat/multilingual-e5-small",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                     marks=[pytest.mark.skip_v1]),
+                     marks=[pytest.mark.skip_v0]),
         # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
 def test_models(

@@ -23,6 +23,14 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:

@@ -23,6 +23,15 @@
     "The capital of Germany is Berlin.",
 ]
 
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 DTYPE = "half"
 
 

@@ -51,9 +51,9 @@ def test_registry_imports(model_arch):
     ("LlamaForCausalLM", False, False, False),
     ("MllamaForConditionalGeneration", True, False, False),
     ("LlavaForConditionalGeneration", True, True, False),
-    ("BertForSequenceClassification", False, False, True),
-    ("RobertaForSequenceClassification", False, False, True),
-    ("XLMRobertaForSequenceClassification", False, False, True),
+    ("BertForSequenceClassification", True, False, True),
+    ("RobertaForSequenceClassification", True, False, True),
+    ("XLMRobertaForSequenceClassification", True, False, True),
 ])
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
     assert ModelRegistry.is_multimodal_model(model_arch) is is_mm

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from vllm.attention import AttentionType
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
@@ -61,6 +62,7 @@ def new_kv_cache_spec(block_size=16,
                              head_size=head_size,
                              dtype=dtype,
                              use_mla=use_mla,
+                             attn_type=AttentionType.DECODER,
                              sliding_window=sliding_window)
 
 
@@ -75,6 +77,7 @@ def new_sliding_window_spec(block_size=16,
                              head_size=head_size,
                              dtype=dtype,
                              use_mla=use_mla,
+                             attn_type=AttentionType.DECODER,
                              sliding_window=sliding_window)
 
 
@@ -534,6 +537,7 @@ def test_merge_kv_cache_spec():
             head_size=full_spec.head_size,
             dtype=full_spec.dtype,
             use_mla=full_spec.use_mla,
+            attn_type=AttentionType.DECODER,
             sliding_window=1,
         ),
     ]
@@ -603,6 +607,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
             head_size=128,
             dtype=torch.float16,
             use_mla=False,
+            attn_type=AttentionType.DECODER,
         )
     # Estimate the maximum model length, 16384 model_len need 8GB
     estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
@@ -638,6 +643,7 @@ def test_get_max_concurrency_for_kv_cache_config():
         head_size=128,
         dtype=torch.float16,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
 
     sliding_window_spec = SlidingWindowSpec(
@@ -646,6 +652,7 @@ def test_get_max_concurrency_for_kv_cache_config():
         head_size=128,
         dtype=torch.float16,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
         sliding_window=1024,
     )
 
@@ -916,4 +923,4 @@ def test_get_kv_cache_config():
         ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
-        ])
+        ])
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
@@ -8,6 +8,7 @@
 import pytest
 import torch
 
+from vllm.attention import AttentionType
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
@@ -53,7 +54,12 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+                FullAttentionSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  attn_type=AttentionType.DECODER),
             )
         ],
     )
@@ -67,7 +73,12 @@ def make_kv_cache_config_hybrid_model(block_size: int,
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer1"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+                FullAttentionSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  attn_type=AttentionType.DECODER),
             ),
             KVCacheGroupSpec(
                 ["layer2"],
@@ -76,6 +87,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
                                   1,
                                   torch.float32,
                                   False,
+                                  attn_type=AttentionType.DECODER,
                                   sliding_window=2 * block_size),
             ),
             KVCacheGroupSpec(
@@ -85,6 +97,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
                                   1,
                                   torch.float32,
                                   False,
+                                  attn_type=AttentionType.DECODER,
                                   sliding_window=2 * block_size),
             ),
         ],
@@ -1218,6 +1231,7 @@ def test_eagle_with_sliding_window():
         dtype=torch.float32,
         sliding_window=block_size,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
     manager = KVCacheManager(
         KVCacheConfig(

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from vllm.attention import AttentionType
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -104,7 +105,7 @@ def create_scheduler(
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+                                               False, AttentionType.DECODER))
         ],
     )
     cache_config.num_gpu_blocks = num_blocks
@@ -1354,7 +1355,7 @@ def create_scheduler_with_priority(
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+                                               False, AttentionType.DECODER))
         ],
     )
     cache_config.num_gpu_blocks = num_blocks

diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from vllm.attention import AttentionType
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
                                          KVCacheBlock)
@@ -26,6 +27,7 @@ def test_sliding_window_possible_cached_prefix():
         dtype=torch.float32,
         sliding_window=4,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
 
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
@@ -92,6 +94,7 @@ def test_sliding_window_remove_skipped_blocks():
         dtype=torch.float32,
         sliding_window=4,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
 
     block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
@@ -160,6 +163,7 @@ def test_get_num_blocks_to_allocate():
         dtype=torch.float32,
         sliding_window=4,  # Placeholder value, not related to test result
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
 
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
@@ -16,6 +16,7 @@
 
 from tests.utils import multi_gpu_test
 from vllm import SamplingParams
+from vllm.attention import AttentionType
 from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
                                         ZmqEventPublisher)
 from vllm.engine.arg_utils import EngineArgs
@@ -544,7 +545,8 @@ def create_mock_executor(vllm_config):
                                       num_kv_heads=1,
                                       head_size=64,
                                       dtype=torch.float16,
-                                      use_mla=False)
+                                      use_mla=False,
+                                      attn_type=AttentionType.DECODER)
 
         mock_executor.get_kv_cache_specs.return_value = [{
             "default": mock_spec

diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import os
-import re
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import regex as re
 import requests
 
 from tests.utils import RemoteOpenAIServer

diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
@@ -7,6 +7,7 @@
 import torch
 
 from vllm import SamplingParams
+from vllm.attention import AttentionType
 from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
                          ModelConfig, SchedulerConfig, VllmConfig)
 from vllm.distributed.kv_transfer.kv_connector.factory import (
@@ -106,7 +107,7 @@ def create_scheduler(
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+                                               False, AttentionType.DECODER))
         ],
     )
     vllm_config.cache_config.num_gpu_blocks = num_blocks

diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
@@ -13,7 +13,6 @@
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
     "state-spaces/mamba-130m-hf",  # mamba1
-    "BAAI/bge-m3",  # embedding
 ]
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 
-from vllm.attention import Attention
+from vllm.attention import Attention, AttentionType
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig, VllmConfig, set_current_vllm_config)
 from vllm.platforms import current_platform
@@ -38,6 +38,7 @@ def initialize_kv_cache(runner: GPUModelRunner):
         head_size=runner.model_config.get_head_size(),
         dtype=runner.kv_cache_dtype,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
     tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS
     kv_cache_config = KVCacheConfig(

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1744,7 +1744,8 @@ def _set_default_args_v1(self, usage_context: UsageContext,
 
         if (self.max_num_seqs is None
                 and usage_context in default_max_num_seqs):
-            self.max_num_seqs = default_max_num_seqs[usage_context]
+            self.max_num_seqs = min(default_max_num_seqs[usage_context],
+                                    self.max_num_batched_tokens or sys.maxsize)
 
             logger.debug("Setting max_num_seqs to %d for %s usage context.",
                          self.max_num_seqs, use_context_value)

@@ -1295,39 +1295,18 @@ def _cross_encoding_score(
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        if self.llm_engine.model_config.is_multimodal_model:
-
-            model_config = self.llm_engine.model_config
-
-            for q, d in input_pairs:
-                _, engine_prompt = get_score_prompt(
-                    model_config=model_config,
-                    data_1=q,
-                    data_2=d,
-                    tokenizer=tokenizer,
-                    tokenization_kwargs=tokenization_kwargs,
-                )
-
-                parsed_prompts.append(engine_prompt)
-
-        else:
+        model_config = self.llm_engine.model_config
+
+        for q, d in input_pairs:
+            _, engine_prompt = get_score_prompt(
+                model_config=model_config,
+                data_1=q,
+                data_2=d,
+                tokenizer=tokenizer,
+                tokenization_kwargs=tokenization_kwargs,
+            )
 
-            for q, t in input_pairs:
-                if self.llm_engine.model_config.use_pad_token:
-                    # cross_encoder models defaults to using pad_token.
-                    prompt_inputs = tokenizer(
-                        text=q,  # type: ignore[arg-type]
-                        text_pair=t,  # type: ignore[arg-type]
-                        **tokenization_kwargs)
-                else:
-                    # `llm as reranker` models defaults to not using pad_token.
-                    prompt_inputs = tokenizer(
-                        text=q + t,  # type: ignore[operator]
-                        **tokenization_kwargs)
-                engine_prompt = TokensPrompt(
-                    prompt_token_ids=prompt_inputs["input_ids"],
-                    token_type_ids=prompt_inputs.get("token_type_ids"))
-                parsed_prompts.append(engine_prompt)
+            parsed_prompts.append(engine_prompt)
 
         self._validate_and_add_requests(
             prompts=parsed_prompts,