vllm-project · maxdebayser · Jun 23, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025
@@ -124,4 +124,4 @@ def test_invocations(server: RemoteOpenAIServer):
                                                  invocation_output["results"]):
         assert rerank_result.keys() == invocations_result.keys()
         assert rerank_result["relevance_score"] == pytest.approx(
-            invocations_result["relevance_score"], rel=0.01)
+            invocations_result["relevance_score"], rel=0.05)
@@ -39,17 +39,9 @@ def v1(run_with_both_engines):
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param(
-            "BAAI/bge-base-en-v1.5",
-            marks=[
-                # CPU only supports V1
-                pytest.mark.core_model,
-                pytest.mark.skip_v1
-            ]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
-                     marks=[pytest.mark.skip_v1]),
-        pytest.param("intfloat/multilingual-e5-small",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                      marks=[pytest.mark.skip_v1]),
         # [Cross-Encoder]

@@ -23,6 +23,14 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:

@@ -23,6 +23,15 @@
     "The capital of Germany is Berlin.",
 ]
 
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 DTYPE = "half"
 
 

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
@@ -61,16 +61,17 @@ def _run_incremental_decode(tokenizer,
         skip_special_tokens=skip_special_tokens,
         spaces_between_special_tokens=spaces_between_special_tokens,
     )
-    request = EngineCoreRequest("",
-                                prompt_token_ids,
-                                None,
-                                None,
-                                None,
-                                params,
-                                None,
-                                None,
-                                0.0,
-                                None,
+    request = EngineCoreRequest(request_id="",
+                                prompt_token_ids=prompt_token_ids,
+                                token_type_ids=None,
+                                mm_inputs=None,
+                                mm_hashes=None,
+                                mm_placeholders=None,
+                                sampling_params=params,
+                                pooling_params=None,
+                                eos_token_id=None,
+                                arrival_time=0.0,
+                                lora_request=None,
                                 cache_salt=None,
                                 data_parallel_rank=None)
 

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from vllm.attention import AttentionType
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
@@ -40,6 +41,7 @@ def make_request(request_id,
     return Request(
         request_id=request_id,
         prompt_token_ids=prompt_token_ids,
+        token_type_ids=None,
         multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
@@ -62,6 +64,7 @@ def new_kv_cache_spec(block_size=16,
                              head_size=head_size,
                              dtype=dtype,
                              use_mla=use_mla,
+                             attn_type=AttentionType.DECODER,
                              sliding_window=sliding_window)
 
 
@@ -76,6 +79,7 @@ def new_sliding_window_spec(block_size=16,
                              head_size=head_size,
                              dtype=dtype,
                              use_mla=use_mla,
+                             attn_type=AttentionType.DECODER,
                              sliding_window=sliding_window)
 
 
@@ -544,6 +548,7 @@ def test_merge_kv_cache_spec():
             head_size=full_spec.head_size,
             dtype=full_spec.dtype,
             use_mla=full_spec.use_mla,
+            attn_type=AttentionType.DECODER,
             sliding_window=1,
         ),
     ]
@@ -613,6 +618,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
             head_size=128,
             dtype=torch.float16,
             use_mla=False,
+            attn_type=AttentionType.DECODER,
         )
     # Estimate the maximum model length, 16384 model_len need 8GB
     estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
@@ -648,6 +654,7 @@ def test_get_max_concurrency_for_kv_cache_config():
         head_size=128,
         dtype=torch.float16,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
 
     sliding_window_spec = SlidingWindowSpec(
@@ -656,6 +663,7 @@ def test_get_max_concurrency_for_kv_cache_config():
         head_size=128,
         dtype=torch.float16,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
         sliding_window=1024,
     )
 

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
@@ -8,6 +8,7 @@
 import pytest
 import torch
 
+from vllm.attention import AttentionType
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
@@ -35,6 +36,7 @@ def make_request(request_id,
     return Request(
         request_id=request_id,
         prompt_token_ids=prompt_token_ids,
+        token_type_ids=None,
         multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
@@ -54,7 +56,12 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+                FullAttentionSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  attn_type=AttentionType.DECODER),
             )
         ],
     )
@@ -68,7 +75,12 @@ def make_kv_cache_config_hybrid_model(block_size: int,
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer1"],
-                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+                FullAttentionSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  attn_type=AttentionType.DECODER),
             ),
             KVCacheGroupSpec(
                 ["layer2"],
@@ -77,6 +89,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
                                   1,
                                   torch.float32,
                                   False,
+                                  attn_type=AttentionType.DECODER,
                                   sliding_window=2 * block_size),
             ),
             KVCacheGroupSpec(
@@ -86,6 +99,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
                                   1,
                                   torch.float32,
                                   False,
+                                  attn_type=AttentionType.DECODER,
                                   sliding_window=2 * block_size),
             ),
         ],
@@ -1222,6 +1236,7 @@ def test_eagle_with_sliding_window():
         dtype=torch.float32,
         sliding_window=block_size,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
     manager = KVCacheManager(
         KVCacheConfig(

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from vllm.attention import AttentionType
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -1290,7 +1291,7 @@ def create_scheduler_with_priority(
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+                                               False, AttentionType.DECODER))
         ],
     )
     cache_config.num_gpu_blocks = num_blocks
@@ -1333,6 +1334,7 @@ def create_requests_with_priority(
         request = Request(
             request_id=f"{i}",
             prompt_token_ids=[i] * num_tokens,
+            token_type_ids=None,
             sampling_params=sampling_params,
             pooling_params=None,
             multi_modal_inputs=mm_inputs,
@@ -1819,6 +1821,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     request = Request(
         request_id="0",
         prompt_token_ids=[0, 1],
+        token_type_ids=None,
         multi_modal_inputs=None,
         multi_modal_hashes=None,
         multi_modal_placeholders=None,

diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from vllm.attention import AttentionType
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
                                          KVCacheBlock)
@@ -26,6 +27,7 @@ def test_sliding_window_possible_cached_prefix():
         dtype=torch.float32,
         sliding_window=4,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
 
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
@@ -92,6 +94,7 @@ def test_sliding_window_remove_skipped_blocks():
         dtype=torch.float32,
         sliding_window=4,
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
 
     block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
@@ -160,6 +163,7 @@ def test_get_num_blocks_to_allocate():
         dtype=torch.float32,
         sliding_window=4,  # Placeholder value, not related to test result
         use_mla=False,
+        attn_type=AttentionType.DECODER,
     )
 
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)

diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from vllm.attention import AttentionType
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -102,7 +103,7 @@ def create_scheduler(
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+                                               False, AttentionType.DECODER))
         ],
     )
     cache_config.num_gpu_blocks = num_blocks
@@ -141,6 +142,7 @@ def create_requests(
         request = Request(
             request_id=f"{i}",
             prompt_token_ids=prompt_token_ids,
+            token_type_ids=None,
             sampling_params=sampling_params,
             pooling_params=None,
             multi_modal_inputs=mm_inputs,

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
@@ -35,6 +35,7 @@ def make_request() -> EngineCoreRequest:
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=PROMPT_TOKENS,
+        token_type_ids=None,
         mm_inputs=None,
         mm_hashes=None,
         mm_placeholders=None,

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
@@ -16,6 +16,7 @@
 
 from tests.utils import multi_gpu_test
 from vllm import SamplingParams
+from vllm.attention import AttentionType
 from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
                                         ZmqEventPublisher)
 from vllm.engine.arg_utils import EngineArgs
@@ -51,6 +52,7 @@ def make_request(
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=prompt_tokens_ids,
+        token_type_ids=None,
         mm_inputs=None,
         mm_hashes=None,
         mm_placeholders=None,
@@ -544,7 +546,8 @@ def create_mock_executor(vllm_config):
                                       num_kv_heads=1,
                                       head_size=64,
                                       dtype=torch.float16,
-                                      use_mla=False)
+                                      use_mla=False,
+                                      attn_type=AttentionType.DECODER)
 
         mock_executor.get_kv_cache_specs.return_value = [{
             "default": mock_spec

diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -31,6 +31,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
         None,
         None,
         None,
+        None,
         params,
         None,
         None,

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
@@ -52,6 +52,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
     requests = [
         EngineCoreRequest(request_id=f"request-{idx}",
                           prompt_token_ids=prompt_tokens,
+                          token_type_ids=None,
                           arrival_time=0,
                           mm_inputs=None,
                           mm_hashes=None,
@@ -401,6 +402,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
     requests = [
         EngineCoreRequest(request_id=request_id_list[idx],
                           prompt_token_ids=prompt_tokens,
+                          token_type_ids=None,
                           arrival_time=0,
                           mm_inputs=None,
                           mm_hashes=None,
@@ -566,6 +568,7 @@ def test_stop_token(include_stop_str_in_output: bool,
     request = EngineCoreRequest(
         request_id=request_id,
         prompt_token_ids=prompt_tokens,
+        token_type_ids=None,
         arrival_time=0,
         mm_inputs=None,
         mm_hashes=None,
@@ -665,6 +668,7 @@ def test_stop_string(include_stop_str_in_output: bool,
         EngineCoreRequest(
             request_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
+            token_type_ids=None,
             arrival_time=0,
             mm_inputs=None,
             mm_hashes=None,
@@ -781,6 +785,7 @@ def test_iteration_stats(dummy_test_vectors):
         EngineCoreRequest(
             request_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
+            token_type_ids=None,
             arrival_time=0,
             mm_inputs=None,
             mm_hashes=None,
-Original file line number
+Diff line change
@@ Expand Up / @@ -31,6 +31,7 @@ def test_fast_inc_detok_invalid_utf8_err_case(): @@
             None,
             None,
             None,
+            None,
             params,
             None,
             None,
@@ Expand Down @@