Skip to content

Add support for encoder embedding models #19988

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5dee54d
Add support for encoder embedding models
maxdebayser Jun 23, 2025
7eb9d28
Fix CUDA graphs for BERT models
maxdebayser Jul 1, 2025
67691e0
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 1, 2025
d3099a9
Fix cuda graph initialization of token type ids
maxdebayser Jul 1, 2025
613ff3b
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 2, 2025
20c41e4
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 2, 2025
ba86026
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 8, 2025
b4f5ead
Fix missing args
maxdebayser Jul 9, 2025
c4060d1
relax assertion
maxdebayser Jul 9, 2025
01d2a65
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 9, 2025
80930d8
fix missing arg
maxdebayser Jul 9, 2025
d881f0a
fix missing arg
maxdebayser Jul 10, 2025
90a25d0
remove model from unsupported list
maxdebayser Jul 10, 2025
6686550
fix missing arg
maxdebayser Jul 10, 2025
cc76777
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 10, 2025
136c9b3
fix tests
maxdebayser Jul 10, 2025
b232491
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 14, 2025
cf5e6b8
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 16, 2025
e19c738
fix tests
maxdebayser Jul 16, 2025
e255f30
fix tests
maxdebayser Jul 16, 2025
ee5950c
add missing arg
maxdebayser Jul 16, 2025
78a2e57
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 16, 2025
a5cfc84
add missing arg
maxdebayser Jul 16, 2025
63fd783
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_rerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,4 @@ def test_invocations(server: RemoteOpenAIServer):
invocation_output["results"]):
assert rerank_result.keys() == invocations_result.keys()
assert rerank_result["relevance_score"] == pytest.approx(
invocations_result["relevance_score"], rel=0.01)
invocations_result["relevance_score"], rel=0.05)
14 changes: 3 additions & 11 deletions tests/models/language/pooling/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,9 @@ def v1(run_with_both_engines):
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
# [Encoder-only]
pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[
# CPU only supports V1
pytest.mark.core_model,
pytest.mark.skip_v1
]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
marks=[pytest.mark.skip_v1]),
pytest.param("intfloat/multilingual-e5-small",
marks=[pytest.mark.skip_v1]),
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
marks=[pytest.mark.skip_v1]),
# [Cross-Encoder]
Expand Down
8 changes: 8 additions & 0 deletions tests/models/language/pooling/test_jina.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
Expand Down
9 changes: 9 additions & 0 deletions tests/models/language/pooling/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
"The capital of Germany is Berlin.",
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


DTYPE = "half"


Expand Down
21 changes: 11 additions & 10 deletions tests/tokenization/test_detokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,17 @@ def _run_incremental_decode(tokenizer,
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
request = EngineCoreRequest("",
prompt_token_ids,
None,
None,
None,
params,
None,
None,
0.0,
None,
request = EngineCoreRequest(request_id="",
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
sampling_params=params,
pooling_params=None,
eos_token_id=None,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None)

Expand Down
8 changes: 8 additions & 0 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest
import torch

from vllm.attention import AttentionType
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
Expand Down Expand Up @@ -40,6 +41,7 @@ def make_request(request_id,
return Request(
request_id=request_id,
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
multi_modal_placeholders=mm_positions,
Expand All @@ -62,6 +64,7 @@ def new_kv_cache_spec(block_size=16,
head_size=head_size,
dtype=dtype,
use_mla=use_mla,
attn_type=AttentionType.DECODER,
sliding_window=sliding_window)


Expand All @@ -76,6 +79,7 @@ def new_sliding_window_spec(block_size=16,
head_size=head_size,
dtype=dtype,
use_mla=use_mla,
attn_type=AttentionType.DECODER,
sliding_window=sliding_window)


Expand Down Expand Up @@ -544,6 +548,7 @@ def test_merge_kv_cache_spec():
head_size=full_spec.head_size,
dtype=full_spec.dtype,
use_mla=full_spec.use_mla,
attn_type=AttentionType.DECODER,
sliding_window=1,
),
]
Expand Down Expand Up @@ -613,6 +618,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
head_size=128,
dtype=torch.float16,
use_mla=False,
attn_type=AttentionType.DECODER,
)
# Estimate the maximum model length, 16384 model_len need 8GB
estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
Expand Down Expand Up @@ -648,6 +654,7 @@ def test_get_max_concurrency_for_kv_cache_config():
head_size=128,
dtype=torch.float16,
use_mla=False,
attn_type=AttentionType.DECODER,
)

sliding_window_spec = SlidingWindowSpec(
Expand All @@ -656,6 +663,7 @@ def test_get_max_concurrency_for_kv_cache_config():
head_size=128,
dtype=torch.float16,
use_mla=False,
attn_type=AttentionType.DECODER,
sliding_window=1024,
)

Expand Down
19 changes: 17 additions & 2 deletions tests/v1/core/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest
import torch

from vllm.attention import AttentionType
from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
Expand Down Expand Up @@ -35,6 +36,7 @@ def make_request(request_id,
return Request(
request_id=request_id,
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
multi_modal_placeholders=mm_positions,
Expand All @@ -54,7 +56,12 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
kv_cache_groups=[
KVCacheGroupSpec(
["layer"],
FullAttentionSpec(block_size, 1, 1, torch.float32, False),
FullAttentionSpec(block_size,
1,
1,
torch.float32,
False,
attn_type=AttentionType.DECODER),
)
],
)
Expand All @@ -68,7 +75,12 @@ def make_kv_cache_config_hybrid_model(block_size: int,
kv_cache_groups=[
KVCacheGroupSpec(
["layer1"],
FullAttentionSpec(block_size, 1, 1, torch.float32, False),
FullAttentionSpec(block_size,
1,
1,
torch.float32,
False,
attn_type=AttentionType.DECODER),
),
KVCacheGroupSpec(
["layer2"],
Expand All @@ -77,6 +89,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
1,
torch.float32,
False,
attn_type=AttentionType.DECODER,
sliding_window=2 * block_size),
),
KVCacheGroupSpec(
Expand All @@ -86,6 +99,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
1,
torch.float32,
False,
attn_type=AttentionType.DECODER,
sliding_window=2 * block_size),
),
],
Expand Down Expand Up @@ -1222,6 +1236,7 @@ def test_eagle_with_sliding_window():
dtype=torch.float32,
sliding_window=block_size,
use_mla=False,
attn_type=AttentionType.DECODER,
)
manager = KVCacheManager(
KVCacheConfig(
Expand Down
5 changes: 4 additions & 1 deletion tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
import torch

from vllm.attention import AttentionType
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
SchedulerConfig, SpeculativeConfig, VllmConfig)
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
Expand Down Expand Up @@ -1290,7 +1291,7 @@ def create_scheduler_with_priority(
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(block_size, 1, 1, torch.float32,
False))
False, AttentionType.DECODER))
],
)
cache_config.num_gpu_blocks = num_blocks
Expand Down Expand Up @@ -1333,6 +1334,7 @@ def create_requests_with_priority(
request = Request(
request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
token_type_ids=None,
sampling_params=sampling_params,
pooling_params=None,
multi_modal_inputs=mm_inputs,
Expand Down Expand Up @@ -1819,6 +1821,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
request = Request(
request_id="0",
prompt_token_ids=[0, 1],
token_type_ids=None,
multi_modal_inputs=None,
multi_modal_hashes=None,
multi_modal_placeholders=None,
Expand Down
4 changes: 4 additions & 0 deletions tests/v1/core/test_specialized_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import torch

from vllm.attention import AttentionType
from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
KVCacheBlock)
Expand All @@ -26,6 +27,7 @@ def test_sliding_window_possible_cached_prefix():
dtype=torch.float32,
sliding_window=4,
use_mla=False,
attn_type=AttentionType.DECODER,
)

block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
Expand Down Expand Up @@ -92,6 +94,7 @@ def test_sliding_window_remove_skipped_blocks():
dtype=torch.float32,
sliding_window=4,
use_mla=False,
attn_type=AttentionType.DECODER,
)

block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
Expand Down Expand Up @@ -160,6 +163,7 @@ def test_get_num_blocks_to_allocate():
dtype=torch.float32,
sliding_window=4, # Placeholder value, not related to test result
use_mla=False,
attn_type=AttentionType.DECODER,
)

block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
Expand Down
4 changes: 3 additions & 1 deletion tests/v1/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import torch

from vllm.attention import AttentionType
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
SchedulerConfig, SpeculativeConfig, VllmConfig)
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
Expand Down Expand Up @@ -102,7 +103,7 @@ def create_scheduler(
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(block_size, 1, 1, torch.float32,
False))
False, AttentionType.DECODER))
],
)
cache_config.num_gpu_blocks = num_blocks
Expand Down Expand Up @@ -141,6 +142,7 @@ def create_requests(
request = Request(
request_id=f"{i}",
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
sampling_params=sampling_params,
pooling_params=None,
multi_modal_inputs=mm_inputs,
Expand Down
1 change: 1 addition & 0 deletions tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def make_request() -> EngineCoreRequest:
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
prompt_token_ids=PROMPT_TOKENS,
token_type_ids=None,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
Expand Down
5 changes: 4 additions & 1 deletion tests/v1/engine/test_engine_core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from tests.utils import multi_gpu_test
from vllm import SamplingParams
from vllm.attention import AttentionType
from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
ZmqEventPublisher)
from vllm.engine.arg_utils import EngineArgs
Expand Down Expand Up @@ -51,6 +52,7 @@ def make_request(
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
prompt_token_ids=prompt_tokens_ids,
token_type_ids=None,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
Expand Down Expand Up @@ -544,7 +546,8 @@ def create_mock_executor(vllm_config):
num_kv_heads=1,
head_size=64,
dtype=torch.float16,
use_mla=False)
use_mla=False,
attn_type=AttentionType.DECODER)

mock_executor.get_kv_cache_specs.return_value = [{
"default": mock_spec
Expand Down
1 change: 1 addition & 0 deletions tests/v1/engine/test_fast_incdec_prefix_err.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
None,
None,
None,
None,
params,
None,
None,
Expand Down
5 changes: 5 additions & 0 deletions tests/v1/engine/test_output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
requests = [
EngineCoreRequest(request_id=f"request-{idx}",
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down Expand Up @@ -401,6 +402,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
requests = [
EngineCoreRequest(request_id=request_id_list[idx],
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down Expand Up @@ -566,6 +568,7 @@ def test_stop_token(include_stop_str_in_output: bool,
request = EngineCoreRequest(
request_id=request_id,
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down Expand Up @@ -665,6 +668,7 @@ def test_stop_string(include_stop_str_in_output: bool,
EngineCoreRequest(
request_id=request_id_list[idx],
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down Expand Up @@ -781,6 +785,7 @@ def test_iteration_stats(dummy_test_vectors):
EngineCoreRequest(
request_id=f"request-{idx}",
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down
Loading