Skip to content

Add support for encoder embedding models using MultiModal args #20026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5dee54d
Add support for encoder embedding models
maxdebayser Jun 23, 2025
b430dba
Use multi-modal support to pass token_type_ids to the model
maxdebayser Jun 23, 2025
aad1052
reduce diff
maxdebayser Jun 24, 2025
3ca7ced
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 1, 2025
7006e8a
Fix cuda graphs for BERT models
maxdebayser Jul 1, 2025
c99df96
Add token_type_ids multi-modal to LLM._cross_encoding_score
maxdebayser Jul 2, 2025
bbe0ea7
fix merge problem
maxdebayser Jul 2, 2025
019496a
fix editing mistake
maxdebayser Jul 2, 2025
6558bdd
fix missing input ids
maxdebayser Jul 2, 2025
33bcc88
fix mistake
maxdebayser Jul 2, 2025
a743268
fix tensor not boolean error
maxdebayser Jul 2, 2025
6310f4d
appease mypy
maxdebayser Jul 2, 2025
1d79887
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 2, 2025
611217a
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 2, 2025
024198b
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 8, 2025
c4dc1a8
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 9, 2025
3f79324
Fix missing args
maxdebayser Jul 9, 2025
f3f075a
fix mm flag in registry test
maxdebayser Jul 9, 2025
268099b
remove model from unsupported list
maxdebayser Jul 10, 2025
5470c4e
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 10, 2025
d19dcd4
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 11, 2025
60696b4
appease linter
maxdebayser Jul 11, 2025
0ce2a36
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 11, 2025
00bfc79
lazy import
maxdebayser Jul 11, 2025
2501649
appease linter
maxdebayser Jul 11, 2025
28fb913
appease linter
maxdebayser Jul 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 5 additions & 14 deletions tests/models/language/pooling/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,13 @@ def v1(run_with_both_engines):
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
# [Encoder-only]
pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[
# CPU only supports V1
pytest.mark.core_model,
pytest.mark.skip_v1
]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
marks=[pytest.mark.skip_v1]),
pytest.param("intfloat/multilingual-e5-small",
marks=[pytest.mark.skip_v1]),
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
marks=[pytest.mark.skip_v1]),
marks=[pytest.mark.skip_v0]),
# [Cross-Encoder]
pytest.param("sentence-transformers/stsb-roberta-base-v2",
marks=[pytest.mark.skip_v1]),
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
],
)
def test_models(
Expand Down
8 changes: 8 additions & 0 deletions tests/models/language/pooling/test_jina.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
Expand Down
9 changes: 9 additions & 0 deletions tests/models/language/pooling/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
"The capital of Germany is Berlin.",
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


DTYPE = "half"


Expand Down
6 changes: 3 additions & 3 deletions tests/models/test_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ def test_registry_imports(model_arch):
("LlamaForCausalLM", False, False, False),
("MllamaForConditionalGeneration", True, False, False),
("LlavaForConditionalGeneration", True, True, False),
("BertForSequenceClassification", False, False, True),
("RobertaForSequenceClassification", False, False, True),
("XLMRobertaForSequenceClassification", False, False, True),
("BertForSequenceClassification", True, False, True),
("RobertaForSequenceClassification", True, False, True),
("XLMRobertaForSequenceClassification", True, False, True),
])
def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
Expand Down
9 changes: 8 additions & 1 deletion tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest
import torch

from vllm.attention import AttentionType
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
Expand Down Expand Up @@ -61,6 +62,7 @@ def new_kv_cache_spec(block_size=16,
head_size=head_size,
dtype=dtype,
use_mla=use_mla,
attn_type=AttentionType.DECODER,
sliding_window=sliding_window)


Expand All @@ -75,6 +77,7 @@ def new_sliding_window_spec(block_size=16,
head_size=head_size,
dtype=dtype,
use_mla=use_mla,
attn_type=AttentionType.DECODER,
sliding_window=sliding_window)


Expand Down Expand Up @@ -534,6 +537,7 @@ def test_merge_kv_cache_spec():
head_size=full_spec.head_size,
dtype=full_spec.dtype,
use_mla=full_spec.use_mla,
attn_type=AttentionType.DECODER,
sliding_window=1,
),
]
Expand Down Expand Up @@ -603,6 +607,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
head_size=128,
dtype=torch.float16,
use_mla=False,
attn_type=AttentionType.DECODER,
)
# Estimate the maximum model length, 16384 model_len need 8GB
estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
Expand Down Expand Up @@ -638,6 +643,7 @@ def test_get_max_concurrency_for_kv_cache_config():
head_size=128,
dtype=torch.float16,
use_mla=False,
attn_type=AttentionType.DECODER,
)

sliding_window_spec = SlidingWindowSpec(
Expand All @@ -646,6 +652,7 @@ def test_get_max_concurrency_for_kv_cache_config():
head_size=128,
dtype=torch.float16,
use_mla=False,
attn_type=AttentionType.DECODER,
sliding_window=1024,
)

Expand Down Expand Up @@ -916,4 +923,4 @@ def test_get_kv_cache_config():
],
kv_cache_groups=[
KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
])
])
18 changes: 16 additions & 2 deletions tests/v1/core/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest
import torch

from vllm.attention import AttentionType
from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
Expand Down Expand Up @@ -53,7 +54,12 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
kv_cache_groups=[
KVCacheGroupSpec(
["layer"],
FullAttentionSpec(block_size, 1, 1, torch.float32, False),
FullAttentionSpec(block_size,
1,
1,
torch.float32,
False,
attn_type=AttentionType.DECODER),
)
],
)
Expand All @@ -67,7 +73,12 @@ def make_kv_cache_config_hybrid_model(block_size: int,
kv_cache_groups=[
KVCacheGroupSpec(
["layer1"],
FullAttentionSpec(block_size, 1, 1, torch.float32, False),
FullAttentionSpec(block_size,
1,
1,
torch.float32,
False,
attn_type=AttentionType.DECODER),
),
KVCacheGroupSpec(
["layer2"],
Expand All @@ -76,6 +87,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
1,
torch.float32,
False,
attn_type=AttentionType.DECODER,
sliding_window=2 * block_size),
),
KVCacheGroupSpec(
Expand All @@ -85,6 +97,7 @@ def make_kv_cache_config_hybrid_model(block_size: int,
1,
torch.float32,
False,
attn_type=AttentionType.DECODER,
sliding_window=2 * block_size),
),
],
Expand Down Expand Up @@ -1218,6 +1231,7 @@ def test_eagle_with_sliding_window():
dtype=torch.float32,
sliding_window=block_size,
use_mla=False,
attn_type=AttentionType.DECODER,
)
manager = KVCacheManager(
KVCacheConfig(
Expand Down
5 changes: 3 additions & 2 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
import torch

from vllm.attention import AttentionType
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
SchedulerConfig, SpeculativeConfig, VllmConfig)
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
Expand Down Expand Up @@ -104,7 +105,7 @@ def create_scheduler(
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(block_size, 1, 1, torch.float32,
False))
False, AttentionType.DECODER))
],
)
cache_config.num_gpu_blocks = num_blocks
Expand Down Expand Up @@ -1354,7 +1355,7 @@ def create_scheduler_with_priority(
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(block_size, 1, 1, torch.float32,
False))
False, AttentionType.DECODER))
],
)
cache_config.num_gpu_blocks = num_blocks
Expand Down
4 changes: 4 additions & 0 deletions tests/v1/core/test_specialized_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import torch

from vllm.attention import AttentionType
from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
KVCacheBlock)
Expand All @@ -26,6 +27,7 @@ def test_sliding_window_possible_cached_prefix():
dtype=torch.float32,
sliding_window=4,
use_mla=False,
attn_type=AttentionType.DECODER,
)

block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
Expand Down Expand Up @@ -92,6 +94,7 @@ def test_sliding_window_remove_skipped_blocks():
dtype=torch.float32,
sliding_window=4,
use_mla=False,
attn_type=AttentionType.DECODER,
)

block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
Expand Down Expand Up @@ -160,6 +163,7 @@ def test_get_num_blocks_to_allocate():
dtype=torch.float32,
sliding_window=4, # Placeholder value, not related to test result
use_mla=False,
attn_type=AttentionType.DECODER,
)

block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
Expand Down
4 changes: 3 additions & 1 deletion tests/v1/engine/test_engine_core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from tests.utils import multi_gpu_test
from vllm import SamplingParams
from vllm.attention import AttentionType
from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
ZmqEventPublisher)
from vllm.engine.arg_utils import EngineArgs
Expand Down Expand Up @@ -544,7 +545,8 @@ def create_mock_executor(vllm_config):
num_kv_heads=1,
head_size=64,
dtype=torch.float16,
use_mla=False)
use_mla=False,
attn_type=AttentionType.DECODER)

mock_executor.get_kv_cache_specs.return_value = [{
"default": mock_spec
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/entrypoints/openai/test_multi_api_servers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
import re

import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import regex as re
import requests

from tests.utils import RemoteOpenAIServer
Expand Down
3 changes: 2 additions & 1 deletion tests/v1/kv_connector/unit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch

from vllm import SamplingParams
from vllm.attention import AttentionType
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
ModelConfig, SchedulerConfig, VllmConfig)
from vllm.distributed.kv_transfer.kv_connector.factory import (
Expand Down Expand Up @@ -106,7 +107,7 @@ def create_scheduler(
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(block_size, 1, 1, torch.float32,
False))
False, AttentionType.DECODER))
],
)
vllm_config.cache_config.num_gpu_blocks = num_blocks
Expand Down
1 change: 0 additions & 1 deletion tests/v1/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
"state-spaces/mamba-130m-hf", # mamba1
"BAAI/bge-m3", # embedding
]

MODEL = "meta-llama/Llama-3.2-1B-Instruct"
Expand Down
3 changes: 2 additions & 1 deletion tests/v1/worker/test_gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest
import torch

from vllm.attention import Attention
from vllm.attention import Attention, AttentionType
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig, VllmConfig, set_current_vllm_config)
from vllm.platforms import current_platform
Expand Down Expand Up @@ -38,6 +38,7 @@ def initialize_kv_cache(runner: GPUModelRunner):
head_size=runner.model_config.get_head_size(),
dtype=runner.kv_cache_dtype,
use_mla=False,
attn_type=AttentionType.DECODER,
)
tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS
kv_cache_config = KVCacheConfig(
Expand Down
3 changes: 2 additions & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1744,7 +1744,8 @@ def _set_default_args_v1(self, usage_context: UsageContext,

if (self.max_num_seqs is None
and usage_context in default_max_num_seqs):
self.max_num_seqs = default_max_num_seqs[usage_context]
self.max_num_seqs = min(default_max_num_seqs[usage_context],
self.max_num_batched_tokens or sys.maxsize)

logger.debug("Setting max_num_seqs to %d for %s usage context.",
self.max_num_seqs, use_context_value)
Expand Down
43 changes: 11 additions & 32 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1295,39 +1295,18 @@ def _cross_encoding_score(

input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

if self.llm_engine.model_config.is_multimodal_model:

model_config = self.llm_engine.model_config

for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)

parsed_prompts.append(engine_prompt)

else:
model_config = self.llm_engine.model_config

for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)

for q, t in input_pairs:
if self.llm_engine.model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(
text=q, # type: ignore[arg-type]
text_pair=t, # type: ignore[arg-type]
**tokenization_kwargs)
else:
# `llm as reranker` models defaults to not using pad_token.
prompt_inputs = tokenizer(
text=q + t, # type: ignore[operator]
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)
parsed_prompts.append(engine_prompt)

self._validate_and_add_requests(
prompts=parsed_prompts,
Expand Down
Loading