Skip to content

[CI] Add mteb testing for rerank models #19344

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,12 +1033,9 @@ def encode(self,
req_outputs = self.model.embed(inputs, *args, **kwargs)
return [req_output.outputs.embedding for req_output in req_outputs]

def score(
self,
text_1: Union[str, list[str]],
text_2: Union[str, list[str]],
) -> list[float]:
req_outputs = self.model.score(text_1, text_2)
def score(self, text_1: Union[str, list[str]],
text_2: Union[str, list[str]], *args, **kwargs) -> list[float]:
req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
return [req_output.outputs.score for req_output in req_outputs]

def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,30 @@
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
MTEB_EMBED_TOL,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
run_mteb_embed_task_st)
run_mteb_embed_task)
from tests.utils import RemoteOpenAIServer

os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

MODEL_NAME = "BAAI/bge-m3"
DTYPE = "float16"
MAIN_SCORE = 0.7873427091972599
MODEL_NAME = "intfloat/e5-small"
MAIN_SCORE = 0.7422994752439667


@pytest.fixture(scope="module")
def server():
args = [
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
"--max-model-len", "512"
"--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


def test_mteb(server):
def test_mteb_embed(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
MODEL_NAME, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
Expand Down
56 changes: 56 additions & 0 deletions tests/entrypoints/openai/correctness/test_mteb_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os

import pytest

from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS,
MTEB_RERANK_TOL,
RerankClientMtebEncoder,
ScoreClientMtebEncoder,
run_mteb_rerank)
from tests.utils import RemoteOpenAIServer

os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
MAIN_SCORE = 0.33702


@pytest.fixture(scope="module")
def server():
args = [
"--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


def test_mteb_score(server):
url = server.url_for("score")
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
MTEB_RERANK_LANGS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)


def test_mteb_rerank(server):
url = server.url_for("rerank")
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
MTEB_RERANK_LANGS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
201 changes: 190 additions & 11 deletions tests/models/language/pooling/mteb_utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import shutil
from collections.abc import Sequence
from typing import Optional

import mteb
import numpy as np
import pytest
import requests

from tests.models.utils import EmbedModelInfo
from tests.conftest import HfRunner, VllmRunner
from tests.models.utils import EmbedModelInfo, RerankModelInfo

# Most models on the STS12 task (See #17175):
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4

MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
MTEB_RERANK_TOL = 1e-4


class VllmMtebEncoder(mteb.Encoder):

Expand All @@ -39,6 +48,27 @@
embeds = embeds[np.argsort(r)]
return embeds

def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]

querys = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]

outputs = self.model.score(querys,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores


class OpenAIClientMtebEncoder(mteb.Encoder):

Expand All @@ -62,23 +92,74 @@
return embeds


class ScoreClientMtebEncoder(mteb.Encoder):

def __init__(self, model_name: str, url):
super().__init__()
self.model_name = model_name
self.url = url
self.rng = np.random.default_rng(seed=42)

def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]

outputs = []
for query, corpus, prompt in sentences:
outputs.append(self.get_score(query, corpus))

scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores

def get_score(self, query, corpus):
response = requests.post(self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
}).json()
return response['data'][0]["score"]


class RerankClientMtebEncoder(ScoreClientMtebEncoder):

def get_score(self, query, corpus):
response = requests.post(self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
}).json()
return response['results'][0]["relevance_score"]


def run_mteb_embed_task(encoder, tasks):
tasks = mteb.get_tasks(tasks=tasks)
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(encoder, verbosity=0, output_folder=None)
results = evaluation.run(
encoder,
verbosity=0,
output_folder=None,
encode_kwargs={
"show_progress_bar": False,
},
)

main_score = results[0].scores["test"][0]["main_score"]
return main_score


def run_mteb_embed_task_st(model_name, tasks):
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_name)
return run_mteb_embed_task(model, tasks)


def mteb_test_embed_models(hf_runner,
vllm_runner,
def mteb_test_embed_models(hf_runner: HfRunner,
vllm_runner: VllmRunner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None):
Expand All @@ -90,7 +171,7 @@
vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = model_info.dtype

with vllm_runner(model_info.name,

Check failure on line 174 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

"VllmRunner" not callable [operator]

Check failure on line 174 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

"VllmRunner" not callable [operator]
task="embed",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
Expand All @@ -103,7 +184,7 @@
MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype

with hf_runner(model_info.name,

Check failure on line 187 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

"HfRunner" not callable [operator]

Check failure on line 187 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

"HfRunner" not callable [operator]
is_sentence_transformer=True,
dtype="float32") as hf_model:

Expand All @@ -118,3 +199,101 @@
print("Difference:", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)


def run_mteb_rerank(cross_encoder, tasks, languages):
results_folder = "tmp_mteb_results"
shutil.rmtree(results_folder, ignore_errors=True)

try:
bm25s = mteb.get_model("bm25s")
tasks = mteb.get_tasks(tasks=tasks, languages=languages)

subset = "default"
eval_splits = ["test"]

evaluation = mteb.MTEB(tasks=tasks)
evaluation.run(
bm25s,
verbosity=0,
eval_splits=eval_splits,
save_predictions=True,
output_folder=f"{results_folder}/stage1",
encode_kwargs={"show_progress_bar": False},
)

results = evaluation.run(
cross_encoder,
verbosity=0,
eval_splits=eval_splits,
top_k=10,
save_predictions=True,
output_folder=f"{results_folder}/stage2",
previous_results=
f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
encode_kwargs={"show_progress_bar": False},
)

main_score = results[0].scores["test"][0]["main_score"]
finally:
shutil.rmtree(results_folder, ignore_errors=True)
return main_score


def mteb_test_rerank_models(hf_runner: HfRunner,
vllm_runner: VllmRunner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None):
if not model_info.enable_test:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest.skip("Skipping test.")

vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = model_info.dtype

with vllm_runner(model_info.name,

Check failure on line 256 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

"VllmRunner" not callable [operator]

Check failure on line 256 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

"VllmRunner" not callable [operator]
task="score",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:

if model_info.architecture:
assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures)

vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype

with hf_runner(model_info.name, is_cross_encoder=True,

Check failure on line 270 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

"HfRunner" not callable [operator]

Check failure on line 270 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

"HfRunner" not callable [operator]
dtype="float32") as hf_model:

hf_model_predict = hf_model.predict

def _predict(
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
):
# vllm and st both remove the prompt, fair comparison.
sentences = [(s[0], s[1]) for s in sentences]

Check failure on line 282 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

List comprehension has incompatible type List[tuple[str, str]]; expected List[tuple[str, str, Optional[str]]] [misc]

Check failure on line 282 in tests/models/language/pooling/mteb_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

List comprehension has incompatible type List[tuple[str, str]]; expected List[tuple[str, str, Optional[str]]] [misc]
return hf_model_predict(sentences, *args, **kwargs)

hf_model.predict = _predict

if hf_model_callback is not None:
hf_model_callback(hf_model)

st_main_score = run_mteb_rerank(hf_model,
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
st_dtype = next(hf_model.model.parameters()).dtype

print("VLLM:", vllm_dtype, vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
Loading