Skip to content

[CI] Add mteb testing for rerank models #19344

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.52.4
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
Expand Down
6 changes: 6 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ black==24.10.0
# via datamodel-code-generator
blobfile==3.0.0
# via -r requirements/test.in
bm25s==0.2.13
# via mteb
boto3==1.35.57
# via tensorizer
botocore==1.35.57
Expand Down Expand Up @@ -344,6 +346,7 @@ numpy==1.26.4
# -r requirements/test.in
# accelerate
# bitsandbytes
# bm25s
# contourpy
# cupy-cuda12x
# datasets
Expand Down Expand Up @@ -534,6 +537,8 @@ pyparsing==3.2.0
# via matplotlib
pyrate-limiter==3.7.0
# via schemathesis
pystemmer==3.0.0
# via mteb
pytablewriter==1.2.0
# via lm-eval
pytest==8.3.3
Expand Down Expand Up @@ -668,6 +673,7 @@ scikit-learn==1.5.2
# sentence-transformers
scipy==1.13.1
# via
# bm25s
# librosa
# mteb
# scikit-learn
Expand Down
9 changes: 3 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,12 +1033,9 @@ def encode(self,
req_outputs = self.model.embed(inputs, *args, **kwargs)
return [req_output.outputs.embedding for req_output in req_outputs]

def score(
self,
text_1: Union[str, list[str]],
text_2: Union[str, list[str]],
) -> list[float]:
req_outputs = self.model.score(text_1, text_2)
def score(self, text_1: Union[str, list[str]],
text_2: Union[str, list[str]], *args, **kwargs) -> list[float]:
req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
return [req_output.outputs.score for req_output in req_outputs]

def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,30 @@
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
MTEB_EMBED_TOL,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
run_mteb_embed_task_st)
run_mteb_embed_task)
from tests.utils import RemoteOpenAIServer

os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

MODEL_NAME = "BAAI/bge-m3"
DTYPE = "float16"
MAIN_SCORE = 0.7873427091972599
MODEL_NAME = "intfloat/e5-small"
MAIN_SCORE = 0.7422994752439667


@pytest.fixture(scope="module")
def server():
args = [
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
"--max-model-len", "512"
"--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


def test_mteb(server):
def test_mteb_embed(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
MODEL_NAME, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
Expand Down
59 changes: 59 additions & 0 deletions tests/entrypoints/openai/correctness/test_mteb_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os

import pytest

# yapf conflicts with isort for this block
# yapf: disable
from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS,
MTEB_RERANK_TOL,
RerankClientMtebEncoder,
ScoreClientMtebEncoder,
run_mteb_rerank)
# yapf: enable
from tests.utils import RemoteOpenAIServer

os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
MAIN_SCORE = 0.33702


@pytest.fixture(scope="module")
def server():
args = [
"--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


def test_mteb_score(server):
url = server.url_for("score")
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
MTEB_RERANK_LANGS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)


def test_mteb_rerank(server):
url = server.url_for("rerank")
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
MTEB_RERANK_LANGS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
191 changes: 182 additions & 9 deletions tests/models/language/pooling/mteb_utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import tempfile
from collections.abc import Sequence
from typing import Optional

import mteb
import numpy as np
import pytest
import requests

from tests.models.utils import EmbedModelInfo
from tests.models.utils import EmbedModelInfo, RerankModelInfo

# Most models on the STS12 task (See #17175):
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4

# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
MTEB_RERANK_TOL = 1e-4


class VllmMtebEncoder(mteb.Encoder):

Expand All @@ -39,6 +48,27 @@ def encode(
embeds = embeds[np.argsort(r)]
return embeds

def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]

queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]

outputs = self.model.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores


class OpenAIClientMtebEncoder(mteb.Encoder):

Expand All @@ -62,21 +92,72 @@ def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
return embeds


class ScoreClientMtebEncoder(mteb.Encoder):

def __init__(self, model_name: str, url):
super().__init__()
self.model_name = model_name
self.url = url
self.rng = np.random.default_rng(seed=42)

def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]

outputs = []
for query, corpus, prompt in sentences:
outputs.append(self.get_score(query, corpus))

scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores

def get_score(self, query, corpus):
response = requests.post(self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
}).json()
return response['data'][0]["score"]


class RerankClientMtebEncoder(ScoreClientMtebEncoder):

def get_score(self, query, corpus):
response = requests.post(self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
}).json()
return response['results'][0]["relevance_score"]


def run_mteb_embed_task(encoder, tasks):
tasks = mteb.get_tasks(tasks=tasks)
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(encoder, verbosity=0, output_folder=None)
results = evaluation.run(
encoder,
verbosity=0,
output_folder=None,
encode_kwargs={
"show_progress_bar": False,
},
)

main_score = results[0].scores["test"][0]["main_score"]
return main_score


def run_mteb_embed_task_st(model_name, tasks):
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_name)
return run_mteb_embed_task(model, tasks)


def mteb_test_embed_models(hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
Expand Down Expand Up @@ -118,3 +199,95 @@ def mteb_test_embed_models(hf_runner,
print("Difference:", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)


def run_mteb_rerank(cross_encoder, tasks, languages):
with tempfile.TemporaryDirectory() as results_folder:
bm25s = mteb.get_model("bm25s")
tasks = mteb.get_tasks(tasks=tasks, languages=languages)

subset = "default"
eval_splits = ["test"]

evaluation = mteb.MTEB(tasks=tasks)
evaluation.run(
bm25s,
verbosity=0,
eval_splits=eval_splits,
save_predictions=True,
output_folder=f"{results_folder}/stage1",
encode_kwargs={"show_progress_bar": False},
)

results = evaluation.run(
cross_encoder,
verbosity=0,
eval_splits=eval_splits,
top_k=10,
save_predictions=True,
output_folder=f"{results_folder}/stage2",
previous_results=
f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
encode_kwargs={"show_progress_bar": False},
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score


def mteb_test_rerank_models(hf_runner,
vllm_runner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None):
if not model_info.enable_test:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest.skip("Skipping test.")

vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = model_info.dtype

with vllm_runner(model_info.name,
task="score",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:

if model_info.architecture:
assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures)

vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype

with hf_runner(model_info.name, is_cross_encoder=True,
dtype="float32") as hf_model:

hf_model_predict = hf_model.predict

def _predict(
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
):
# vllm and st both remove the prompt, fair comparison.
prompts = [(s[0], s[1]) for s in sentences]
return hf_model_predict(prompts, *args, **kwargs)

hf_model.predict = _predict

if hf_model_callback is not None:
hf_model_callback(hf_model)

st_main_score = run_mteb_rerank(hf_model,
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
st_dtype = next(hf_model.model.parameters()).dtype

print("VLLM:", vllm_dtype, vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
Loading