Skip to content

Commit f40f763

Browse files
authored
[CI] Add mteb testing for rerank models (#19344)
1 parent 26bc46e commit f40f763

File tree

15 files changed

+428
-256
lines changed

15 files changed

+428
-256
lines changed

requirements/test.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ num2words # required for smolvlm test
3333
opencv-python-headless >= 4.11.0 # required for video test
3434
datamodel_code_generator # required for minicpm3 test
3535
lm-eval[api]==0.4.8 # required for model evaluation test
36-
mteb>=1.38.11, <2 # required for mteb test
36+
mteb[bm25s]>=1.38.11, <2 # required for mteb test
3737
transformers==4.52.4
3838
tokenizers==0.21.1
3939
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.

requirements/test.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ black==24.10.0
5151
# via datamodel-code-generator
5252
blobfile==3.0.0
5353
# via -r requirements/test.in
54+
bm25s==0.2.13
55+
# via mteb
5456
boto3==1.35.57
5557
# via tensorizer
5658
botocore==1.35.57
@@ -344,6 +346,7 @@ numpy==1.26.4
344346
# -r requirements/test.in
345347
# accelerate
346348
# bitsandbytes
349+
# bm25s
347350
# contourpy
348351
# cupy-cuda12x
349352
# datasets
@@ -534,6 +537,8 @@ pyparsing==3.2.0
534537
# via matplotlib
535538
pyrate-limiter==3.7.0
536539
# via schemathesis
540+
pystemmer==3.0.0
541+
# via mteb
537542
pytablewriter==1.2.0
538543
# via lm-eval
539544
pytest==8.3.3
@@ -668,6 +673,7 @@ scikit-learn==1.5.2
668673
# sentence-transformers
669674
scipy==1.13.1
670675
# via
676+
# bm25s
671677
# librosa
672678
# mteb
673679
# scikit-learn

tests/conftest.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -727,8 +727,12 @@ def encode(self, prompts: list[str], *args,
727727
**kwargs) -> list[list[torch.Tensor]]:
728728
return self.model.encode(prompts, *args, **kwargs)
729729

730-
def predict(self, prompts: list[list[str]]) -> torch.Tensor:
731-
return self.model.predict(prompts, convert_to_tensor=True)
730+
def predict(self, prompts: list[list[str]], *args,
731+
**kwargs) -> torch.Tensor:
732+
return self.model.predict(prompts,
733+
*args,
734+
convert_to_tensor=True,
735+
**kwargs)
732736

733737
def __enter__(self):
734738
return self
@@ -1037,8 +1041,10 @@ def score(
10371041
self,
10381042
text_1: Union[str, list[str]],
10391043
text_2: Union[str, list[str]],
1044+
*args,
1045+
**kwargs,
10401046
) -> list[float]:
1041-
req_outputs = self.model.score(text_1, text_2)
1047+
req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
10421048
return [req_output.outputs.score for req_output in req_outputs]
10431049

10441050
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:

tests/entrypoints/openai/correctness/test_mteb.py renamed to tests/entrypoints/openai/correctness/test_mteb_embed.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,30 @@
77
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
88
MTEB_EMBED_TOL,
99
OpenAIClientMtebEncoder,
10-
run_mteb_embed_task,
11-
run_mteb_embed_task_st)
10+
run_mteb_embed_task)
1211
from tests.utils import RemoteOpenAIServer
1312

1413
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
1514

16-
MODEL_NAME = "BAAI/bge-m3"
17-
DTYPE = "float16"
18-
MAIN_SCORE = 0.7873427091972599
15+
MODEL_NAME = "intfloat/e5-small"
16+
MAIN_SCORE = 0.7422994752439667
1917

2018

2119
@pytest.fixture(scope="module")
2220
def server():
2321
args = [
24-
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
25-
"--max-model-len", "512"
22+
"--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
2623
]
2724

2825
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
2926
yield remote_server
3027

3128

32-
def test_mteb(server):
29+
def test_mteb_embed(server):
3330
client = server.get_client()
3431
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
3532
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
36-
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
37-
MODEL_NAME, MTEB_EMBED_TASKS)
33+
st_main_score = MAIN_SCORE
3834

3935
print("VLLM main score: ", vllm_main_score)
4036
print("SentenceTransformer main score: ", st_main_score)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import os
4+
5+
import pytest
6+
7+
# yapf conflicts with isort for this block
8+
# yapf: disable
9+
from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
10+
MTEB_RERANK_TASKS,
11+
MTEB_RERANK_TOL,
12+
RerankClientMtebEncoder,
13+
ScoreClientMtebEncoder,
14+
run_mteb_rerank)
15+
# yapf: enable
16+
from tests.utils import RemoteOpenAIServer
17+
18+
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
19+
20+
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
21+
MAIN_SCORE = 0.33437
22+
23+
24+
@pytest.fixture(scope="module")
25+
def server():
26+
args = [
27+
"--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
28+
]
29+
30+
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
31+
yield remote_server
32+
33+
34+
def test_mteb_score(server):
35+
url = server.url_for("score")
36+
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
37+
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
38+
MTEB_RERANK_LANGS)
39+
st_main_score = MAIN_SCORE
40+
41+
print("VLLM main score: ", vllm_main_score)
42+
print("SentenceTransformer main score: ", st_main_score)
43+
print("Difference: ", st_main_score - vllm_main_score)
44+
45+
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
46+
47+
48+
def test_mteb_rerank(server):
49+
url = server.url_for("rerank")
50+
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
51+
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
52+
MTEB_RERANK_LANGS)
53+
st_main_score = MAIN_SCORE
54+
55+
print("VLLM main score: ", vllm_main_score)
56+
print("SentenceTransformer main score: ", st_main_score)
57+
print("Difference: ", st_main_score - vllm_main_score)
58+
59+
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)

0 commit comments

Comments
 (0)