Skip to content

[Frontend][Model] Qwen3Rerank API Server backward compatibility #20239

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 134 additions & 8 deletions tests/entrypoints/openai/test_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch.nn.functional as F
from torch import tensor

from vllm.entrypoints.openai.protocol import ScoreResponse
from vllm.entrypoints.openai.protocol import RerankResponse, ScoreResponse

from ...utils import RemoteOpenAIServer

Expand All @@ -29,11 +29,35 @@
"name": "BAAI/bge-base-en-v1.5",
"is_cross_encoder": False
},
{
"name": "Qwen/Qwen3-Reranker-0.6B",
"is_cross_encoder": True,
"is_qwen3_reranker": True,
},
]
DTYPE = "half"


def _run_qwen3_reranker_hf(hf_model, text_pairs, instruction):
"""Helper to run Qwen3 reranker with HF, applying the template."""
prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'

Check failure on line 43 in tests/entrypoints/openai/test_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

tests/entrypoints/openai/test_score.py:43:81: E501 Line too long (208 > 80)
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"

formatted_pairs = []
for query, doc in text_pairs:
q_formatted = f"{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
d_formatted = f"<Document>: {doc}{suffix}"
formatted_pairs.append([q_formatted, d_formatted])

return hf_model.predict(formatted_pairs).tolist()


def run_transformers(hf_model, model, text_pairs):
if model.get("is_qwen3_reranker"):
# The default instruction used in the server fixture.
default_instruction = "Given a web search query, retrieve relevant passages that answer the query"

Check failure on line 58 in tests/entrypoints/openai/test_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

tests/entrypoints/openai/test_score.py:58:81: E501 Line too long (106 > 80)
return _run_qwen3_reranker_hf(hf_model, text_pairs,
default_instruction)
if model["is_cross_encoder"]:
return hf_model.predict(text_pairs).tolist()
else:
Expand All @@ -53,21 +77,51 @@

@pytest.fixture(scope="class")
def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
args = ["--enforce-eager", "--max-model-len", "256", "--dtype", DTYPE]
if model.get("is_qwen3_reranker"):
import json
prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'

Check failure on line 83 in tests/entrypoints/openai/test_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

tests/entrypoints/openai/test_score.py:83:81: E501 Line too long (212 > 80)
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
default_instruction = "Given a web search query, retrieve relevant passages that answer the query"

Check failure on line 85 in tests/entrypoints/openai/test_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

tests/entrypoints/openai/test_score.py:85:81: E501 Line too long (106 > 80)

hf_overrides = {
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
"score_template": {
"query_template":
f"{prefix}<Instruct>: {{instruction}}\n<Query>: {{query}}\n",
"document_template": f"<Document>: {{document}}{suffix}",
"default_context": {
"instruction": default_instruction
}
}
}
args.extend(["--hf-overrides", json.dumps(hf_overrides)])

with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server


@pytest.fixture(scope="class")
def runner(model: dict[str, Any], hf_runner):
kwargs = {
"dtype": DTYPE,
"is_cross_encoder" if model["is_cross_encoder"]\
else "is_sentence_transformer": True
}
model_name = model["name"]
kwargs = {"dtype": DTYPE}

Check failure on line 109 in tests/entrypoints/openai/test_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (F841)

tests/entrypoints/openai/test_score.py:109:5: F841 Local variable `kwargs` is assigned to but never used
if model.get("is_qwen3_reranker"):
# For the HF reference, use the pre-converted Sequence Classification
# model to simplify the runner logic.
model_name = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
hf_runner_kwargs = {
"dtype": DTYPE,
"is_cross_encoder": True,
"trust_remote_code": True,
}
elif model["is_cross_encoder"]:
hf_runner_kwargs = {"dtype": DTYPE, "is_cross_encoder": True}
else:
hf_runner_kwargs = {"dtype": DTYPE, "is_sentence_transformer": True}

with hf_runner(model["name"], **kwargs) as hf_model:
with hf_runner(model_name, **hf_runner_kwargs) as hf_model:
yield hf_model


Expand Down Expand Up @@ -191,3 +245,75 @@
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \
score_response.text

def test_rerank_with_template(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
if not model.get("is_qwen3_reranker"):
pytest.skip("Test only for Qwen3 Reranker with template support.")

instruction = "Find the document that is most relevant to the query about national capitals."

Check failure on line 254 in tests/entrypoints/openai/test_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

tests/entrypoints/openai/test_score.py:254:81: E501 Line too long (101 > 80)
query = "What is the capital of China?"
documents = [
"The capital of France is Paris.",
"The capital of China is Beijing."
]

# vLLM run with custom instruction via kwargs
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": model["name"],
"query": query,
"documents": documents,
"score_template_kwargs": {
"instruction": instruction
}
Comment on lines +268 to +270
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This test uses score_template_kwargs in a /rerank request. Per protocol.py, RerankRequest expects rerank_template_kwargs. Confirm this test aligns with the final protocol definition after the field name unification.

})
rerank_response.raise_for_status()
response_data = RerankResponse.model_validate(rerank_response.json())
vllm_outputs = {
res.document.text: res.relevance_score
for res in response_data.results
}

# HF reference run with the same custom instruction
text_pairs = [[query, doc] for doc in documents]
hf_outputs = _run_qwen3_reranker_hf(runner, text_pairs, instruction)

for i, doc in enumerate(documents):
assert vllm_outputs[doc] == pytest.approx(hf_outputs[i],
rel=0.01)

def test_score_with_template(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
if not model.get("is_qwen3_reranker"):
pytest.skip("Test only for Qwen3 Reranker with template support.")

Check failure on line 290 in tests/entrypoints/openai/test_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

tests/entrypoints/openai/test_score.py:290:81: E501 Line too long (101 > 80)

instruction = "Find the document that is most relevant to the query about national capitals."
text_1 = "What is the capital of China?"
text_2 = [
"The capital of France is Paris.",
"The capital of China is Beijing."
]

# vLLM run with custom instruction via kwargs
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"score_template_kwargs": {
"instruction": instruction
}
})
score_response.raise_for_status()
response_data = ScoreResponse.model_validate(score_response.json())
vllm_outputs = [res.score for res in response_data.data]

# HF reference run with the same custom instruction
text_pairs = [[text_1, doc] for doc in text_2]
hf_outputs = _run_qwen3_reranker_hf(runner, text_pairs, instruction)

for i in range(len(vllm_outputs)):
assert vllm_outputs[i] == pytest.approx(hf_outputs[i], rel=0.01)
22 changes: 20 additions & 2 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,7 +1194,16 @@ class ScoreRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."),
)

score_template: Optional[dict[str, str]] = Field(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see that these endpoints don't use chat templates now. In that case, I prefer separating query_template and document_template into separate arguments to avoid unnecessary nesting.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy that 🫡

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but here we may need to pass in instruction, that is, other parameters may need to be applied to the template. Maybe a template+template kwargs is better?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for user convenience, it's better to separate template and template kwargs, since the kwargs aren't used that often

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think template is a triple of query, document, instruction, among which if the user does not give the instruction, use the default instruction.

should basically be consistent with mteb

https://github.com/embeddings-benchmark/mteb/blob/4ff1413316727ecec7ddaeb280fe963f06e7c3fb/mteb/evaluation/evaluators/RetrievalEvaluator.py#L310-L329

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should use the same API for the entire ecosystem, so I think it should be implemented by Sentence Transformers first.

default=None,
description=("A dictionary containing query_template and "
"document_template to format the scorer input."))
score_template_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=(
"Additional keyword args to pass to the template renderer. "
"Will be accessible by the score template."),
)
# --8<-- [end:score-extra-params]

def to_pooling_params(self):
Expand All @@ -1220,7 +1229,16 @@ class RerankRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."),
)

rerank_template: Optional[dict[str, str]] = Field(
default=None,
description=("A dictionary containing query_template and "
"document_template to format the reranker input.")
)
rerank_template_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("A dictionary of key-value pairs to be formatted into "
"the rerank model's template.")
)
Comment on lines +1232 to +1241
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The field names rerank_template and rerank_template_kwargs in RerankRequest are inconsistent with ScoreRequest's score_template and score_template_kwargs. To ensure consistency, unify the field names in RerankRequest to match ScoreRequest.

Also, improve the description for rerank_template_kwargs by using the description from score_template_kwargs.

Suggested change
rerank_template: Optional[dict[str, str]] = Field(
default=None,
description=("A dictionary containing query_template and "
"document_template to format the reranker input.")
)
rerank_template_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("A dictionary of key-value pairs to be formatted into "
"the rerank model's template.")
)
score_template: Optional[dict[str, str]] = Field(
default=None,
description=("A dictionary containing query_template and "
"document_template to format the reranker input.")
)
score_template_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("Additional keyword args to pass to the template renderer. "
"Will be accessible by the rerank template.")
)

# --8<-- [end:rerank-extra-params]

def to_pooling_params(self):
Expand Down
79 changes: 64 additions & 15 deletions vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
texts_1: list[str],
texts_2: list[str],
request: Union[RerankRequest, ScoreRequest],
request_id=str,
request_id: str,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[Union[LoRARequest, None]] = None,
prompt_adapter_request: Optional[Union[PromptAdapterRequest,
Expand Down Expand Up @@ -139,31 +139,58 @@

return final_res_batch

async def _cross_encoding_score(
async def _preprocess_score(
self,
tokenizer: Union[AnyTokenizer],
request: Union[RerankRequest, ScoreRequest],
tokenizer: AnyTokenizer,
texts_1: list[str],
texts_2: list[str],
request: Union[RerankRequest, ScoreRequest],
request_id=str,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[Union[LoRARequest, None]] = None,
prompt_adapter_request: Optional[Union[PromptAdapterRequest,
None]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
) -> list[PoolingRequestOutput]:

) -> tuple[list[str], list[TokensPrompt]]:
request_prompts: list[str] = []
engine_prompts: list[TokensPrompt] = []

if len(texts_1) == 1:
texts_1 = texts_1 * len(texts_2)

input_pairs = [(t1, t2) for t1, t2 in zip(texts_1, texts_2)]
def identity_processor(t1: str, t2: str) -> tuple[str, str]:
return t1, t2

if isinstance(tokenizer, MistralTokenizer):
raise ValueError(
"MistralTokenizer not supported for cross-encoding")
pair_processor = identity_processor

template_config = (request.score_template

Check failure on line 161 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Item "RerankRequest" of "Union[RerankRequest, ScoreRequest]" has no attribute "score_template" [union-attr]
or self.model_config.hf_config.get(
"score_template"))
Comment on lines +161 to +163
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The implementation relies on request.score_template, but for a RerankRequest, the field is rerank_template in protocol.py. Fix this in conjunction with the comment on vllm/entrypoints/openai/protocol.py to unify the template-related field names.


if isinstance(template_config, dict):

def template_processor(t1: str, t2: str) -> tuple[str, str]:

Check failure on line 167 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Need type annotation for "default_context" [var-annotated]
default_context = template_config.get("default_context", {})
context = default_context.copy() if isinstance(
default_context, dict) else {}

Check failure on line 170 in vllm/entrypoints/openai/serving_score.py

View workflow job for this annotation

GitHub Actions / pre-commit

Item "RerankRequest" of "Union[RerankRequest, ScoreRequest]" has no attribute "score_template_kwargs" [union-attr]
if request.score_template_kwargs:
context.update(request.score_template_kwargs)
Comment on lines +171 to +172
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This code relies on request.score_template_kwargs, but for a RerankRequest, the field is rerank_template_kwargs in protocol.py. Fix this in conjunction with the comment on vllm/entrypoints/openai/protocol.py to unify the template-related field names.


context['query'] = t1
context['document'] = t2

query_template = template_config.get("query_template",
"{query}")
doc_template = template_config.get("document_template",
"{document}")

formatted_t1 = query_template.format(
**context) if "query_template" in template_config else t1
formatted_t2 = doc_template.format(
**context
) if "document_template" in template_config else t2
return formatted_t1, formatted_t2

pair_processor = template_processor

input_pairs = [
pair_processor(t1, t2) for t1, t2 in zip(texts_1, texts_2)
]

tokenize_async = make_async(tokenizer.__call__,
executor=self._tokenizer_executor)
Expand All @@ -186,6 +213,28 @@

request_prompts.append(request_prompt)
engine_prompts.append(engine_prompt)
return request_prompts, engine_prompts

async def _cross_encoding_score(
self,
tokenizer: AnyTokenizer,
texts_1: list[str],
texts_2: list[str],
request: Union[RerankRequest, ScoreRequest],
request_id: str,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[Union[LoRARequest, None]] = None,
prompt_adapter_request: Optional[Union[PromptAdapterRequest,
None]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
) -> list[PoolingRequestOutput]:

if isinstance(tokenizer, MistralTokenizer):
raise ValueError(
"MistralTokenizer not supported for cross-encoding")

request_prompts, engine_prompts = await self._preprocess_score(
request, tokenizer, texts_1, texts_2, tokenization_kwargs)

# Schedule the request and get the result generator.
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
Expand Down