7
7
import torch .nn .functional as F
8
8
from torch import tensor
9
9
10
- from vllm .entrypoints .openai .protocol import ScoreResponse
10
+ from vllm .entrypoints .openai .protocol import RerankResponse , ScoreResponse
11
11
12
12
from ...utils import RemoteOpenAIServer
13
13
@@ -29,11 +29,35 @@ def v1(run_with_both_engines):
29
29
"name" : "BAAI/bge-base-en-v1.5" ,
30
30
"is_cross_encoder" : False
31
31
},
32
+ {
33
+ "name" : "Qwen/Qwen3-Reranker-0.6B" ,
34
+ "is_cross_encoder" : True ,
35
+ "is_qwen3_reranker" : True ,
36
+ },
32
37
]
33
38
DTYPE = "half"
34
39
35
40
41
+ def _run_qwen3_reranker_hf (hf_model , text_pairs , instruction ):
42
+ """Helper to run Qwen3 reranker with HF, applying the template."""
43
+ prefix = '<|im_start|>system\n Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n <|im_start|>user\n '
44
+ suffix = "<|im_end|>\n <|im_start|>assistant\n <think>\n \n </think>\n \n "
45
+
46
+ formatted_pairs = []
47
+ for query , doc in text_pairs :
48
+ q_formatted = f"{ prefix } <Instruct>: { instruction } \n <Query>: { query } \n "
49
+ d_formatted = f"<Document>: { doc } { suffix } "
50
+ formatted_pairs .append ([q_formatted , d_formatted ])
51
+
52
+ return hf_model .predict (formatted_pairs ).tolist ()
53
+
54
+
36
55
def run_transformers (hf_model , model , text_pairs ):
56
+ if model .get ("is_qwen3_reranker" ):
57
+ # The default instruction used in the server fixture.
58
+ default_instruction = "Given a web search query, retrieve relevant passages that answer the query"
59
+ return _run_qwen3_reranker_hf (hf_model , text_pairs ,
60
+ default_instruction )
37
61
if model ["is_cross_encoder" ]:
38
62
return hf_model .predict (text_pairs ).tolist ()
39
63
else :
@@ -53,21 +77,51 @@ def model(request):
53
77
54
78
@pytest .fixture (scope = "class" )
55
79
def server (model : dict [str , Any ]):
56
- args = ["--enforce-eager" , "--max-model-len" , "100" , "--dtype" , DTYPE ]
80
+ args = ["--enforce-eager" , "--max-model-len" , "256" , "--dtype" , DTYPE ]
81
+ if model .get ("is_qwen3_reranker" ):
82
+ import json
83
+ prefix = '<|im_start|>system\n Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n <|im_start|>user\n '
84
+ suffix = "<|im_end|>\n <|im_start|>assistant\n <think>\n \n </think>\n \n "
85
+ default_instruction = "Given a web search query, retrieve relevant passages that answer the query"
86
+
87
+ hf_overrides = {
88
+ "architectures" : ["Qwen3ForSequenceClassification" ],
89
+ "classifier_from_token" : ["no" , "yes" ],
90
+ "is_original_qwen3_reranker" : True ,
91
+ "score_template" : {
92
+ "query_template" :
93
+ f"{ prefix } <Instruct>: {{instruction}}\n <Query>: {{query}}\n " ,
94
+ "document_template" : f"<Document>: {{document}}{ suffix } " ,
95
+ "default_context" : {
96
+ "instruction" : default_instruction
97
+ }
98
+ }
99
+ }
100
+ args .extend (["--hf-overrides" , json .dumps (hf_overrides )])
57
101
58
102
with RemoteOpenAIServer (model ["name" ], args ) as remote_server :
59
103
yield remote_server
60
104
61
105
62
106
@pytest .fixture (scope = "class" )
63
107
def runner (model : dict [str , Any ], hf_runner ):
64
- kwargs = {
65
- "dtype" : DTYPE ,
66
- "is_cross_encoder" if model ["is_cross_encoder" ]\
67
- else "is_sentence_transformer" : True
68
- }
108
+ model_name = model ["name" ]
109
+ kwargs = {"dtype" : DTYPE }
110
+ if model .get ("is_qwen3_reranker" ):
111
+ # For the HF reference, use the pre-converted Sequence Classification
112
+ # model to simplify the runner logic.
113
+ model_name = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
114
+ hf_runner_kwargs = {
115
+ "dtype" : DTYPE ,
116
+ "is_cross_encoder" : True ,
117
+ "trust_remote_code" : True ,
118
+ }
119
+ elif model ["is_cross_encoder" ]:
120
+ hf_runner_kwargs = {"dtype" : DTYPE , "is_cross_encoder" : True }
121
+ else :
122
+ hf_runner_kwargs = {"dtype" : DTYPE , "is_sentence_transformer" : True }
69
123
70
- with hf_runner (model [ "name" ] , ** kwargs ) as hf_model :
124
+ with hf_runner (model_name , ** hf_runner_kwargs ) as hf_model :
71
125
yield hf_model
72
126
73
127
@@ -191,3 +245,75 @@ def test_score_max_model_len(self, server: RemoteOpenAIServer,
191
245
assert score_response .status_code == 400
192
246
assert "Please, select a smaller truncation size." in \
193
247
score_response .text
248
+
249
+ def test_rerank_with_template (self , server : RemoteOpenAIServer ,
250
+ model : dict [str , Any ], runner ):
251
+ if not model .get ("is_qwen3_reranker" ):
252
+ pytest .skip ("Test only for Qwen3 Reranker with template support." )
253
+
254
+ instruction = "Find the document that is most relevant to the query about national capitals."
255
+ query = "What is the capital of China?"
256
+ documents = [
257
+ "The capital of France is Paris." ,
258
+ "The capital of China is Beijing."
259
+ ]
260
+
261
+ # vLLM run with custom instruction via kwargs
262
+ rerank_response = requests .post (
263
+ server .url_for ("rerank" ),
264
+ json = {
265
+ "model" : model ["name" ],
266
+ "query" : query ,
267
+ "documents" : documents ,
268
+ "score_template_kwargs" : {
269
+ "instruction" : instruction
270
+ }
271
+ })
272
+ rerank_response .raise_for_status ()
273
+ response_data = RerankResponse .model_validate (rerank_response .json ())
274
+ vllm_outputs = {
275
+ res .document .text : res .relevance_score
276
+ for res in response_data .results
277
+ }
278
+
279
+ # HF reference run with the same custom instruction
280
+ text_pairs = [[query , doc ] for doc in documents ]
281
+ hf_outputs = _run_qwen3_reranker_hf (runner , text_pairs , instruction )
282
+
283
+ for i , doc in enumerate (documents ):
284
+ assert vllm_outputs [doc ] == pytest .approx (hf_outputs [i ],
285
+ rel = 0.01 )
286
+
287
+ def test_score_with_template (self , server : RemoteOpenAIServer ,
288
+ model : dict [str , Any ], runner ):
289
+ if not model .get ("is_qwen3_reranker" ):
290
+ pytest .skip ("Test only for Qwen3 Reranker with template support." )
291
+
292
+ instruction = "Find the document that is most relevant to the query about national capitals."
293
+ text_1 = "What is the capital of China?"
294
+ text_2 = [
295
+ "The capital of France is Paris." ,
296
+ "The capital of China is Beijing."
297
+ ]
298
+
299
+ # vLLM run with custom instruction via kwargs
300
+ score_response = requests .post (
301
+ server .url_for ("score" ),
302
+ json = {
303
+ "model" : model ["name" ],
304
+ "text_1" : text_1 ,
305
+ "text_2" : text_2 ,
306
+ "score_template_kwargs" : {
307
+ "instruction" : instruction
308
+ }
309
+ })
310
+ score_response .raise_for_status ()
311
+ response_data = ScoreResponse .model_validate (score_response .json ())
312
+ vllm_outputs = [res .score for res in response_data .data ]
313
+
314
+ # HF reference run with the same custom instruction
315
+ text_pairs = [[text_1 , doc ] for doc in text_2 ]
316
+ hf_outputs = _run_qwen3_reranker_hf (runner , text_pairs , instruction )
317
+
318
+ for i in range (len (vllm_outputs )):
319
+ assert vllm_outputs [i ] == pytest .approx (hf_outputs [i ], rel = 0.01 )
0 commit comments