@@ -8,7 +8,7 @@ vLLM Judge uses a single `evaluate()` method that adapts to your needs:
8
8
9
9
``` python
10
10
result = await judge.evaluate(
11
- response = " ..." , # What to evaluate
11
+ content = " ..." , # What to evaluate
12
12
criteria = " ..." , # What to evaluate for
13
13
# Optional parameters to control evaluation
14
14
)
@@ -23,13 +23,13 @@ The simplest form - just provide text and criteria:
23
23
``` python
24
24
# Basic evaluation
25
25
result = await judge.evaluate(
26
- response = " The Earth is the third planet from the Sun." ,
26
+ content = " The Earth is the third planet from the Sun." ,
27
27
criteria = " scientific accuracy"
28
28
)
29
29
30
30
# Multiple criteria
31
31
result = await judge.evaluate(
32
- response = " Dear customer, thank you for your feedback..." ,
32
+ content = " Dear customer, thank you for your feedback..." ,
33
33
criteria = " professionalism, empathy, and clarity"
34
34
)
35
35
```
@@ -51,14 +51,14 @@ Control the scoring range:
51
51
``` python
52
52
# 5-point scale
53
53
result = await judge.evaluate(
54
- response = " The product works as advertised." ,
54
+ content = " The product works as advertised." ,
55
55
criteria = " review helpfulness" ,
56
56
scale = (1 , 5 )
57
57
)
58
58
59
59
# 100-point scale for fine-grained scoring
60
60
result = await judge.evaluate(
61
- response = essay_text,
61
+ content = essay_text,
62
62
criteria = " writing quality" ,
63
63
scale = (0 , 100 )
64
64
)
@@ -70,7 +70,7 @@ Provide evaluation guidance as text:
70
70
71
71
``` python
72
72
result = await judge.evaluate(
73
- response = " I hate this product!" ,
73
+ content = " I hate this product!" ,
74
74
criteria = " sentiment analysis" ,
75
75
rubric = " Classify as 'positive', 'neutral', or 'negative' based on emotional tone"
76
76
)
@@ -83,7 +83,7 @@ Define specific score meanings:
83
83
84
84
``` python
85
85
result = await judge.evaluate(
86
- response = code_snippet,
86
+ content = code_snippet,
87
87
criteria = " code quality" ,
88
88
scale = (1 , 10 ),
89
89
rubric = {
@@ -104,7 +104,7 @@ Compare two responses by providing a dictionary:
104
104
``` python
105
105
# Compare two responses
106
106
result = await judge.evaluate(
107
- response = {
107
+ content = {
108
108
" a" : " Python is great for beginners due to its simple syntax." ,
109
109
" b" : " Python's intuitive syntax makes it ideal for newcomers."
110
110
},
@@ -114,7 +114,7 @@ result = await judge.evaluate(
114
114
115
115
# With additional context
116
116
result = await judge.evaluate(
117
- response = {
117
+ content = {
118
118
" a" : customer_response_1,
119
119
" b" : customer_response_2
120
120
},
@@ -131,7 +131,7 @@ Add context to improve evaluation accuracy:
131
131
132
132
``` python
133
133
result = await judge.evaluate(
134
- response = " Just use the default settings." ,
134
+ content = " Just use the default settings." ,
135
135
criteria = " helpfulness" ,
136
136
context = " User asked how to configure advanced security settings"
137
137
)
@@ -144,7 +144,7 @@ Guide the evaluation with examples:
144
144
145
145
``` python
146
146
result = await judge.evaluate(
147
- response = " Your code has a bug on line 5." ,
147
+ content = " Your code has a bug on line 5." ,
148
148
criteria = " constructive feedback quality" ,
149
149
scale = (1 , 10 ),
150
150
examples = [
@@ -169,7 +169,7 @@ Take full control of the evaluator's persona:
169
169
``` python
170
170
# Expert evaluator
171
171
result = await judge.evaluate(
172
- response = medical_advice,
172
+ content = medical_advice,
173
173
criteria = " medical accuracy and safety" ,
174
174
system_prompt = """ You are a licensed medical professional reviewing
175
175
health information for accuracy and potential harm. Be extremely
@@ -178,7 +178,7 @@ result = await judge.evaluate(
178
178
179
179
# Specific domain expert
180
180
result = await judge.evaluate(
181
- response = legal_document,
181
+ content = legal_document,
182
182
criteria = " legal compliance" ,
183
183
system_prompt = """ You are a corporate lawyer specializing in GDPR
184
184
compliance. Evaluate for regulatory adherence."""
@@ -193,7 +193,7 @@ When you provide a scale, you get numeric scoring:
193
193
194
194
``` python
195
195
result = await judge.evaluate(
196
- response = " Great product!" ,
196
+ content = " Great product!" ,
197
197
criteria = " review quality" ,
198
198
scale = (1 , 5 )
199
199
)
@@ -208,7 +208,7 @@ Without a scale but with category rubric:
208
208
209
209
``` python
210
210
result = await judge.evaluate(
211
- response = " This might be considered offensive." ,
211
+ content = " This might be considered offensive." ,
212
212
criteria = " content moderation" ,
213
213
rubric = " Classify as 'safe', 'warning', or 'unsafe'"
214
214
)
@@ -223,7 +223,7 @@ For yes/no evaluations:
223
223
224
224
``` python
225
225
result = await judge.evaluate(
226
- response = user_message,
226
+ content = user_message,
227
227
criteria = " spam detection" ,
228
228
rubric = " Determine if this is 'spam' or 'not spam'"
229
229
)
@@ -237,7 +237,7 @@ You can request both classification and scoring:
237
237
238
238
``` python
239
239
result = await judge.evaluate(
240
- response = essay,
240
+ content = essay,
241
241
criteria = " academic quality" ,
242
242
rubric = """
243
243
Grade the essay:
@@ -263,7 +263,7 @@ result = await judge.evaluate(
263
263
async def qa_check (response : str , threshold : float = 7.0 ):
264
264
""" Check if response meets quality threshold."""
265
265
result = await judge.evaluate(
266
- response = response,
266
+ content = response,
267
267
criteria = " helpfulness, accuracy, and professionalism" ,
268
268
scale = (1 , 10 )
269
269
)
@@ -283,7 +283,7 @@ async def qa_check(response: str, threshold: float = 7.0):
283
283
async def compare_models (prompt : str , response_a : str , response_b : str ):
284
284
""" Compare two model responses."""
285
285
result = await judge.evaluate(
286
- response = {" a" : response_a, " b" : response_b},
286
+ content = {" a" : response_a, " b" : response_b},
287
287
criteria = " helpfulness, accuracy, and clarity" ,
288
288
context = f " User prompt: { prompt} "
289
289
)
@@ -310,7 +310,7 @@ async def comprehensive_evaluation(content: str):
310
310
results = {}
311
311
for aspect, criteria in aspects.items():
312
312
result = await judge.evaluate(
313
- response = content,
313
+ content = content,
314
314
criteria = criteria,
315
315
scale = (1 , 10 )
316
316
)
0 commit comments