Skip to content

Commit 18eb389

Browse files
Improve default system prompt & JSON description. Add tests.
1 parent 14c7a96 commit 18eb389

26 files changed

+1875
-131
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ cython_debug/
166166
.idea/.gitignore
167167
.idea/misc.xml
168168
.idea/modules.xml
169-
.idea/trustyai-service-v2.iml
170169
.idea/vcs.xml
171170
.idea/inspectionProfiles/profiles_settings.xml
172171

examples/basic_test.ipynb

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 5,
5+
"execution_count": 1,
66
"metadata": {},
77
"outputs": [],
88
"source": [
@@ -31,7 +31,7 @@
3131
},
3232
{
3333
"cell_type": "code",
34-
"execution_count": 7,
34+
"execution_count": 2,
3535
"metadata": {},
3636
"outputs": [],
3737
"source": [
@@ -40,7 +40,7 @@
4040
},
4141
{
4242
"cell_type": "code",
43-
"execution_count": 8,
43+
"execution_count": 3,
4444
"metadata": {},
4545
"outputs": [],
4646
"source": [
@@ -50,20 +50,20 @@
5050
},
5151
{
5252
"cell_type": "code",
53-
"execution_count": 9,
53+
"execution_count": 4,
5454
"metadata": {},
5555
"outputs": [
5656
{
5757
"data": {
5858
"text/plain": [
59-
"{'decision': False,\n",
60-
" 'reasoning': 'The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.',\n",
59+
"{'decision': 'PASS',\n",
60+
" 'reasoning': 'The content maintains a professional tone by clearly stating the intention to bump the version and asking for confirmation.',\n",
6161
" 'score': None,\n",
6262
" 'metadata': {'model': 'qwen2',\n",
63-
" 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.\",\\n \"score\": null\\n}'}}"
63+
" 'raw_response': '{\\n \"decision\": \"PASS\",\\n \"reasoning\": \"The content maintains a professional tone by clearly stating the intention to bump the version and asking for confirmation.\",\\n \"score\": null\\n}'}}"
6464
]
6565
},
66-
"execution_count": 9,
66+
"execution_count": 4,
6767
"metadata": {},
6868
"output_type": "execute_result"
6969
}
@@ -74,47 +74,47 @@
7474
},
7575
{
7676
"cell_type": "code",
77-
"execution_count": 10,
77+
"execution_count": 5,
7878
"metadata": {},
7979
"outputs": [
8080
{
8181
"data": {
8282
"text/plain": [
83-
"{'decision': 5,\n",
84-
" 'reasoning': 'The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.',\n",
85-
" 'score': 5.0,\n",
83+
"{'decision': 'False',\n",
84+
" 'reasoning': 'The content lacks a professional tone as it is phrased as a question without providing context or justification for the version bump.',\n",
85+
" 'score': 0.2,\n",
8686
" 'metadata': {'model': 'qwen2',\n",
87-
" 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.\",\\n \"score\": 5\\n}'}}"
87+
" 'raw_response': '{\\n \"decision\": \"False\",\\n \"reasoning\": \"The content lacks a professional tone as it is phrased as a question without providing context or justification for the version bump.\",\\n \"score\": 0.2\\n}'}}"
8888
]
8989
},
90-
"execution_count": 10,
90+
"execution_count": 5,
9191
"metadata": {},
9292
"output_type": "execute_result"
9393
}
9494
],
9595
"source": [
9696
"res = await judge.evaluate(content=\"I want to bump the version to 1.0.1, is it a good idea?\",\n",
9797
" criteria=\"Check the professional tone.\",\n",
98-
" rubric=\"Assign a score between 0 and 10 based on the professional tone. 0 is the worst and 10 is the best.\")\n",
98+
" rubric=\"Assign a score between 0 and 1 based on the professional tone. 0 is the worst and 1 is the best.\")\n",
9999
"res.model_dump()"
100100
]
101101
},
102102
{
103103
"cell_type": "code",
104-
"execution_count": 11,
104+
"execution_count": 8,
105105
"metadata": {},
106106
"outputs": [
107107
{
108108
"data": {
109109
"text/plain": [
110-
"{'decision': 5,\n",
111-
" 'reasoning': 'The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.',\n",
112-
" 'score': 5.0,\n",
110+
"{'decision': 'True',\n",
111+
" 'reasoning': 'The response is clear and to the point, maintaining a professional tone by asking for a decision on the version bump.',\n",
112+
" 'score': 1.0,\n",
113113
" 'metadata': {'model': 'qwen2',\n",
114-
" 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.\",\\n \"score\": 5\\n}'}}"
114+
" 'raw_response': '{\\n \"decision\": \"True\",\\n \"reasoning\": \"The response is clear and to the point, maintaining a professional tone by asking for a decision on the version bump.\",\\n \"score\": 1\\n}'}}"
115115
]
116116
},
117-
"execution_count": 11,
117+
"execution_count": 8,
118118
"metadata": {},
119119
"output_type": "execute_result"
120120
}
@@ -124,15 +124,17 @@
124124
" criteria=\"Check the professional tone.\",\n",
125125
" rubric={\n",
126126
" 0: \"The response is not professional.\",\n",
127-
" 5: \"The response is somewhat professional.\",\n",
128-
" 10: \"The response is very professional.\"\n",
129-
" })\n",
127+
" 0.5: \"The response is somewhat professional.\",\n",
128+
" 1: \"The response is very professional.\"\n",
129+
" },\n",
130+
" scale=(0, 1)\n",
131+
" )\n",
130132
"res.model_dump()"
131133
]
132134
},
133135
{
134136
"cell_type": "code",
135-
"execution_count": 12,
137+
"execution_count": 9,
136138
"metadata": {},
137139
"outputs": [],
138140
"source": [
@@ -146,20 +148,20 @@
146148
},
147149
{
148150
"cell_type": "code",
149-
"execution_count": 13,
151+
"execution_count": 10,
150152
"metadata": {},
151153
"outputs": [
152154
{
153155
"data": {
154156
"text/plain": [
155157
"{'decision': 'moderate',\n",
156-
" 'reasoning': 'The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.',\n",
158+
" 'reasoning': 'The content is a question about version bumping and lacks formal structure or context, which makes it slightly informal. However, it does not contain any unprofessional language or tone.',\n",
157159
" 'score': 5.0,\n",
158160
" 'metadata': {'model': 'qwen2',\n",
159-
" 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.\",\\n \"score\": 5\\n}'}}"
161+
" 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The content is a question about version bumping and lacks formal structure or context, which makes it slightly informal. However, it does not contain any unprofessional language or tone.\",\\n \"score\": 5\\n}'}}"
160162
]
161163
},
162-
"execution_count": 13,
164+
"execution_count": 10,
163165
"metadata": {},
164166
"output_type": "execute_result"
165167
}
@@ -172,20 +174,20 @@
172174
},
173175
{
174176
"cell_type": "code",
175-
"execution_count": 14,
177+
"execution_count": 11,
176178
"metadata": {},
177179
"outputs": [
178180
{
179181
"data": {
180182
"text/plain": [
181183
"{'decision': 'non-professional',\n",
182-
" 'reasoning': 'The response uses informal and expletive language, which is not appropriate for a professional context.',\n",
183-
" 'score': 1.0,\n",
184+
" 'reasoning': \"The phrase 'Holy shit, this is a great!' is informal and contains an exclamation, which does not meet the criteria for a professional tone.\",\n",
185+
" 'score': 2.0,\n",
184186
" 'metadata': {'model': 'qwen2',\n",
185-
" 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The response uses informal and expletive language, which is not appropriate for a professional context.\",\\n \"score\": 1\\n}'}}"
187+
" 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The phrase \\'Holy shit, this is a great!\\' is informal and contains an exclamation, which does not meet the criteria for a professional tone.\",\\n \"score\": 2\\n}'}}"
186188
]
187189
},
188-
"execution_count": 14,
190+
"execution_count": 11,
189191
"metadata": {},
190192
"output_type": "execute_result"
191193
}
@@ -198,22 +200,22 @@
198200
},
199201
{
200202
"cell_type": "code",
201-
"execution_count": 15,
203+
"execution_count": 12,
202204
"metadata": {},
203205
"outputs": [
204206
{
205207
"data": {
206208
"text/plain": [
207-
"{'decision': True,\n",
208-
" 'reasoning': 'The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.',\n",
209+
"{'decision': 'PASS',\n",
210+
" 'reasoning': 'The statement is accurate and complete as it correctly identifies Paris as the capital of France.',\n",
209211
" 'score': None,\n",
210212
" 'metadata': {'model': 'qwen2',\n",
211-
" 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.\",\\n \"score\": null\\n}',\n",
213+
" 'raw_response': '{\\n \"decision\": \"PASS\",\\n \"reasoning\": \"The statement is accurate and complete as it correctly identifies Paris as the capital of France.\",\\n \"score\": null\\n}',\n",
212214
" 'template_vars': {'input': 'What is the capital of France?'},\n",
213215
" 'template_engine': 'format'}}"
214216
]
215217
},
216-
"execution_count": 15,
218+
"execution_count": 12,
217219
"metadata": {},
218220
"output_type": "execute_result"
219221
}

examples/templating.ipynb

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,22 @@
2020
},
2121
{
2222
"cell_type": "code",
23-
"execution_count": 4,
23+
"execution_count": 3,
2424
"metadata": {},
2525
"outputs": [
2626
{
2727
"data": {
2828
"text/plain": [
29-
"{'decision': 5,\n",
30-
" 'reasoning': 'The function is concise but uses a naive recursive approach which is inefficient and can lead to a stack overflow for large values of n. It lacks error handling and docstrings for documentation.',\n",
31-
" 'score': 5.0,\n",
29+
"{'decision': 'FAIL',\n",
30+
" 'reasoning': \"The function contains a typo ('fib' instead of 'fibonacci' in the recursive call) and lacks a proper docstring or comments, which are essential for production code. The code is also not optimized and may lead to a stack overflow for large values of n due to repeated calculations.\",\n",
31+
" 'score': 3.0,\n",
3232
" 'metadata': {'model': 'qwen2',\n",
33-
" 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The function is concise but uses a naive recursive approach which is inefficient and can lead to a stack overflow for large values of n. It lacks error handling and docstrings for documentation.\",\\n \"score\": 5\\n}',\n",
33+
" 'raw_response': '{\\n \"decision\": \"FAIL\",\\n \"reasoning\": \"The function contains a typo (\\'fib\\' instead of \\'fibonacci\\' in the recursive call) and lacks a proper docstring or comments, which are essential for production code. The code is also not optimized and may lead to a stack overflow for large values of n due to repeated calculations.\",\\n \"score\": 3\\n}',\n",
3434
" 'template_vars': {'language': 'Python', 'use_case': 'production deployment'},\n",
3535
" 'template_engine': 'format'}}"
3636
]
3737
},
38-
"execution_count": 4,
38+
"execution_count": 3,
3939
"metadata": {},
4040
"output_type": "execute_result"
4141
}
@@ -56,7 +56,7 @@
5656
},
5757
{
5858
"cell_type": "code",
59-
"execution_count": 5,
59+
"execution_count": 4,
6060
"metadata": {},
6161
"outputs": [],
6262
"source": [
@@ -76,7 +76,7 @@
7676
},
7777
{
7878
"cell_type": "code",
79-
"execution_count": 6,
79+
"execution_count": 5,
8080
"metadata": {},
8181
"outputs": [],
8282
"source": [
@@ -100,22 +100,22 @@
100100
},
101101
{
102102
"cell_type": "code",
103-
"execution_count": 7,
103+
"execution_count": 6,
104104
"metadata": {},
105105
"outputs": [
106106
{
107107
"data": {
108108
"text/plain": [
109-
"{'decision': 1,\n",
110-
" 'reasoning': \"The function does not perform any data science tasks and incorrectly uses the name 'fib' instead of 'fibonacci' in the recursive call, which will cause an error.\",\n",
111-
" 'score': None,\n",
109+
"{'decision': 'False',\n",
110+
" 'reasoning': 'The code is a recursive implementation of the Fibonacci sequence, but it lacks an appropriate base case check and is inefficient for larger values of n due to repeated calculations. This makes it unsuitable for data science applications where performance and efficiency are crucial.',\n",
111+
" 'score': 1.0,\n",
112112
" 'metadata': {'model': 'qwen2',\n",
113-
" 'raw_response': '{\\n \"decision\": 1,\\n \"reasoning\": \"The function does not perform any data science tasks and incorrectly uses the name \\'fib\\' instead of \\'fibonacci\\' in the recursive call, which will cause an error.\",\\n \"score\": null\\n}',\n",
113+
" 'raw_response': '{\\n \"decision\": \"False\",\\n \"reasoning\": \"The code is a recursive implementation of the Fibonacci sequence, but it lacks an appropriate base case check and is inefficient for larger values of n due to repeated calculations. This makes it unsuitable for data science applications where performance and efficiency are crucial.\",\\n \"score\": 1\\n}',\n",
114114
" 'template_vars': {'language': 'Python', 'purpose': 'data science'},\n",
115115
" 'template_engine': 'format'}}"
116116
]
117117
},
118-
"execution_count": 7,
118+
"execution_count": 6,
119119
"metadata": {},
120120
"output_type": "execute_result"
121121
}
@@ -126,22 +126,22 @@
126126
},
127127
{
128128
"cell_type": "code",
129-
"execution_count": 8,
129+
"execution_count": 7,
130130
"metadata": {},
131131
"outputs": [
132132
{
133133
"data": {
134134
"text/plain": [
135-
"{'decision': 5,\n",
136-
" 'reasoning': 'The function is a correct implementation of the Fibonacci sequence, but it lacks an optimization for performance and does not consider edge cases like negative numbers or non-integer inputs.',\n",
137-
" 'score': 5.0,\n",
135+
"{'decision': 'false',\n",
136+
" 'reasoning': 'The function is a correct implementation of the Fibonacci sequence, but it is not efficient for large values of n due to its exponential time complexity. This makes it unsuitable for web frontend applications where performance is critical.',\n",
137+
" 'score': 1.0,\n",
138138
" 'metadata': {'model': 'qwen2',\n",
139-
" 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The function is a correct implementation of the Fibonacci sequence, but it lacks an optimization for performance and does not consider edge cases like negative numbers or non-integer inputs.\",\\n \"score\": 5\\n}',\n",
139+
" 'raw_response': '{\\n \"decision\": \"false\",\\n \"reasoning\": \"The function is a correct implementation of the Fibonacci sequence, but it is not efficient for large values of n due to its exponential time complexity. This makes it unsuitable for web frontend applications where performance is critical.\",\\n \"score\": 1\\n}',\n",
140140
" 'template_vars': {'language': 'JavaScript', 'purpose': 'web frontend'},\n",
141141
" 'template_engine': 'format'}}"
142142
]
143143
},
144-
"execution_count": 8,
144+
"execution_count": 7,
145145
"metadata": {},
146146
"output_type": "execute_result"
147147
}
@@ -152,7 +152,7 @@
152152
},
153153
{
154154
"cell_type": "code",
155-
"execution_count": 9,
155+
"execution_count": 8,
156156
"metadata": {},
157157
"outputs": [],
158158
"source": [
@@ -191,25 +191,25 @@
191191
},
192192
{
193193
"cell_type": "code",
194-
"execution_count": 10,
194+
"execution_count": 9,
195195
"metadata": {},
196196
"outputs": [
197197
{
198198
"data": {
199199
"text/plain": [
200200
"{'decision': 'DECENT',\n",
201-
" 'reasoning': 'The code is simple and returns a JSON response, but it lacks RESTful design principles, error handling, and proper documentation. There is no indication of authentication or authorization, which are critical for API security.',\n",
202-
" 'score': 4.0,\n",
201+
" 'reasoning': 'The API endpoint follows a basic RESTful design by using the GET method for retrieving users. However, it lacks error handling and documentation, which are crucial for a robust API. Additionally, there is no mention of authentication and authorization, which are essential for securing the API.',\n",
202+
" 'score': 5.0,\n",
203203
" 'metadata': {'model': 'qwen2',\n",
204-
" 'raw_response': '{\\n \"decision\": \"DECENT\",\\n \"reasoning\": \"The code is simple and returns a JSON response, but it lacks RESTful design principles, error handling, and proper documentation. There is no indication of authentication or authorization, which are critical for API security.\",\\n \"score\": 4\\n}',\n",
204+
" 'raw_response': '{\\n \"decision\": \"DECENT\",\\n \"reasoning\": \"The API endpoint follows a basic RESTful design by using the GET method for retrieving users. However, it lacks error handling and documentation, which are crucial for a robust API. Additionally, there is no mention of authentication and authorization, which are essential for securing the API.\",\\n \"score\": 5\\n}',\n",
205205
" 'template_vars': {'aspects': ['RESTful design',\n",
206206
" 'Error handling',\n",
207207
" 'Documentation'],\n",
208208
" 'security_critical': True},\n",
209209
" 'template_engine': 'jinja2'}}"
210210
]
211211
},
212-
"execution_count": 10,
212+
"execution_count": 9,
213213
"metadata": {},
214214
"output_type": "execute_result"
215215
}

src/vllm_judge/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
ModelSpecificMetric
1818
)
1919
from vllm_judge.templating import TemplateProcessor
20-
from vllm_judge.metrics import (
20+
from vllm_judge.builtin_metrics import (
2121
# General metrics
2222
HELPFULNESS,
2323
ACCURACY,

src/vllm_judge/api/__init__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
"""
2-
API module for vLLM Judge.
3-
"""
41
from vllm_judge.api.server import app, create_app, start_server
52
from vllm_judge.api.client import JudgeClient
63
from vllm_judge.api.models import (

src/vllm_judge/api/client.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
"""
2-
HTTP client for vLLM Judge API.
3-
"""
41
import asyncio
52
from typing import Union, Dict, List, Optional, Tuple, Any, AsyncIterator
63
import httpx

0 commit comments

Comments
 (0)