|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 5, |
| 5 | + "execution_count": 1, |
6 | 6 | "metadata": {},
|
7 | 7 | "outputs": [],
|
8 | 8 | "source": [
|
|
31 | 31 | },
|
32 | 32 | {
|
33 | 33 | "cell_type": "code",
|
34 |
| - "execution_count": 7, |
| 34 | + "execution_count": 2, |
35 | 35 | "metadata": {},
|
36 | 36 | "outputs": [],
|
37 | 37 | "source": [
|
|
40 | 40 | },
|
41 | 41 | {
|
42 | 42 | "cell_type": "code",
|
43 |
| - "execution_count": 8, |
| 43 | + "execution_count": 3, |
44 | 44 | "metadata": {},
|
45 | 45 | "outputs": [],
|
46 | 46 | "source": [
|
|
50 | 50 | },
|
51 | 51 | {
|
52 | 52 | "cell_type": "code",
|
53 |
| - "execution_count": 9, |
| 53 | + "execution_count": 4, |
54 | 54 | "metadata": {},
|
55 | 55 | "outputs": [
|
56 | 56 | {
|
57 | 57 | "data": {
|
58 | 58 | "text/plain": [
|
59 |
| - "{'decision': False,\n", |
60 |
| - " 'reasoning': 'The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.',\n", |
| 59 | + "{'decision': 'PASS',\n", |
| 60 | + " 'reasoning': 'The content maintains a professional tone by clearly stating the intention to bump the version and asking for confirmation.',\n", |
61 | 61 | " 'score': None,\n",
|
62 | 62 | " 'metadata': {'model': 'qwen2',\n",
|
63 |
| - " 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.\",\\n \"score\": null\\n}'}}" |
| 63 | + " 'raw_response': '{\\n \"decision\": \"PASS\",\\n \"reasoning\": \"The content maintains a professional tone by clearly stating the intention to bump the version and asking for confirmation.\",\\n \"score\": null\\n}'}}" |
64 | 64 | ]
|
65 | 65 | },
|
66 |
| - "execution_count": 9, |
| 66 | + "execution_count": 4, |
67 | 67 | "metadata": {},
|
68 | 68 | "output_type": "execute_result"
|
69 | 69 | }
|
|
74 | 74 | },
|
75 | 75 | {
|
76 | 76 | "cell_type": "code",
|
77 |
| - "execution_count": 10, |
| 77 | + "execution_count": 5, |
78 | 78 | "metadata": {},
|
79 | 79 | "outputs": [
|
80 | 80 | {
|
81 | 81 | "data": {
|
82 | 82 | "text/plain": [
|
83 |
| - "{'decision': 5,\n", |
84 |
| - " 'reasoning': 'The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.',\n", |
85 |
| - " 'score': 5.0,\n", |
| 83 | + "{'decision': 'False',\n", |
| 84 | + " 'reasoning': 'The content lacks a professional tone as it is phrased as a question without providing context or justification for the version bump.',\n", |
| 85 | + " 'score': 0.2,\n", |
86 | 86 | " 'metadata': {'model': 'qwen2',\n",
|
87 |
| - " 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.\",\\n \"score\": 5\\n}'}}" |
| 87 | + " 'raw_response': '{\\n \"decision\": \"False\",\\n \"reasoning\": \"The content lacks a professional tone as it is phrased as a question without providing context or justification for the version bump.\",\\n \"score\": 0.2\\n}'}}" |
88 | 88 | ]
|
89 | 89 | },
|
90 |
| - "execution_count": 10, |
| 90 | + "execution_count": 5, |
91 | 91 | "metadata": {},
|
92 | 92 | "output_type": "execute_result"
|
93 | 93 | }
|
94 | 94 | ],
|
95 | 95 | "source": [
|
96 | 96 | "res = await judge.evaluate(content=\"I want to bump the version to 1.0.1, is it a good idea?\",\n",
|
97 | 97 | " criteria=\"Check the professional tone.\",\n",
|
98 |
| - " rubric=\"Assign a score between 0 and 10 based on the professional tone. 0 is the worst and 10 is the best.\")\n", |
| 98 | + " rubric=\"Assign a score between 0 and 1 based on the professional tone. 0 is the worst and 1 is the best.\")\n", |
99 | 99 | "res.model_dump()"
|
100 | 100 | ]
|
101 | 101 | },
|
102 | 102 | {
|
103 | 103 | "cell_type": "code",
|
104 |
| - "execution_count": 11, |
| 104 | + "execution_count": 8, |
105 | 105 | "metadata": {},
|
106 | 106 | "outputs": [
|
107 | 107 | {
|
108 | 108 | "data": {
|
109 | 109 | "text/plain": [
|
110 |
| - "{'decision': 5,\n", |
111 |
| - " 'reasoning': 'The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.',\n", |
112 |
| - " 'score': 5.0,\n", |
| 110 | + "{'decision': 'True',\n", |
| 111 | + " 'reasoning': 'The response is clear and to the point, maintaining a professional tone by asking for a decision on the version bump.',\n", |
| 112 | + " 'score': 1.0,\n", |
113 | 113 | " 'metadata': {'model': 'qwen2',\n",
|
114 |
| - " 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.\",\\n \"score\": 5\\n}'}}" |
| 114 | + " 'raw_response': '{\\n \"decision\": \"True\",\\n \"reasoning\": \"The response is clear and to the point, maintaining a professional tone by asking for a decision on the version bump.\",\\n \"score\": 1\\n}'}}" |
115 | 115 | ]
|
116 | 116 | },
|
117 |
| - "execution_count": 11, |
| 117 | + "execution_count": 8, |
118 | 118 | "metadata": {},
|
119 | 119 | "output_type": "execute_result"
|
120 | 120 | }
|
|
124 | 124 | " criteria=\"Check the professional tone.\",\n",
|
125 | 125 | " rubric={\n",
|
126 | 126 | " 0: \"The response is not professional.\",\n",
|
127 |
| - " 5: \"The response is somewhat professional.\",\n", |
128 |
| - " 10: \"The response is very professional.\"\n", |
129 |
| - " })\n", |
| 127 | + " 0.5: \"The response is somewhat professional.\",\n", |
| 128 | + " 1: \"The response is very professional.\"\n", |
| 129 | + " },\n", |
| 130 | + " scale=(0, 1)\n", |
| 131 | + " )\n", |
130 | 132 | "res.model_dump()"
|
131 | 133 | ]
|
132 | 134 | },
|
133 | 135 | {
|
134 | 136 | "cell_type": "code",
|
135 |
| - "execution_count": 12, |
| 137 | + "execution_count": 9, |
136 | 138 | "metadata": {},
|
137 | 139 | "outputs": [],
|
138 | 140 | "source": [
|
|
146 | 148 | },
|
147 | 149 | {
|
148 | 150 | "cell_type": "code",
|
149 |
| - "execution_count": 13, |
| 151 | + "execution_count": 10, |
150 | 152 | "metadata": {},
|
151 | 153 | "outputs": [
|
152 | 154 | {
|
153 | 155 | "data": {
|
154 | 156 | "text/plain": [
|
155 | 157 | "{'decision': 'moderate',\n",
|
156 |
| - " 'reasoning': 'The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.',\n", |
| 158 | + " 'reasoning': 'The content is a question about version bumping and lacks formal structure or context, which makes it slightly informal. However, it does not contain any unprofessional language or tone.',\n", |
157 | 159 | " 'score': 5.0,\n",
|
158 | 160 | " 'metadata': {'model': 'qwen2',\n",
|
159 |
| - " 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.\",\\n \"score\": 5\\n}'}}" |
| 161 | + " 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The content is a question about version bumping and lacks formal structure or context, which makes it slightly informal. However, it does not contain any unprofessional language or tone.\",\\n \"score\": 5\\n}'}}" |
160 | 162 | ]
|
161 | 163 | },
|
162 |
| - "execution_count": 13, |
| 164 | + "execution_count": 10, |
163 | 165 | "metadata": {},
|
164 | 166 | "output_type": "execute_result"
|
165 | 167 | }
|
|
172 | 174 | },
|
173 | 175 | {
|
174 | 176 | "cell_type": "code",
|
175 |
| - "execution_count": 14, |
| 177 | + "execution_count": 11, |
176 | 178 | "metadata": {},
|
177 | 179 | "outputs": [
|
178 | 180 | {
|
179 | 181 | "data": {
|
180 | 182 | "text/plain": [
|
181 | 183 | "{'decision': 'non-professional',\n",
|
182 |
| - " 'reasoning': 'The response uses informal and expletive language, which is not appropriate for a professional context.',\n", |
183 |
| - " 'score': 1.0,\n", |
| 184 | + " 'reasoning': \"The phrase 'Holy shit, this is a great!' is informal and contains an exclamation, which does not meet the criteria for a professional tone.\",\n", |
| 185 | + " 'score': 2.0,\n", |
184 | 186 | " 'metadata': {'model': 'qwen2',\n",
|
185 |
| - " 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The response uses informal and expletive language, which is not appropriate for a professional context.\",\\n \"score\": 1\\n}'}}" |
| 187 | + " 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The phrase \\'Holy shit, this is a great!\\' is informal and contains an exclamation, which does not meet the criteria for a professional tone.\",\\n \"score\": 2\\n}'}}" |
186 | 188 | ]
|
187 | 189 | },
|
188 |
| - "execution_count": 14, |
| 190 | + "execution_count": 11, |
189 | 191 | "metadata": {},
|
190 | 192 | "output_type": "execute_result"
|
191 | 193 | }
|
|
198 | 200 | },
|
199 | 201 | {
|
200 | 202 | "cell_type": "code",
|
201 |
| - "execution_count": 15, |
| 203 | + "execution_count": 12, |
202 | 204 | "metadata": {},
|
203 | 205 | "outputs": [
|
204 | 206 | {
|
205 | 207 | "data": {
|
206 | 208 | "text/plain": [
|
207 |
| - "{'decision': True,\n", |
208 |
| - " 'reasoning': 'The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.',\n", |
| 209 | + "{'decision': 'PASS',\n", |
| 210 | + " 'reasoning': 'The statement is accurate and complete as it correctly identifies Paris as the capital of France.',\n", |
209 | 211 | " 'score': None,\n",
|
210 | 212 | " 'metadata': {'model': 'qwen2',\n",
|
211 |
| - " 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.\",\\n \"score\": null\\n}',\n", |
| 213 | + " 'raw_response': '{\\n \"decision\": \"PASS\",\\n \"reasoning\": \"The statement is accurate and complete as it correctly identifies Paris as the capital of France.\",\\n \"score\": null\\n}',\n", |
212 | 214 | " 'template_vars': {'input': 'What is the capital of France?'},\n",
|
213 | 215 | " 'template_engine': 'format'}}"
|
214 | 216 | ]
|
215 | 217 | },
|
216 |
| - "execution_count": 15, |
| 218 | + "execution_count": 12, |
217 | 219 | "metadata": {},
|
218 | 220 | "output_type": "execute_result"
|
219 | 221 | }
|
|
0 commit comments