12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
15
+ from __future__ import annotations
16
+
15
17
import json
16
18
import logging
17
19
import os
23
25
from typing import Union
24
26
import uuid
25
27
28
+ from google .genai import types as genai_types
26
29
from pydantic import ValidationError
27
30
31
+ from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
32
+ from .eval_case import IntermediateData
28
33
from .eval_set import EvalSet
29
- from .evaluation_generator import EvaluationGenerator
30
34
from .evaluator import EvalStatus
31
35
from .evaluator import EvaluationResult
32
36
from .evaluator import Evaluator
33
37
from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
34
- from .response_evaluator import ResponseEvaluator
35
- from .trajectory_evaluator import TrajectoryEvaluator
36
38
37
39
logger = logging .getLogger ("google_adk." + __name__ )
38
40
@@ -96,6 +98,7 @@ async def evaluate_eval_set(
96
98
criteria : dict [str , float ],
97
99
num_runs = NUM_RUNS ,
98
100
agent_name = None ,
101
+ print_detailed_results : bool = True ,
99
102
):
100
103
"""Evaluates an agent using the given EvalSet.
101
104
@@ -109,14 +112,22 @@ async def evaluate_eval_set(
109
112
num_runs: Number of times all entries in the eval dataset should be
110
113
assessed.
111
114
agent_name: The name of the agent.
115
+ print_detailed_results: Whether to print detailed results for each metric
116
+ evaluation.
112
117
"""
118
+ try :
119
+ from .evaluation_generator import EvaluationGenerator
120
+ except ModuleNotFoundError as e :
121
+ raise ModuleNotFoundError (MISSING_EVAL_DEPENDENCIES_MESSAGE ) from e
113
122
eval_case_responses_list = await EvaluationGenerator .generate_responses (
114
123
eval_set = eval_set ,
115
124
agent_module_path = agent_module ,
116
125
repeat_num = num_runs ,
117
126
agent_name = agent_name ,
118
127
)
119
128
129
+ failures = []
130
+
120
131
for eval_case_responses in eval_case_responses_list :
121
132
actual_invocations = [
122
133
invocation
@@ -139,10 +150,25 @@ async def evaluate_eval_set(
139
150
)
140
151
)
141
152
142
- assert evaluation_result .overall_eval_status == EvalStatus .PASSED , (
143
- f"{ metric_name } for { agent_module } Failed. Expected { threshold } ,"
144
- f" but got { evaluation_result .overall_score } ."
145
- )
153
+ if print_detailed_results :
154
+ AgentEvaluator ._print_details (
155
+ evaluation_result = evaluation_result ,
156
+ metric_name = metric_name ,
157
+ threshold = threshold ,
158
+ )
159
+
160
+ # Gather all the failures.
161
+ if evaluation_result .overall_eval_status != EvalStatus .PASSED :
162
+ failures .append (
163
+ f"{ metric_name } for { agent_module } Failed. Expected { threshold } ,"
164
+ f" but got { evaluation_result .overall_score } ."
165
+ )
166
+
167
+ assert not failures , (
168
+ "Following are all the test failures. If you looking to get more"
169
+ " details on the failures, then please re-run this test with"
170
+ " `print_details` set to `True`.\n {}" .format ("\n " .join (failures ))
171
+ )
146
172
147
173
@staticmethod
148
174
async def evaluate (
@@ -158,9 +184,10 @@ async def evaluate(
158
184
agent_module: The path to python module that contains the definition of
159
185
the agent. There is convention in place here, where the code is going to
160
186
look for 'root_agent' in the loaded module.
161
- eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
162
- full path to the file containing eval dataset, or a directory that is
163
- recursively explored for all files that have a `.test.json` suffix.
187
+ eval_dataset_file_path_or_dir: The eval data set. This can be either a
188
+ string representing full path to the file containing eval dataset, or a
189
+ directory that is recursively explored for all files that have a
190
+ `.test.json` suffix.
164
191
num_runs: Number of times all entries in the eval dataset should be
165
192
assessed.
166
193
agent_name: The name of the agent.
@@ -358,6 +385,11 @@ def _validate_input(eval_dataset, criteria):
358
385
359
386
@staticmethod
360
387
def _get_metric_evaluator (metric_name : str , threshold : float ) -> Evaluator :
388
+ try :
389
+ from .response_evaluator import ResponseEvaluator
390
+ from .trajectory_evaluator import TrajectoryEvaluator
391
+ except ModuleNotFoundError as e :
392
+ raise ModuleNotFoundError (MISSING_EVAL_DEPENDENCIES_MESSAGE ) from e
361
393
if metric_name == TOOL_TRAJECTORY_SCORE_KEY :
362
394
return TrajectoryEvaluator (threshold = threshold )
363
395
elif (
@@ -367,3 +399,60 @@ def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
367
399
return ResponseEvaluator (threshold = threshold , metric_name = metric_name )
368
400
369
401
raise ValueError (f"Unsupported eval metric: { metric_name } " )
402
+
403
+ @staticmethod
404
+ def _print_details (
405
+ evaluation_result : EvaluationResult , metric_name : str , threshold : float
406
+ ):
407
+ try :
408
+ from pandas import pandas as pd
409
+ from tabulate import tabulate
410
+ except ModuleNotFoundError as e :
411
+ raise ModuleNotFoundError (MISSING_EVAL_DEPENDENCIES_MESSAGE ) from e
412
+ print (
413
+ f"Summary: `{ evaluation_result .overall_eval_status } ` for Metric:"
414
+ f" `{ metric_name } `. Expected threshold: `{ threshold } `, actual value:"
415
+ f" `{ evaluation_result .overall_score } `."
416
+ )
417
+
418
+ data = []
419
+ for per_invocation_result in evaluation_result .per_invocation_results :
420
+ data .append ({
421
+ "eval_status" : per_invocation_result .eval_status ,
422
+ "score" : per_invocation_result .score ,
423
+ "threshold" : threshold ,
424
+ "prompt" : AgentEvaluator ._convert_content_to_text (
425
+ per_invocation_result .expected_invocation .user_content
426
+ ),
427
+ "expected_response" : AgentEvaluator ._convert_content_to_text (
428
+ per_invocation_result .expected_invocation .final_response
429
+ ),
430
+ "actual_response" : AgentEvaluator ._convert_content_to_text (
431
+ per_invocation_result .actual_invocation .final_response
432
+ ),
433
+ "expected_tool_calls" : AgentEvaluator ._convert_tool_calls_to_text (
434
+ per_invocation_result .expected_invocation .intermediate_data
435
+ ),
436
+ "actual_tool_calls" : AgentEvaluator ._convert_tool_calls_to_text (
437
+ per_invocation_result .actual_invocation .intermediate_data
438
+ ),
439
+ })
440
+
441
+ print (tabulate (pd .DataFrame (data ), headers = "keys" , tablefmt = "grid" ))
442
+ print ("\n \n " ) # Few empty lines for visual clarity
443
+
444
+ @staticmethod
445
+ def _convert_content_to_text (content : Optional [genai_types .Content ]) -> str :
446
+ if content and content .parts :
447
+ return "\n " .join ([p .text for p in content .parts if p .text ])
448
+
449
+ return ""
450
+
451
+ @staticmethod
452
+ def _convert_tool_calls_to_text (
453
+ intermediate_data : Optional [IntermediateData ],
454
+ ) -> str :
455
+ if intermediate_data and intermediate_data .tool_uses :
456
+ return "\n " .join ([str (t ) for t in intermediate_data .tool_uses ])
457
+
458
+ return ""
0 commit comments