13
13
# limitations under the License.
14
14
15
15
import asyncio
16
+ import collections
16
17
from contextlib import asynccontextmanager
17
18
from datetime import datetime
18
19
import logging
19
20
import os
20
21
import tempfile
22
+ from typing import AsyncGenerator
23
+ from typing import Coroutine
21
24
from typing import Optional
22
25
23
26
import click
27
30
from . import cli_create
28
31
from . import cli_deploy
29
32
from .. import version
33
+ from ..evaluation .local_eval_set_results_manager import LocalEvalSetResultsManager
34
+ from ..sessions .in_memory_session_service import InMemorySessionService
30
35
from .cli import run_cli
31
36
from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE
32
37
from .fast_api import get_fast_api_app
@@ -306,7 +311,7 @@ def cli_eval(
306
311
EvalMetric (metric_name = metric_name , threshold = threshold )
307
312
)
308
313
309
- print (f"Using evaluation creiteria : { evaluation_criteria } " )
314
+ print (f"Using evaluation criteria : { evaluation_criteria } " )
310
315
311
316
root_agent = get_root_agent (agent_module_file_path )
312
317
reset_func = try_get_reset_func (agent_module_file_path )
@@ -325,21 +330,47 @@ def cli_eval(
325
330
e for e in eval_set .eval_cases if e .eval_id in eval_case_ids
326
331
]
327
332
328
- eval_set_id_to_eval_cases [eval_set_file_path ] = eval_cases
333
+ eval_set_id_to_eval_cases [eval_set . eval_set_id ] = eval_cases
329
334
330
335
async def _collect_eval_results () -> list [EvalCaseResult ]:
331
- return [
332
- result
333
- async for result in run_evals (
334
- eval_set_id_to_eval_cases , root_agent , reset_func , eval_metrics
335
- )
336
- ]
336
+ session_service = InMemorySessionService ()
337
+ eval_case_results = []
338
+ async for eval_case_result in run_evals (
339
+ eval_set_id_to_eval_cases ,
340
+ root_agent ,
341
+ reset_func ,
342
+ eval_metrics ,
343
+ session_service = session_service ,
344
+ ):
345
+ eval_case_result .session_details = await session_service .get_session (
346
+ app_name = os .path .basename (agent_module_file_path ),
347
+ user_id = eval_case_result .user_id ,
348
+ session_id = eval_case_result .session_id ,
349
+ )
350
+ eval_case_results .append (eval_case_result )
351
+ return eval_case_results
337
352
338
353
try :
339
354
eval_results = asyncio .run (_collect_eval_results ())
340
355
except ModuleNotFoundError :
341
356
raise click .ClickException (MISSING_EVAL_DEPENDENCIES_MESSAGE )
342
357
358
+ # Write eval set results.
359
+ local_eval_set_results_manager = LocalEvalSetResultsManager (
360
+ agent_dir = os .path .dirname (agent_module_file_path )
361
+ )
362
+ eval_set_id_to_eval_results = collections .defaultdict (list )
363
+ for eval_case_result in eval_results :
364
+ eval_set_id = eval_case_result .eval_set_id
365
+ eval_set_id_to_eval_results [eval_set_id ].append (eval_case_result )
366
+
367
+ for eval_set_id , eval_case_results in eval_set_id_to_eval_results .items ():
368
+ local_eval_set_results_manager .save_eval_set_result (
369
+ app_name = os .path .basename (agent_module_file_path ),
370
+ eval_set_id = eval_set_id ,
371
+ eval_case_results = eval_case_results ,
372
+ )
373
+
343
374
print ("*********************************************************************" )
344
375
eval_run_summary = {}
345
376
0 commit comments