|
13 | 13 | from typing_extensions import NotRequired, Required, TypeGuard
|
14 | 14 | from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
15 | 15 | from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
16 |
| -from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException |
| 16 | +from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException |
17 | 17 | from azure.ai.evaluation._model_configurations import (
|
18 | 18 | AzureAIProject,
|
19 | 19 | AzureOpenAIModelConfiguration,
|
@@ -481,6 +481,115 @@ def raise_exception(msg, target):
|
481 | 481 | ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
482 | 482 | )
|
483 | 483 |
|
| 484 | +def _extract_text_from_content(content): |
| 485 | + text = [] |
| 486 | + for msg in content: |
| 487 | + if 'text' in msg: |
| 488 | + text.append(msg['text']) |
| 489 | + return text |
| 490 | + |
| 491 | +def _get_conversation_history(query): |
| 492 | + all_user_queries = [] |
| 493 | + cur_user_query = [] |
| 494 | + all_agent_responses = [] |
| 495 | + cur_agent_response = [] |
| 496 | + for msg in query: |
| 497 | + if not 'role' in msg: |
| 498 | + continue |
| 499 | + if msg['role'] == 'user' and 'content' in msg: |
| 500 | + if cur_agent_response != []: |
| 501 | + all_agent_responses.append(cur_agent_response) |
| 502 | + cur_agent_response = [] |
| 503 | + text_in_msg = _extract_text_from_content(msg['content']) |
| 504 | + if text_in_msg: |
| 505 | + cur_user_query.append(text_in_msg) |
| 506 | + |
| 507 | + if msg['role'] == 'assistant' and 'content' in msg: |
| 508 | + if cur_user_query !=[]: |
| 509 | + all_user_queries.append(cur_user_query) |
| 510 | + cur_user_query = [] |
| 511 | + text_in_msg = _extract_text_from_content(msg['content']) |
| 512 | + if text_in_msg: |
| 513 | + cur_agent_response.append(text_in_msg) |
| 514 | + if cur_user_query !=[]: |
| 515 | + all_user_queries.append(cur_user_query) |
| 516 | + if cur_agent_response !=[]: |
| 517 | + all_agent_responses.append(cur_agent_response) |
| 518 | + |
| 519 | + if len(all_user_queries) != len(all_agent_responses) + 1: |
| 520 | + raise EvaluationException( |
| 521 | + message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY, |
| 522 | + internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY, |
| 523 | + target=ErrorTarget.CONVERSATION_HISTORY_PARSING, |
| 524 | + category=ErrorCategory.INVALID_VALUE, |
| 525 | + blame=ErrorBlame.USER_ERROR, |
| 526 | + ) |
| 527 | + |
| 528 | + return { |
| 529 | + 'user_queries' : all_user_queries, |
| 530 | + 'agent_responses' : all_agent_responses |
| 531 | + } |
| 532 | + |
| 533 | +def _pretty_format_conversation_history(conversation_history): |
| 534 | + """Formats the conversation history for better readability.""" |
| 535 | + formatted_history = "" |
| 536 | + for i, (user_query, agent_response) in enumerate(zip(conversation_history['user_queries'], conversation_history['agent_responses']+[None])): |
| 537 | + formatted_history+=f"User turn {i+1}:\n" |
| 538 | + for msg in user_query: |
| 539 | + formatted_history+=" " + "\n ".join(msg) |
| 540 | + formatted_history+="\n\n" |
| 541 | + if agent_response: |
| 542 | + formatted_history+=f"Agent turn {i+1}:\n" |
| 543 | + for msg in agent_response: |
| 544 | + formatted_history+=" " + "\n ".join(msg) |
| 545 | + formatted_history+="\n\n" |
| 546 | + return formatted_history |
| 547 | + |
| 548 | +def reformat_conversation_history(query, logger = None): |
| 549 | + """Reformats the conversation history to a more compact representation.""" |
| 550 | + try: |
| 551 | + conversation_history = _get_conversation_history(query) |
| 552 | + return _pretty_format_conversation_history(conversation_history) |
| 553 | + except: |
| 554 | + # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned |
| 555 | + # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected. |
| 556 | + # From our tests the negative impact on IntentResolution is: |
| 557 | + # Higher intra model variance (0.142 vs 0.046) |
| 558 | + # Higher inter model variance (0.345 vs 0.607) |
| 559 | + # Lower percentage of mode in Likert scale (73.4% vs 75.4%) |
| 560 | + # Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3) |
| 561 | + if logger: |
| 562 | + logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}") |
| 563 | + return query |
| 564 | + |
| 565 | +def _get_agent_response(agent_response_msgs): |
| 566 | + """Extracts the text from the agent response content.""" |
| 567 | + agent_response_text = [] |
| 568 | + for msg in agent_response_msgs: |
| 569 | + if 'role' in msg and msg['role'] == 'assistant' and 'content' in msg: |
| 570 | + text = _extract_text_from_content(msg['content']) |
| 571 | + if text: |
| 572 | + agent_response_text.extend(text) |
| 573 | + return agent_response_text |
| 574 | + |
| 575 | +def reformat_agent_response(response, logger = None): |
| 576 | + try: |
| 577 | + if response is None or response == []: |
| 578 | + return "" |
| 579 | + agent_response = _get_agent_response(response) |
| 580 | + if agent_response == []: |
| 581 | + # If no message could be extracted, likely the format changed, fallback to the original response in that case |
| 582 | + if logger: |
| 583 | + logger.warning(f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}") |
| 584 | + return response |
| 585 | + return "\n".join(agent_response) |
| 586 | + except: |
| 587 | + # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned |
| 588 | + # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details. |
| 589 | + if logger: |
| 590 | + logger.warning(f"Agent response could not be parsed, falling back to original response: {response}") |
| 591 | + return response |
| 592 | + |
484 | 593 | def upload(path: str, container_client: ContainerClient, logger=None):
|
485 | 594 | """Upload files or directories to Azure Blob Storage using a container client.
|
486 | 595 |
|
|
0 commit comments