Skip to content

Commit e1dfeb4

Browse files
authored
Update IntentResolution evaluator: now more reliable, faster and cheaper (#41560)
* Update IntentResolution evaluator. New evaluator is a significant improvement over previous version. Has lower intra model variance (0.046 vs 0.118), lower inter model variance (0.345 vs 0.688), higher percentage of most frequent score (75.4% vs 64.5%) is 2x faster to execute and cheaper (as it has fewer input and output tokens) * Added unit tests Fixed corner cases Update changelog * Update speller exceptions * Add logger
1 parent d66a7b0 commit e1dfeb4

File tree

7 files changed

+825
-108
lines changed

7 files changed

+825
-108
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
### Features Added
66

77
### Bugs Fixed
8+
9+
- Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
10+
811
- Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
912

1013
## 1.8.0 (2025-05-29)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py

Lines changed: 110 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from typing_extensions import NotRequired, Required, TypeGuard
1414
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
1515
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
16-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16+
from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
1717
from azure.ai.evaluation._model_configurations import (
1818
AzureAIProject,
1919
AzureOpenAIModelConfiguration,
@@ -481,6 +481,115 @@ def raise_exception(msg, target):
481481
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
482482
)
483483

484+
def _extract_text_from_content(content):
485+
text = []
486+
for msg in content:
487+
if 'text' in msg:
488+
text.append(msg['text'])
489+
return text
490+
491+
def _get_conversation_history(query):
492+
all_user_queries = []
493+
cur_user_query = []
494+
all_agent_responses = []
495+
cur_agent_response = []
496+
for msg in query:
497+
if not 'role' in msg:
498+
continue
499+
if msg['role'] == 'user' and 'content' in msg:
500+
if cur_agent_response != []:
501+
all_agent_responses.append(cur_agent_response)
502+
cur_agent_response = []
503+
text_in_msg = _extract_text_from_content(msg['content'])
504+
if text_in_msg:
505+
cur_user_query.append(text_in_msg)
506+
507+
if msg['role'] == 'assistant' and 'content' in msg:
508+
if cur_user_query !=[]:
509+
all_user_queries.append(cur_user_query)
510+
cur_user_query = []
511+
text_in_msg = _extract_text_from_content(msg['content'])
512+
if text_in_msg:
513+
cur_agent_response.append(text_in_msg)
514+
if cur_user_query !=[]:
515+
all_user_queries.append(cur_user_query)
516+
if cur_agent_response !=[]:
517+
all_agent_responses.append(cur_agent_response)
518+
519+
if len(all_user_queries) != len(all_agent_responses) + 1:
520+
raise EvaluationException(
521+
message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
522+
internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
523+
target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
524+
category=ErrorCategory.INVALID_VALUE,
525+
blame=ErrorBlame.USER_ERROR,
526+
)
527+
528+
return {
529+
'user_queries' : all_user_queries,
530+
'agent_responses' : all_agent_responses
531+
}
532+
533+
def _pretty_format_conversation_history(conversation_history):
534+
"""Formats the conversation history for better readability."""
535+
formatted_history = ""
536+
for i, (user_query, agent_response) in enumerate(zip(conversation_history['user_queries'], conversation_history['agent_responses']+[None])):
537+
formatted_history+=f"User turn {i+1}:\n"
538+
for msg in user_query:
539+
formatted_history+=" " + "\n ".join(msg)
540+
formatted_history+="\n\n"
541+
if agent_response:
542+
formatted_history+=f"Agent turn {i+1}:\n"
543+
for msg in agent_response:
544+
formatted_history+=" " + "\n ".join(msg)
545+
formatted_history+="\n\n"
546+
return formatted_history
547+
548+
def reformat_conversation_history(query, logger = None):
549+
"""Reformats the conversation history to a more compact representation."""
550+
try:
551+
conversation_history = _get_conversation_history(query)
552+
return _pretty_format_conversation_history(conversation_history)
553+
except:
554+
# If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
555+
# This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
556+
# From our tests the negative impact on IntentResolution is:
557+
# Higher intra model variance (0.142 vs 0.046)
558+
# Higher inter model variance (0.345 vs 0.607)
559+
# Lower percentage of mode in Likert scale (73.4% vs 75.4%)
560+
# Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
561+
if logger:
562+
logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
563+
return query
564+
565+
def _get_agent_response(agent_response_msgs):
566+
"""Extracts the text from the agent response content."""
567+
agent_response_text = []
568+
for msg in agent_response_msgs:
569+
if 'role' in msg and msg['role'] == 'assistant' and 'content' in msg:
570+
text = _extract_text_from_content(msg['content'])
571+
if text:
572+
agent_response_text.extend(text)
573+
return agent_response_text
574+
575+
def reformat_agent_response(response, logger = None):
576+
try:
577+
if response is None or response == []:
578+
return ""
579+
agent_response = _get_agent_response(response)
580+
if agent_response == []:
581+
# If no message could be extracted, likely the format changed, fallback to the original response in that case
582+
if logger:
583+
logger.warning(f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}")
584+
return response
585+
return "\n".join(agent_response)
586+
except:
587+
# If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
588+
# This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
589+
if logger:
590+
logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
591+
return response
592+
484593
def upload(path: str, container_client: ContainerClient, logger=None):
485594
"""Upload files or directories to Azure Blob Storage using a container client.
486595

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,19 @@
33
# ---------------------------------------------------------
44
import os
55
import math
6+
import logging
67
from typing import Dict, Union, List, Optional
78

89
from typing_extensions import overload, override
910

1011
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
1112
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
1213
from azure.ai.evaluation._model_configurations import Conversation, Message
13-
from ..._common.utils import check_score_is_valid
14+
from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response
1415
from azure.ai.evaluation._common._experimental import experimental
1516

17+
logger = logging.getLogger(__name__)
18+
1619
@experimental
1720
class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
1821
"""
@@ -116,7 +119,7 @@ def __call__( # pylint: disable=docstring-missing-param
116119
For detailed parameter types and return value documentation, see the overloaded __call__ definition.
117120
"""
118121
return super().__call__(*args, **kwargs)
119-
122+
120123
@override
121124
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
122125
"""Do intent resolution evaluation.
@@ -135,10 +138,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
135138
category=ErrorCategory.MISSING_FIELD,
136139
target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
137140
)
141+
# reformat query and response to the format expected by the prompty flow
142+
eval_input['query'] = reformat_conversation_history(eval_input["query"], logger)
143+
eval_input['response'] = reformat_agent_response(eval_input["response"], logger)
144+
138145
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
139146
# llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
140147
if isinstance(llm_output, dict):
141-
score = llm_output.get("resolution_score", math.nan)
148+
score = llm_output.get("score", math.nan)
142149
if not check_score_is_valid(score, IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE, IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE):
143150
raise EvaluationException(
144151
message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
@@ -150,17 +157,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
150157
score = float(score)
151158
score_result = 'pass' if score >= self.threshold else 'fail'
152159

153-
#remove fields 'explanation' and 'resolution_score' from llm_output as they are already included in the response_dict
154-
if 'explanation' in llm_output: llm_output.pop("explanation")
155-
if 'resolution_score' in llm_output: llm_output.pop("resolution_score")
156-
157160
response_dict = {
158161
f"{self._result_key}" : score,
159162
f"{self._result_key}_result" : score_result,
160163
f"{self._result_key}_threshold" : self.threshold,
161164
f"{self._result_key}_reason" : reason,
162-
f"additional_details" : llm_output
163165
}
164166
return response_dict
165167
# If llm_output is not a dictionary, return NaN for the score. This should never happen
168+
if logger:
169+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
166170
return {self._result_key: math.nan}

0 commit comments

Comments
 (0)