Azure
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Lines changed: 3 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
Lines changed: 110 additions & 1 deletion b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
Lines changed: 110 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py
Lines changed: 12 additions & 8 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py
Lines changed: 12 additions & 8 deletions
@@ -5,6 +5,9 @@
 ### Features Added
 
 ### Bugs Fixed
+
+- Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
+
 - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
 
 ## 1.8.0 (2025-05-29)
 
@@ -13,7 +13,7 @@
 from typing_extensions import NotRequired, Required, TypeGuard
 from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
 from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -481,6 +481,115 @@ def raise_exception(msg, target):
             ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
         )
 
+def _extract_text_from_content(content):
+    text = []
+    for msg in content:
+        if 'text' in msg:
+            text.append(msg['text'])
+    return text
+
+def _get_conversation_history(query):
+    all_user_queries = []
+    cur_user_query = []
+    all_agent_responses = []
+    cur_agent_response = []
+    for msg in query:
+        if not 'role' in msg:
+            continue
+        if msg['role'] == 'user' and 'content' in msg:
+            if cur_agent_response != []:
+                all_agent_responses.append(cur_agent_response)
+                cur_agent_response = []
+            text_in_msg = _extract_text_from_content(msg['content'])
+            if text_in_msg:
+                cur_user_query.append(text_in_msg)
+
+        if msg['role'] == 'assistant' and 'content' in msg:
+            if cur_user_query !=[]:
+                all_user_queries.append(cur_user_query)
+                cur_user_query = []
+            text_in_msg = _extract_text_from_content(msg['content'])
+            if text_in_msg:
+                cur_agent_response.append(text_in_msg)
+    if cur_user_query !=[]:
+        all_user_queries.append(cur_user_query)
+    if cur_agent_response !=[]:
+        all_agent_responses.append(cur_agent_response)
+
+    if len(all_user_queries) != len(all_agent_responses) + 1:
+        raise EvaluationException(
+            message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
+            internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
+            target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+
+    return {
+            'user_queries'    : all_user_queries,
+            'agent_responses' : all_agent_responses
+           }
+
+def _pretty_format_conversation_history(conversation_history):
+    """Formats the conversation history for better readability."""
+    formatted_history = ""
+    for i, (user_query, agent_response) in enumerate(zip(conversation_history['user_queries'], conversation_history['agent_responses']+[None])):
+        formatted_history+=f"User turn {i+1}:\n"
+        for msg in user_query:
+            formatted_history+="  " + "\n  ".join(msg)
+        formatted_history+="\n\n"
+        if agent_response:
+            formatted_history+=f"Agent turn {i+1}:\n"
+            for msg in agent_response:
+                formatted_history+="  " + "\n  ".join(msg)
+            formatted_history+="\n\n"
+    return formatted_history
+
+def reformat_conversation_history(query, logger = None):
+    """Reformats the conversation history to a more compact representation."""
+    try:
+        conversation_history = _get_conversation_history(query)
+        return _pretty_format_conversation_history(conversation_history)
+    except:
+        # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
+        # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
+        # From our tests the negative impact on IntentResolution is:
+        #   Higher intra model variance (0.142 vs 0.046)
+        #   Higher inter model variance (0.345 vs 0.607)
+        #   Lower percentage of mode in Likert scale (73.4% vs 75.4%)
+        #   Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
+        if logger:
+            logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
+        return query
+
+def _get_agent_response(agent_response_msgs):
+    """Extracts the text from the agent response content."""
+    agent_response_text = []
+    for msg in agent_response_msgs:
+        if 'role' in msg and msg['role'] == 'assistant' and 'content' in msg:
+            text = _extract_text_from_content(msg['content'])
+            if text:
+                agent_response_text.extend(text)
+    return agent_response_text
+
+def reformat_agent_response(response, logger = None):
+    try:
+        if response is None or response == []:
+            return ""
+        agent_response = _get_agent_response(response)
+        if agent_response == []:
+            # If no message could be extracted, likely the format changed, fallback to the original response in that case
+            if logger:
+                logger.warning(f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}")
+            return response
+        return "\n".join(agent_response)
+    except:
+        # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
+        # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
+        if logger:
+            logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
+        return response
+
 def upload(path: str, container_client: ContainerClient, logger=None):
     """Upload files or directories to Azure Blob Storage using a container client.
 
 
@@ -3,16 +3,19 @@
 # ---------------------------------------------------------
 import os
 import math
+import logging
 from typing import Dict, Union, List, Optional
 
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation, Message
-from ..._common.utils import check_score_is_valid
+from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response
 from azure.ai.evaluation._common._experimental import experimental
 
+logger = logging.getLogger(__name__)
+
 @experimental
 class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
@@ -116,7 +119,7 @@ def __call__(  # pylint: disable=docstring-missing-param
         For detailed parameter types and return value documentation, see the overloaded __call__ definition.
         """
         return super().__call__(*args, **kwargs)
-
+    
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
         """Do intent resolution evaluation.
@@ -135,10 +138,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
             )
+        # reformat query and response to the format expected by the prompty flow
+        eval_input['query'] = reformat_conversation_history(eval_input["query"], logger)
+        eval_input['response'] = reformat_agent_response(eval_input["response"], logger)
+
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
         if isinstance(llm_output, dict):
-            score  = llm_output.get("resolution_score", math.nan)
+            score  = llm_output.get("score", math.nan)
             if not check_score_is_valid(score, IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE, IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE):
                 raise EvaluationException(
                     message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
@@ -150,17 +157,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             score = float(score)
             score_result = 'pass' if score >= self.threshold else 'fail'
 
-            #remove fields 'explanation' and 'resolution_score' from llm_output as they are already included in the response_dict
-            if 'explanation' in llm_output: llm_output.pop("explanation")
-            if 'resolution_score' in llm_output: llm_output.pop("resolution_score")
-
             response_dict = {
                              f"{self._result_key}"           : score,
                              f"{self._result_key}_result"    : score_result,
                              f"{self._result_key}_threshold" : self.threshold,
                              f"{self._result_key}_reason"    : reason,
-                             f"additional_details"           : llm_output
                         }
             return response_dict
         # If llm_output is not a dictionary, return NaN for the score. This should never happen
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
         return {self._result_key: math.nan}