Azure
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Lines changed: 8 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Lines changed: 8 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
Lines changed: 69 additions & 10 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
Lines changed: 69 additions & 10 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py
Lines changed: 15 additions & 9 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py
Lines changed: 15 additions & 9 deletions
@@ -1,5 +1,13 @@
 # Release History
 
+## 1.10.0 (Unreleased)
+
+### Features Added
+
+### Bugs Fixed
+
+- Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
+
 ## 1.9.0 (2025-07-02)
 
 ### Features Added
 
@@ -494,14 +494,17 @@ def _extract_text_from_content(content):
     return text
 
 
-def _get_conversation_history(query):
+def _get_conversation_history(query, include_system_messages=False):
     all_user_queries = []
     cur_user_query = []
     all_agent_responses = []
     cur_agent_response = []
+    system_message = None
     for msg in query:
         if not "role" in msg:
             continue
+        if include_system_messages and msg["role"] == "system" and "content" in msg:
+            system_message = msg.get("content", "")
         if msg["role"] == "user" and "content" in msg:
             if cur_agent_response != []:
                 all_agent_responses.append(cur_agent_response)
@@ -530,13 +533,18 @@ def _get_conversation_history(query):
             category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
         )
-
-    return {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
+    result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
+    if include_system_messages:
+        result["system_message"] = system_message
+    return result
 
 
 def _pretty_format_conversation_history(conversation_history):
     """Formats the conversation history for better readability."""
     formatted_history = ""
+    if "system_message" in conversation_history and conversation_history["system_message"] is not None:
+        formatted_history += "SYSTEM_PROMPT:\n"
+        formatted_history += "  " + conversation_history["system_message"] + "\n\n"
     for i, (user_query, agent_response) in enumerate(
         zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
     ):
@@ -552,10 +560,10 @@ def _pretty_format_conversation_history(conversation_history):
     return formatted_history
 
 
-def reformat_conversation_history(query, logger=None):
+def reformat_conversation_history(query, logger=None, include_system_messages=False):
     """Reformats the conversation history to a more compact representation."""
     try:
-        conversation_history = _get_conversation_history(query)
+        conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
         return _pretty_format_conversation_history(conversation_history)
     except:
         # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
@@ -570,22 +578,53 @@ def reformat_conversation_history(query, logger=None):
         return query
 
 
-def _get_agent_response(agent_response_msgs):
-    """Extracts the text from the agent response content."""
+def _get_agent_response(agent_response_msgs, include_tool_messages=False):
+    """Extracts formatted agent response including text, and optionally tool calls/results."""
     agent_response_text = []
+    tool_results = {}
+
+    # First pass: collect tool results
+    if include_tool_messages:
+        for msg in agent_response_msgs:
+            if msg.get("role") == "tool" and "tool_call_id" in msg:
+                for content in msg.get("content", []):
+                    if content.get("type") == "tool_result":
+                        result = content.get("tool_result")
+                        tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
+
+    # Second pass: parse assistant messages and tool calls
     for msg in agent_response_msgs:
-        if "role" in msg and msg["role"] == "assistant" and "content" in msg:
+        if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
             text = _extract_text_from_content(msg["content"])
             if text:
                 agent_response_text.extend(text)
+            if include_tool_messages:
+                for content in msg.get("content", []):
+                    # Todo: Verify if this is the correct way to handle tool calls
+                    if content.get("type") == "tool_call":
+                        if "tool_call" in content and "function" in content.get("tool_call", {}):
+                            tc = content.get("tool_call", {})
+                            func_name = tc.get("function", {}).get("name", "")
+                            args = tc.get("function", {}).get("arguments", {})
+                            tool_call_id = tc.get("id")
+                        else:
+                            tool_call_id = content.get("tool_call_id")
+                            func_name = content.get("name", "")
+                            args = content.get("arguments", {})
+                        args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
+                        call_line = f"[TOOL_CALL] {func_name}({args_str})"
+                        agent_response_text.append(call_line)
+                        if tool_call_id in tool_results:
+                            agent_response_text.append(tool_results[tool_call_id])
+
     return agent_response_text
 
 
-def reformat_agent_response(response, logger=None):
+def reformat_agent_response(response, logger=None, include_tool_messages=False):
     try:
         if response is None or response == []:
             return ""
-        agent_response = _get_agent_response(response)
+        agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
         if agent_response == []:
             # If no message could be extracted, likely the format changed, fallback to the original response in that case
             if logger:
@@ -602,6 +641,26 @@ def reformat_agent_response(response, logger=None):
         return response
 
 
+def reformat_tool_definitions(tool_definitions, logger=None):
+    try:
+        output_lines = ["TOOL_DEFINITIONS:"]
+        for tool in tool_definitions:
+            name = tool.get("name", "unnamed_tool")
+            desc = tool.get("description", "").strip()
+            params = tool.get("parameters", {}).get("properties", {})
+            param_names = ", ".join(params.keys()) if params else "no parameters"
+            output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
+        return "\n".join(output_lines)
+    except Exception as e:
+        # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
+        # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
+        if logger:
+            logger.warning(
+                f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
+            )
+        return tool_definitions
+
+
 def upload(path: str, container_client: ContainerClient, logger=None):
     """Upload files or directories to Azure Blob Storage using a container client.
 
 
@@ -3,16 +3,19 @@
 # ---------------------------------------------------------
 import os
 import math
+import logging
 from typing import Dict, Union, List, Optional
 
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
+from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
 from azure.ai.evaluation._model_configurations import Message
 from azure.ai.evaluation._common._experimental import experimental
 
+logger = logging.getLogger(__name__)
+
 
 @experimental
 class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
@@ -140,20 +143,23 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
             )
-
+        eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
+        eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
+        if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
+            eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-
-        score = math.nan
-        if llm_output:
-            score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
-
+        if isinstance(llm_output, dict):
+            score = float(llm_output.get("score", math.nan))
             score_result = "pass" if score >= self.threshold else "fail"
-
+            reason = llm_output.get("explanation", "")
             return {
                 f"{self._result_key}": score,
                 f"{self._result_key}_result": score_result,
                 f"{self._result_key}_threshold": self.threshold,
                 f"{self._result_key}_reason": reason,
+                # Uncomment the following line in the next iteration after UI contracts are validated.
+                # f"{self._result_key}_additional_details": llm_output
             }
-
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
         return {self._result_key: math.nan}