Skip to content

Commit 4f6d36d

Browse files
ghyadavsingankit
andauthored
TaskAdherence V2 prompt updates (#41616)
* Merging all commits to one * Adding black * updating CHANGELOG.md * updating CHANGELOG.md to also have Features Section * updating CHANGELOG.md to also have Features Section * updating _version.py * run black * Update CHANGELOG.md * Update _version.py --------- Co-authored-by: Ankit Singhal <30610298+singankit@users.noreply.github.com>
1 parent 416eee7 commit 4f6d36d

File tree

6 files changed

+659
-87
lines changed

6 files changed

+659
-87
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Release History
22

3+
## 1.10.0 (Unreleased)
4+
5+
### Features Added
6+
7+
### Bugs Fixed
8+
9+
- Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
10+
311
## 1.9.0 (2025-07-02)
412

513
### Features Added

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -494,14 +494,17 @@ def _extract_text_from_content(content):
494494
return text
495495

496496

497-
def _get_conversation_history(query):
497+
def _get_conversation_history(query, include_system_messages=False):
498498
all_user_queries = []
499499
cur_user_query = []
500500
all_agent_responses = []
501501
cur_agent_response = []
502+
system_message = None
502503
for msg in query:
503504
if not "role" in msg:
504505
continue
506+
if include_system_messages and msg["role"] == "system" and "content" in msg:
507+
system_message = msg.get("content", "")
505508
if msg["role"] == "user" and "content" in msg:
506509
if cur_agent_response != []:
507510
all_agent_responses.append(cur_agent_response)
@@ -530,13 +533,18 @@ def _get_conversation_history(query):
530533
category=ErrorCategory.INVALID_VALUE,
531534
blame=ErrorBlame.USER_ERROR,
532535
)
533-
534-
return {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
536+
result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
537+
if include_system_messages:
538+
result["system_message"] = system_message
539+
return result
535540

536541

537542
def _pretty_format_conversation_history(conversation_history):
538543
"""Formats the conversation history for better readability."""
539544
formatted_history = ""
545+
if "system_message" in conversation_history and conversation_history["system_message"] is not None:
546+
formatted_history += "SYSTEM_PROMPT:\n"
547+
formatted_history += " " + conversation_history["system_message"] + "\n\n"
540548
for i, (user_query, agent_response) in enumerate(
541549
zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
542550
):
@@ -552,10 +560,10 @@ def _pretty_format_conversation_history(conversation_history):
552560
return formatted_history
553561

554562

555-
def reformat_conversation_history(query, logger=None):
563+
def reformat_conversation_history(query, logger=None, include_system_messages=False):
556564
"""Reformats the conversation history to a more compact representation."""
557565
try:
558-
conversation_history = _get_conversation_history(query)
566+
conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
559567
return _pretty_format_conversation_history(conversation_history)
560568
except:
561569
# If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
@@ -570,22 +578,53 @@ def reformat_conversation_history(query, logger=None):
570578
return query
571579

572580

573-
def _get_agent_response(agent_response_msgs):
574-
"""Extracts the text from the agent response content."""
581+
def _get_agent_response(agent_response_msgs, include_tool_messages=False):
582+
"""Extracts formatted agent response including text, and optionally tool calls/results."""
575583
agent_response_text = []
584+
tool_results = {}
585+
586+
# First pass: collect tool results
587+
if include_tool_messages:
588+
for msg in agent_response_msgs:
589+
if msg.get("role") == "tool" and "tool_call_id" in msg:
590+
for content in msg.get("content", []):
591+
if content.get("type") == "tool_result":
592+
result = content.get("tool_result")
593+
tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
594+
595+
# Second pass: parse assistant messages and tool calls
576596
for msg in agent_response_msgs:
577-
if "role" in msg and msg["role"] == "assistant" and "content" in msg:
597+
if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
578598
text = _extract_text_from_content(msg["content"])
579599
if text:
580600
agent_response_text.extend(text)
601+
if include_tool_messages:
602+
for content in msg.get("content", []):
603+
# Todo: Verify if this is the correct way to handle tool calls
604+
if content.get("type") == "tool_call":
605+
if "tool_call" in content and "function" in content.get("tool_call", {}):
606+
tc = content.get("tool_call", {})
607+
func_name = tc.get("function", {}).get("name", "")
608+
args = tc.get("function", {}).get("arguments", {})
609+
tool_call_id = tc.get("id")
610+
else:
611+
tool_call_id = content.get("tool_call_id")
612+
func_name = content.get("name", "")
613+
args = content.get("arguments", {})
614+
args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
615+
call_line = f"[TOOL_CALL] {func_name}({args_str})"
616+
agent_response_text.append(call_line)
617+
if tool_call_id in tool_results:
618+
agent_response_text.append(tool_results[tool_call_id])
619+
581620
return agent_response_text
582621

583622

584-
def reformat_agent_response(response, logger=None):
623+
def reformat_agent_response(response, logger=None, include_tool_messages=False):
585624
try:
586625
if response is None or response == []:
587626
return ""
588-
agent_response = _get_agent_response(response)
627+
agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
589628
if agent_response == []:
590629
# If no message could be extracted, likely the format changed, fallback to the original response in that case
591630
if logger:
@@ -602,6 +641,26 @@ def reformat_agent_response(response, logger=None):
602641
return response
603642

604643

644+
def reformat_tool_definitions(tool_definitions, logger=None):
645+
try:
646+
output_lines = ["TOOL_DEFINITIONS:"]
647+
for tool in tool_definitions:
648+
name = tool.get("name", "unnamed_tool")
649+
desc = tool.get("description", "").strip()
650+
params = tool.get("parameters", {}).get("properties", {})
651+
param_names = ", ".join(params.keys()) if params else "no parameters"
652+
output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
653+
return "\n".join(output_lines)
654+
except Exception as e:
655+
# If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
656+
# This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
657+
if logger:
658+
logger.warning(
659+
f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
660+
)
661+
return tool_definitions
662+
663+
605664
def upload(path: str, container_client: ContainerClient, logger=None):
606665
"""Upload files or directories to Azure Blob Storage using a container client.
607666

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,19 @@
33
# ---------------------------------------------------------
44
import os
55
import math
6+
import logging
67
from typing import Dict, Union, List, Optional
78

89
from typing_extensions import overload, override
910

1011
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
1112
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
12-
from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
13+
from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
1314
from azure.ai.evaluation._model_configurations import Message
1415
from azure.ai.evaluation._common._experimental import experimental
1516

17+
logger = logging.getLogger(__name__)
18+
1619

1720
@experimental
1821
class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
@@ -140,20 +143,23 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
140143
category=ErrorCategory.MISSING_FIELD,
141144
target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
142145
)
143-
146+
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
147+
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
148+
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
149+
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
144150
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
145-
146-
score = math.nan
147-
if llm_output:
148-
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
149-
151+
if isinstance(llm_output, dict):
152+
score = float(llm_output.get("score", math.nan))
150153
score_result = "pass" if score >= self.threshold else "fail"
151-
154+
reason = llm_output.get("explanation", "")
152155
return {
153156
f"{self._result_key}": score,
154157
f"{self._result_key}_result": score_result,
155158
f"{self._result_key}_threshold": self.threshold,
156159
f"{self._result_key}_reason": reason,
160+
# Uncomment the following line in the next iteration after UI contracts are validated.
161+
# f"{self._result_key}_additional_details": llm_output
157162
}
158-
163+
if logger:
164+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
159165
return {self._result_key: math.nan}

0 commit comments

Comments
 (0)