From 41de91a6384087ca6aa66f9e2c73370f78699df9 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Sun, 22 Jun 2025 22:40:37 +0300 Subject: [PATCH 01/23] support 5 levels, evaluate all tools at once --- .../_tool_call_accuracy.py | 270 ++++++++---------- .../tool_call_accuracy.prompty | 126 ++++++-- 2 files changed, 226 insertions(+), 170 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 4caa6547a3bb..20b1455c4627 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -8,9 +8,7 @@ from typing import Dict, List, Union, TypeVar, cast from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase -from azure.ai.evaluation._common.utils import remove_optional_singletons, parse_quality_evaluator_reason_score from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException -from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS from azure.ai.evaluation._common._experimental import experimental logger = logging.getLogger(__name__) @@ -24,9 +22,12 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): - Parameter correctness according to tool definitions - Parameter value extraction from the conversation - The evaluator uses a binary scoring system (0 or 1): - - Score 0: The tool call is irrelevant or contains information not in the conversation/definition - - Score 1: The tool call is relevant with properly extracted parameters from the conversation + The evaluator uses a scoring rubric of 1 to 5: + - Score 1: The tool calls are irrelevant + - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed + - Score 3: The tool calls are relevant, but there were unncessary, excessive tool calls made + - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded + - Score 5: The tool calls are relevant, and all parameters were correctly passed This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing user needs while properly following tool definitions and using information present in the @@ -63,19 +64,18 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ _PROMPTY_FILE = "tool_call_accuracy.prompty" - _RESULT_KEY = "tool_call_accurate" - _AGGREGATE_RESULT_KEY = "tool_call_accuracy" + _RESULT_KEY = "tool_calls_success_level" - _MAX_TOOL_CALL_ACCURACY_SCORE = 1.0 - _MIN_TOOL_CALL_ACCURACY_SCORE = 0.0 - _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 0.8 + _MAX_TOOL_CALL_ACCURACY_SCORE = 5 + _MIN_TOOL_CALL_ACCURACY_SCORE = 1 + _DEFAULT_TOOL_CALL_ACCURACY_THRESHOLD = 3 id = "id" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override def __init__(self, model_config, *, - threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, + threshold=_DEFAULT_TOOL_CALL_ACCURACY_THRESHOLD, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) @@ -161,12 +161,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs): # TODO : Support classes that represents tool calls, messages etc once client side definitions are available if tool_calls is None: # Extract tool calls from response if not provided - tool_calls = [] - if isinstance(response, list): - for message in response: - if message.get("role") == "assistant": - tool_calls.extend([content for content in message.get("content") - if content.get("type") == "tool_call"]) + tool_calls = self._parse_response(response) if len(tool_calls) == 0: raise EvaluationException( message="response does not have tool calls. Either provide tool_calls or response with tool calls.", @@ -177,36 +172,26 @@ def _convert_kwargs_to_eval_input(self, **kwargs): if not isinstance(tool_calls, list): tool_calls = [tool_calls] - + if not isinstance(tool_definitions, list): tool_definitions = [tool_definitions] - eval_inputs = [] - # TODO : When evaluating an agent tool that depends on the output of a previous tool call, - # we need to provide the output of the previous tool call as part of messages. - for tool_call in tool_calls: - if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call": # TODO assuming dict here but it can be a class - function_name = tool_call.get("name") - tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name] - if len(tool_definition) > 0: - tool_definition = tool_definition - else: - raise EvaluationException( - message="Tool definition not found", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - ) - eval_inputs.append({"query": query, "tool_call": tool_call, "tool_definition": tool_definition}) - else: - raise EvaluationException( - message="Tool definition not found", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - ) - - return eval_inputs + needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions) + if len(needed_tool_definitions) == 0: + raise EvaluationException( + message="No tool definitions found for the provided tool calls.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + ) + + eval_input = { + "query": query, + "tool_calls": tool_calls, + "tool_definitions": needed_tool_definitions + } + + return eval_input @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] @@ -219,69 +204,51 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Instead of splitting into per-tool-call, pass all tool calls at once + try: + eval_input = self._convert_kwargs_to_eval_input(**kwargs) + + # If no tool calls were made or tool call type is not supported, return not applicable result + except EvaluationException as e: + return self._not_applicable_result(eval_input) + + # Single LLM call for all tool calls llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - score = math.nan - if llm_output: - score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]") - if score >= 0 and score <= 1: - return { - self._result_key: bool(float(score)), - f"{self._result_key}_reason": reason, - "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"), - } - raise EvaluationException( - message="Tool call accuracy evaluator: Invalid score returned from LLM.", + if isinstance(llm_output, dict): + score = llm_output.get(self._RESULT_KEY, None) + if not check_score_is_valid(score, ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE): + raise EvaluationException( + message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].", + internal_message="Invalid score value.", + category=ErrorCategory.FAILED_EXECUTION, + blame=ErrorBlame.SYSTEM_ERROR, + ) + + # Format the output + reason = llm_output.get("chain_of_thought", "") + score = float(score) + score_result = 'pass' if score >= self.threshold else 'fail' + response_dict = { + self._result_key: score, + f"{self._result_key}_result": score_result, + f"{self._result_key}_threshold": self.threshold, + f"{self._result_key}_reason": reason, + 'applicable': True, + 'per_tool_call_details': llm_output.get('additional_details', {}), + 'excess_tool_calls': llm_output.get('excess_tool_calls', {}), + 'missing_tool_calls': llm_output.get('missing_tool_calls', {}), + } + return response_dict + + else: + raise EvaluationException( + message="Tool call accuracy evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.INVALID_VALUE, + category=ErrorCategory.FAILED_EXECUTION, target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, ) - async def _real_call(self, **kwargs): - """The asynchronous call where real end-to-end evaluation logic is performed. - - :keyword kwargs: The inputs to evaluate. - :type kwargs: Dict - :return: The evaluation result. - :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]] - """ - # Convert inputs into list of evaluable inputs. - eval_input_list = self._convert_kwargs_to_eval_input(**kwargs) - if len(eval_input_list) == 0: - return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT, - f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT, - f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold, - f"{self._AGGREGATE_RESULT_KEY}_reason": - "No tool calls were made.", - "per_tool_call_details": [] - } - - per_turn_results = [] - # Evaluate all inputs. - for eval_input in eval_input_list: - if self._is_applicable_tool(eval_input): - per_turn_results.append(await self._do_eval(eval_input)) - else: - per_turn_results.append(self._not_applicable_result(eval_input)) - - return self._aggregate_results(per_turn_results=per_turn_results) - - def _is_applicable_tool(self, eval_input): - """Determine if a given tool should be evaluated, since we only evaluate tools that - have sufficient context available. - - :type eval_input: Dict - :return: True if the tool call should be evaluated - :rtype: bool - """ - tool_definition = eval_input.get("tool_definition") - if tool_definition is None or len(tool_definition) != 1: - return False - tool_type = tool_definition[0].get("type") - if tool_type is None or tool_type != "function": - return False - return True - def _not_applicable_result(self, eval_input): """Return a result indicating that the tool call is not applicable for evaluation. @@ -290,53 +257,66 @@ def _not_applicable_result(self, eval_input): :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ + # If no tool calls were made or tool call type is not supported, return not applicable result return { - f"{self._result_key}": self._NOT_APPLICABLE_RESULT, - f"{self._result_key}_reason": "Tool call not supported for evaluation", - "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"), - } - - def _aggregate_results(self, per_turn_results): - """Aggregate the evaluation results of each conversation turn into a single result. - - Exact implementation might need to vary slightly depending on the results produced. - Default behavior is to average the all number-based outputs. + self._result_key: self._NOT_APPLICABLE_RESULT, + f"{self._result_key}_result": 'pass', + f"{self._result_key}_threshold": self.threshold, + f"{self._result_key}_reason": 'Not applicable. No tool calls were made or tool call type is not supported.', + "applicable": False, + "per_tool_call_details": {}, + "excess_tool_calls": {}, + "missing_tool_calls": {}, - :param per_turn_results: List of evaluation results for each turn in the conversation. - :type per_turn_results: List[Dict] - :return: A dictionary containing aggregated results, with numeric metrics having their - means as top-level values in the dictionary, and all original - values (including non-numerics) located in under the "evaluation_per_turn" key, - which each sub-key being a metric and each sub-value being a the list of that metric's - per-turn values. - :rtype: AggregateResult[T_EvalValue] + } + + def _parse_response(self, response): + """Parse the response to extract tool calls and results. + :param response: The response to parse. + :type response: Union[str, List[dict]] + :return: List of tool calls extracted from the response. + :rtype: List[dict] """ - - aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {} - evaluation_per_turn: Dict[str, List[T_EvalValue]] = {} - - # Go over each turn, and rotate the results into a - # metric: List[values] format for the evals_per_turn dictionary. - - num_evaluated = len([per_turn_result for per_turn_result in per_turn_results - if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT]) - if num_evaluated == 0: - # None of the invoked tools were applicable, return not applicable result - # (If a tool fails evaluation, we'll throw an exception) - return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT, - f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT, - f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold, - f"{self._AGGREGATE_RESULT_KEY}_reason": - "Tool call accuracy evaluation is not yet supported for the invoked tools.", - "per_tool_call_details": [] - } - # ignore not_applicable results, where the _result_key will be "not applicable" - score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated - aggregated[self._AGGREGATE_RESULT_KEY] = score - aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT - aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold - aggregated["per_tool_call_details"] = per_turn_results - return aggregated + tool_calls = [] + tool_results = [] + if isinstance(response, list): + for message in response: + if message.get("role") == "assistant": + tool_calls.extend([content for content in message.get("content") + if content.get("type") == "tool_call"]) + tool_results.extend([content for content in message.get("content") + if content.get("type") == "tool_result"]) + # Format the tool calls and results + for i in range(min(len(tool_calls), len(tool_results))): + if isinstance(tool_calls[i], dict) and tool_calls[i].get("type") == "tool_call": + if tool_results[i]["tool_call_id"] == tool_calls[i]["tool_call_id"]: + tool_calls[i]["tool_result"] = tool_results[i] + + return tool_calls + + def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): + """Extract the tool definitions that are needed for the provided tool calls. + :param tool_calls: List of tool calls to evaluate. + :type tool_calls: List[dict] + :param tool_definitions: List of tool definitions to use for evaluation. + :type tool_definitions: List[dict] + :return: List of tool definitions that are needed for the provided tool calls. + :rtype: List[dict] + """ + for tool_call in tool_calls: + if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call": + tool_name = tool_call.get("name") + tool_definition = [tool for tool in tool_definitions if tool.get("name") == tool_name] + if len(tool_definition) > 0: + needed_tool_definitions.extend(tool_definition) + else: + raise EvaluationException( + message=f"Tool definition for {tool_name} not found", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + ) + return needed_tool_definitions @override def __call__( # pylint: disable=docstring-missing-param diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index 6b964cc54c0c..6d64fd69d6e1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -4,21 +4,21 @@ description: Evaluates Tool Call Accuracy for tool used by agent model: api: chat parameters: - temperature: 0.0 - max_tokens: 800 + temperature: 0 + max_tokens: 3000 top_p: 1.0 presence_penalty: 0 frequency_penalty: 0 response_format: - type: text + type: json_object inputs: query: - type: array - tool_call: - type: object - tool_definition: - type: object + type: List + tool_calls: + type: List + tool_definitions: + type: Dict --- system: @@ -27,7 +27,7 @@ system: ### Your are an expert in evaluating the accuracy of a tool call considering relevance and potential usefulness including syntactic and semantic correctness of a proposed tool call from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. - **Data**: Your input data include CONVERSATION , TOOL CALL and TOOL DEFINITION. -- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways, and you need to be very precise in your evaluation. user: # Definition @@ -40,32 +40,108 @@ user: 4. Potential Value: Is the information this tool call might provide likely to be useful in advancing the conversation or addressing the user expressed or implied needs? 5. Context Appropriateness: Does the tool call make sense at this point in the conversation, given what has been discussed so far? - # Ratings -## [Tool Call Accuracy: 0] (Irrelevant) +## [Tool Call Accuracy: 1] (Irrelevant) +**Definition:** +Tool calls were not relevant to the user's query, resulting in anirrelevant or unhelpful final output. +This level is a 'fail'. + +**Example:** + The user's query is asking for most popular hotels in New York, but the agent calls a tool that does search in local files on a machine. This tool is not relevant to the user query, so this case is a Level 1 'fail'. + + +## [Tool Call Accuracy: 2] (Partially Relevant - No output) +**Definition:** +Tool calls were somewhat related to the user's query, but the agent was not able to reach a final output that addresses the user query due to one or more of the following: +• Tools returned errors, and no retrials for the tool call were successful. +• Parameters passed to the tool were incorrect. +• Not enough tools were called to fully address the query (missing tool calls). +This level is a 'fail'. + +**Example:** + The user asks for the coordinates of Chicago. The agent calls the correct tool that retrieves the coordinates -which is the relevant tool for the user query- but passes 'New York' instead of 'Chicago' as the parameter to the tool. So this is a Level 2 'fail'. + +**Example:** + The user asks for the coordinates of Chicago. The agent calls the correct tool that retrieves the coordinates -which is the relevant tool for the user query- and passes 'Chicago' as the parameter to the tool which is also correct, but the tool returns an error so the agent can't reach the correct answer to the user's query. This is a Level 2 'fail'. + +**Example:** + The user asks a question that needs 3 tool calls for it to be answered. The agent calls only one of the three required tool calls. So this case is a Level 2 'fail'. + + +## [Tool Call Accuracy: 3] (Slightly Correct - Reached Output) +**Definition:** +Tool calls were relevant and led to a correct output. However, multiple excessive, unnecessary tool calls were made. +This level is a 'pass'. + +**Example:** + The user asked to do a modification in the database. The agent called the tool multiple times, resulting in multiple modifications in the database instead of one. This is a level 3 'pass'. + +**Example:** + The user asked for popular hotels in a certain place. The agent calls the same tool with the same parameters multiple times, even though a single tool call that returns an output is sufficient. So there were unnecessary tool calls. This is a Level 3 'pass'. + + +## [Tool Call Accuracy: 4] (Mostly Correct - Reached output) **Definition:** - 1. The TOOL CALL is not relevant and will not help resolve the user's need. - 2. TOOL CALL include parameters values that are not present or inferred from CONVERSATION. - 3. TOOL CALL has parameters that is not present in TOOL DEFINITION. +Tool calls were fully relevant and efficient: +• Correct tools were called with the correct parameters, whether they are extracted from the conversation history or the current user query. +• A tool returned an error, but the agent retried calling the tool and successfully got an output. +This level is a 'pass'. + +**Example:** + The user asks for the weather forecast in a certain place. The agent calls the correct tool that retrieves the weather forecast with the correct parameters, but the tool returns an error. The agent re-calls the tool once again and it returns the correct output. This is a Level 4 'pass'. + -## [Tool Call Accuracy: 1] (Relevant) +## [Tool Call Accuracy: 5] (Optimal Solution - Reached output) **Definition:** - 1. The TOOL CALL is directly relevant and very likely to help resolve the user's need. - 2. TOOL CALL include parameters values that are present or inferred from CONVERSATION. - 3. TOOL CALL has parameters that is present in TOOL DEFINITION. +Tool calls were fully relevant and efficient: +• Correct tools were called with the correct parameters, whether they are extracted from the conversation history or the current user query. +• No unnecessary or excessive tool calls were made. +• No errors occurred in any of the tools. +• The agent was able to reach the final output that addresses the user's query without facing any issues. +This level is a 'pass'. + +**Example:** + The user asks for the distance between two places. The agent correctly calls the tools that retrieve the coordinates for the two places respectively, then calls the tool that calculates the distance between the two sets of coordinates, passing the correct arguments to all the tools, without calling other tools excessively or unnecessarily. This is the optimal solution for the user's query. This is a Level 5 'pass'. + +**Example:** + The user asks for the distance between two places. The agent retrieves the needed coordinates from the outputs of the tool calls in the conversation history, and then correctly passes these coordinates to the tool that calculates the distance to output it to the user. This is also an optimal solution for the user's query. This is a Level 5 'pass'. + + + +# IMPORTANT NOTE +There is a clear distinction between 'pass' levels and 'fail' levels. The distinction is that the tools are called correctly in order to reach the required output. If the agent was not able to reach the final output that addresses the user query, it cannot be either of the 'pass' levels, and vice versa. It is crucial that you ensure you are rating the agent's response with the correct level based on the tool calls made to address the user's query. # Data CONVERSATION : {{query}} -TOOL CALL: {{tool_call}} +TOOL CALLS: {{tool_calls}} TOOL DEFINITION: {{tool_definition}} # Tasks -## Please provide your assessment Score for the previous CONVERSATION , TOOL CALL and TOOL DEFINITION based on the Definitions above. Your output should include the following information: -- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". -- **Explanation**: a very short explanation of why you think the input Data should get that Score. -- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "0", "1") based on the levels of the definitions. - +## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above. +Your output should consist only of a JSON object, as provided in the examples, that has the following keys: + - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query. + - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. + - tool_calls_sucess_result: 'pass' or 'fail' based on the evaluation level of the tool call accuracy. Levels 1 and 2 are a 'fail', levels 3, 4 and 5 are a 'pass'. + - additional_details: a dictionary that contains the following keys: + - tool_calls_made_by_agent: total number of tool calls made by the agent + - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent + - per_tool_details: a list of dictionaries, each containing: + - tool_name: name of the tool + - total_calls_required: total number of calls required for the tool + - correct_calls_made_by_agent: number of correct calls made by the agent + - correct_tool_percentage: percentage of correct calls made by the agent for this tool. It is a value between 0.0 and 1.0 + - tool_call_errors: number of errors encountered during the tool call + - tool_success_result: 'pass' or 'fail' based on the evaluation of the tool call accuracy for this tool + - excess_tool_calls: a dictionary with the following keys: + - total: total number of excess, unnecessary tool calls made by the agent + - per_tool_details: a list of dictionaries, each containing: + - tool_name: name of the tool + - excess_count: number of excess calls made for this query + - missing_tool_calls: a dictionary with the following keys: + - total: total number of missing tool calls that should have been made by the agent to be able to answer the query + - per_tool_details: a list of dictionaries, each containing: + - tool_name: name of the tool + - missing_count: number of missing calls for this query -## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. # Output \ No newline at end of file From 6a1e2b3836c438906872459c9217467f9b57535d Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 23 Jun 2025 12:09:20 +0300 Subject: [PATCH 02/23] Update sample notebook and change log --- sdk/evaluation/azure-ai-evaluation/CHANGELOG.md | 2 ++ sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md | 3 --- .../_tool_call_accuracy/_tool_call_accuracy.py | 9 +++++---- .../samples/agent_evaluators/tool_call_accuracy.ipynb | 11 +++++++---- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 6c71825e6f1c..0e6dd5c50cf7 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -8,6 +8,8 @@ - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens. +- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a number in the range [0-1]. The number range is now [1-5]. + - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415) ## 1.8.0 (2025-05-29) diff --git a/sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md b/sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md index 7d8a17a549cd..a48a4973fdc5 100644 --- a/sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md +++ b/sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md @@ -46,9 +46,6 @@ This guide walks you through how to investigate failures, common errors in the ` - Risk and safety evaluators depend on the Azure AI Studio safety evaluation backend service. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaisafetyeval-regionsupport). - If you encounter a 403 Unauthorized error when using safety evaluators, verify that you have the `Contributor` role assigned to your Azure AI project. `Contributor` role is currently required to run safety evaluations. -### Troubleshoot Quality Evaluator Issues -- For `ToolCallAccuracyEvaluator`, if your input did not have a tool to evaluate, the current behavior is to output `null`. - ## Handle Simulation Errors ### Adversarial Simulation Supported Regions diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 20b1455c4627..b7c73422b35e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -64,7 +64,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ _PROMPTY_FILE = "tool_call_accuracy.prompty" - _RESULT_KEY = "tool_calls_success_level" + _RESULT_KEY = "tool_call_accuracy" _MAX_TOOL_CALL_ACCURACY_SCORE = 5 _MIN_TOOL_CALL_ACCURACY_SCORE = 1 @@ -161,7 +161,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs): # TODO : Support classes that represents tool calls, messages etc once client side definitions are available if tool_calls is None: # Extract tool calls from response if not provided - tool_calls = self._parse_response(response) + tool_calls = self._parse_tools_from_response(response) if len(tool_calls) == 0: raise EvaluationException( message="response does not have tool calls. Either provide tool_calls or response with tool calls.", @@ -216,7 +216,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) if isinstance(llm_output, dict): - score = llm_output.get(self._RESULT_KEY, None) + score = llm_output.get("tool_calls_success_level", None) if not check_score_is_valid(score, ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE): raise EvaluationException( message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].", @@ -270,7 +270,7 @@ def _not_applicable_result(self, eval_input): } - def _parse_response(self, response): + def _parse_tools_from_response(self, response): """Parse the response to extract tool calls and results. :param response: The response to parse. :type response: Union[str, List[dict]] @@ -303,6 +303,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): :return: List of tool definitions that are needed for the provided tool calls. :rtype: List[dict] """ + needed_tool_definitions = [] for tool_call in tool_calls: if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call": tool_name = tool_call.get("name") diff --git a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_accuracy.ipynb b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_accuracy.ipynb index bf8695d2122c..c08365505d6f 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_accuracy.ipynb +++ b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_accuracy.ipynb @@ -13,7 +13,7 @@ "source": [ "### Getting Started\n", "\n", - "This sample demonstrates how to use Intent Resolution Evaluator\n", + "This sample demonstrates how to use Tool Call Accuracy Evaluator\n", "Before running the sample:\n", "```bash\n", "pip install azure-ai-projects azure-identity azure-ai-evaluation\n", @@ -39,9 +39,12 @@ "- Parameter value extraction from the conversation\n", "- Potential usefulness of the tool call\n", "\n", - "The evaluator uses a binary scoring system (0 or 1):\n", - " - Score 0: The tool call is irrelevant or contains information not in the conversation/definition\n", - " - Score 1: The tool call is relevant with properly extracted parameters from the conversation\n", + "The evaluator uses a scoring rubric of 1 to 5:\n", + " - Score 1: The tool calls are irrelevant\n", + " - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed\n", + " - Score 3: The tool calls are relevant, but there were unncessary, excessive tool calls made\n", + " - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded\n", + " - Score 5: The tool calls are relevant, and all parameters were correctly passed and no excessive calls were made.\n", "\n", "This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing query while properly following tool definitions and using information present in the conversation history." ] From 0dad199e5d8797f34b163855ebb2eb4d91ac7437 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 23 Jun 2025 15:59:50 +0300 Subject: [PATCH 03/23] Add missing import --- .../_evaluators/_tool_call_accuracy/_tool_call_accuracy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index b7c73422b35e..076b165cbcbe 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -9,6 +9,7 @@ from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException +from ..._common.utils import check_score_is_valid from azure.ai.evaluation._common._experimental import experimental logger = logging.getLogger(__name__) @@ -64,7 +65,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ _PROMPTY_FILE = "tool_call_accuracy.prompty" - _RESULT_KEY = "tool_call_accuracy" + _RESULT_KEY = "tool_call_accurate" _MAX_TOOL_CALL_ACCURACY_SCORE = 5 _MIN_TOOL_CALL_ACCURACY_SCORE = 1 From e4b1a37711daf090b05b01fad6d57b6bc5741a82 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Tue, 24 Jun 2025 00:01:42 +0300 Subject: [PATCH 04/23] Modify test cases to match the new output format --- .../_evaluators/_common/_base_eval.py | 2 +- .../_tool_call_accuracy.py | 97 +++---- .../test_tool_call_accuracy_evaluator.py | 263 +++++++----------- 3 files changed, 146 insertions(+), 216 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index 60ecd6b1edbe..c88b4b8e47ce 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -288,7 +288,7 @@ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]: return multi_modal_converter - def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]: + def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]: """Convert an arbitrary input into a list of inputs for evaluators. It is assumed that evaluators generally make use of their inputs in one of two ways. Either they receive a collection of keyname inputs that are all single values diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 076b165cbcbe..7727918e1658 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -69,14 +69,14 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): _MAX_TOOL_CALL_ACCURACY_SCORE = 5 _MIN_TOOL_CALL_ACCURACY_SCORE = 1 - _DEFAULT_TOOL_CALL_ACCURACY_THRESHOLD = 3 + _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3 id = "id" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override def __init__(self, model_config, *, - threshold=_DEFAULT_TOOL_CALL_ACCURACY_THRESHOLD, + threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) @@ -138,61 +138,40 @@ def _convert_kwargs_to_eval_input(self, **kwargs): """ # TODO add warning that only tool calls of type function are supported # Collect inputs - tool_calls = kwargs.get("tool_calls", None) + tool_calls = kwargs.get("tool_calls") tool_definitions = kwargs.get("tool_definitions") - query = kwargs.get("query", None) - response = kwargs.get("response", None) - - if response is None and tool_calls is None: - raise EvaluationException( - message="Either response or tool_calls must be provided.", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.MISSING_FIELD, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - ) - - if tool_definitions is None: - raise EvaluationException( - message="Tool definitions must be provided.", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.MISSING_FIELD, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - ) + query = kwargs.get("query") + response = kwargs.get("response") # TODO : Support classes that represents tool calls, messages etc once client side definitions are available - if tool_calls is None: - # Extract tool calls from response if not provided - tool_calls = self._parse_tools_from_response(response) - if len(tool_calls) == 0: - raise EvaluationException( - message="response does not have tool calls. Either provide tool_calls or response with tool calls.", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.MISSING_FIELD, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - ) + if response: + parsed_tool_calls = self._parse_tools_from_response(response) + if parsed_tool_calls: + tool_calls = parsed_tool_calls + + if not tool_calls: + return {"error_message": "No tool calls found in response or provided tool_calls."} + if not tool_definitions or len(tool_definitions) == 0: + return {"error_message": "Tool definitions must be provided."} if not isinstance(tool_calls, list): tool_calls = [tool_calls] - if not isinstance(tool_definitions, list): tool_definitions = [tool_definitions] - needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions) + try: + needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions) + except EvaluationException as e: + return {"error_message": "Tool definitions for all tool calls must be provided."} if len(needed_tool_definitions) == 0: - raise EvaluationException( - message="No tool definitions found for the provided tool calls.", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - ) - - eval_input = { + return {"error_message": "Tool definitions for all tool calls must be provided."} + + return { "query": query, "tool_calls": tool_calls, "tool_definitions": needed_tool_definitions } - return eval_input @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] @@ -205,20 +184,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ - # Instead of splitting into per-tool-call, pass all tool calls at once - try: - eval_input = self._convert_kwargs_to_eval_input(**kwargs) - - # If no tool calls were made or tool call type is not supported, return not applicable result - except EvaluationException as e: - return self._not_applicable_result(eval_input) - # Single LLM call for all tool calls llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) if isinstance(llm_output, dict): score = llm_output.get("tool_calls_success_level", None) - if not check_score_is_valid(score, ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE): + if not score or not check_score_is_valid(score, ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE): raise EvaluationException( message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].", internal_message="Invalid score value.", @@ -249,8 +220,26 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.FAILED_EXECUTION, target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, ) + + async def _real_call(self, **kwargs): + """The asynchronous call where real end-to-end evaluation logic is performed. - def _not_applicable_result(self, eval_input): + :keyword kwargs: The inputs to evaluate. + :type kwargs: Dict + :return: The evaluation result. + :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]] + """ + # Convert inputs into list of evaluable inputs. + eval_input = self._convert_kwargs_to_eval_input(**kwargs) + if isinstance(eval_input, dict) and eval_input.get('error_message'): + # If there is an error message, return not applicable result + return self._not_applicable_result(eval_input.get('error_message')) + # Do the evaluation + result = await self._do_eval(eval_input) + # Return the result + return result + + def _not_applicable_result(self, error_message): """Return a result indicating that the tool call is not applicable for evaluation. :param eval_input: The input to the evaluator. @@ -263,7 +252,7 @@ def _not_applicable_result(self, eval_input): self._result_key: self._NOT_APPLICABLE_RESULT, f"{self._result_key}_result": 'pass', f"{self._result_key}_threshold": self.threshold, - f"{self._result_key}_reason": 'Not applicable. No tool calls were made or tool call type is not supported.', + f"{self._result_key}_reason": error_message, "applicable": False, "per_tool_call_details": {}, "excess_tool_calls": {}, @@ -308,7 +297,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): for tool_call in tool_calls: if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call": tool_name = tool_call.get("name") - tool_definition = [tool for tool in tool_definitions if tool.get("name") == tool_name] + tool_definition = [tool for tool in tool_definitions if tool.get("name") == tool_name and tool.get("type", "function") == "function"] if len(tool_definition) > 0: needed_tool_definitions.extend(tool_definition) else: diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 9d6fa8e03573..50997438ed92 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -5,14 +5,43 @@ from azure.ai.evaluation._exceptions import EvaluationException -# Use tool_call_id convenience to specify whether eval result is good, bad, or invalid +# This mock should return a dictionary that mimics the output of the prompty (the _flow call), +# which is then processed by the _do_eval method. async def flow_side_effect(timeout, **kwargs): - if "good" in kwargs.get("tool_call").get("tool_call_id"): - return """Let's think step by step. You're totally right! Tool is the best ever. 1""" - elif "bad" in kwargs.get("tool_call").get("tool_call_id"): - return """Let's think step by step. You're wrong! Tool is not good. 0""" - else: - return """Let's think Or not. Tool is...who knows. hello""" + tool_calls = kwargs.get("tool_calls", []) + + good_calls = sum(1 for tc in tool_calls if "good" in tc.get("tool_call_id", "")) + bad_calls = sum(1 for tc in tool_calls if "bad" in tc.get("tool_call_id", "")) + invalid_calls = sum(1 for tc in tool_calls if "invalid" in tc.get("tool_call_id", "")) + total_calls = len(tool_calls) + + if invalid_calls > 0: + # Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid + return { + "chain_of_thought": "The tool calls were very correct that I returned a huge number!", + "tool_calls_success_level": 25, + "additional_details": {}, + "excess_tool_calls": {}, + "missing_tool_calls": {} + } + + score = 1 # Default score for "all bad" + if total_calls > 0: + if good_calls == total_calls: + score = 5 # All good + elif good_calls > 0: + score = 3 # Mixed good and bad + + return { + "chain_of_thought": f"Evaluated {total_calls} tool calls with {good_calls} correct calls.", + "tool_calls_success_level": score, + "additional_details": { + "tool_calls_made_by_agent": total_calls, + "correct_tool_calls_made_by_agent": good_calls + }, + "excess_tool_calls": {"total": 0}, + "missing_tool_calls": {"total": 0} + } @pytest.mark.usefixtures("mock_model_config") @@ -22,15 +51,14 @@ def test_evaluate_tools_valid1(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) - # Test evaluation with valid input, one good tool call and one bad + # Test evaluation with one good and one bad tool call query="Where is the Eiffel Tower?" - response="The Eiffel Tower is in Paris." tool_calls=[ { "type": "tool_call", "tool_call_id": "call_good", "name": "fetch_weather", - "arguments": {"location": "Tokyo"}, + "arguments": {"location": "Paris"}, }, { "type": "tool_call", @@ -63,37 +91,27 @@ def test_evaluate_tools_valid1(self, mock_model_config): }, }, ] - result = evaluator(query=query, response=response, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._AGGREGATE_RESULT_KEY + key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) - assert result[key] == 0.5 - assert result[f"{key}_result"] == "fail" + assert result[key] == 3.0 # Mixed good/bad gets score 3 + assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert f"{key}_reason" in result + assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." assert "per_tool_call_details" in result - assert len(result["per_tool_call_details"]) == 2 - for tool_call in result["per_tool_call_details"]: - assert "tool_call_accurate" in tool_call - assert "tool_call_accurate_reason" in tool_call - assert "tool_call_id" in tool_call - if tool_call["tool_call_id"] == "call_good": - assert tool_call["tool_call_accurate"] is True - assert len(tool_call["tool_call_accurate_reason"]) > 0 - elif tool_call["tool_call_id"] == "call_bad": - assert tool_call["tool_call_accurate"] is False - assert len(tool_call["tool_call_accurate_reason"]) > 0 - else: - pytest.fail() - + assert "excess_tool_calls" in result + assert "missing_tool_calls" in result + assert result["applicable"] is True def test_evaluate_tools_valid2(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) - # Test evaluation with valid input, one good tool call and one bad + # Test evaluation with two bad tool calls query="Where is the Eiffel Tower?" - response="The Eiffel Tower is in Paris." tool_calls=[ { "type": "tool_call", @@ -132,49 +150,39 @@ def test_evaluate_tools_valid2(self, mock_model_config): }, }, ] - result = evaluator(query=query, response=response, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._AGGREGATE_RESULT_KEY + key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) - assert result[key] == 0.0 + assert result[key] == 1.0 # All bad gets score 1 assert result[f"{key}_result"] == "fail" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert f"{key}_reason" in result + assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." assert "per_tool_call_details" in result - assert len(result["per_tool_call_details"]) == 2 - for tool_call in result["per_tool_call_details"]: - assert "tool_call_accurate" in tool_call - assert "tool_call_accurate_reason" in tool_call - assert "tool_call_id" in tool_call - if tool_call["tool_call_id"] == "call_good": - assert tool_call["tool_call_accurate"] is False - assert len(tool_call["tool_call_accurate_reason"]) > 0 - elif tool_call["tool_call_id"] == "call_bad": - assert tool_call["tool_call_accurate"] is False - assert len(tool_call["tool_call_accurate_reason"]) > 0 - else: - pytest.fail() - + assert "excess_tool_calls" in result + assert "missing_tool_calls" in result + assert result["applicable"] is True def test_evaluate_tools_valid3(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) - # Test evaluation with valid input, one good tool call and one bad + # Test evaluation with two good tool calls query="Where is the Eiffel Tower?" - response="The Eiffel Tower is in Paris." tool_calls=[ { "type": "tool_call", "tool_call_id": "call_good", "name": "fetch_weather", - "arguments": {"location": "Tokyo"}, + "arguments": {"location": "Paris"}, }, { "type": "tool_call", "tool_call_id": "call_good", "name": "buy_jacket", - "arguments": {"type": "raincoat"}, + "arguments": {"type": "jacket"}, }, ] tool_definitions=[ @@ -201,51 +209,35 @@ def test_evaluate_tools_valid3(self, mock_model_config): }, }, ] - result = evaluator(query=query, response=response, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._AGGREGATE_RESULT_KEY + key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) - assert result[key] == 1.0 + assert result[key] == 5.0 # All good gets score 5 assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert f"{key}_reason" in result + assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." assert "per_tool_call_details" in result - assert len(result["per_tool_call_details"]) == 2 - for tool_call in result["per_tool_call_details"]: - assert "tool_call_accurate" in tool_call - assert "tool_call_accurate_reason" in tool_call - assert "tool_call_id" in tool_call - if tool_call["tool_call_id"] == "call_good": - assert tool_call["tool_call_accurate"] is True - assert len(tool_call["tool_call_accurate_reason"]) > 0 - elif tool_call["tool_call_id"] == "call_bad": - assert tool_call["tool_call_accurate"] is True - assert len(tool_call["tool_call_accurate_reason"]) > 0 - else: - pytest.fail() + assert "excess_tool_calls" in result + assert "missing_tool_calls" in result + assert result["applicable"] is True def test_evaluate_tools_one_eval_fails(self, mock_model_config): with pytest.raises(EvaluationException) as exc_info: - evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) - # Test evaluation with valid input, one good tool call and one bad + # Test evaluation with an invalid tool call ID to trigger failure query="Where is the Eiffel Tower?" - response="The Eiffel Tower is in Paris." tool_calls=[ { "type": "tool_call", - "tool_call_id": "call_good", + "tool_call_id": "call_invalid", "name": "fetch_weather", "arguments": {"location": "Tokyo"}, }, - { - "type": "tool_call", - "tool_call_id": "call_invalid", - "name": "buy_jacket", - "arguments": {"type": "raincoat"}, - }, ] tool_definitions=[ { @@ -259,29 +251,17 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config): }, }, }, - { - "name": "buy_jacket", - "type": "function", - "description": "Buy a jacket of the given type.", - "parameters": { - "type": "object", - "properties": { - "type": {"type": "string", "description": "The type of jacket to buy."} - }, - }, - }, ] - result = evaluator(query=query, response=response, tool_calls=tool_calls, tool_definitions=tool_definitions) - # if one tool call evaluation fails, we'll fail the whole thing - assert "Tool call accuracy evaluator" in str(exc_info.value) + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Invalid score value" in str(exc_info.value) def test_evaluate_tools_some_not_applicable(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) - # Test evaluation with valid input, one good tool call and one bad + # Test with one function tool and one non-function tool query="Where is the Eiffel Tower?" - response="The Eiffel Tower is in Paris." tool_calls=[ { "type": "tool_call", @@ -310,7 +290,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): }, { "name": "buy_jacket", - "type": "another_built_in", + "type": "another_built_in", # This tool will be filtered out "description": "Buy a jacket of the given type.", "parameters": { "type": "object", @@ -320,36 +300,25 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): }, }, ] - result = evaluator(query=query, response=response, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._AGGREGATE_RESULT_KEY + key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) - assert result[key] == 1.0 + assert result[key] == "not applicable" assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert "per_tool_call_details" in result - assert len(result["per_tool_call_details"]) == 2 - for tool_call in result["per_tool_call_details"]: - assert "tool_call_accurate" in tool_call - assert "tool_call_accurate_reason" in tool_call - assert "tool_call_id" in tool_call - if tool_call["tool_call_id"] == "call_good": - assert tool_call["tool_call_accurate"] is True - assert len(tool_call["tool_call_accurate_reason"]) > 0 - elif tool_call["tool_call_id"] == "call_bad": - assert tool_call["tool_call_accurate"] == "not applicable" - assert tool_call["tool_call_accurate_reason"] == "Tool call not supported for evaluation" - else: - pytest.fail() + assert result[f"{key}_reason"] == "Tool definitions for all tool calls must be provided." + assert result["per_tool_call_details"] == {} + assert result["excess_tool_calls"] == {} + assert result["missing_tool_calls"] == {} + assert result["applicable"] is False def test_evaluate_tools_all_not_applicable(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) - - # Test evaluation with valid input, one good tool call and one bad + + # Test with only non-function tools query="Where is the Eiffel Tower?" - response="The Eiffel Tower is in Paris." tool_calls=[ { "type": "tool_call", @@ -357,17 +326,11 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): "name": "fetch_weather", "arguments": {"location": "Tokyo"}, }, - { - "type": "tool_call", - "tool_call_id": "call_good", - "name": "buy_jacket", - "arguments": {"type": "raincoat"}, - }, ] tool_definitions=[ { "name": "fetch_weather", - "type": "some_built_in", + "type": "some_built_in", # Not a 'function' type "description": "Fetches the weather information for the specified location.", "parameters": { "type": "object", @@ -376,37 +339,26 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): }, }, }, - { - "name": "buy_jacket", - "type": "another_built_in", - "description": "Buy a jacket of the given type.", - "parameters": { - "type": "object", - "properties": { - "type": {"type": "string", "description": "The type of jacket to buy."} - }, - }, - }, ] - result = evaluator(query=query, response=response, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._AGGREGATE_RESULT_KEY + key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) assert result[key] == "not applicable" - assert result[f"{key}_result"] == "not applicable" + assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert "per_tool_call_details" in result - assert len(result["per_tool_call_details"]) == 0 - assert result[f"{key}_reason"] == "Tool call accuracy evaluation is not yet supported for the invoked tools." + assert result[f"{key}_reason"] == "Tool definitions for all tool calls must be provided." + assert result["per_tool_call_details"] == {} + assert result["excess_tool_calls"] == {} + assert result["missing_tool_calls"] == {} + assert result["applicable"] is False def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) - # Test evaluation with valid input, one good tool call and one bad + # Test with no tool calls provided query="Where is the Eiffel Tower?" - response="The Eiffel Tower is in Paris." tool_calls=[] tool_definitions=[ { @@ -420,27 +372,16 @@ def test_evaluate_tools_no_tools(self, mock_model_config): }, }, }, - { - "name": "buy_jacket", - "type": "another_built_in", - "description": "Buy a jacket of the given type.", - "parameters": { - "type": "object", - "properties": { - "type": {"type": "string", "description": "The type of jacket to buy."} - }, - }, - }, ] - result = evaluator(query=query, response=response, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - key = ToolCallAccuracyEvaluator._AGGREGATE_RESULT_KEY + key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) assert result[key] == "not applicable" - assert result[f"{key}_result"] == "not applicable" + assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert "per_tool_call_details" in result - assert len(result["per_tool_call_details"]) == 0 - assert result[f"{key}_reason"] == "No tool calls were made." - + assert result[f"{key}_reason"] == "No tool calls found in response or provided tool_calls." + assert result["per_tool_call_details"] == {} + assert result["excess_tool_calls"] == {} + assert result["missing_tool_calls"] == {} + assert result["applicable"] is False \ No newline at end of file From a40c91bf59bd18b106eb9456bdd1342fb6a16833 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Tue, 24 Jun 2025 00:20:40 +0300 Subject: [PATCH 05/23] Modify other test file to match the new output format --- .../tests/unittests/test_agent_evaluators.py | 162 ++++++++---------- 1 file changed, 70 insertions(+), 92 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py index d8485bb04c00..f42bcb747f7a 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py @@ -9,109 +9,87 @@ class TestEvaluate: def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=mock_model_config) - # Test tool_calls provided but missing response - with pytest.raises(EvaluationException) as exc_info: - tool_call_accuracy( - query="Where is the Eiffel Tower?", - response="The Eiffel Tower is in Paris.", - tool_calls="Test", - tool_definitions={ - "name": "fetch_weather", - "description": "Fetches the weather information for the specified location.", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The location to fetch weather for." - } + # Test with missing tool_calls and response + result = tool_call_accuracy( + query="Where is the Eiffel Tower?", + tool_definitions=[{ + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for." } } } - ) + }] + ) + assert not result["applicable"] + assert result["tool_call_accurate"] == "not applicable" + assert "No tool calls found in response or provided tool_calls." in result["tool_call_accurate_reason"] # Test with missing tool_definitions - with pytest.raises(EvaluationException) as exc_info: - tool_call_accuracy( - query="Where is the Eiffel Tower?", - tool_calls={ - "type": "tool_call", - "tool_call": { - "id": "call_K21dwOxgCN2syn4qjutMVV7Z", - "type": "function", - "function": { - "name": "fetch_weather", - "arguments": { - "location": "Tokyo" - } - } - } + result = tool_call_accuracy( + query="Where is the Eiffel Tower?", + tool_definitions=[], + tool_calls=[{ + "type": "tool_call", + "name": "fetch_weather", + "arguments": { + "location": "Tokyo" } - ) - assert "Tool definitions must be provided." in str(exc_info.value) - - # Test with missing tool_cools - with pytest.raises(EvaluationException) as exc_info: - tool_call_accuracy( - query="Where is the Eiffel Tower?", - tool_definitions={ - "name": "fetch_weather", - "description": "Fetches the weather information for the specified location.", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The location to fetch weather for." - } - } - } - } - ) + }] + ) + assert not result["applicable"] + assert result["tool_call_accurate"] == "not applicable" + assert "Tool definitions must be provided." in result["tool_call_accurate_reason"] - assert "Either response or tool_calls must be provided." in str(exc_info.value) - - # Test response provided but missing tool_calls - with pytest.raises(EvaluationException) as exc_info: - tool_call_accuracy( - query="Where is the Eiffel Tower?", - response="The Eiffel Tower is in Paris.", - tool_definitions={ - "name": "fetch_weather", - "description": "Fetches the weather information for the specified location.", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The location to fetch weather for." - } + # Test with response that has no tool calls + result = tool_call_accuracy( + query="Where is the Eiffel Tower?", + response="The Eiffel Tower is in Paris.", + tool_definitions=[{ + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for." } } } - ) - - assert "response does not have tool calls. Either provide tool_calls or response with tool calls." in str(exc_info.value) + }] + ) + assert not result["applicable"] + assert result["tool_call_accurate"] == "not applicable" + assert "No tool calls found in response or provided tool_calls." in result["tool_call_accurate_reason"] - # Test tool_calls provided but missing response - with pytest.raises(EvaluationException) as exc_info: - tool_call_accuracy( - query="Where is the Eiffel Tower?", - response="The Eiffel Tower is in Paris.", - tool_calls="Test", - tool_definitions={ - "name": "fetch_weather", - "description": "Fetches the weather information for the specified location.", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The location to fetch weather for." - } + # Test with tool call for which definition is not provided + result = tool_call_accuracy( + query="Where is the Eiffel Tower?", + tool_calls=[{ + "type": "tool_call", + "name": "some_other_tool", + "arguments": {} + }], + tool_definitions=[{ + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for." } } } - ) - - assert "Tool definition not found" in str(exc_info.value) \ No newline at end of file + }] + ) + assert not result["applicable"] + assert result["tool_call_accurate"] == "not applicable" + assert "Tool definitions for all tool calls must be provided." in result["tool_call_accurate_reason"] \ No newline at end of file From ed0ecf99c3652915b63fb123d0e2341ee716f07c Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Tue, 24 Jun 2025 11:45:04 +0300 Subject: [PATCH 06/23] Fixed parsing of results --- .../_tool_call_accuracy.py | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 7727918e1658..3a039e2580ff 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -268,19 +268,29 @@ def _parse_tools_from_response(self, response): :rtype: List[dict] """ tool_calls = [] - tool_results = [] + tool_results_map = {} if isinstance(response, list): - for message in response: - if message.get("role") == "assistant": - tool_calls.extend([content for content in message.get("content") - if content.get("type") == "tool_call"]) - tool_results.extend([content for content in message.get("content") - if content.get("type") == "tool_result"]) - # Format the tool calls and results - for i in range(min(len(tool_calls), len(tool_results))): - if isinstance(tool_calls[i], dict) and tool_calls[i].get("type") == "tool_call": - if tool_results[i]["tool_call_id"] == tool_calls[i]["tool_call_id"]: - tool_calls[i]["tool_result"] = tool_results[i] + for message in response: + print(message) + # Extract tool calls from assistant messages + if message.get("role") == "assistant" and isinstance(message.get("content"), list): + for content_item in message.get("content"): + if isinstance(content_item, dict) and content_item.get("type") == "tool_call": + tool_calls.append(content_item) + + # Extract tool results from tool messages + elif message.get("role") == "tool" and message.get("tool_call_id"): + tool_call_id = message.get("tool_call_id") + if isinstance(message.get("content"), list) and len(message.get("content")) > 0: + result_content = message.get("content")[0] + if isinstance(result_content, dict) and result_content.get("type") == "tool_result": + tool_results_map[tool_call_id] = result_content + + # Attach results to their corresponding calls + for tool_call in tool_calls: + tool_call_id = tool_call.get("tool_call_id") + if tool_call_id in tool_results_map: + tool_call["tool_result"] = tool_results_map[tool_call_id]['tool_result'] return tool_calls @@ -297,7 +307,9 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): for tool_call in tool_calls: if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call": tool_name = tool_call.get("name") - tool_definition = [tool for tool in tool_definitions if tool.get("name") == tool_name and tool.get("type", "function") == "function"] + tool_definition = [tool for tool in tool_definitions + if tool.get("name") == tool_name and + tool.get("type", "function") == "function"] if len(tool_definition) > 0: needed_tool_definitions.extend(tool_definition) else: From 9bc900bd80593c5d94f6e20457e186c615257b46 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Tue, 24 Jun 2025 12:42:23 +0300 Subject: [PATCH 07/23] Change key name in output --- .../_tool_call_accuracy/tool_call_accuracy.prompty | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index 6d64fd69d6e1..fb431b23d1e8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -126,7 +126,7 @@ Your output should consist only of a JSON object, as provided in the examples, t - additional_details: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent - - per_tool_details: a list of dictionaries, each containing: + - details: a list of dictionaries, each containing: - tool_name: name of the tool - total_calls_required: total number of calls required for the tool - correct_calls_made_by_agent: number of correct calls made by the agent @@ -135,12 +135,12 @@ Your output should consist only of a JSON object, as provided in the examples, t - tool_success_result: 'pass' or 'fail' based on the evaluation of the tool call accuracy for this tool - excess_tool_calls: a dictionary with the following keys: - total: total number of excess, unnecessary tool calls made by the agent - - per_tool_details: a list of dictionaries, each containing: + - details: a list of dictionaries, each containing: - tool_name: name of the tool - excess_count: number of excess calls made for this query - missing_tool_calls: a dictionary with the following keys: - total: total number of missing tool calls that should have been made by the agent to be able to answer the query - - per_tool_details: a list of dictionaries, each containing: + - details: a list of dictionaries, each containing: - tool_name: name of the tool - missing_count: number of missing calls for this query From eaf493af6cfb8e9b519d9c808e1e8631c267cec7 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Tue, 24 Jun 2025 13:26:09 +0300 Subject: [PATCH 08/23] Spell check fixes --- .../_evaluators/_tool_call_accuracy/_tool_call_accuracy.py | 2 +- .../_tool_call_accuracy/tool_call_accuracy.prompty | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a039e2580ff..376f453a62d3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -26,7 +26,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): The evaluator uses a scoring rubric of 1 to 5: - Score 1: The tool calls are irrelevant - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed - - Score 3: The tool calls are relevant, but there were unncessary, excessive tool calls made + - Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded - Score 5: The tool calls are relevant, and all parameters were correctly passed diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index fb431b23d1e8..e6c67c7d1f0e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -43,7 +43,7 @@ user: # Ratings ## [Tool Call Accuracy: 1] (Irrelevant) **Definition:** -Tool calls were not relevant to the user's query, resulting in anirrelevant or unhelpful final output. +Tool calls were not relevant to the user's query, resulting in an irrelevant or unhelpful final output. This level is a 'fail'. **Example:** @@ -122,7 +122,7 @@ TOOL DEFINITION: {{tool_definition}} Your output should consist only of a JSON object, as provided in the examples, that has the following keys: - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query. - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. - - tool_calls_sucess_result: 'pass' or 'fail' based on the evaluation level of the tool call accuracy. Levels 1 and 2 are a 'fail', levels 3, 4 and 5 are a 'pass'. + - tool_calls_success_result: 'pass' or 'fail' based on the evaluation level of the tool call accuracy. Levels 1 and 2 are a 'fail', levels 3, 4 and 5 are a 'pass'. - additional_details: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent From 196563936cc563725b18fddd6c9c80ba4eed4ade Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Tue, 24 Jun 2025 22:13:40 +0300 Subject: [PATCH 09/23] Minor prompt update --- .../_tool_call_accuracy/tool_call_accuracy.prompty | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index e6c67c7d1f0e..8286d9656984 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -108,8 +108,9 @@ This level is a 'pass'. -# IMPORTANT NOTE -There is a clear distinction between 'pass' levels and 'fail' levels. The distinction is that the tools are called correctly in order to reach the required output. If the agent was not able to reach the final output that addresses the user query, it cannot be either of the 'pass' levels, and vice versa. It is crucial that you ensure you are rating the agent's response with the correct level based on the tool calls made to address the user's query. +# IMPORTANT NOTES +- There is a clear distinction between 'pass' levels and 'fail' levels. The distinction is that the tools are called correctly in order to reach the required output. If the agent was not able to reach the final output that addresses the user query, it cannot be either of the 'pass' levels, and vice versa. It is crucial that you ensure you are rating the agent's response with the correct level based on the tool calls made to address the user's query. +- You are NOT concerned with the correctness of the result of the tool. As long as the tool did not return an error, then the tool output is correct and accurate. Do not look into the correctness of the tool's result. # Data CONVERSATION : {{query}} From 886524076f8750f0bbb1b9e8f54664f689d51853 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Wed, 25 Jun 2025 09:12:46 +0300 Subject: [PATCH 10/23] Update result key to tool_call_accuracy --- .../_tool_call_accuracy/_tool_call_accuracy.py | 2 +- .../_tool_call_accuracy/test_new_evaluator.ipynb | 0 .../tests/unittests/test_agent_evaluators.py | 16 ++++++++-------- 3 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/test_new_evaluator.ipynb diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 376f453a62d3..68715c9a18b3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -65,7 +65,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ _PROMPTY_FILE = "tool_call_accuracy.prompty" - _RESULT_KEY = "tool_call_accurate" + _RESULT_KEY = "tool_call_accuracy" _MAX_TOOL_CALL_ACCURACY_SCORE = 5 _MIN_TOOL_CALL_ACCURACY_SCORE = 1 diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/test_new_evaluator.ipynb b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/test_new_evaluator.ipynb new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py index f42bcb747f7a..a7df53a45606 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py @@ -27,8 +27,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): }] ) assert not result["applicable"] - assert result["tool_call_accurate"] == "not applicable" - assert "No tool calls found in response or provided tool_calls." in result["tool_call_accurate_reason"] + assert result["tool_call_accuracy"] == "not applicable" + assert "No tool calls found in response or provided tool_calls." in result["tool_call_accuracy_reason"] # Test with missing tool_definitions result = tool_call_accuracy( @@ -43,8 +43,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): }] ) assert not result["applicable"] - assert result["tool_call_accurate"] == "not applicable" - assert "Tool definitions must be provided." in result["tool_call_accurate_reason"] + assert result["tool_call_accuracy"] == "not applicable" + assert "Tool definitions must be provided." in result["tool_call_accuracy_reason"] # Test with response that has no tool calls result = tool_call_accuracy( @@ -65,8 +65,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): }] ) assert not result["applicable"] - assert result["tool_call_accurate"] == "not applicable" - assert "No tool calls found in response or provided tool_calls." in result["tool_call_accurate_reason"] + assert result["tool_call_accuracy"] == "not applicable" + assert "No tool calls found in response or provided tool_calls." in result["tool_call_accuracy_reason"] # Test with tool call for which definition is not provided result = tool_call_accuracy( @@ -91,5 +91,5 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): }] ) assert not result["applicable"] - assert result["tool_call_accurate"] == "not applicable" - assert "Tool definitions for all tool calls must be provided." in result["tool_call_accurate_reason"] \ No newline at end of file + assert result["tool_call_accuracy"] == "not applicable" + assert "Tool definitions for all tool calls must be provided." in result["tool_call_accuracy_reason"] \ No newline at end of file From fcd1cb81290ff7f456bc7fe0e4d001f4bcc3d6a5 Mon Sep 17 00:00:00 2001 From: Salma El-Shafey <76866105+salma-elshafey@users.noreply.github.com> Date: Wed, 25 Jun 2025 11:43:51 +0300 Subject: [PATCH 11/23] Delete test_new_evaluator.ipynb --- .../_evaluators/_tool_call_accuracy/test_new_evaluator.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/test_new_evaluator.ipynb diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/test_new_evaluator.ipynb b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/test_new_evaluator.ipynb deleted file mode 100644 index e69de29bb2d1..000000000000 From 67fc87d6b7374c948f04761a664210902c10c0fd Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Wed, 25 Jun 2025 23:17:46 +0300 Subject: [PATCH 12/23] Added field names and messages as constants --- .../azure-ai-evaluation/CHANGELOG.md | 2 +- .../_tool_call_accuracy.py | 27 ++++++++---- .../tests/unittests/test_agent_evaluators.py | 16 +++---- .../test_tool_call_accuracy_evaluator.py | 44 +++++++++---------- 4 files changed, 49 insertions(+), 40 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 0e6dd5c50cf7..8ee7f6070737 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -8,7 +8,7 @@ - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens. -- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a number in the range [0-1]. The number range is now [1-5]. +- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5]. - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 68715c9a18b3..1c6aecf36108 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -71,6 +71,15 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): _MIN_TOOL_CALL_ACCURACY_SCORE = 1 _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3 + _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls." + _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided." + _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided." + _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5." + + _LLM_SCORE_KEY = "tool_calls_success_level" + _EXCESS_TOOL_CALLS_KEY = "excess_tool_calls" + _MISSING_TOOL_CALLS_KEY = "missing_tool_calls" + id = "id" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @@ -150,9 +159,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_calls = parsed_tool_calls if not tool_calls: - return {"error_message": "No tool calls found in response or provided tool_calls."} + return {"error_message": self._NO_TOOL_CALLS_MESSAGE} if not tool_definitions or len(tool_definitions) == 0: - return {"error_message": "Tool definitions must be provided."} + return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} if not isinstance(tool_calls, list): tool_calls = [tool_calls] @@ -162,9 +171,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs): try: needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions) except EvaluationException as e: - return {"error_message": "Tool definitions for all tool calls must be provided."} + return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} if len(needed_tool_definitions) == 0: - return {"error_message": "Tool definitions for all tool calls must be provided."} + return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} return { "query": query, @@ -188,7 +197,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) if isinstance(llm_output, dict): - score = llm_output.get("tool_calls_success_level", None) + score = llm_output.get(self._LLM_SCORE_KEY, None) if not score or not check_score_is_valid(score, ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE): raise EvaluationException( message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].", @@ -208,8 +217,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_reason": reason, 'applicable': True, 'per_tool_call_details': llm_output.get('additional_details', {}), - 'excess_tool_calls': llm_output.get('excess_tool_calls', {}), - 'missing_tool_calls': llm_output.get('missing_tool_calls', {}), + self._EXCESS_TOOL_CALLS_KEY: llm_output.get(self._EXCESS_TOOL_CALLS_KEY, {}), + self._MISSING_TOOL_CALLS_KEY: llm_output.get(self._MISSING_TOOL_CALLS_KEY, {}), } return response_dict @@ -255,8 +264,8 @@ def _not_applicable_result(self, error_message): f"{self._result_key}_reason": error_message, "applicable": False, "per_tool_call_details": {}, - "excess_tool_calls": {}, - "missing_tool_calls": {}, + self._EXCESS_TOOL_CALLS_KEY: {}, + self._MISSING_TOOL_CALLS_KEY: {}, } diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py index a7df53a45606..617d9397c68a 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py @@ -27,8 +27,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): }] ) assert not result["applicable"] - assert result["tool_call_accuracy"] == "not applicable" - assert "No tool calls found in response or provided tool_calls." in result["tool_call_accuracy_reason"] + assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] # Test with missing tool_definitions result = tool_call_accuracy( @@ -43,8 +43,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): }] ) assert not result["applicable"] - assert result["tool_call_accuracy"] == "not applicable" - assert "Tool definitions must be provided." in result["tool_call_accuracy_reason"] + assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] # Test with response that has no tool calls result = tool_call_accuracy( @@ -65,8 +65,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): }] ) assert not result["applicable"] - assert result["tool_call_accuracy"] == "not applicable" - assert "No tool calls found in response or provided tool_calls." in result["tool_call_accuracy_reason"] + assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] # Test with tool call for which definition is not provided result = tool_call_accuracy( @@ -91,5 +91,5 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): }] ) assert not result["applicable"] - assert result["tool_call_accuracy"] == "not applicable" - assert "Tool definitions for all tool calls must be provided." in result["tool_call_accuracy_reason"] \ No newline at end of file + assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + assert ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 50997438ed92..9dd052fe714f 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -21,8 +21,8 @@ async def flow_side_effect(timeout, **kwargs): "chain_of_thought": "The tool calls were very correct that I returned a huge number!", "tool_calls_success_level": 25, "additional_details": {}, - "excess_tool_calls": {}, - "missing_tool_calls": {} + ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {}, + ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {} } score = 1 # Default score for "all bad" @@ -39,8 +39,8 @@ async def flow_side_effect(timeout, **kwargs): "tool_calls_made_by_agent": total_calls, "correct_tool_calls_made_by_agent": good_calls }, - "excess_tool_calls": {"total": 0}, - "missing_tool_calls": {"total": 0} + ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {"total": 0}, + ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {"total": 0} } @@ -102,8 +102,8 @@ def test_evaluate_tools_valid1(self, mock_model_config): assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." assert "per_tool_call_details" in result - assert "excess_tool_calls" in result - assert "missing_tool_calls" in result + assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result + assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result assert result["applicable"] is True def test_evaluate_tools_valid2(self, mock_model_config): @@ -161,8 +161,8 @@ def test_evaluate_tools_valid2(self, mock_model_config): assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." assert "per_tool_call_details" in result - assert "excess_tool_calls" in result - assert "missing_tool_calls" in result + assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result + assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result assert result["applicable"] is True def test_evaluate_tools_valid3(self, mock_model_config): @@ -220,8 +220,8 @@ def test_evaluate_tools_valid3(self, mock_model_config): assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." assert "per_tool_call_details" in result - assert "excess_tool_calls" in result - assert "missing_tool_calls" in result + assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result + assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result assert result["applicable"] is True def test_evaluate_tools_one_eval_fails(self, mock_model_config): @@ -304,13 +304,13 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert result[key] == "not applicable" + assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == "Tool definitions for all tool calls must be provided." + assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE assert result["per_tool_call_details"] == {} - assert result["excess_tool_calls"] == {} - assert result["missing_tool_calls"] == {} + assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} + assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} assert result["applicable"] is False def test_evaluate_tools_all_not_applicable(self, mock_model_config): @@ -344,13 +344,13 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert result[key] == "not applicable" + assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == "Tool definitions for all tool calls must be provided." + assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE assert result["per_tool_call_details"] == {} - assert result["excess_tool_calls"] == {} - assert result["missing_tool_calls"] == {} + assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} + assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} assert result["applicable"] is False def test_evaluate_tools_no_tools(self, mock_model_config): @@ -377,11 +377,11 @@ def test_evaluate_tools_no_tools(self, mock_model_config): key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert result[key] == "not applicable" + assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == "No tool calls found in response or provided tool_calls." + assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE assert result["per_tool_call_details"] == {} - assert result["excess_tool_calls"] == {} - assert result["missing_tool_calls"] == {} + assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} + assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} assert result["applicable"] is False \ No newline at end of file From fd2429f89261c49909c305af4ce24232b640d22e Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Sun, 29 Jun 2025 18:50:44 +0300 Subject: [PATCH 13/23] Additional note in prompt --- .../tool_call_accuracy.prompty | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index 8286d9656984..865ce02c4b19 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -4,8 +4,7 @@ description: Evaluates Tool Call Accuracy for tool used by agent model: api: chat parameters: - temperature: 0 - max_tokens: 3000 + max_completion_tokens: 3000 top_p: 1.0 presence_penalty: 0 frequency_penalty: 0 @@ -40,17 +39,18 @@ user: 4. Potential Value: Is the information this tool call might provide likely to be useful in advancing the conversation or addressing the user expressed or implied needs? 5. Context Appropriateness: Does the tool call make sense at this point in the conversation, given what has been discussed so far? + # Ratings ## [Tool Call Accuracy: 1] (Irrelevant) **Definition:** -Tool calls were not relevant to the user's query, resulting in an irrelevant or unhelpful final output. +Tool calls were not relevant to the user's query, resulting in anirrelevant or unhelpful final output. This level is a 'fail'. **Example:** The user's query is asking for most popular hotels in New York, but the agent calls a tool that does search in local files on a machine. This tool is not relevant to the user query, so this case is a Level 1 'fail'. -## [Tool Call Accuracy: 2] (Partially Relevant - No output) +## [Tool Call Accuracy: 2] (Partially Relevant - No correct output) **Definition:** Tool calls were somewhat related to the user's query, but the agent was not able to reach a final output that addresses the user query due to one or more of the following: • Tools returned errors, and no retrials for the tool call were successful. @@ -70,7 +70,7 @@ This level is a 'fail'. ## [Tool Call Accuracy: 3] (Slightly Correct - Reached Output) **Definition:** -Tool calls were relevant and led to a correct output. However, multiple excessive, unnecessary tool calls were made. +Tool calls were relevant, correct and grounded parameters were passed so that led to a correct output. However, multiple excessive, unnecessary tool calls were made. This level is a 'pass'. **Example:** @@ -83,7 +83,7 @@ This level is a 'pass'. ## [Tool Call Accuracy: 4] (Mostly Correct - Reached output) **Definition:** Tool calls were fully relevant and efficient: -• Correct tools were called with the correct parameters, whether they are extracted from the conversation history or the current user query. +• Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query. • A tool returned an error, but the agent retried calling the tool and successfully got an output. This level is a 'pass'. @@ -94,7 +94,7 @@ This level is a 'pass'. ## [Tool Call Accuracy: 5] (Optimal Solution - Reached output) **Definition:** Tool calls were fully relevant and efficient: -• Correct tools were called with the correct parameters, whether they are extracted from the conversation history or the current user query. +• Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query. • No unnecessary or excessive tool calls were made. • No errors occurred in any of the tools. • The agent was able to reach the final output that addresses the user's query without facing any issues. @@ -110,7 +110,8 @@ This level is a 'pass'. # IMPORTANT NOTES - There is a clear distinction between 'pass' levels and 'fail' levels. The distinction is that the tools are called correctly in order to reach the required output. If the agent was not able to reach the final output that addresses the user query, it cannot be either of the 'pass' levels, and vice versa. It is crucial that you ensure you are rating the agent's response with the correct level based on the tool calls made to address the user's query. -- You are NOT concerned with the correctness of the result of the tool. As long as the tool did not return an error, then the tool output is correct and accurate. Do not look into the correctness of the tool's result. +- "Correct output" means correct tool with the correct, grounded parameters. You are NOT concerned with the correctness of the result of the tool. As long as the parameters passed were correct and the tool did not return an error, then the tool output is correct and accurate. +- Ensure that every single parameter that is passed to the tools is correct and grounded from the user query or the conversation history. If the agent passes incorrect parameters or completely makes them up, then this is a fail, even if somehow the agent reaches a correct result. # Data CONVERSATION : {{query}} @@ -123,7 +124,7 @@ TOOL DEFINITION: {{tool_definition}} Your output should consist only of a JSON object, as provided in the examples, that has the following keys: - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query. - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. - - tool_calls_success_result: 'pass' or 'fail' based on the evaluation level of the tool call accuracy. Levels 1 and 2 are a 'fail', levels 3, 4 and 5 are a 'pass'. + - tool_calls_sucess_result: 'pass' or 'fail' based on the evaluation level of the tool call accuracy. Levels 1 and 2 are a 'fail', levels 3, 4 and 5 are a 'pass'. - additional_details: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent From 6c9e3424130ffca60fa02d43e99a5670914953e5 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 30 Jun 2025 13:22:31 +0300 Subject: [PATCH 14/23] Re-add the temperature to the prompty file --- .../_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index 865ce02c4b19..139d71c1d202 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -4,7 +4,8 @@ description: Evaluates Tool Call Accuracy for tool used by agent model: api: chat parameters: - max_completion_tokens: 3000 + temperature: 0.0 + max_tokens: 3000 top_p: 1.0 presence_penalty: 0 frequency_penalty: 0 From d0f637ea0869a3fca5ccc8a650f303a578ef4d6a Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 30 Jun 2025 23:56:23 +0300 Subject: [PATCH 15/23] Removed 'applicable' field and print statement --- .../_tool_call_accuracy/_tool_call_accuracy.py | 5 +---- .../tests/unittests/test_agent_evaluators.py | 4 ---- .../tests/unittests/test_tool_call_accuracy_evaluator.py | 8 +------- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 1c6aecf36108..840744c1ab68 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -215,7 +215,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_result": score_result, f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": reason, - 'applicable': True, 'per_tool_call_details': llm_output.get('additional_details', {}), self._EXCESS_TOOL_CALLS_KEY: llm_output.get(self._EXCESS_TOOL_CALLS_KEY, {}), self._MISSING_TOOL_CALLS_KEY: llm_output.get(self._MISSING_TOOL_CALLS_KEY, {}), @@ -250,7 +249,7 @@ async def _real_call(self, **kwargs): def _not_applicable_result(self, error_message): """Return a result indicating that the tool call is not applicable for evaluation. - +pr :param eval_input: The input to the evaluator. :type eval_input: Dict :return: A dictionary containing the result of the evaluation. @@ -262,7 +261,6 @@ def _not_applicable_result(self, error_message): f"{self._result_key}_result": 'pass', f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": error_message, - "applicable": False, "per_tool_call_details": {}, self._EXCESS_TOOL_CALLS_KEY: {}, self._MISSING_TOOL_CALLS_KEY: {}, @@ -280,7 +278,6 @@ def _parse_tools_from_response(self, response): tool_results_map = {} if isinstance(response, list): for message in response: - print(message) # Extract tool calls from assistant messages if message.get("role") == "assistant" and isinstance(message.get("content"), list): for content_item in message.get("content"): diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py index 617d9397c68a..280aa9df03da 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py @@ -26,7 +26,6 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } }] ) - assert not result["applicable"] assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] @@ -42,7 +41,6 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } }] ) - assert not result["applicable"] assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] @@ -64,7 +62,6 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } }] ) - assert not result["applicable"] assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] @@ -90,6 +87,5 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } }] ) - assert not result["applicable"] assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 9dd052fe714f..73adba4d521f 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -104,7 +104,6 @@ def test_evaluate_tools_valid1(self, mock_model_config): assert "per_tool_call_details" in result assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result - assert result["applicable"] is True def test_evaluate_tools_valid2(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -163,7 +162,6 @@ def test_evaluate_tools_valid2(self, mock_model_config): assert "per_tool_call_details" in result assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result - assert result["applicable"] is True def test_evaluate_tools_valid3(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -222,7 +220,6 @@ def test_evaluate_tools_valid3(self, mock_model_config): assert "per_tool_call_details" in result assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result - assert result["applicable"] is True def test_evaluate_tools_one_eval_fails(self, mock_model_config): with pytest.raises(EvaluationException) as exc_info: @@ -311,7 +308,6 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): assert result["per_tool_call_details"] == {} assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} - assert result["applicable"] is False def test_evaluate_tools_all_not_applicable(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -351,7 +347,6 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): assert result["per_tool_call_details"] == {} assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} - assert result["applicable"] is False def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -383,5 +378,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config): assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE assert result["per_tool_call_details"] == {} assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} - assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} - assert result["applicable"] is False \ No newline at end of file + assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} \ No newline at end of file From 4c27dfffc50a53653795be264d0974a4422d8371 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Tue, 1 Jul 2025 07:35:10 +0300 Subject: [PATCH 16/23] Move excess/missing tool calls fields under additional details --- .../_tool_call_accuracy.py | 6 ------ .../tool_call_accuracy.prompty | 20 +++++++++---------- .../test_tool_call_accuracy_evaluator.py | 18 +---------------- 3 files changed, 11 insertions(+), 33 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 840744c1ab68..64a933f0d91d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -77,8 +77,6 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5." _LLM_SCORE_KEY = "tool_calls_success_level" - _EXCESS_TOOL_CALLS_KEY = "excess_tool_calls" - _MISSING_TOOL_CALLS_KEY = "missing_tool_calls" id = "id" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @@ -216,8 +214,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": reason, 'per_tool_call_details': llm_output.get('additional_details', {}), - self._EXCESS_TOOL_CALLS_KEY: llm_output.get(self._EXCESS_TOOL_CALLS_KEY, {}), - self._MISSING_TOOL_CALLS_KEY: llm_output.get(self._MISSING_TOOL_CALLS_KEY, {}), } return response_dict @@ -262,8 +258,6 @@ def _not_applicable_result(self, error_message): f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": error_message, "per_tool_call_details": {}, - self._EXCESS_TOOL_CALLS_KEY: {}, - self._MISSING_TOOL_CALLS_KEY: {}, } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index 139d71c1d202..c6e0b42b163a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -136,15 +136,15 @@ Your output should consist only of a JSON object, as provided in the examples, t - correct_tool_percentage: percentage of correct calls made by the agent for this tool. It is a value between 0.0 and 1.0 - tool_call_errors: number of errors encountered during the tool call - tool_success_result: 'pass' or 'fail' based on the evaluation of the tool call accuracy for this tool - - excess_tool_calls: a dictionary with the following keys: - - total: total number of excess, unnecessary tool calls made by the agent - - details: a list of dictionaries, each containing: - - tool_name: name of the tool - - excess_count: number of excess calls made for this query - - missing_tool_calls: a dictionary with the following keys: - - total: total number of missing tool calls that should have been made by the agent to be able to answer the query - - details: a list of dictionaries, each containing: - - tool_name: name of the tool - - missing_count: number of missing calls for this query + - excess_tool_calls: a dictionary with the following keys: + - total: total number of excess, unnecessary tool calls made by the agent + - details: a list of dictionaries, each containing: + - tool_name: name of the tool + - excess_count: number of excess calls made for this query + - missing_tool_calls: a dictionary with the following keys: + - total: total number of missing tool calls that should have been made by the agent to be able to answer the query + - details: a list of dictionaries, each containing: + - tool_name: name of the tool + - missing_count: number of missing calls for this query # Output \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 73adba4d521f..cb21300963e5 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -21,8 +21,6 @@ async def flow_side_effect(timeout, **kwargs): "chain_of_thought": "The tool calls were very correct that I returned a huge number!", "tool_calls_success_level": 25, "additional_details": {}, - ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {}, - ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {} } score = 1 # Default score for "all bad" @@ -39,8 +37,6 @@ async def flow_side_effect(timeout, **kwargs): "tool_calls_made_by_agent": total_calls, "correct_tool_calls_made_by_agent": good_calls }, - ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {"total": 0}, - ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {"total": 0} } @@ -102,8 +98,6 @@ def test_evaluate_tools_valid1(self, mock_model_config): assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." assert "per_tool_call_details" in result - assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result - assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result def test_evaluate_tools_valid2(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -160,8 +154,6 @@ def test_evaluate_tools_valid2(self, mock_model_config): assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." assert "per_tool_call_details" in result - assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result - assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result def test_evaluate_tools_valid3(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -218,8 +210,6 @@ def test_evaluate_tools_valid3(self, mock_model_config): assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." assert "per_tool_call_details" in result - assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result - assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result def test_evaluate_tools_one_eval_fails(self, mock_model_config): with pytest.raises(EvaluationException) as exc_info: @@ -306,8 +296,6 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE assert result["per_tool_call_details"] == {} - assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} - assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} def test_evaluate_tools_all_not_applicable(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -345,8 +333,6 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE assert result["per_tool_call_details"] == {} - assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} - assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -376,6 +362,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - assert result["per_tool_call_details"] == {} - assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {} - assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {} \ No newline at end of file + assert result["per_tool_call_details"] == {} \ No newline at end of file From 3fa14f06c2f13a3763ddaaf41f6c5ce0cd97accf Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Wed, 2 Jul 2025 21:45:13 +0300 Subject: [PATCH 17/23] Typo fix and removal of redundant field in the prompt --- .../_evaluators/_tool_call_accuracy/_tool_call_accuracy.py | 4 +--- .../_tool_call_accuracy/tool_call_accuracy.prompty | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 64a933f0d91d..f156c2fb6439 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -182,8 +182,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs): @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] - """Do a relevance evaluation. - + """Do a tool call accuracy evaluation. :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method, including context and other fields depending on the child class. @@ -245,7 +244,6 @@ async def _real_call(self, **kwargs): def _not_applicable_result(self, error_message): """Return a result indicating that the tool call is not applicable for evaluation. -pr :param eval_input: The input to the evaluator. :type eval_input: Dict :return: A dictionary containing the result of the evaluation. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index c6e0b42b163a..c013f34702b1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -125,7 +125,6 @@ TOOL DEFINITION: {{tool_definition}} Your output should consist only of a JSON object, as provided in the examples, that has the following keys: - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query. - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. - - tool_calls_sucess_result: 'pass' or 'fail' based on the evaluation level of the tool call accuracy. Levels 1 and 2 are a 'fail', levels 3, 4 and 5 are a 'pass'. - additional_details: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent From 2c3ce5080fc317a12263653e841a455fa16b73a6 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 7 Jul 2025 11:53:24 +0300 Subject: [PATCH 18/23] Modify per_tool_call_details field's name to details --- .../_tool_call_accuracy/_tool_call_accuracy.py | 4 ++-- .../_tool_call_accuracy/tool_call_accuracy.prompty | 4 ++-- .../unittests/test_tool_call_accuracy_evaluator.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index f156c2fb6439..05e9e7728e9f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -212,7 +212,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_result": score_result, f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": reason, - 'per_tool_call_details': llm_output.get('additional_details', {}), + 'details': llm_output.get('details', {}), } return response_dict @@ -255,7 +255,7 @@ def _not_applicable_result(self, error_message): f"{self._result_key}_result": 'pass', f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": error_message, - "per_tool_call_details": {}, + "details": {}, } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index c013f34702b1..a4766441535b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -125,10 +125,10 @@ TOOL DEFINITION: {{tool_definition}} Your output should consist only of a JSON object, as provided in the examples, that has the following keys: - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query. - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. - - additional_details: a dictionary that contains the following keys: + - details: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent - - details: a list of dictionaries, each containing: + - per_tool_call_details: a list of dictionaries, each containing: - tool_name: name of the tool - total_calls_required: total number of calls required for the tool - correct_calls_made_by_agent: number of correct calls made by the agent diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index cb21300963e5..e83ee2f39991 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -97,7 +97,7 @@ def test_evaluate_tools_valid1(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." - assert "per_tool_call_details" in result + assert "details" in result def test_evaluate_tools_valid2(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -153,7 +153,7 @@ def test_evaluate_tools_valid2(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." - assert "per_tool_call_details" in result + assert "details" in result def test_evaluate_tools_valid3(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -209,7 +209,7 @@ def test_evaluate_tools_valid3(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." - assert "per_tool_call_details" in result + assert "details" in result def test_evaluate_tools_one_eval_fails(self, mock_model_config): with pytest.raises(EvaluationException) as exc_info: @@ -295,7 +295,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result["per_tool_call_details"] == {} + assert result["details"] == {} def test_evaluate_tools_all_not_applicable(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -332,7 +332,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result["per_tool_call_details"] == {} + assert result["details"] == {} def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -362,4 +362,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - assert result["per_tool_call_details"] == {} \ No newline at end of file + assert result["details"] == {} \ No newline at end of file From 6525a6f8a2d8f20f5a12eb45a6ecfed665c495ab Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Wed, 16 Jul 2025 18:44:32 +0300 Subject: [PATCH 19/23] Revert "Modify per_tool_call_details field's name to details" This reverts commit 2c3ce5080fc317a12263653e841a455fa16b73a6. --- .../_tool_call_accuracy/_tool_call_accuracy.py | 4 ++-- .../_tool_call_accuracy/tool_call_accuracy.prompty | 4 ++-- .../unittests/test_tool_call_accuracy_evaluator.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 05e9e7728e9f..f156c2fb6439 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -212,7 +212,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_result": score_result, f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": reason, - 'details': llm_output.get('details', {}), + 'per_tool_call_details': llm_output.get('additional_details', {}), } return response_dict @@ -255,7 +255,7 @@ def _not_applicable_result(self, error_message): f"{self._result_key}_result": 'pass', f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": error_message, - "details": {}, + "per_tool_call_details": {}, } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index a4766441535b..c013f34702b1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -125,10 +125,10 @@ TOOL DEFINITION: {{tool_definition}} Your output should consist only of a JSON object, as provided in the examples, that has the following keys: - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query. - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. - - details: a dictionary that contains the following keys: + - additional_details: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent - - per_tool_call_details: a list of dictionaries, each containing: + - details: a list of dictionaries, each containing: - tool_name: name of the tool - total_calls_required: total number of calls required for the tool - correct_calls_made_by_agent: number of correct calls made by the agent diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index e83ee2f39991..cb21300963e5 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -97,7 +97,7 @@ def test_evaluate_tools_valid1(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." - assert "details" in result + assert "per_tool_call_details" in result def test_evaluate_tools_valid2(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -153,7 +153,7 @@ def test_evaluate_tools_valid2(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." - assert "details" in result + assert "per_tool_call_details" in result def test_evaluate_tools_valid3(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -209,7 +209,7 @@ def test_evaluate_tools_valid3(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." - assert "details" in result + assert "per_tool_call_details" in result def test_evaluate_tools_one_eval_fails(self, mock_model_config): with pytest.raises(EvaluationException) as exc_info: @@ -295,7 +295,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result["details"] == {} + assert result["per_tool_call_details"] == {} def test_evaluate_tools_all_not_applicable(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -332,7 +332,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result["details"] == {} + assert result["per_tool_call_details"] == {} def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -362,4 +362,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - assert result["details"] == {} \ No newline at end of file + assert result["per_tool_call_details"] == {} \ No newline at end of file From e72b084a892b5a078cf04cbefcbe89d6e5043936 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Wed, 16 Jul 2025 19:09:53 +0300 Subject: [PATCH 20/23] Revert 'Merge branch 'main' into selshafey/improve_tool_call_accuracy' --- .../_tool_call_accuracy/_tool_call_accuracy.py | 4 ++-- .../_tool_call_accuracy/tool_call_accuracy.prompty | 4 ++-- .../unittests/test_tool_call_accuracy_evaluator.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index f156c2fb6439..05e9e7728e9f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -212,7 +212,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_result": score_result, f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": reason, - 'per_tool_call_details': llm_output.get('additional_details', {}), + 'details': llm_output.get('details', {}), } return response_dict @@ -255,7 +255,7 @@ def _not_applicable_result(self, error_message): f"{self._result_key}_result": 'pass', f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": error_message, - "per_tool_call_details": {}, + "details": {}, } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index c013f34702b1..a4766441535b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -125,10 +125,10 @@ TOOL DEFINITION: {{tool_definition}} Your output should consist only of a JSON object, as provided in the examples, that has the following keys: - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query. - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. - - additional_details: a dictionary that contains the following keys: + - details: a dictionary that contains the following keys: - tool_calls_made_by_agent: total number of tool calls made by the agent - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent - - details: a list of dictionaries, each containing: + - per_tool_call_details: a list of dictionaries, each containing: - tool_name: name of the tool - total_calls_required: total number of calls required for the tool - correct_calls_made_by_agent: number of correct calls made by the agent diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index cb21300963e5..e83ee2f39991 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -97,7 +97,7 @@ def test_evaluate_tools_valid1(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." - assert "per_tool_call_details" in result + assert "details" in result def test_evaluate_tools_valid2(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -153,7 +153,7 @@ def test_evaluate_tools_valid2(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." - assert "per_tool_call_details" in result + assert "details" in result def test_evaluate_tools_valid3(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -209,7 +209,7 @@ def test_evaluate_tools_valid3(self, mock_model_config): assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." - assert "per_tool_call_details" in result + assert "details" in result def test_evaluate_tools_one_eval_fails(self, mock_model_config): with pytest.raises(EvaluationException) as exc_info: @@ -295,7 +295,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result["per_tool_call_details"] == {} + assert result["details"] == {} def test_evaluate_tools_all_not_applicable(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -332,7 +332,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - assert result["per_tool_call_details"] == {} + assert result["details"] == {} def test_evaluate_tools_no_tools(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) @@ -362,4 +362,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config): assert result[f"{key}_result"] == "pass" assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - assert result["per_tool_call_details"] == {} \ No newline at end of file + assert result["details"] == {} \ No newline at end of file From a79b3a1a284a76a6f0a09289b4a680eb48859ebb Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Wed, 16 Jul 2025 21:00:52 +0300 Subject: [PATCH 21/23] Black reformat --- .../_evaluators/_common/_base_eval.py | 97 ++++++-- .../_tool_call_accuracy.py | 100 ++++++--- .../tool_call_accuracy.prompty | 2 +- .../tests/unittests/test_agent_evaluators.py | 138 +++++++----- .../test_tool_call_accuracy_evaluator.py | 209 +++++++++++++----- 5 files changed, 382 insertions(+), 164 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index 9d75d3b70944..9b81738c994c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -4,14 +4,34 @@ import inspect from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional +from typing import ( + Any, + Callable, + Dict, + Generic, + List, + TypedDict, + TypeVar, + Union, + cast, + final, + Optional, +) from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop from typing_extensions import ParamSpec, TypeAlias, get_overloads -from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException +from azure.ai.evaluation._exceptions import ( + ErrorBlame, + ErrorCategory, + ErrorTarget, + EvaluationException, +) from azure.ai.evaluation._common.utils import remove_optional_singletons -from azure.ai.evaluation._constants import _AggregationType, EVALUATION_PASS_FAIL_MAPPING +from azure.ai.evaluation._constants import ( + _AggregationType, + EVALUATION_PASS_FAIL_MAPPING, +) from azure.ai.evaluation._model_configurations import Conversation from azure.ai.evaluation._common._experimental import experimental @@ -101,14 +121,18 @@ def __init__( not_singleton_inputs: List[str] = ["conversation", "kwargs"], eval_last_turn: bool = False, conversation_aggregation_type: _AggregationType = _AggregationType.MEAN, - conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None, + conversation_aggregator_override: Optional[ + Callable[[List[float]], float] + ] = None, _higher_is_better: Optional[bool] = True, ): self._not_singleton_inputs = not_singleton_inputs self._eval_last_turn = eval_last_turn self._singleton_inputs = self._derive_singleton_inputs() self._async_evaluator = AsyncEvaluatorBase(self._real_call) - self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type) + self._conversation_aggregation_function = GetAggregator( + conversation_aggregation_type + ) self._higher_is_better = _higher_is_better self._threshold = threshold if conversation_aggregator_override is not None: @@ -170,13 +194,18 @@ def _derive_singleton_inputs(self) -> List[str]: singletons = [] for call_signature in call_signatures: params = call_signature.parameters - if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs): + if any( + not_singleton_input in params + for not_singleton_input in self._not_singleton_inputs + ): continue # exclude self since it is not a singleton input singletons.extend([p for p in params if p != "self"]) return singletons - def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]: + def _derive_conversation_converter( + self, + ) -> Callable[[Dict], List[DerivedEvalInput]]: """Produce the function that will be used to convert conversations to a list of evaluable inputs. This uses the inputs derived from the _derive_singleton_inputs function to determine which aspects of a conversation ought to be extracted. @@ -235,7 +264,9 @@ def converter(conversation: Dict) -> List[DerivedEvalInput]: return converter - def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]: + def _derive_multi_modal_conversation_converter( + self, + ) -> Callable[[Dict], List[Dict[str, Any]]]: """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs. This uses the inputs derived from the _derive_singleton_inputs function to determine which aspects of a conversation ought to be extracted. @@ -269,12 +300,16 @@ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]: if len(user_messages) != len(assistant_messages): raise EvaluationException( message="Mismatched number of user and assistant messages.", - internal_message=("Mismatched number of user and assistant messages."), + internal_message=( + "Mismatched number of user and assistant messages." + ), ) if len(assistant_messages) > 1: raise EvaluationException( message="Conversation can have only one assistant message.", - internal_message=("Conversation can have only one assistant message."), + internal_message=( + "Conversation can have only one assistant message." + ), ) eval_conv_inputs = [] for user_msg, assist_msg in zip(user_messages, assistant_messages): @@ -283,12 +318,16 @@ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]: conv_messages.append(system_messages[0]) conv_messages.append(user_msg) conv_messages.append(assist_msg) - eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)}) + eval_conv_inputs.append( + {"conversation": Conversation(messages=conv_messages)} + ) return eval_conv_inputs return multi_modal_converter - def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]: + def _convert_kwargs_to_eval_input( + self, **kwargs + ) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]: """Convert an arbitrary input into a list of inputs for evaluators. It is assumed that evaluators generally make use of their inputs in one of two ways. Either they receive a collection of keyname inputs that are all single values @@ -353,11 +392,17 @@ def _is_multi_modal_conversation(self, conversation: Dict) -> bool: if "content" in message: content = message.get("content", "") if isinstance(content, list): - if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content): + if any( + item.get("type") == "image_url" + and "url" in item.get("image_url", {}) + for item in content + ): return True return False - def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]: + def _aggregate_results( + self, per_turn_results: List[DoEvalResult[T_EvalValue]] + ) -> AggregateResult[T_EvalValue]: """Aggregate the evaluation results of each conversation turn into a single result. Exact implementation might need to vary slightly depending on the results produced. @@ -387,12 +432,16 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) # Find and average all numeric values for metric, values in evaluation_per_turn.items(): if all(isinstance(value, (int, float)) for value in values): - aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values)) + aggregated[metric] = self._conversation_aggregation_function( + cast(List[Union[int, float]], values) + ) # Slap the per-turn results back in. aggregated["evaluation_per_turn"] = evaluation_per_turn return aggregated - async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: + async def _real_call( + self, **kwargs + ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: """The asynchronous call where real end-to-end evaluation logic is performed. :keyword kwargs: The inputs to evaluate. @@ -445,7 +494,9 @@ def _to_async(self) -> "AsyncEvaluatorBase": @experimental @final - def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None: + def _set_conversation_aggregation_type( + self, conversation_aggregation_type: _AggregationType + ) -> None: """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a multi-turn conversation into a single top-level result. @@ -454,11 +505,15 @@ def _set_conversation_aggregation_type(self, conversation_aggregation_type: _Agg results of a conversation to produce a single result. :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType """ - self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type) + self._conversation_aggregation_function = GetAggregator( + conversation_aggregation_type + ) @experimental @final - def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None: + def _set_conversation_aggregator( + self, aggregator: Callable[[List[float]], float] + ) -> None: """Set the conversation aggregator function directly. This function will be applied to all numeric outputs of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not @@ -488,7 +543,9 @@ class AsyncEvaluatorBase: to ensure that no one ever needs to extend or otherwise modify this class directly. """ - def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION + def __init__( + self, real_call + ): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION self._real_call = real_call # Don't look at my shame. Nothing to see here.... diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 1e0df36d33ef..01c986800c20 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -8,7 +8,12 @@ from typing import Dict, List, Union, TypeVar, cast from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase -from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException +from azure.ai.evaluation._exceptions import ( + ErrorBlame, + ErrorCategory, + ErrorTarget, + EvaluationException, +) from ..._common.utils import check_score_is_valid from azure.ai.evaluation._common._experimental import experimental @@ -74,7 +79,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls." _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided." - _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided." + _TOOL_DEFINITIONS_MISSING_MESSAGE = ( + "Tool definitions for all tool calls must be provided." + ) _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5." _LLM_SCORE_KEY = "tool_calls_success_level" @@ -83,11 +90,18 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override - def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs): + def __init__( + self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs + ): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self.threshold = threshold - super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs) + super().__init__( + model_config=model_config, + prompty_file=prompty_path, + result_key=self._RESULT_KEY, + **kwargs, + ) @overload def __call__( @@ -164,7 +178,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_definitions = [tool_definitions] try: - needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions) + needed_tool_definitions = self._extract_needed_tool_definitions( + tool_calls, tool_definitions + ) except EvaluationException as e: return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} if len(needed_tool_definitions) == 0: @@ -173,9 +189,8 @@ def _convert_kwargs_to_eval_input(self, **kwargs): return { "query": query, "tool_calls": tool_calls, - "tool_definitions": needed_tool_definitions + "tool_definitions": needed_tool_definitions, } - @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] @@ -192,35 +207,39 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t if isinstance(llm_output, dict): score = llm_output.get(self._LLM_SCORE_KEY, None) - if not score or not check_score_is_valid(score, ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE): + if not score or not check_score_is_valid( + score, + ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, + ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE, + ): raise EvaluationException( message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].", internal_message="Invalid score value.", category=ErrorCategory.FAILED_EXECUTION, blame=ErrorBlame.SYSTEM_ERROR, ) - + # Format the output reason = llm_output.get("chain_of_thought", "") score = float(score) - score_result = 'pass' if score >= self.threshold else 'fail' + score_result = "pass" if score >= self.threshold else "fail" response_dict = { self._result_key: score, f"{self._result_key}_result": score_result, f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": reason, - 'details': llm_output.get('details', {}), + "details": llm_output.get("details", {}), } return response_dict - + else: raise EvaluationException( - message="Tool call accuracy evaluator returned invalid output.", - blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.FAILED_EXECUTION, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - ) - + message="Tool call accuracy evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + ) + async def _real_call(self, **kwargs): """The asynchronous call where real end-to-end evaluation logic is performed. @@ -231,14 +250,14 @@ async def _real_call(self, **kwargs): """ # Convert inputs into list of evaluable inputs. eval_input = self._convert_kwargs_to_eval_input(**kwargs) - if isinstance(eval_input, dict) and eval_input.get('error_message'): + if isinstance(eval_input, dict) and eval_input.get("error_message"): # If there is an error message, return not applicable result - return self._not_applicable_result(eval_input.get('error_message')) + return self._not_applicable_result(eval_input.get("error_message")) # Do the evaluation result = await self._do_eval(eval_input) # Return the result return result - + def _not_applicable_result(self, error_message): """Return a result indicating that the tool call is not applicable for evaluation. :param eval_input: The input to the evaluator. @@ -249,13 +268,12 @@ def _not_applicable_result(self, error_message): # If no tool calls were made or tool call type is not supported, return not applicable result return { self._result_key: self._NOT_APPLICABLE_RESULT, - f"{self._result_key}_result": 'pass', + f"{self._result_key}_result": "pass", f"{self._result_key}_threshold": self.threshold, f"{self._result_key}_reason": error_message, "details": {}, - } - + def _parse_tools_from_response(self, response): """Parse the response to extract tool calls and results. :param response: The response to parse. @@ -266,29 +284,40 @@ def _parse_tools_from_response(self, response): tool_calls = [] tool_results_map = {} if isinstance(response, list): - for message in response: + for message in response: # Extract tool calls from assistant messages - if message.get("role") == "assistant" and isinstance(message.get("content"), list): + if message.get("role") == "assistant" and isinstance( + message.get("content"), list + ): for content_item in message.get("content"): - if isinstance(content_item, dict) and content_item.get("type") == "tool_call": + if ( + isinstance(content_item, dict) + and content_item.get("type") == "tool_call" + ): tool_calls.append(content_item) # Extract tool results from tool messages elif message.get("role") == "tool" and message.get("tool_call_id"): tool_call_id = message.get("tool_call_id") - if isinstance(message.get("content"), list) and len(message.get("content")) > 0: + if ( + isinstance(message.get("content"), list) + and len(message.get("content")) > 0 + ): result_content = message.get("content")[0] - if isinstance(result_content, dict) and result_content.get("type") == "tool_result": + if ( + isinstance(result_content, dict) + and result_content.get("type") == "tool_result" + ): tool_results_map[tool_call_id] = result_content # Attach results to their corresponding calls for tool_call in tool_calls: tool_call_id = tool_call.get("tool_call_id") if tool_call_id in tool_results_map: - tool_call["tool_result"] = tool_results_map[tool_call_id]['tool_result'] + tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"] return tool_calls - + def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): """Extract the tool definitions that are needed for the provided tool calls. :param tool_calls: List of tool calls to evaluate. @@ -302,9 +331,12 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): for tool_call in tool_calls: if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call": tool_name = tool_call.get("name") - tool_definition = [tool for tool in tool_definitions - if tool.get("name") == tool_name and - tool.get("type", "function") == "function"] + tool_definition = [ + tool + for tool in tool_definitions + if tool.get("name") == tool_name + and tool.get("type", "function") == "function" + ] if len(tool_definition) > 0: needed_tool_definitions.extend(tool_definition) else: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty index a4766441535b..193d2174e9bb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty @@ -44,7 +44,7 @@ user: # Ratings ## [Tool Call Accuracy: 1] (Irrelevant) **Definition:** -Tool calls were not relevant to the user's query, resulting in anirrelevant or unhelpful final output. +Tool calls were not relevant to the user's query, resulting in an irrelevant or unhelpful final output. This level is a 'fail'. **Example:** diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py index 280aa9df03da..9365c347d2cf 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py @@ -12,80 +12,108 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): # Test with missing tool_calls and response result = tool_call_accuracy( query="Where is the Eiffel Tower?", - tool_definitions=[{ - "name": "fetch_weather", - "description": "Fetches the weather information for the specified location.", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The location to fetch weather for." - } - } + tool_definitions=[ + { + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, + }, } - }] + ], + ) + assert ( + result[ToolCallAccuracyEvaluator._RESULT_KEY] + == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + ) + assert ( + ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE + in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] ) - assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] # Test with missing tool_definitions result = tool_call_accuracy( query="Where is the Eiffel Tower?", tool_definitions=[], - tool_calls=[{ - "type": "tool_call", - "name": "fetch_weather", - "arguments": { - "location": "Tokyo" + tool_calls=[ + { + "type": "tool_call", + "name": "fetch_weather", + "arguments": {"location": "Tokyo"}, } - }] + ], + ) + assert ( + result[ToolCallAccuracyEvaluator._RESULT_KEY] + == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + ) + assert ( + ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE + in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] ) - assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] # Test with response that has no tool calls result = tool_call_accuracy( query="Where is the Eiffel Tower?", response="The Eiffel Tower is in Paris.", - tool_definitions=[{ - "name": "fetch_weather", - "description": "Fetches the weather information for the specified location.", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The location to fetch weather for." - } - } + tool_definitions=[ + { + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, + }, } - }] + ], + ) + assert ( + result[ToolCallAccuracyEvaluator._RESULT_KEY] + == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + ) + assert ( + ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE + in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] ) - assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] # Test with tool call for which definition is not provided result = tool_call_accuracy( query="Where is the Eiffel Tower?", - tool_calls=[{ - "type": "tool_call", - "name": "some_other_tool", - "arguments": {} - }], - tool_definitions=[{ - "name": "fetch_weather", - "description": "Fetches the weather information for the specified location.", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The location to fetch weather for." - } - } + tool_calls=[ + {"type": "tool_call", "name": "some_other_tool", "arguments": {}} + ], + tool_definitions=[ + { + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, + }, } - }] + ], + ) + assert ( + result[ToolCallAccuracyEvaluator._RESULT_KEY] + == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT + ) + assert ( + ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE + in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] ) - assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - assert ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index c834b838a729..5f7e06e4589f 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -9,10 +9,12 @@ # which is then processed by the _do_eval method. async def flow_side_effect(timeout, **kwargs): tool_calls = kwargs.get("tool_calls", []) - + good_calls = sum(1 for tc in tool_calls if "good" in tc.get("tool_call_id", "")) bad_calls = sum(1 for tc in tool_calls if "bad" in tc.get("tool_call_id", "")) - invalid_calls = sum(1 for tc in tool_calls if "invalid" in tc.get("tool_call_id", "")) + invalid_calls = sum( + 1 for tc in tool_calls if "invalid" in tc.get("tool_call_id", "") + ) total_calls = len(tool_calls) if invalid_calls > 0: @@ -29,13 +31,13 @@ async def flow_side_effect(timeout, **kwargs): score = 5 # All good elif good_calls > 0: score = 3 # Mixed good and bad - + return { "chain_of_thought": f"Evaluated {total_calls} tool calls with {good_calls} correct calls.", "tool_calls_success_level": score, "additional_details": { "tool_calls_made_by_agent": total_calls, - "correct_tool_calls_made_by_agent": good_calls + "correct_tool_calls_made_by_agent": good_calls, }, } @@ -48,8 +50,8 @@ def test_evaluate_tools_valid1(self, mock_model_config): evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test evaluation with one good and one bad tool call - query="Where is the Eiffel Tower?" - tool_calls=[ + query = "Where is the Eiffel Tower?" + tool_calls = [ { "type": "tool_call", "tool_call_id": "call_good", @@ -70,7 +72,12 @@ def test_evaluate_tools_valid1(self, mock_model_config): "description": "Fetches the weather information for the specified location.", "parameters": { "type": "object", - "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, }, }, { @@ -79,18 +86,30 @@ def test_evaluate_tools_valid1(self, mock_model_config): "description": "Buy a jacket of the given type.", "parameters": { "type": "object", - "properties": {"type": {"type": "string", "description": "The type of jacket to buy."}}, + "properties": { + "type": { + "type": "string", + "description": "The type of jacket to buy.", + } + }, }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator( + query=query, tool_calls=tool_calls, tool_definitions=tool_definitions + ) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) + assert ( + key in result and f"{key}_result" in result and f"{key}_threshold" in result + ) assert result[key] == 3.0 # Mixed good/bad gets score 3 assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert ( + result[f"{key}_threshold"] + == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + ) assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." assert "details" in result @@ -100,8 +119,8 @@ def test_evaluate_tools_valid2(self, mock_model_config): evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test evaluation with two bad tool calls - query="Where is the Eiffel Tower?" - tool_calls=[ + query = "Where is the Eiffel Tower?" + tool_calls = [ { "type": "tool_call", "tool_call_id": "call_bad", @@ -122,7 +141,12 @@ def test_evaluate_tools_valid2(self, mock_model_config): "description": "Fetches the weather information for the specified location.", "parameters": { "type": "object", - "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, }, }, { @@ -131,18 +155,30 @@ def test_evaluate_tools_valid2(self, mock_model_config): "description": "Buy a jacket of the given type.", "parameters": { "type": "object", - "properties": {"type": {"type": "string", "description": "The type of jacket to buy."}}, + "properties": { + "type": { + "type": "string", + "description": "The type of jacket to buy.", + } + }, }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator( + query=query, tool_calls=tool_calls, tool_definitions=tool_definitions + ) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) + assert ( + key in result and f"{key}_result" in result and f"{key}_threshold" in result + ) assert result[key] == 1.0 # All bad gets score 1 assert result[f"{key}_result"] == "fail" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert ( + result[f"{key}_threshold"] + == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + ) assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." assert "details" in result @@ -152,8 +188,8 @@ def test_evaluate_tools_valid3(self, mock_model_config): evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test evaluation with two good tool calls - query="Where is the Eiffel Tower?" - tool_calls=[ + query = "Where is the Eiffel Tower?" + tool_calls = [ { "type": "tool_call", "tool_call_id": "call_good", @@ -174,7 +210,12 @@ def test_evaluate_tools_valid3(self, mock_model_config): "description": "Fetches the weather information for the specified location.", "parameters": { "type": "object", - "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, }, }, { @@ -183,18 +224,30 @@ def test_evaluate_tools_valid3(self, mock_model_config): "description": "Buy a jacket of the given type.", "parameters": { "type": "object", - "properties": {"type": {"type": "string", "description": "The type of jacket to buy."}}, + "properties": { + "type": { + "type": "string", + "description": "The type of jacket to buy.", + } + }, }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator( + query=query, tool_calls=tool_calls, tool_definitions=tool_definitions + ) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result) + assert ( + key in result and f"{key}_result" in result and f"{key}_threshold" in result + ) assert result[key] == 5.0 # All good gets score 5 assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert ( + result[f"{key}_threshold"] + == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + ) assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." assert "details" in result @@ -205,8 +258,8 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config): evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test evaluation with an invalid tool call ID to trigger failure - query="Where is the Eiffel Tower?" - tool_calls=[ + query = "Where is the Eiffel Tower?" + tool_calls = [ { "type": "tool_call", "tool_call_id": "call_invalid", @@ -222,13 +275,18 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config): "parameters": { "type": "object", "properties": { - "location": {"type": "string", "description": "The location to fetch weather for."} + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } }, }, }, ] - evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) - + evaluator( + query=query, tool_calls=tool_calls, tool_definitions=tool_definitions + ) + assert "Invalid score value" in str(exc_info.value) def test_evaluate_tools_some_not_applicable(self, mock_model_config): @@ -236,8 +294,8 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test with one function tool and one non-function tool - query="Where is the Eiffel Tower?" - tool_calls=[ + query = "Where is the Eiffel Tower?" + tool_calls = [ { "type": "tool_call", "tool_call_id": "call_good", @@ -258,36 +316,54 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): "description": "Fetches the weather information for the specified location.", "parameters": { "type": "object", - "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, }, }, { "name": "buy_jacket", - "type": "another_built_in", # This tool will be filtered out + "type": "another_built_in", # This tool will be filtered out "description": "Buy a jacket of the given type.", "parameters": { "type": "object", - "properties": {"type": {"type": "string", "description": "The type of jacket to buy."}}, + "properties": { + "type": { + "type": "string", + "description": "The type of jacket to buy.", + } + }, }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator( + query=query, tool_calls=tool_calls, tool_definitions=tool_definitions + ) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE + assert ( + result[f"{key}_threshold"] + == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + ) + assert ( + result[f"{key}_reason"] + == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE + ) assert result["details"] == {} def test_evaluate_tools_all_not_applicable(self, mock_model_config): evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) evaluator._flow = MagicMock(side_effect=flow_side_effect) - + # Test with only non-function tools - query="Where is the Eiffel Tower?" - tool_calls=[ + query = "Where is the Eiffel Tower?" + tool_calls = [ { "type": "tool_call", "tool_call_id": "call_good", @@ -298,22 +374,35 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): tool_definitions = [ { "name": "fetch_weather", - "type": "some_built_in", # Not a 'function' type + "type": "some_built_in", # Not a 'function' type "description": "Fetches the weather information for the specified location.", "parameters": { "type": "object", - "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator( + query=query, tool_calls=tool_calls, tool_definitions=tool_definitions + ) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE + assert ( + result[f"{key}_threshold"] + == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + ) + assert ( + result[f"{key}_reason"] + == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE + ) assert result["details"] == {} def test_evaluate_tools_no_tools(self, mock_model_config): @@ -321,25 +410,37 @@ def test_evaluate_tools_no_tools(self, mock_model_config): evaluator._flow = MagicMock(side_effect=flow_side_effect) # Test with no tool calls provided - query="Where is the Eiffel Tower?" - tool_calls=[] - tool_definitions=[ + query = "Where is the Eiffel Tower?" + tool_calls = [] + tool_definitions = [ { "name": "fetch_weather", "type": "function", "description": "Fetches the weather information for the specified location.", "parameters": { "type": "object", - "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for.", + } + }, }, }, ] - result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + result = evaluator( + query=query, tool_calls=tool_calls, tool_definitions=tool_definitions + ) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - assert result["details"] == {} \ No newline at end of file + assert ( + result[f"{key}_threshold"] + == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + ) + assert ( + result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE + ) + assert result["details"] == {} From 440b6c1f087fd73a1f6f0f5d07988c5bc542abc0 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Thu, 17 Jul 2025 00:01:09 +0300 Subject: [PATCH 22/23] Reformat with black --- .../_evaluators/_common/_base_eval.py | 63 ++++--------- .../_tool_call_accuracy.py | 34 ++----- .../tests/unittests/test_agent_evaluators.py | 24 ++--- .../test_tool_call_accuracy_evaluator.py | 88 +++++-------------- 4 files changed, 48 insertions(+), 161 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index 9b81738c994c..f6890b6aceb5 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -121,18 +121,14 @@ def __init__( not_singleton_inputs: List[str] = ["conversation", "kwargs"], eval_last_turn: bool = False, conversation_aggregation_type: _AggregationType = _AggregationType.MEAN, - conversation_aggregator_override: Optional[ - Callable[[List[float]], float] - ] = None, + conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None, _higher_is_better: Optional[bool] = True, ): self._not_singleton_inputs = not_singleton_inputs self._eval_last_turn = eval_last_turn self._singleton_inputs = self._derive_singleton_inputs() self._async_evaluator = AsyncEvaluatorBase(self._real_call) - self._conversation_aggregation_function = GetAggregator( - conversation_aggregation_type - ) + self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type) self._higher_is_better = _higher_is_better self._threshold = threshold if conversation_aggregator_override is not None: @@ -194,10 +190,7 @@ def _derive_singleton_inputs(self) -> List[str]: singletons = [] for call_signature in call_signatures: params = call_signature.parameters - if any( - not_singleton_input in params - for not_singleton_input in self._not_singleton_inputs - ): + if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs): continue # exclude self since it is not a singleton input singletons.extend([p for p in params if p != "self"]) @@ -300,16 +293,12 @@ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]: if len(user_messages) != len(assistant_messages): raise EvaluationException( message="Mismatched number of user and assistant messages.", - internal_message=( - "Mismatched number of user and assistant messages." - ), + internal_message=("Mismatched number of user and assistant messages."), ) if len(assistant_messages) > 1: raise EvaluationException( message="Conversation can have only one assistant message.", - internal_message=( - "Conversation can have only one assistant message." - ), + internal_message=("Conversation can have only one assistant message."), ) eval_conv_inputs = [] for user_msg, assist_msg in zip(user_messages, assistant_messages): @@ -318,16 +307,12 @@ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]: conv_messages.append(system_messages[0]) conv_messages.append(user_msg) conv_messages.append(assist_msg) - eval_conv_inputs.append( - {"conversation": Conversation(messages=conv_messages)} - ) + eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)}) return eval_conv_inputs return multi_modal_converter - def _convert_kwargs_to_eval_input( - self, **kwargs - ) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]: + def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]: """Convert an arbitrary input into a list of inputs for evaluators. It is assumed that evaluators generally make use of their inputs in one of two ways. Either they receive a collection of keyname inputs that are all single values @@ -392,17 +377,11 @@ def _is_multi_modal_conversation(self, conversation: Dict) -> bool: if "content" in message: content = message.get("content", "") if isinstance(content, list): - if any( - item.get("type") == "image_url" - and "url" in item.get("image_url", {}) - for item in content - ): + if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content): return True return False - def _aggregate_results( - self, per_turn_results: List[DoEvalResult[T_EvalValue]] - ) -> AggregateResult[T_EvalValue]: + def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]: """Aggregate the evaluation results of each conversation turn into a single result. Exact implementation might need to vary slightly depending on the results produced. @@ -432,16 +411,12 @@ def _aggregate_results( # Find and average all numeric values for metric, values in evaluation_per_turn.items(): if all(isinstance(value, (int, float)) for value in values): - aggregated[metric] = self._conversation_aggregation_function( - cast(List[Union[int, float]], values) - ) + aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values)) # Slap the per-turn results back in. aggregated["evaluation_per_turn"] = evaluation_per_turn return aggregated - async def _real_call( - self, **kwargs - ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: + async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: """The asynchronous call where real end-to-end evaluation logic is performed. :keyword kwargs: The inputs to evaluate. @@ -494,9 +469,7 @@ def _to_async(self) -> "AsyncEvaluatorBase": @experimental @final - def _set_conversation_aggregation_type( - self, conversation_aggregation_type: _AggregationType - ) -> None: + def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None: """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a multi-turn conversation into a single top-level result. @@ -505,15 +478,11 @@ def _set_conversation_aggregation_type( results of a conversation to produce a single result. :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType """ - self._conversation_aggregation_function = GetAggregator( - conversation_aggregation_type - ) + self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type) @experimental @final - def _set_conversation_aggregator( - self, aggregator: Callable[[List[float]], float] - ) -> None: + def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None: """Set the conversation aggregator function directly. This function will be applied to all numeric outputs of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not @@ -543,9 +512,7 @@ class AsyncEvaluatorBase: to ensure that no one ever needs to extend or otherwise modify this class directly. """ - def __init__( - self, real_call - ): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION + def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION self._real_call = real_call # Don't look at my shame. Nothing to see here.... diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 01c986800c20..2aa57c0d360a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -79,9 +79,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls." _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided." - _TOOL_DEFINITIONS_MISSING_MESSAGE = ( - "Tool definitions for all tool calls must be provided." - ) + _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided." _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5." _LLM_SCORE_KEY = "tool_calls_success_level" @@ -90,9 +88,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override - def __init__( - self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs - ): + def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self.threshold = threshold @@ -178,9 +174,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_definitions = [tool_definitions] try: - needed_tool_definitions = self._extract_needed_tool_definitions( - tool_calls, tool_definitions - ) + needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions) except EvaluationException as e: return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} if len(needed_tool_definitions) == 0: @@ -286,28 +280,17 @@ def _parse_tools_from_response(self, response): if isinstance(response, list): for message in response: # Extract tool calls from assistant messages - if message.get("role") == "assistant" and isinstance( - message.get("content"), list - ): + if message.get("role") == "assistant" and isinstance(message.get("content"), list): for content_item in message.get("content"): - if ( - isinstance(content_item, dict) - and content_item.get("type") == "tool_call" - ): + if isinstance(content_item, dict) and content_item.get("type") == "tool_call": tool_calls.append(content_item) # Extract tool results from tool messages elif message.get("role") == "tool" and message.get("tool_call_id"): tool_call_id = message.get("tool_call_id") - if ( - isinstance(message.get("content"), list) - and len(message.get("content")) > 0 - ): + if isinstance(message.get("content"), list) and len(message.get("content")) > 0: result_content = message.get("content")[0] - if ( - isinstance(result_content, dict) - and result_content.get("type") == "tool_result" - ): + if isinstance(result_content, dict) and result_content.get("type") == "tool_result": tool_results_map[tool_call_id] = result_content # Attach results to their corresponding calls @@ -334,8 +317,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): tool_definition = [ tool for tool in tool_definitions - if tool.get("name") == tool_name - and tool.get("type", "function") == "function" + if tool.get("name") == tool_name and tool.get("type", "function") == "function" ] if len(tool_definition) > 0: needed_tool_definitions.extend(tool_definition) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py index 9365c347d2cf..3b3580817eb5 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py @@ -28,10 +28,7 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } ], ) - assert ( - result[ToolCallAccuracyEvaluator._RESULT_KEY] - == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - ) + assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert ( ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] @@ -49,10 +46,7 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } ], ) - assert ( - result[ToolCallAccuracyEvaluator._RESULT_KEY] - == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - ) + assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert ( ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] @@ -78,10 +72,7 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } ], ) - assert ( - result[ToolCallAccuracyEvaluator._RESULT_KEY] - == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - ) + assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert ( ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] @@ -90,9 +81,7 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): # Test with tool call for which definition is not provided result = tool_call_accuracy( query="Where is the Eiffel Tower?", - tool_calls=[ - {"type": "tool_call", "name": "some_other_tool", "arguments": {}} - ], + tool_calls=[{"type": "tool_call", "name": "some_other_tool", "arguments": {}}], tool_definitions=[ { "name": "fetch_weather", @@ -109,10 +98,7 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config): } ], ) - assert ( - result[ToolCallAccuracyEvaluator._RESULT_KEY] - == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT - ) + assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert ( ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 5f7e06e4589f..a82577a96bd1 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -12,9 +12,7 @@ async def flow_side_effect(timeout, **kwargs): good_calls = sum(1 for tc in tool_calls if "good" in tc.get("tool_call_id", "")) bad_calls = sum(1 for tc in tool_calls if "bad" in tc.get("tool_call_id", "")) - invalid_calls = sum( - 1 for tc in tool_calls if "invalid" in tc.get("tool_call_id", "") - ) + invalid_calls = sum(1 for tc in tool_calls if "invalid" in tc.get("tool_call_id", "")) total_calls = len(tool_calls) if invalid_calls > 0: @@ -95,21 +93,14 @@ def test_evaluate_tools_valid1(self, mock_model_config): }, }, ] - result = evaluator( - query=query, tool_calls=tool_calls, tool_definitions=tool_definitions - ) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert ( - key in result and f"{key}_result" in result and f"{key}_threshold" in result - ) + assert key in result and f"{key}_result" in result and f"{key}_threshold" in result assert result[key] == 3.0 # Mixed good/bad gets score 3 assert result[f"{key}_result"] == "pass" - assert ( - result[f"{key}_threshold"] - == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - ) + assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls." assert "details" in result @@ -164,21 +155,14 @@ def test_evaluate_tools_valid2(self, mock_model_config): }, }, ] - result = evaluator( - query=query, tool_calls=tool_calls, tool_definitions=tool_definitions - ) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert ( - key in result and f"{key}_result" in result and f"{key}_threshold" in result - ) + assert key in result and f"{key}_result" in result and f"{key}_threshold" in result assert result[key] == 1.0 # All bad gets score 1 assert result[f"{key}_result"] == "fail" - assert ( - result[f"{key}_threshold"] - == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - ) + assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls." assert "details" in result @@ -233,21 +217,14 @@ def test_evaluate_tools_valid3(self, mock_model_config): }, }, ] - result = evaluator( - query=query, tool_calls=tool_calls, tool_definitions=tool_definitions - ) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None - assert ( - key in result and f"{key}_result" in result and f"{key}_threshold" in result - ) + assert key in result and f"{key}_result" in result and f"{key}_threshold" in result assert result[key] == 5.0 # All good gets score 5 assert result[f"{key}_result"] == "pass" - assert ( - result[f"{key}_threshold"] - == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - ) + assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE assert f"{key}_reason" in result assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls." assert "details" in result @@ -283,9 +260,7 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config): }, }, ] - evaluator( - query=query, tool_calls=tool_calls, tool_definitions=tool_definitions - ) + evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) assert "Invalid score value" in str(exc_info.value) @@ -339,22 +314,14 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config): }, }, ] - result = evaluator( - query=query, tool_calls=tool_calls, tool_definitions=tool_definitions - ) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert ( - result[f"{key}_threshold"] - == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - ) - assert ( - result[f"{key}_reason"] - == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - ) + assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE assert result["details"] == {} def test_evaluate_tools_all_not_applicable(self, mock_model_config): @@ -387,22 +354,14 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config): }, }, ] - result = evaluator( - query=query, tool_calls=tool_calls, tool_definitions=tool_definitions - ) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert ( - result[f"{key}_threshold"] - == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - ) - assert ( - result[f"{key}_reason"] - == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE - ) + assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE assert result["details"] == {} def test_evaluate_tools_no_tools(self, mock_model_config): @@ -428,19 +387,12 @@ def test_evaluate_tools_no_tools(self, mock_model_config): }, }, ] - result = evaluator( - query=query, tool_calls=tool_calls, tool_definitions=tool_definitions - ) + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT assert result[f"{key}_result"] == "pass" - assert ( - result[f"{key}_threshold"] - == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE - ) - assert ( - result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE - ) + assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE + assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE assert result["details"] == {} From e690217341dc5328012a060488c74820ba67c441 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Thu, 17 Jul 2025 09:47:38 +0300 Subject: [PATCH 23/23] To re-trigger build pipelines --- .../_tool_call_accuracy/_tool_call_accuracy.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 2aa57c0d360a..8799fd422f0a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -25,16 +25,16 @@ @experimental class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining: - - Relevance to the conversation - - Parameter correctness according to tool definitions - - Parameter value extraction from the conversation + - Relevance to the conversation. + - Parameter correctness according to tool definitions. + - Parameter value extraction from the conversation. The evaluator uses a scoring rubric of 1 to 5: - Score 1: The tool calls are irrelevant - - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed - - Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made - - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded - - Score 5: The tool calls are relevant, and all parameters were correctly passed + - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed. + - Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made. + - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded. + - Score 5: The tool calls are relevant, and all parameters were correctly passed. This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing user needs while properly following tool definitions and using information present in the