Move excess/missing tool calls fields under additional details

Salma Elshafey · Salma Elshafey · commit 4c27dfffc50a · 2025-07-01T07:35:10.000+03:00
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -77,8 +77,6 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
 
     _LLM_SCORE_KEY = "tool_calls_success_level"
-    _EXCESS_TOOL_CALLS_KEY = "excess_tool_calls"
-    _MISSING_TOOL_CALLS_KEY = "missing_tool_calls"
 
     id = "id"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
@@ -216,8 +214,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_threshold": self.threshold,
                 f"{self._result_key}_reason": reason,
                 'per_tool_call_details': llm_output.get('additional_details', {}),
-                self._EXCESS_TOOL_CALLS_KEY: llm_output.get(self._EXCESS_TOOL_CALLS_KEY, {}),
-                self._MISSING_TOOL_CALLS_KEY: llm_output.get(self._MISSING_TOOL_CALLS_KEY, {}),
             }
             return response_dict
             
@@ -262,8 +258,6 @@ def _not_applicable_result(self, error_message):
             f"{self._result_key}_threshold": self.threshold,
             f"{self._result_key}_reason": error_message,
             "per_tool_call_details": {},
-            self._EXCESS_TOOL_CALLS_KEY: {},
-            self._MISSING_TOOL_CALLS_KEY: {},
 
         }
     
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty
@@ -136,15 +136,15 @@ Your output should consist only of a JSON object, as provided in the examples, t
           - correct_tool_percentage: percentage of correct calls made by the agent for this tool. It is a value between 0.0 and 1.0
           - tool_call_errors: number of errors encountered during the tool call
           - tool_success_result: 'pass' or 'fail' based on the evaluation of the tool call accuracy for this tool
-  - excess_tool_calls: a dictionary with the following keys:
-      - total: total number of excess, unnecessary tool calls made by the agent
-      - details: a list of dictionaries, each containing:
-        - tool_name: name of the tool
-        - excess_count: number of excess calls made for this query
-  - missing_tool_calls: a dictionary with the following keys:
-      - total: total number of missing tool calls that should have been made by the agent to be able to answer the query
-      - details: a list of dictionaries, each containing:
-        - tool_name: name of the tool
-        - missing_count: number of missing calls for this query
+        - excess_tool_calls: a dictionary with the following keys:
+            - total: total number of excess, unnecessary tool calls made by the agent
+            - details: a list of dictionaries, each containing:
+              - tool_name: name of the tool
+              - excess_count: number of excess calls made for this query
+        - missing_tool_calls: a dictionary with the following keys:
+            - total: total number of missing tool calls that should have been made by the agent to be able to answer the query
+            - details: a list of dictionaries, each containing:
+              - tool_name: name of the tool
+              - missing_count: number of missing calls for this query
 
 # Output
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -21,8 +21,6 @@ async def flow_side_effect(timeout, **kwargs):
             "chain_of_thought": "The tool calls were very correct that I returned a huge number!",
             "tool_calls_success_level": 25,
             "additional_details": {},
-            ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {},
-            ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {}
         }
 
     score = 1  # Default score for "all bad"
@@ -39,8 +37,6 @@ async def flow_side_effect(timeout, **kwargs):
             "tool_calls_made_by_agent": total_calls,
             "correct_tool_calls_made_by_agent": good_calls
         },
-        ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {"total": 0},
-        ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {"total": 0}
     }
 
 
@@ -102,8 +98,6 @@ def test_evaluate_tools_valid1(self, mock_model_config):
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
         assert "per_tool_call_details" in result
-        assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
-        assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
 
     def test_evaluate_tools_valid2(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -160,8 +154,6 @@ def test_evaluate_tools_valid2(self, mock_model_config):
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
         assert "per_tool_call_details" in result
-        assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
-        assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
 
     def test_evaluate_tools_valid3(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -218,8 +210,6 @@ def test_evaluate_tools_valid3(self, mock_model_config):
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
         assert "per_tool_call_details" in result
-        assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
-        assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
 
     def test_evaluate_tools_one_eval_fails(self, mock_model_config):
         with pytest.raises(EvaluationException) as exc_info:
@@ -306,8 +296,6 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
         assert result["per_tool_call_details"] == {}
-        assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
-        assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
 
     def test_evaluate_tools_all_not_applicable(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -345,8 +333,6 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
         assert result["per_tool_call_details"] == {}
-        assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
-        assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
 
     def test_evaluate_tools_no_tools(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -376,6 +362,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
-        assert result["per_tool_call_details"] == {}
-        assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
-        assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
+        assert result["per_tool_call_details"] == {}