Revert 'Merge branch 'main' into selshafey/improve_tool_call_accuracy'

Salma Elshafey · Salma Elshafey · commit e72b084a892b · 2025-07-16T19:09:53.000+03:00
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -212,7 +212,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_result": score_result,
                 f"{self._result_key}_threshold": self.threshold,
                 f"{self._result_key}_reason": reason,
-                'per_tool_call_details': llm_output.get('additional_details', {}),
+                'details': llm_output.get('details', {}),
             }
             return response_dict
             
@@ -255,7 +255,7 @@ def _not_applicable_result(self, error_message):
             f"{self._result_key}_result": 'pass',
             f"{self._result_key}_threshold": self.threshold,
             f"{self._result_key}_reason": error_message,
-            "per_tool_call_details": {},
+            "details": {},
 
         }
     
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty
@@ -125,10 +125,10 @@ TOOL DEFINITION: {{tool_definition}}
 Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
   - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query.
   - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
-  - additional_details: a dictionary that contains the following keys:
+  - details: a dictionary that contains the following keys:
         - tool_calls_made_by_agent: total number of tool calls made by the agent
         - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
-        - details: a list of dictionaries, each containing:
+        - per_tool_call_details: a list of dictionaries, each containing:
           - tool_name: name of the tool
           - total_calls_required: total number of calls required for the tool
           - correct_calls_made_by_agent: number of correct calls made by the agent
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -97,7 +97,7 @@ def test_evaluate_tools_valid1(self, mock_model_config):
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
-        assert "per_tool_call_details" in result
+        assert "details" in result
 
     def test_evaluate_tools_valid2(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -153,7 +153,7 @@ def test_evaluate_tools_valid2(self, mock_model_config):
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
-        assert "per_tool_call_details" in result
+        assert "details" in result
 
     def test_evaluate_tools_valid3(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -209,7 +209,7 @@ def test_evaluate_tools_valid3(self, mock_model_config):
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
-        assert "per_tool_call_details" in result
+        assert "details" in result
 
     def test_evaluate_tools_one_eval_fails(self, mock_model_config):
         with pytest.raises(EvaluationException) as exc_info:
@@ -295,7 +295,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result["per_tool_call_details"] == {}
+        assert result["details"] == {}
 
     def test_evaluate_tools_all_not_applicable(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -332,7 +332,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result["per_tool_call_details"] == {}
+        assert result["details"] == {}
 
     def test_evaluate_tools_no_tools(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -362,4 +362,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
-        assert result["per_tool_call_details"] == {}
+        assert result["details"] == {}

Original file line number	Diff line number	Diff line change
`@@ -212,7 +212,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t`
`212`	`212`	`f"{self._result_key}_result": score_result,`
`213`	`213`	`f"{self._result_key}_threshold": self.threshold,`
`214`	`214`	`f"{self._result_key}_reason": reason,`
`215`		`- 'per_tool_call_details': llm_output.get('additional_details', {}),`
	`215`	`+ 'details': llm_output.get('details', {}),`
`216`	`216`	`}`
`217`	`217`	`return response_dict`
`218`	`218`
`@@ -255,7 +255,7 @@ def _not_applicable_result(self, error_message):`
`255`	`255`	`f"{self._result_key}_result": 'pass',`
`256`	`256`	`f"{self._result_key}_threshold": self.threshold,`
`257`	`257`	`f"{self._result_key}_reason": error_message,`
`258`		`- "per_tool_call_details": {},`
	`258`	`+ "details": {},`
`259`	`259`
`260`	`260`	`}`
`261`	`261`