Added field names and messages as constants

Salma Elshafey · Salma Elshafey · commit 67fc87d6b737 · 2025-06-25T23:17:46.000+03:00
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -8,7 +8,7 @@
 
 - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
 
-- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a number in the range [0-1]. The number range is now [1-5].
+- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
 
 - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -71,6 +71,15 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     _MIN_TOOL_CALL_ACCURACY_SCORE = 1
     _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
 
+    _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
+    _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
+    _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
+    _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
+
+    _LLM_SCORE_KEY = "tool_calls_success_level"
+    _EXCESS_TOOL_CALLS_KEY = "excess_tool_calls"
+    _MISSING_TOOL_CALLS_KEY = "missing_tool_calls"
+
     id = "id"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
@@ -150,9 +159,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 tool_calls = parsed_tool_calls
 
         if not tool_calls:
-            return {"error_message": "No tool calls found in response or provided tool_calls."}
+            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
         if not tool_definitions or len(tool_definitions) == 0:
-            return {"error_message": "Tool definitions must be provided."}
+            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -162,9 +171,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         try:
             needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
         except EvaluationException as e:
-            return {"error_message": "Tool definitions for all tool calls must be provided."}
+            return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
         if len(needed_tool_definitions) == 0:
-            return {"error_message": "Tool definitions for all tool calls must be provided."}
+            return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
 
         return {
             "query": query,
@@ -188,7 +197,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
 
         if isinstance(llm_output, dict):
-            score = llm_output.get("tool_calls_success_level", None)
+            score = llm_output.get(self._LLM_SCORE_KEY, None)
             if not score or not check_score_is_valid(score, ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE):
                 raise EvaluationException(
                     message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
@@ -208,8 +217,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_reason": reason,
                 'applicable': True,
                 'per_tool_call_details': llm_output.get('additional_details', {}),
-                'excess_tool_calls': llm_output.get('excess_tool_calls', {}),
-                'missing_tool_calls': llm_output.get('missing_tool_calls', {}),
+                self._EXCESS_TOOL_CALLS_KEY: llm_output.get(self._EXCESS_TOOL_CALLS_KEY, {}),
+                self._MISSING_TOOL_CALLS_KEY: llm_output.get(self._MISSING_TOOL_CALLS_KEY, {}),
             }
             return response_dict
             
@@ -255,8 +264,8 @@ def _not_applicable_result(self, error_message):
             f"{self._result_key}_reason": error_message,
             "applicable": False,
             "per_tool_call_details": {},
-            "excess_tool_calls": {},
-            "missing_tool_calls": {},
+            self._EXCESS_TOOL_CALLS_KEY: {},
+            self._MISSING_TOOL_CALLS_KEY: {},
 
         }
     
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py
@@ -27,8 +27,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
             }]
         )
         assert not result["applicable"]
-        assert result["tool_call_accuracy"] == "not applicable"
-        assert "No tool calls found in response or provided tool_calls." in result["tool_call_accuracy_reason"]
+        assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
+        assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
 
         # Test with missing tool_definitions
         result = tool_call_accuracy(
@@ -43,8 +43,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
             }]
         )
         assert not result["applicable"]
-        assert result["tool_call_accuracy"] == "not applicable"
-        assert "Tool definitions must be provided." in result["tool_call_accuracy_reason"]
+        assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
+        assert ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
 
         # Test with response that has no tool calls
         result = tool_call_accuracy(
@@ -65,8 +65,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
             }]
         )
         assert not result["applicable"]
-        assert result["tool_call_accuracy"] == "not applicable"
-        assert "No tool calls found in response or provided tool_calls." in result["tool_call_accuracy_reason"]
+        assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
+        assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
 
         # Test with tool call for which definition is not provided
         result = tool_call_accuracy(
@@ -91,5 +91,5 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
             }]
         )
         assert not result["applicable"]
-        assert result["tool_call_accuracy"] == "not applicable"
-        assert "Tool definitions for all tool calls must be provided." in result["tool_call_accuracy_reason"]
+        assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
+        assert ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -21,8 +21,8 @@ async def flow_side_effect(timeout, **kwargs):
             "chain_of_thought": "The tool calls were very correct that I returned a huge number!",
             "tool_calls_success_level": 25,
             "additional_details": {},
-            "excess_tool_calls": {},
-            "missing_tool_calls": {}
+            ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {},
+            ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {}
         }
 
     score = 1  # Default score for "all bad"
@@ -39,8 +39,8 @@ async def flow_side_effect(timeout, **kwargs):
             "tool_calls_made_by_agent": total_calls,
             "correct_tool_calls_made_by_agent": good_calls
         },
-        "excess_tool_calls": {"total": 0},
-        "missing_tool_calls": {"total": 0}
+        ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {"total": 0},
+        ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {"total": 0}
     }
 
 
@@ -102,8 +102,8 @@ def test_evaluate_tools_valid1(self, mock_model_config):
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
         assert "per_tool_call_details" in result
-        assert "excess_tool_calls" in result
-        assert "missing_tool_calls" in result
+        assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
+        assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
         assert result["applicable"] is True
 
     def test_evaluate_tools_valid2(self, mock_model_config):
@@ -161,8 +161,8 @@ def test_evaluate_tools_valid2(self, mock_model_config):
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
         assert "per_tool_call_details" in result
-        assert "excess_tool_calls" in result
-        assert "missing_tool_calls" in result
+        assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
+        assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
         assert result["applicable"] is True
 
     def test_evaluate_tools_valid3(self, mock_model_config):
@@ -220,8 +220,8 @@ def test_evaluate_tools_valid3(self, mock_model_config):
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
         assert "per_tool_call_details" in result
-        assert "excess_tool_calls" in result
-        assert "missing_tool_calls" in result
+        assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
+        assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
         assert result["applicable"] is True
 
     def test_evaluate_tools_one_eval_fails(self, mock_model_config):
@@ -304,13 +304,13 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
 
         key = ToolCallAccuracyEvaluator._RESULT_KEY
         assert result is not None
-        assert result[key] == "not applicable"
+        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == "Tool definitions for all tool calls must be provided."
+        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
         assert result["per_tool_call_details"] == {}
-        assert result["excess_tool_calls"] == {}
-        assert result["missing_tool_calls"] == {}
+        assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
+        assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
         assert result["applicable"] is False
 
     def test_evaluate_tools_all_not_applicable(self, mock_model_config):
@@ -344,13 +344,13 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
 
         key = ToolCallAccuracyEvaluator._RESULT_KEY
         assert result is not None
-        assert result[key] == "not applicable"
+        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == "Tool definitions for all tool calls must be provided."
+        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
         assert result["per_tool_call_details"] == {}
-        assert result["excess_tool_calls"] == {}
-        assert result["missing_tool_calls"] == {}
+        assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
+        assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
         assert result["applicable"] is False
 
     def test_evaluate_tools_no_tools(self, mock_model_config):
@@ -377,11 +377,11 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
 
         key = ToolCallAccuracyEvaluator._RESULT_KEY
         assert result is not None
-        assert result[key] == "not applicable"
+        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == "No tool calls found in response or provided tool_calls."
+        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
         assert result["per_tool_call_details"] == {}
-        assert result["excess_tool_calls"] == {}
-        assert result["missing_tool_calls"] == {}
+        assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
+        assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
         assert result["applicable"] is False