Skip to content

Commit 4c27dff

Browse files
author
Salma Elshafey
committed
Move excess/missing tool calls fields under additional details
1 parent d0f637e commit 4c27dff

File tree

3 files changed

+11
-33
lines changed

3 files changed

+11
-33
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
7777
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
7878

7979
_LLM_SCORE_KEY = "tool_calls_success_level"
80-
_EXCESS_TOOL_CALLS_KEY = "excess_tool_calls"
81-
_MISSING_TOOL_CALLS_KEY = "missing_tool_calls"
8280

8381
id = "id"
8482
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
@@ -216,8 +214,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
216214
f"{self._result_key}_threshold": self.threshold,
217215
f"{self._result_key}_reason": reason,
218216
'per_tool_call_details': llm_output.get('additional_details', {}),
219-
self._EXCESS_TOOL_CALLS_KEY: llm_output.get(self._EXCESS_TOOL_CALLS_KEY, {}),
220-
self._MISSING_TOOL_CALLS_KEY: llm_output.get(self._MISSING_TOOL_CALLS_KEY, {}),
221217
}
222218
return response_dict
223219

@@ -262,8 +258,6 @@ def _not_applicable_result(self, error_message):
262258
f"{self._result_key}_threshold": self.threshold,
263259
f"{self._result_key}_reason": error_message,
264260
"per_tool_call_details": {},
265-
self._EXCESS_TOOL_CALLS_KEY: {},
266-
self._MISSING_TOOL_CALLS_KEY: {},
267261

268262
}
269263

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -136,15 +136,15 @@ Your output should consist only of a JSON object, as provided in the examples, t
136136
- correct_tool_percentage: percentage of correct calls made by the agent for this tool. It is a value between 0.0 and 1.0
137137
- tool_call_errors: number of errors encountered during the tool call
138138
- tool_success_result: 'pass' or 'fail' based on the evaluation of the tool call accuracy for this tool
139-
- excess_tool_calls: a dictionary with the following keys:
140-
- total: total number of excess, unnecessary tool calls made by the agent
141-
- details: a list of dictionaries, each containing:
142-
- tool_name: name of the tool
143-
- excess_count: number of excess calls made for this query
144-
- missing_tool_calls: a dictionary with the following keys:
145-
- total: total number of missing tool calls that should have been made by the agent to be able to answer the query
146-
- details: a list of dictionaries, each containing:
147-
- tool_name: name of the tool
148-
- missing_count: number of missing calls for this query
139+
- excess_tool_calls: a dictionary with the following keys:
140+
- total: total number of excess, unnecessary tool calls made by the agent
141+
- details: a list of dictionaries, each containing:
142+
- tool_name: name of the tool
143+
- excess_count: number of excess calls made for this query
144+
- missing_tool_calls: a dictionary with the following keys:
145+
- total: total number of missing tool calls that should have been made by the agent to be able to answer the query
146+
- details: a list of dictionaries, each containing:
147+
- tool_name: name of the tool
148+
- missing_count: number of missing calls for this query
149149

150150
# Output

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@ async def flow_side_effect(timeout, **kwargs):
2121
"chain_of_thought": "The tool calls were very correct that I returned a huge number!",
2222
"tool_calls_success_level": 25,
2323
"additional_details": {},
24-
ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {},
25-
ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {}
2624
}
2725

2826
score = 1 # Default score for "all bad"
@@ -39,8 +37,6 @@ async def flow_side_effect(timeout, **kwargs):
3937
"tool_calls_made_by_agent": total_calls,
4038
"correct_tool_calls_made_by_agent": good_calls
4139
},
42-
ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {"total": 0},
43-
ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {"total": 0}
4440
}
4541

4642

@@ -102,8 +98,6 @@ def test_evaluate_tools_valid1(self, mock_model_config):
10298
assert f"{key}_reason" in result
10399
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
104100
assert "per_tool_call_details" in result
105-
assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
106-
assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
107101

108102
def test_evaluate_tools_valid2(self, mock_model_config):
109103
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -160,8 +154,6 @@ def test_evaluate_tools_valid2(self, mock_model_config):
160154
assert f"{key}_reason" in result
161155
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
162156
assert "per_tool_call_details" in result
163-
assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
164-
assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
165157

166158
def test_evaluate_tools_valid3(self, mock_model_config):
167159
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -218,8 +210,6 @@ def test_evaluate_tools_valid3(self, mock_model_config):
218210
assert f"{key}_reason" in result
219211
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
220212
assert "per_tool_call_details" in result
221-
assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
222-
assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
223213

224214
def test_evaluate_tools_one_eval_fails(self, mock_model_config):
225215
with pytest.raises(EvaluationException) as exc_info:
@@ -306,8 +296,6 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
306296
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
307297
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
308298
assert result["per_tool_call_details"] == {}
309-
assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
310-
assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
311299

312300
def test_evaluate_tools_all_not_applicable(self, mock_model_config):
313301
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -345,8 +333,6 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
345333
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
346334
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
347335
assert result["per_tool_call_details"] == {}
348-
assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
349-
assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
350336

351337
def test_evaluate_tools_no_tools(self, mock_model_config):
352338
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -376,6 +362,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
376362
assert result[f"{key}_result"] == "pass"
377363
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
378364
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
379-
assert result["per_tool_call_details"] == {}
380-
assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
381-
assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
365+
assert result["per_tool_call_details"] == {}

0 commit comments

Comments
 (0)