Skip to content

Commit e72b084

Browse files
author
Salma Elshafey
committed
Revert 'Merge branch 'main' into selshafey/improve_tool_call_accuracy'
1 parent 6525a6f commit e72b084

File tree

3 files changed

+10
-10
lines changed

3 files changed

+10
-10
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
212212
f"{self._result_key}_result": score_result,
213213
f"{self._result_key}_threshold": self.threshold,
214214
f"{self._result_key}_reason": reason,
215-
'per_tool_call_details': llm_output.get('additional_details', {}),
215+
'details': llm_output.get('details', {}),
216216
}
217217
return response_dict
218218

@@ -255,7 +255,7 @@ def _not_applicable_result(self, error_message):
255255
f"{self._result_key}_result": 'pass',
256256
f"{self._result_key}_threshold": self.threshold,
257257
f"{self._result_key}_reason": error_message,
258-
"per_tool_call_details": {},
258+
"details": {},
259259

260260
}
261261

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,10 @@ TOOL DEFINITION: {{tool_definition}}
125125
Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
126126
- chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query.
127127
- tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
128-
- additional_details: a dictionary that contains the following keys:
128+
- details: a dictionary that contains the following keys:
129129
- tool_calls_made_by_agent: total number of tool calls made by the agent
130130
- correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
131-
- details: a list of dictionaries, each containing:
131+
- per_tool_call_details: a list of dictionaries, each containing:
132132
- tool_name: name of the tool
133133
- total_calls_required: total number of calls required for the tool
134134
- correct_calls_made_by_agent: number of correct calls made by the agent

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def test_evaluate_tools_valid1(self, mock_model_config):
9797
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
9898
assert f"{key}_reason" in result
9999
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
100-
assert "per_tool_call_details" in result
100+
assert "details" in result
101101

102102
def test_evaluate_tools_valid2(self, mock_model_config):
103103
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -153,7 +153,7 @@ def test_evaluate_tools_valid2(self, mock_model_config):
153153
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
154154
assert f"{key}_reason" in result
155155
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
156-
assert "per_tool_call_details" in result
156+
assert "details" in result
157157

158158
def test_evaluate_tools_valid3(self, mock_model_config):
159159
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -209,7 +209,7 @@ def test_evaluate_tools_valid3(self, mock_model_config):
209209
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
210210
assert f"{key}_reason" in result
211211
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
212-
assert "per_tool_call_details" in result
212+
assert "details" in result
213213

214214
def test_evaluate_tools_one_eval_fails(self, mock_model_config):
215215
with pytest.raises(EvaluationException) as exc_info:
@@ -295,7 +295,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
295295
assert result[f"{key}_result"] == "pass"
296296
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
297297
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
298-
assert result["per_tool_call_details"] == {}
298+
assert result["details"] == {}
299299

300300
def test_evaluate_tools_all_not_applicable(self, mock_model_config):
301301
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -332,7 +332,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
332332
assert result[f"{key}_result"] == "pass"
333333
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
334334
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
335-
assert result["per_tool_call_details"] == {}
335+
assert result["details"] == {}
336336

337337
def test_evaluate_tools_no_tools(self, mock_model_config):
338338
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -362,4 +362,4 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
362362
assert result[f"{key}_result"] == "pass"
363363
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
364364
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
365-
assert result["per_tool_call_details"] == {}
365+
assert result["details"] == {}

0 commit comments

Comments
 (0)