Skip to content

Commit eac1d35

Browse files
authored
Query optional for RAI service evaluators (#41372)
* make query optional * add _OPTIONAL_PARAMS * udpates from feedback * add _evaluate_query to evaluators init * update tests * update assets * test recording updates * asset update * black fix * test config fix * black fix
1 parent fb3837b commit eac1d35

File tree

16 files changed

+100
-35
lines changed

16 files changed

+100
-35
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## 1.10.0 (Unreleased)
44

5+
### Breaking Changes
6+
- Added `_evaluate_query` parameter to `RaiServiceEvaluatorBase` class with a default value of `False`. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. Existing code that relies on queries being evaluated will need to explicitly set `_evaluate_query=True` to maintain the previous behavior.
7+
58
### Features Added
69

710
### Bugs Fixed

sdk/evaluation/azure-ai-evaluation/assets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_e9c7adf5b1"
5+
"Tag": "python/evaluation/azure-ai-evaluation_4a5c8207cc"
66
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,21 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
8181

8282
id = "code_vulnerability"
8383
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
84+
_OPTIONAL_PARAMS = ["query"]
8485

8586
@override
8687
def __init__(
8788
self,
8889
credential,
8990
azure_ai_project,
91+
*,
92+
_evaluate_query: bool = True,
9093
):
9194
super().__init__(
9295
eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
9396
azure_ai_project=azure_ai_project,
9497
credential=credential,
98+
_evaluate_query=_evaluate_query,
9599
)
96100

97101
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
3636
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
3737
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
3838
:type eval_last_turn: bool
39-
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
40-
to produce a single result.
39+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
4140
Default is ~azure.ai.evaluation._AggregationType.MEAN.
4241
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
4342
:param threshold: The threshold for the evaluation. Default is 3.
4443
:type threshold: Optional[int]
44+
:param _evaluate_query: If True, the query will be included in the evaluation data when evaluating
45+
query-response pairs. If False, only the response will be evaluated. Default is False.
46+
:type _evaluate_query: bool
4547
:param _higher_is_better: If True, higher scores are better. Default is True.
4648
:type _higher_is_better: Optional[bool]
4749
"""
@@ -55,6 +57,7 @@ def __init__(
5557
eval_last_turn: bool = False,
5658
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
5759
threshold: int = 3,
60+
_evaluate_query: bool = False,
5861
_higher_is_better: Optional[bool] = False,
5962
):
6063
super().__init__(
@@ -67,6 +70,7 @@ def __init__(
6770
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
6871
self._credential = credential
6972
self._threshold = threshold
73+
self._evaluate_query = _evaluate_query
7074
self._higher_is_better = _higher_is_better
7175

7276
@override
@@ -101,7 +105,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
101105
:return: The evaluation result.
102106
:rtype: Dict
103107
"""
104-
if "query" in eval_input and "response" in eval_input:
108+
if "response" in eval_input:
105109
return await self._evaluate_query_response(eval_input)
106110

107111
conversation = eval_input.get("conversation", None)
@@ -131,17 +135,20 @@ async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
131135
return result
132136

133137
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
134-
query = str(eval_input.get("query", None))
135-
response = str(eval_input.get("response", None))
136-
if query is None or response is None:
138+
query = eval_input.get("query", None)
139+
response = eval_input.get("response", None)
140+
if response is None:
137141
raise EvaluationException(
138142
message="Not implemented",
139143
internal_message=(
140-
"Reached query/response evaluation without supplying query or response."
144+
"Reached query/response evaluation without supplying response."
141145
+ " This should have failed earlier."
142146
),
143147
)
144-
input_data = {"query": query, "response": response}
148+
input_data = {"response": str(response)}
149+
150+
if query is not None and self._evaluate_query:
151+
input_data["query"] = str(query)
145152

146153
if "context" in self._singleton_inputs:
147154
context = eval_input.get("context", None)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
3333
:type self_harm_threshold: int
3434
:param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
3535
:type hate_unfairness_threshold: int
36+
:param _evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
37+
:type _evaluate_query: bool
3638
:param kwargs: Additional arguments to pass to the evaluator.
3739
:type kwargs: Any
3840
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -68,6 +70,7 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
6870

6971
id = "content_safety"
7072
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
73+
_OPTIONAL_PARAMS = ["query"]
7174

7275
def __init__(
7376
self,
@@ -78,6 +81,7 @@ def __init__(
7881
sexual_threshold: int = 3,
7982
self_harm_threshold: int = 3,
8083
hate_unfairness_threshold: int = 3,
84+
_evaluate_query: bool = False,
8185
**kwargs,
8286
):
8387
# Type checking
@@ -91,10 +95,16 @@ def __init__(
9195
raise TypeError(f"{name} must be an int, got {type(value)}")
9296

9397
evaluators = [
94-
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
95-
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
96-
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
97-
HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
98+
ViolenceEvaluator(
99+
credential, azure_ai_project, threshold=violence_threshold, _evaluate_query=_evaluate_query
100+
),
101+
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, _evaluate_query=_evaluate_query),
102+
SelfHarmEvaluator(
103+
credential, azure_ai_project, threshold=self_harm_threshold, _evaluate_query=_evaluate_query
104+
),
105+
HateUnfairnessEvaluator(
106+
credential, azure_ai_project, threshold=hate_unfairness_threshold, _evaluate_query=_evaluate_query
107+
),
98108
]
99109
super().__init__(evaluators=evaluators, **kwargs)
100110

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
8282

8383
id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
8484
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
85+
_OPTIONAL_PARAMS = ["query"]
8586

8687
@override
8788
def __init__(
@@ -90,6 +91,7 @@ def __init__(
9091
azure_ai_project,
9192
*,
9293
threshold: int = 3,
94+
_evaluate_query: bool = False,
9395
):
9496
super().__init__(
9597
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -98,6 +100,7 @@ def __init__(
98100
conversation_aggregation_type=_AggregationType.MAX,
99101
threshold=threshold,
100102
_higher_is_better=False,
103+
_evaluate_query=_evaluate_query,
101104
)
102105

103106
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
6767

6868
id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
6969
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70+
_OPTIONAL_PARAMS = ["query"]
7071

7172
@override
7273
def __init__(
@@ -75,6 +76,7 @@ def __init__(
7576
azure_ai_project,
7677
*,
7778
threshold: int = 3,
79+
_evaluate_query: bool = False,
7880
):
7981
super().__init__(
8082
eval_metric=EvaluationMetrics.SELF_HARM,
@@ -83,6 +85,7 @@ def __init__(
8385
conversation_aggregation_type=_AggregationType.MAX,
8486
threshold=threshold,
8587
_higher_is_better=False,
88+
_evaluate_query=_evaluate_query,
8689
)
8790

8891
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
7878

7979
id = "azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3"
8080
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
81+
_OPTIONAL_PARAMS = ["query"]
8182

8283
@override
8384
def __init__(
@@ -86,6 +87,7 @@ def __init__(
8687
azure_ai_project,
8788
*,
8889
threshold: int = 3,
90+
_evaluate_query: bool = False,
8991
):
9092
super().__init__(
9193
eval_metric=EvaluationMetrics.SEXUAL,
@@ -94,6 +96,7 @@ def __init__(
9496
conversation_aggregation_type=_AggregationType.MAX,
9597
threshold=threshold,
9698
_higher_is_better=False,
99+
_evaluate_query=_evaluate_query,
97100
)
98101

99102
@overload
@@ -146,7 +149,7 @@ def __call__( # pylint: disable=docstring-missing-param
146149
key "messages". Conversation turns are expected
147150
to be dictionaries with keys "content" and "role".
148151
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
149-
:return: The fluency score.
152+
:return: The sexual score.
150153
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
151154
"""
152155
return super().__call__(*args, **kwargs)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
7878

7979
id = "azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3"
8080
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
81+
_OPTIONAL_PARAMS = ["query"]
8182

8283
@override
8384
def __init__(
@@ -86,6 +87,7 @@ def __init__(
8687
azure_ai_project,
8788
*,
8889
threshold: int = 3,
90+
_evaluate_query: bool = False,
8991
):
9092
super().__init__(
9193
eval_metric=EvaluationMetrics.VIOLENCE,
@@ -94,6 +96,7 @@ def __init__(
9496
conversation_aggregation_type=_AggregationType.MAX,
9597
threshold=threshold,
9698
_higher_is_better=False,
99+
_evaluate_query=_evaluate_query,
97100
)
98101

99102
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,21 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
5252

5353
id = "eci"
5454
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55+
_OPTIONAL_PARAMS = ["query"]
5556

5657
@override
5758
def __init__(
5859
self,
5960
credential,
6061
azure_ai_project,
62+
*,
63+
_evaluate_query: bool = False,
6164
):
6265
super().__init__(
6366
eval_metric=_InternalEvaluationMetrics.ECI,
6467
azure_ai_project=azure_ai_project,
6568
credential=credential,
69+
_evaluate_query=_evaluate_query,
6670
)
6771

6872
@overload

0 commit comments

Comments
 (0)