Query optional for RAI service evaluators (#41372)

slister1001 · web-flow · commit eac1d35234b5 · 2025-07-10T22:27:00.000Z
* make query optional

* add _OPTIONAL_PARAMS

* udpates from feedback

* add _evaluate_query to evaluators init

* update tests

* update assets

* test recording updates

* asset update

* black fix

* test config fix

* black fix
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## 1.10.0 (Unreleased)
 
+### Breaking Changes
+- Added `_evaluate_query` parameter to `RaiServiceEvaluatorBase` class with a default value of `False`. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. Existing code that relies on queries being evaluated will need to explicitly set `_evaluate_query=True` to maintain the previous behavior.
+
 ### Features Added
 
 ### Bugs Fixed
diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_e9c7adf5b1"
+  "Tag": "python/evaluation/azure-ai-evaluation_4a5c8207cc"
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py
@@ -81,17 +81,21 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
 
     id = "code_vulnerability"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        *,
+        _evaluate_query: bool = True,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -36,12 +36,14 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
         when this occurs. Default is False, resulting full conversation evaluation and aggregation.
     :type eval_last_turn: bool
-    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
-        to produce a single result.
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation        to produce a single result.
         Default is ~azure.ai.evaluation._AggregationType.MEAN.
     :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
     :param threshold: The threshold for the evaluation. Default is 3.
     :type threshold: Optional[int]
+    :param _evaluate_query: If True, the query will be included in the evaluation data when evaluating
+        query-response pairs. If False, only the response will be evaluated. Default is False.
+    :type _evaluate_query: bool
     :param _higher_is_better: If True, higher scores are better. Default is True.
     :type _higher_is_better: Optional[bool]
     """
@@ -55,6 +57,7 @@ def __init__(
         eval_last_turn: bool = False,
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
         threshold: int = 3,
+        _evaluate_query: bool = False,
         _higher_is_better: Optional[bool] = False,
     ):
         super().__init__(
@@ -67,6 +70,7 @@ def __init__(
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential
         self._threshold = threshold
+        self._evaluate_query = _evaluate_query
         self._higher_is_better = _higher_is_better
 
     @override
@@ -101,7 +105,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
         :return: The evaluation result.
         :rtype: Dict
         """
-        if "query" in eval_input and "response" in eval_input:
+        if "response" in eval_input:
             return await self._evaluate_query_response(eval_input)
 
         conversation = eval_input.get("conversation", None)
@@ -131,17 +135,20 @@ async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
         return result
 
     async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
-        query = str(eval_input.get("query", None))
-        response = str(eval_input.get("response", None))
-        if query is None or response is None:
+        query = eval_input.get("query", None)
+        response = eval_input.get("response", None)
+        if response is None:
             raise EvaluationException(
                 message="Not implemented",
                 internal_message=(
-                    "Reached query/response evaluation without supplying query or response."
+                    "Reached query/response evaluation without supplying response."
                     + " This should have failed earlier."
                 ),
             )
-        input_data = {"query": query, "response": response}
+        input_data = {"response": str(response)}
+
+        if query is not None and self._evaluate_query:
+            input_data["query"] = str(query)
 
         if "context" in self._singleton_inputs:
             context = eval_input.get("context", None)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
     :type self_harm_threshold: int
     :param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
     :type hate_unfairness_threshold: int
+    :param _evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
+    :type _evaluate_query: bool
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -68,6 +70,7 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
 
     id = "content_safety"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     def __init__(
         self,
@@ -78,6 +81,7 @@ def __init__(
         sexual_threshold: int = 3,
         self_harm_threshold: int = 3,
         hate_unfairness_threshold: int = 3,
+        _evaluate_query: bool = False,
         **kwargs,
     ):
         # Type checking
@@ -91,10 +95,16 @@ def __init__(
                 raise TypeError(f"{name} must be an int, got {type(value)}")
 
         evaluators = [
-            ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
-            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
-            SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
-            HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
+            ViolenceEvaluator(
+                credential, azure_ai_project, threshold=violence_threshold, _evaluate_query=_evaluate_query
+            ),
+            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, _evaluate_query=_evaluate_query),
+            SelfHarmEvaluator(
+                credential, azure_ai_project, threshold=self_harm_threshold, _evaluate_query=_evaluate_query
+            ),
+            HateUnfairnessEvaluator(
+                credential, azure_ai_project, threshold=hate_unfairness_threshold, _evaluate_query=_evaluate_query
+            ),
         ]
         super().__init__(evaluators=evaluators, **kwargs)
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -82,6 +82,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
 
     id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
@@ -90,6 +91,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 3,
+        _evaluate_query: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -98,6 +100,7 @@ def __init__(
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -67,6 +67,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
 
     id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
@@ -75,6 +76,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 3,
+        _evaluate_query: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SELF_HARM,
@@ -83,6 +85,7 @@ def __init__(
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -78,6 +78,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
 
     id = "azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
@@ -86,6 +87,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 3,
+        _evaluate_query: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SEXUAL,
@@ -94,6 +96,7 @@ def __init__(
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
@@ -146,7 +149,7 @@ def __call__(  # pylint: disable=docstring-missing-param
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
+        :return: The sexual score.
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
         return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -78,6 +78,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
 
     id = "azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
@@ -86,6 +87,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 3,
+        _evaluate_query: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.VIOLENCE,
@@ -94,6 +96,7 @@ def __init__(
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
@@ -52,17 +52,21 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
 
     id = "eci"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        *,
+        _evaluate_query: bool = False,
     ):
         super().__init__(
             eval_metric=_InternalEvaluationMetrics.ECI,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -52,17 +52,21 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
 
     id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        *,
+        _evaluate_query: bool = True,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
@@ -68,6 +68,7 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
 
     id = "azureml://registries/azureml/models/Groundedness-Pro-Evaluator/versions/1"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
@@ -76,6 +77,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 5,
+        _evaluate_query: bool = False,
         **kwargs,
     ):
         self.threshold = threshold
@@ -86,6 +88,7 @@ def __init__(
             azure_ai_project=azure_ai_project,
             credential=credential,
             threshold=self.threshold,
+            _evaluate_query=_evaluate_query,
             **kwargs,
         )
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py
@@ -60,17 +60,21 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
 
     id = "ungrounded_attributes"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        *,
+        _evaluate_query: bool = True,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -69,17 +69,21 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
 
     id = "azureml://registries/azureml/models/Indirect-Attack-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
 
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        *,
+        _evaluate_query: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.XPIA,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            _evaluate_query=_evaluate_query,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py

Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"AssetsRepo": "Azure/azure-sdk-assets",`
`3`	`3`	`"AssetsRepoPrefixPath": "python",`
`4`	`4`	`"TagPrefix": "python/evaluation/azure-ai-evaluation",`
`5`		`- "Tag": "python/evaluation/azure-ai-evaluation_e9c7adf5b1"`
	`5`	`+ "Tag": "python/evaluation/azure-ai-evaluation_4a5c8207cc"`
`6`	`6`	`}`