google
diff --git a/‎src/google/adk/cli/cli_eval.py
Lines changed: 4 additions & 0 deletions b/‎src/google/adk/cli/cli_eval.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/agent_evaluator.py
Lines changed: 8 additions & 0 deletions b/‎src/google/adk/evaluation/agent_evaluator.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/response_evaluator.py
Lines changed: 18 additions & 84 deletions b/‎src/google/adk/evaluation/response_evaluator.py
Lines changed: 18 additions & 84 deletions
diff --git a/‎src/google/adk/evaluation/safety_evaluator.py
Lines changed: 54 additions & 0 deletions b/‎src/google/adk/evaluation/safety_evaluator.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/vertex_ai_eval_facade.py
Lines changed: 147 additions & 0 deletions b/‎src/google/adk/evaluation/vertex_ai_eval_facade.py
Lines changed: 147 additions & 0 deletions
@@ -41,6 +41,7 @@
 
 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
 RESPONSE_MATCH_SCORE_KEY = "response_match_score"
+SAFETY_V1_KEY = "safety_v1"
 # This evaluation is not very stable.
 # This is always optional unless explicitly specified.
 RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
@@ -260,6 +261,7 @@ async def run_evals(
 def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
   try:
     from ..evaluation.response_evaluator import ResponseEvaluator
+    from ..evaluation.safety_evaluator import SafetyEvaluatorV1
     from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
   except ModuleNotFoundError as e:
     raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
@@ -272,5 +274,7 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
     return ResponseEvaluator(
         threshold=eval_metric.threshold, metric_name=eval_metric.metric_name
     )
+  elif eval_metric.metric_name == SAFETY_V1_KEY:
+    return SafetyEvaluatorV1(eval_metric)
 
   raise ValueError(f"Unsupported eval metric: {eval_metric}")
@@ -30,6 +30,7 @@
 
 from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from .eval_case import IntermediateData
+from .eval_metrics import EvalMetric
 from .eval_set import EvalSet
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
@@ -46,11 +47,13 @@
 # This is always optional unless explicitly specified.
 RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
 RESPONSE_MATCH_SCORE_KEY = "response_match_score"
+SAFETY_V1_KEY = "safety_v1"
 
 ALLOWED_CRITERIA = [
     TOOL_TRAJECTORY_SCORE_KEY,
     RESPONSE_EVALUATION_SCORE_KEY,
     RESPONSE_MATCH_SCORE_KEY,
+    SAFETY_V1_KEY,
 ]
 
 
@@ -387,6 +390,7 @@ def _validate_input(eval_dataset, criteria):
   def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
     try:
       from .response_evaluator import ResponseEvaluator
+      from .safety_evaluator import SafetyEvaluatorV1
       from .trajectory_evaluator import TrajectoryEvaluator
     except ModuleNotFoundError as e:
       raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
@@ -397,6 +401,10 @@ def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
         or metric_name == RESPONSE_EVALUATION_SCORE_KEY
     ):
       return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
+    elif metric_name == SAFETY_V1_KEY:
+      return SafetyEvaluatorV1(
+          eval_metric=EvalMetric(threshold=threshold, metric_name=metric_name)
+      )
 
     raise ValueError(f"Unsupported eval metric: {metric_name}")
 
 
@@ -14,26 +14,34 @@
 
 from __future__ import annotations
 
-import os
 from typing import Optional
 
-from google.genai import types as genai_types
-import pandas as pd
 from typing_extensions import override
-from vertexai import Client as VertexAiClient
 from vertexai import types as vertexai_types
 
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
-from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
-from .evaluator import PerInvocationResult
 from .final_response_match_v1 import RougeEvaluator
+from .vertex_ai_eval_facade import _VertexAiEvalFacade
 
 
 class ResponseEvaluator(Evaluator):
-  """Runs response evaluation for agents."""
+  """Evaluates Agent's responses.
+
+  This class supports two metrics:
+  1) response_evaluation_score
+  This metric evaluates how coherent agent's resposne was.
+
+  Value range of this metric is [1,5], with values closer to 5 more desirable.
+
+  2) response_match_score:
+  This metric evaluates if agent's final response matches a golden/expected
+  final response.
+
+  Value range for this metric is [0,1], with values closer to 1 more desirable.
+  """
 
   def __init__(
       self,
@@ -77,80 +85,6 @@ def evaluate_invocations(
           actual_invocations, expected_invocations
       )
 
-    total_score = 0.0
-    num_invocations = 0
-    per_invocation_results = []
-    for actual, expected in zip(actual_invocations, expected_invocations):
-      prompt = self._get_text(expected.user_content)
-      reference = self._get_text(expected.final_response)
-      response = self._get_text(actual.final_response)
-
-      eval_case = {
-          "prompt": prompt,
-          "reference": reference,
-          "response": response,
-      }
-
-      eval_case_result = ResponseEvaluator._perform_eval(
-          pd.DataFrame([eval_case]), [self._metric_name]
-      )
-      score = self._get_score(eval_case_result)
-      per_invocation_results.append(
-          PerInvocationResult(
-              actual_invocation=actual,
-              expected_invocation=expected,
-              score=score,
-              eval_status=self._get_eval_status(score),
-          )
-      )
-
-      if score:
-        total_score += score
-        num_invocations += 1
-
-    if per_invocation_results:
-      overall_score = (
-          total_score / num_invocations if num_invocations > 0 else None
-      )
-      return EvaluationResult(
-          overall_score=overall_score,
-          overall_eval_status=self._get_eval_status(overall_score),
-          per_invocation_results=per_invocation_results,
-      )
-
-    return EvaluationResult()
-
-  def _get_text(self, content: Optional[genai_types.Content]) -> str:
-    if content and content.parts:
-      return "\n".join([p.text for p in content.parts if p.text])
-
-    return ""
-
-  def _get_score(self, eval_result) -> Optional[float]:
-    if eval_result and eval_result.summary_metrics:
-      return eval_result.summary_metrics[0].mean_score
-
-    return None
-
-  def _get_eval_status(self, score: Optional[float]):
-    if score:
-      return (
-          EvalStatus.PASSED if score >= self._threshold else EvalStatus.FAILED
-      )
-
-    return EvalStatus.NOT_EVALUATED
-
-  @staticmethod
-  def _perform_eval(dataset, metrics):
-    """This method hides away the call to external service.
-
-    Primarily helps with unit testing.
-    """
-    project_id = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))
-    location = os.environ.get("GOOGLE_CLOUD_REGION")
-    client = VertexAiClient(project=project_id, location=location)
-
-    return client.evals.evaluate(
-        dataset=vertexai_types.EvaluationDataset(eval_dataset_df=dataset),
-        metrics=metrics,
-    )
+    return _VertexAiEvalFacade(
+        threshold=self._threshold, metric_name=self._metric_name
+    ).evaluate_invocations(actual_invocations, expected_invocations)
@@ -0,0 +1,54 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing_extensions import override
+from vertexai import types as vertexai_types
+
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .evaluator import EvaluationResult
+from .evaluator import Evaluator
+from .vertex_ai_eval_facade import _VertexAiEvalFacade
+
+
+class SafetyEvaluatorV1(Evaluator):
+  """Evaluates safety (harmlessness) of an Agent's Response.
+
+  The class delegates the responsibility to Vertex Gen AI Eval SDK. The V1
+  suffix in the class name is added to convey that there could be other versions
+  of the safety metric as well, and those metrics could use a different strategy
+  to evaluate safety.
+
+  Using this class requires a GCP project. Please set GOOGLE_CLOUD_PROJECT and
+  GOOGLE_CLOUD_LOCATION in your .env file.
+
+  Value range of the metric is [0, 1], with values closer to 1 to be more
+  desirable (safe).
+  """
+
+  def __init__(self, eval_metric: EvalMetric):
+    self._eval_metric = eval_metric
+
+  @override
+  def evaluate_invocations(
+      self,
+      actual_invocations: list[Invocation],
+      expected_invocations: list[Invocation],
+  ) -> EvaluationResult:
+    return _VertexAiEvalFacade(
+        threshold=self._eval_metric.threshold,
+        metric_name=vertexai_types.PrebuiltMetric.SAFETY,
+    ).evaluate_invocations(actual_invocations, expected_invocations)
@@ -0,0 +1,147 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+from typing import Optional
+
+from google.genai import types as genai_types
+import pandas as pd
+from typing_extensions import override
+from vertexai import Client as VertexAiClient
+from vertexai import types as vertexai_types
+
+from .eval_case import Invocation
+from .evaluator import EvalStatus
+from .evaluator import EvaluationResult
+from .evaluator import Evaluator
+from .evaluator import PerInvocationResult
+
+_ERROR_MESSAGE_SUFFIX = """
+You should specify both project id and location. This metric uses Vertex Gen AI
+Eval SDK, and it requires google cloud credentials.
+
+If using an .env file add the values there, or explicitly set in the code using
+the template below:
+
+os.environ['GOOGLE_CLOUD_LOCATION'] = <LOCATION>
+os.environ['GOOGLE_CLOUD_PROJECT'] = <PROJECT ID>
+"""
+
+
+class _VertexAiEvalFacade(Evaluator):
+  """Simple facade for Vertex Gen AI Eval SDK.
+
+  Vertex Gen AI Eval SDK exposes quite a few metrics that are valuable for
+  agentic evals. This class helps us to access those metrics.
+
+  Using this class requires a GCP project. Please set GOOGLE_CLOUD_PROJECT and
+  GOOGLE_CLOUD_LOCATION in your .env file.
+  """
+
+  def __init__(
+      self, threshold: float, metric_name: vertexai_types.PrebuiltMetric
+  ):
+    self._threshold = threshold
+    self._metric_name = metric_name
+
+  @override
+  def evaluate_invocations(
+      self,
+      actual_invocations: list[Invocation],
+      expected_invocations: list[Invocation],
+  ) -> EvaluationResult:
+    total_score = 0.0
+    num_invocations = 0
+    per_invocation_results = []
+    for actual, expected in zip(actual_invocations, expected_invocations):
+      prompt = self._get_text(expected.user_content)
+      reference = self._get_text(expected.final_response)
+      response = self._get_text(actual.final_response)
+      eval_case = {
+          "prompt": prompt,
+          "reference": reference,
+          "response": response,
+      }
+
+      eval_case_result = _VertexAiEvalFacade._perform_eval(
+          dataset=pd.DataFrame([eval_case]), metrics=[self._metric_name]
+      )
+      score = self._get_score(eval_case_result)
+      per_invocation_results.append(
+          PerInvocationResult(
+              actual_invocation=actual,
+              expected_invocation=expected,
+              score=score,
+              eval_status=self._get_eval_status(score),
+          )
+      )
+
+      if score:
+        total_score += score
+        num_invocations += 1
+
+    if per_invocation_results:
+      overall_score = (
+          total_score / num_invocations if num_invocations > 0 else None
+      )
+      return EvaluationResult(
+          overall_score=overall_score,
+          overall_eval_status=self._get_eval_status(overall_score),
+          per_invocation_results=per_invocation_results,
+      )
+
+    return EvaluationResult()
+
+  def _get_text(self, content: Optional[genai_types.Content]) -> str:
+    if content and content.parts:
+      return "\n".join([p.text for p in content.parts if p.text])
+
+    return ""
+
+  def _get_score(self, eval_result) -> Optional[float]:
+    if eval_result and eval_result.summary_metrics:
+      return eval_result.summary_metrics[0].mean_score
+
+    return None
+
+  def _get_eval_status(self, score: Optional[float]):
+    if score:
+      return (
+          EvalStatus.PASSED if score >= self._threshold else EvalStatus.FAILED
+      )
+
+    return EvalStatus.NOT_EVALUATED
+
+  @staticmethod
+  def _perform_eval(dataset, metrics):
+    """This method hides away the call to external service.
+
+    Primarily helps with unit testing.
+    """
+    project_id = os.environ.get("GOOGLE_CLOUD_PROJECT", None)
+    location = os.environ.get("GOOGLE_CLOUD_LOCATION", None)
+
+    if not project_id:
+      raise ValueError("Missing project id." + _ERROR_MESSAGE_SUFFIX)
+    if not location:
+      raise ValueError("Missing location." + _ERROR_MESSAGE_SUFFIX)
+
+    client = VertexAiClient(project=project_id, location=location)
+
+    return client.evals.evaluate(
+        dataset=vertexai_types.EvaluationDataset(eval_dataset_df=dataset),
+        metrics=metrics,
+    )