feat: BaseEvalService declaration and surrounding data models

ankursharmas · copybara-github · commit b0d88bf17242 · 2025-07-01T14:13:48.000-07:00
Also, adds a metric registry.

PiperOrigin-RevId: 778186012
diff --git a/src/google/adk/evaluation/base_eval_service.py b/src/google/adk/evaluation/base_eval_service.py
@@ -0,0 +1,157 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from abc import ABC
+from abc import abstractmethod
+from typing import AsyncGenerator
+from typing import Optional
+
+from pydantic import alias_generators
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic import Field
+
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .eval_result import EvalCaseResult
+
+
+class EvaluateConfig(BaseModel):
+  """Contains configurations need to run an evaluations."""
+
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  eval_metrics: list[EvalMetric] = Field(
+      description="""The list of metrics to be used in Eval.""",
+  )
+
+
+class InferenceConfig(BaseModel):
+  """Contains configurations need to run inferences."""
+
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  labels: Optional[dict[str, str]] = Field(
+      default=None,
+      description="""Labels with user-defined metadata to break down billed
+charges.""",
+  )
+
+
+class InferenceRequest(BaseModel):
+  """Represent a request to perform inferences for the eval cases in an eval set."""
+
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  app_name: str = Field(
+      description="""The name of the app to which the eval case belongs to."""
+  )
+
+  eval_set_id: str = Field(description="""Id of the eval set.""")
+
+  eval_case_ids: Optional[list[str]] = Field(
+      default=None,
+      description="""Id of the eval cases for which inferences need to be
+generated.
+
+All the eval case ids should belong to the EvalSet.
+
+If the list of eval case ids are empty or not specified, then all the eval cases
+in an eval set are evaluated.
+      """,
+  )
+
+  inference_config: InferenceConfig = Field(
+      description="""The config to use for inferencing.""",
+  )
+
+
+class InferenceResult(BaseModel):
+  """Contains inference results for a single eval case."""
+
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  app_name: str = Field(
+      description="""The name of the app to which the eval case belongs to."""
+  )
+
+  eval_set_id: str = Field(description="""Id of the eval set.""")
+
+  eval_case_id: str = Field(
+      description="""Id of the eval case for which inferences were generated.""",
+  )
+
+  inferences: list[Invocation] = Field(
+      description="""Inferences obtained from the Agent for the eval case."""
+  )
+
+  session_id: Optional[str] = Field(
+      description="""Id of the inference session."""
+  )
+
+
+class EvaluateRequest(BaseModel):
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  inference_results: list[InferenceResult] = Field(
+      description="""A list of inferences that need to be evaluated.""",
+  )
+
+  evaluate_config: EvaluateConfig = Field(
+      description="""The config to use for evaluations.""",
+  )
+
+
+class BaseEvalService(ABC):
+  """A service to run Evals for an ADK agent."""
+
+  @abstractmethod
+  async def perform_inference(
+      self,
+      inference_request: InferenceRequest,
+  ) -> AsyncGenerator[InferenceResult, None]:
+    """Returns InferenceResult obtained from the Agent as and when they are available.
+
+    Args:
+      inference_request: The request for generating inferences.
+    """
+
+  @abstractmethod
+  async def evaluate(
+      self,
+      evaluate_request: EvaluateRequest,
+  ) -> AsyncGenerator[EvalCaseResult, None]:
+    """Returns EvalCaseResult for each item as and when they are available.
+
+    Args:
+      evaluate_request: The request to perform metric evaluations on the
+        inferences.
+    """
diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
@@ -14,16 +14,30 @@
 
 from __future__ import annotations
 
+from enum import Enum
 from typing import Optional
+from typing import Union
 
 from pydantic import alias_generators
 from pydantic import BaseModel
 from pydantic import ConfigDict
+from typing_extensions import TypeAlias
 
 from .eval_case import Invocation
 from .evaluator import EvalStatus
 
 
+class PrebuiltMetrics(Enum):
+  TOOL_TRAJECTORY_AVG_SCORE = "tool_trajectory_avg_score"
+
+  RESPONSE_EVALUATION_SCORE = "response_evaluation_score"
+
+  RESPONSE_MATCH_SCORE = "response_match_score"
+
+
+MetricName: TypeAlias = Union[str, PrebuiltMetrics]
+
+
 class EvalMetric(BaseModel):
   """A metric used to evaluate a particular aspect of an eval case."""
 
diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py
@@ -0,0 +1,89 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import logging
+
+from ..errors.not_found_error import NotFoundError
+from .eval_metrics import EvalMetric
+from .eval_metrics import MetricName
+from .eval_metrics import PrebuiltMetrics
+from .evaluator import Evaluator
+from .response_evaluator import ResponseEvaluator
+from .trajectory_evaluator import TrajectoryEvaluator
+
+logger = logging.getLogger("google_adk." + __name__)
+
+
+class MetricEvaluatorRegistry:
+  """A registry for metric Evaluators."""
+
+  _registry: dict[str, type[Evaluator]] = {}
+
+  def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
+    """Returns an Evaluator for the given metric.
+
+    A new instance of the Evaluator is returned.
+
+    Args:
+      eval_metric: The metric for which we need the Evaluator.
+
+    Raises:
+      NotFoundError: If there is no evaluator for the metric.
+    """
+    if eval_metric.metric_name not in self._registry:
+      raise NotFoundError(f"{eval_metric.metric_name} not found in registry.")
+
+    return self._registry[eval_metric.metric_name](eval_metric=eval_metric)
+
+  def register_evaluator(
+      self, metric_name: MetricName, evaluator: type[Evaluator]
+  ):
+    """Registers an evaluator given the metric name.
+
+    If a mapping already exist, then it is updated.
+    """
+    if metric_name in self._registry:
+      logger.info(
+          "Updating Evaluator class for %s from %s to %s",
+          metric_name,
+          self._registry[metric_name],
+          evaluator,
+      )
+
+    self._registry[str(metric_name)] = evaluator
+
+
+def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
+  """Returns an instance of MetricEvaluatorRegistry with standard metrics already registered in it."""
+  metric_evaluator_registry = MetricEvaluatorRegistry()
+
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
+      evaluator=type(TrajectoryEvaluator),
+  )
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
+      evaluator=type(ResponseEvaluator),
+  )
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
+      evaluator=type(ResponseEvaluator),
+  )
+
+  return metric_evaluator_registry
+
+
+DEFAULT_METRIC_EVALUATOR_REGISTRY = _get_default_metric_evaluator_registry()
diff --git a/src/google/adk/evaluation/response_evaluator.py b/src/google/adk/evaluation/response_evaluator.py
@@ -38,7 +38,24 @@
 class ResponseEvaluator(Evaluator):
   """Runs response evaluation for agents."""
 
-  def __init__(self, threshold: float, metric_name: str):
+  def __init__(
+      self,
+      threshold: Optional[float] = None,
+      metric_name: Optional[str] = None,
+      eval_metric: Optional[EvalMetric] = None,
+  ):
+    if (threshold is not None and eval_metric) or (
+        metric_name is not None and eval_metric
+    ):
+      raise ValueError(
+          "Either eval_metric should be specified or both threshold and"
+          " metric_name should be specified."
+      )
+
+    if eval_metric:
+      threshold = eval_metric.threshold
+      metric_name = eval_metric.metric_name
+
     if "response_evaluation_score" == metric_name:
       self._metric_name = MetricPromptTemplateExamples.Pointwise.COHERENCE
     elif "response_match_score" == metric_name:
diff --git a/src/google/adk/evaluation/trajectory_evaluator.py b/src/google/adk/evaluation/trajectory_evaluator.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 from typing import Any
-from typing import cast
+from typing import Optional
 
 from google.genai import types as genai_types
 import pandas as pd
@@ -24,6 +24,7 @@
 from typing_extensions import override
 
 from .eval_case import Invocation
+from .eval_metrics import EvalMetric
 from .evaluation_constants import EvalConstants
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
@@ -34,7 +35,20 @@
 class TrajectoryEvaluator(Evaluator):
   """Evaluates tool use trajectories for accuracy."""
 
-  def __init__(self, threshold: float):
+  def __init__(
+      self,
+      threshold: Optional[float] = None,
+      eval_metric: Optional[EvalMetric] = None,
+  ):
+    if threshold is not None and eval_metric:
+      raise ValueError(
+          "Either eval_metric should be specified or threshold should be"
+          " specified."
+      )
+
+    if eval_metric:
+      threshold = eval_metric.threshold
+
     self._threshold = threshold
 
   @override
diff --git a/tests/unittests/evaluation/test_metric_evaluator_registry.py b/tests/unittests/evaluation/test_metric_evaluator_registry.py