Skip to content

Commit b0d88bf

Browse files
ankursharmascopybara-github
authored andcommitted
feat: BaseEvalService declaration and surrounding data models
Also, adds a metric registry. PiperOrigin-RevId: 778186012
1 parent 17d6042 commit b0d88bf

File tree

6 files changed

+386
-3
lines changed

6 files changed

+386
-3
lines changed
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from abc import ABC
18+
from abc import abstractmethod
19+
from typing import AsyncGenerator
20+
from typing import Optional
21+
22+
from pydantic import alias_generators
23+
from pydantic import BaseModel
24+
from pydantic import ConfigDict
25+
from pydantic import Field
26+
27+
from .eval_case import Invocation
28+
from .eval_metrics import EvalMetric
29+
from .eval_result import EvalCaseResult
30+
31+
32+
class EvaluateConfig(BaseModel):
33+
"""Contains configurations need to run an evaluations."""
34+
35+
model_config = ConfigDict(
36+
alias_generator=alias_generators.to_camel,
37+
populate_by_name=True,
38+
)
39+
40+
eval_metrics: list[EvalMetric] = Field(
41+
description="""The list of metrics to be used in Eval.""",
42+
)
43+
44+
45+
class InferenceConfig(BaseModel):
46+
"""Contains configurations need to run inferences."""
47+
48+
model_config = ConfigDict(
49+
alias_generator=alias_generators.to_camel,
50+
populate_by_name=True,
51+
)
52+
53+
labels: Optional[dict[str, str]] = Field(
54+
default=None,
55+
description="""Labels with user-defined metadata to break down billed
56+
charges.""",
57+
)
58+
59+
60+
class InferenceRequest(BaseModel):
61+
"""Represent a request to perform inferences for the eval cases in an eval set."""
62+
63+
model_config = ConfigDict(
64+
alias_generator=alias_generators.to_camel,
65+
populate_by_name=True,
66+
)
67+
68+
app_name: str = Field(
69+
description="""The name of the app to which the eval case belongs to."""
70+
)
71+
72+
eval_set_id: str = Field(description="""Id of the eval set.""")
73+
74+
eval_case_ids: Optional[list[str]] = Field(
75+
default=None,
76+
description="""Id of the eval cases for which inferences need to be
77+
generated.
78+
79+
All the eval case ids should belong to the EvalSet.
80+
81+
If the list of eval case ids are empty or not specified, then all the eval cases
82+
in an eval set are evaluated.
83+
""",
84+
)
85+
86+
inference_config: InferenceConfig = Field(
87+
description="""The config to use for inferencing.""",
88+
)
89+
90+
91+
class InferenceResult(BaseModel):
92+
"""Contains inference results for a single eval case."""
93+
94+
model_config = ConfigDict(
95+
alias_generator=alias_generators.to_camel,
96+
populate_by_name=True,
97+
)
98+
99+
app_name: str = Field(
100+
description="""The name of the app to which the eval case belongs to."""
101+
)
102+
103+
eval_set_id: str = Field(description="""Id of the eval set.""")
104+
105+
eval_case_id: str = Field(
106+
description="""Id of the eval case for which inferences were generated.""",
107+
)
108+
109+
inferences: list[Invocation] = Field(
110+
description="""Inferences obtained from the Agent for the eval case."""
111+
)
112+
113+
session_id: Optional[str] = Field(
114+
description="""Id of the inference session."""
115+
)
116+
117+
118+
class EvaluateRequest(BaseModel):
119+
model_config = ConfigDict(
120+
alias_generator=alias_generators.to_camel,
121+
populate_by_name=True,
122+
)
123+
124+
inference_results: list[InferenceResult] = Field(
125+
description="""A list of inferences that need to be evaluated.""",
126+
)
127+
128+
evaluate_config: EvaluateConfig = Field(
129+
description="""The config to use for evaluations.""",
130+
)
131+
132+
133+
class BaseEvalService(ABC):
134+
"""A service to run Evals for an ADK agent."""
135+
136+
@abstractmethod
137+
async def perform_inference(
138+
self,
139+
inference_request: InferenceRequest,
140+
) -> AsyncGenerator[InferenceResult, None]:
141+
"""Returns InferenceResult obtained from the Agent as and when they are available.
142+
143+
Args:
144+
inference_request: The request for generating inferences.
145+
"""
146+
147+
@abstractmethod
148+
async def evaluate(
149+
self,
150+
evaluate_request: EvaluateRequest,
151+
) -> AsyncGenerator[EvalCaseResult, None]:
152+
"""Returns EvalCaseResult for each item as and when they are available.
153+
154+
Args:
155+
evaluate_request: The request to perform metric evaluations on the
156+
inferences.
157+
"""

src/google/adk/evaluation/eval_metrics.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,30 @@
1414

1515
from __future__ import annotations
1616

17+
from enum import Enum
1718
from typing import Optional
19+
from typing import Union
1820

1921
from pydantic import alias_generators
2022
from pydantic import BaseModel
2123
from pydantic import ConfigDict
24+
from typing_extensions import TypeAlias
2225

2326
from .eval_case import Invocation
2427
from .evaluator import EvalStatus
2528

2629

30+
class PrebuiltMetrics(Enum):
31+
TOOL_TRAJECTORY_AVG_SCORE = "tool_trajectory_avg_score"
32+
33+
RESPONSE_EVALUATION_SCORE = "response_evaluation_score"
34+
35+
RESPONSE_MATCH_SCORE = "response_match_score"
36+
37+
38+
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
39+
40+
2741
class EvalMetric(BaseModel):
2842
"""A metric used to evaluate a particular aspect of an eval case."""
2943

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import logging
18+
19+
from ..errors.not_found_error import NotFoundError
20+
from .eval_metrics import EvalMetric
21+
from .eval_metrics import MetricName
22+
from .eval_metrics import PrebuiltMetrics
23+
from .evaluator import Evaluator
24+
from .response_evaluator import ResponseEvaluator
25+
from .trajectory_evaluator import TrajectoryEvaluator
26+
27+
logger = logging.getLogger("google_adk." + __name__)
28+
29+
30+
class MetricEvaluatorRegistry:
31+
"""A registry for metric Evaluators."""
32+
33+
_registry: dict[str, type[Evaluator]] = {}
34+
35+
def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
36+
"""Returns an Evaluator for the given metric.
37+
38+
A new instance of the Evaluator is returned.
39+
40+
Args:
41+
eval_metric: The metric for which we need the Evaluator.
42+
43+
Raises:
44+
NotFoundError: If there is no evaluator for the metric.
45+
"""
46+
if eval_metric.metric_name not in self._registry:
47+
raise NotFoundError(f"{eval_metric.metric_name} not found in registry.")
48+
49+
return self._registry[eval_metric.metric_name](eval_metric=eval_metric)
50+
51+
def register_evaluator(
52+
self, metric_name: MetricName, evaluator: type[Evaluator]
53+
):
54+
"""Registers an evaluator given the metric name.
55+
56+
If a mapping already exist, then it is updated.
57+
"""
58+
if metric_name in self._registry:
59+
logger.info(
60+
"Updating Evaluator class for %s from %s to %s",
61+
metric_name,
62+
self._registry[metric_name],
63+
evaluator,
64+
)
65+
66+
self._registry[str(metric_name)] = evaluator
67+
68+
69+
def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
70+
"""Returns an instance of MetricEvaluatorRegistry with standard metrics already registered in it."""
71+
metric_evaluator_registry = MetricEvaluatorRegistry()
72+
73+
metric_evaluator_registry.register_evaluator(
74+
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
75+
evaluator=type(TrajectoryEvaluator),
76+
)
77+
metric_evaluator_registry.register_evaluator(
78+
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
79+
evaluator=type(ResponseEvaluator),
80+
)
81+
metric_evaluator_registry.register_evaluator(
82+
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
83+
evaluator=type(ResponseEvaluator),
84+
)
85+
86+
return metric_evaluator_registry
87+
88+
89+
DEFAULT_METRIC_EVALUATOR_REGISTRY = _get_default_metric_evaluator_registry()

src/google/adk/evaluation/response_evaluator.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,24 @@
3838
class ResponseEvaluator(Evaluator):
3939
"""Runs response evaluation for agents."""
4040

41-
def __init__(self, threshold: float, metric_name: str):
41+
def __init__(
42+
self,
43+
threshold: Optional[float] = None,
44+
metric_name: Optional[str] = None,
45+
eval_metric: Optional[EvalMetric] = None,
46+
):
47+
if (threshold is not None and eval_metric) or (
48+
metric_name is not None and eval_metric
49+
):
50+
raise ValueError(
51+
"Either eval_metric should be specified or both threshold and"
52+
" metric_name should be specified."
53+
)
54+
55+
if eval_metric:
56+
threshold = eval_metric.threshold
57+
metric_name = eval_metric.metric_name
58+
4259
if "response_evaluation_score" == metric_name:
4360
self._metric_name = MetricPromptTemplateExamples.Pointwise.COHERENCE
4461
elif "response_match_score" == metric_name:

src/google/adk/evaluation/trajectory_evaluator.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from __future__ import annotations
1616

1717
from typing import Any
18-
from typing import cast
18+
from typing import Optional
1919

2020
from google.genai import types as genai_types
2121
import pandas as pd
@@ -24,6 +24,7 @@
2424
from typing_extensions import override
2525

2626
from .eval_case import Invocation
27+
from .eval_metrics import EvalMetric
2728
from .evaluation_constants import EvalConstants
2829
from .evaluator import EvalStatus
2930
from .evaluator import EvaluationResult
@@ -34,7 +35,20 @@
3435
class TrajectoryEvaluator(Evaluator):
3536
"""Evaluates tool use trajectories for accuracy."""
3637

37-
def __init__(self, threshold: float):
38+
def __init__(
39+
self,
40+
threshold: Optional[float] = None,
41+
eval_metric: Optional[EvalMetric] = None,
42+
):
43+
if threshold is not None and eval_metric:
44+
raise ValueError(
45+
"Either eval_metric should be specified or threshold should be"
46+
" specified."
47+
)
48+
49+
if eval_metric:
50+
threshold = eval_metric.threshold
51+
3852
self._threshold = threshold
3953

4054
@override

0 commit comments

Comments
 (0)