Skip to content

Commit 9597a44

Browse files
jcpagadora737copybara-github
authored andcommitted
feat: Add rouge_score library to ADK eval dependencies, and implement RougeEvaluator that is computes ROUGE-1 for "response_match_score" metric
PiperOrigin-RevId: 774949712
1 parent fa025d7 commit 9597a44

File tree

5 files changed

+301
-2
lines changed

5 files changed

+301
-2
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ eval = [
8787
"google-cloud-aiplatform[evaluation]>=1.87.0",
8888
"pandas>=2.2.3",
8989
"tabulate>=0.9.0",
90+
"rouge-score>=0.1.2",
9091
# go/keep-sorted end
9192
]
9293

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from typing import Optional
18+
19+
from google.genai import types as genai_types
20+
from rouge_score import rouge_scorer
21+
from typing_extensions import override
22+
23+
from .eval_case import Invocation
24+
from .eval_metrics import EvalMetric
25+
from .evaluator import EvalStatus
26+
from .evaluator import EvaluationResult
27+
from .evaluator import Evaluator
28+
from .evaluator import PerInvocationResult
29+
30+
31+
class RougeEvaluator(Evaluator):
32+
"""Calculates the ROUGE-1 metric to compare responses."""
33+
34+
def __init__(self, eval_metric: EvalMetric):
35+
self._eval_metric = eval_metric
36+
37+
@override
38+
def evaluate_invocations(
39+
self,
40+
actual_invocations: list[Invocation],
41+
expected_invocations: list[Invocation],
42+
) -> EvaluationResult:
43+
total_score = 0.0
44+
num_invocations = 0
45+
per_invocation_results = []
46+
for actual, expected in zip(actual_invocations, expected_invocations):
47+
reference = _get_text_from_content(expected.final_response)
48+
response = _get_text_from_content(actual.final_response)
49+
rouge_1_scores = _calculate_rouge_1_scores(response, reference)
50+
score = rouge_1_scores.fmeasure
51+
per_invocation_results.append(
52+
PerInvocationResult(
53+
actual_invocation=actual,
54+
expected_invocation=expected,
55+
score=score,
56+
eval_status=_get_eval_status(score, self._eval_metric.threshold),
57+
)
58+
)
59+
total_score += score
60+
num_invocations += 1
61+
62+
if per_invocation_results:
63+
overall_score = total_score / num_invocations
64+
return EvaluationResult(
65+
overall_score=overall_score,
66+
overall_eval_status=_get_eval_status(
67+
overall_score, self._eval_metric.threshold
68+
),
69+
per_invocation_results=per_invocation_results,
70+
)
71+
72+
return EvaluationResult()
73+
74+
75+
def _get_text_from_content(content: Optional[genai_types.Content]) -> str:
76+
if content and content.parts:
77+
return "\n".join([part.text for part in content.parts if part.text])
78+
79+
return ""
80+
81+
82+
def _get_eval_status(score: float, threshold: float):
83+
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
84+
85+
86+
def _calculate_rouge_1_scores(candidate: str, reference: str):
87+
"""Calculates the ROUGE-1 score between a candidate and reference text.
88+
89+
ROUGE-1 measures the overlap of unigrams (single words) between the
90+
candidate and reference texts. The score is broken down into:
91+
- Precision: The proportion of unigrams in the candidate that are also in the
92+
reference.
93+
- Recall: The proportion of unigrams in the reference that are also in the
94+
candidate.
95+
- F-measure: The harmonic mean of precision and recall.
96+
97+
Args:
98+
candidate: The generated text to be evaluated.
99+
reference: The ground-truth text to compare against.
100+
101+
Returns:
102+
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
103+
"""
104+
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
105+
106+
# The score method returns a dictionary where keys are the ROUGE types
107+
# and values are Score objects (tuples) with precision, recall, and fmeasure.
108+
scores = scorer.score(reference, candidate)
109+
110+
return scores["rouge1"]

src/google/adk/evaluation/response_evaluator.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,12 @@
2727

2828
from .eval_case import IntermediateData
2929
from .eval_case import Invocation
30+
from .eval_metrics import EvalMetric
3031
from .evaluator import EvalStatus
3132
from .evaluator import EvaluationResult
3233
from .evaluator import Evaluator
3334
from .evaluator import PerInvocationResult
35+
from .final_response_match_v1 import RougeEvaluator
3436

3537

3638
class ResponseEvaluator(Evaluator):
@@ -40,7 +42,7 @@ def __init__(self, threshold: float, metric_name: str):
4042
if "response_evaluation_score" == metric_name:
4143
self._metric_name = MetricPromptTemplateExamples.Pointwise.COHERENCE
4244
elif "response_match_score" == metric_name:
43-
self._metric_name = "rouge_1"
45+
self._metric_name = "response_match_score"
4446
else:
4547
raise ValueError(f"`{metric_name}` is not supported.")
4648

@@ -52,6 +54,15 @@ def evaluate_invocations(
5254
actual_invocations: list[Invocation],
5355
expected_invocations: list[Invocation],
5456
) -> EvaluationResult:
57+
# If the metric is response_match_score, just use the RougeEvaluator.
58+
if self._metric_name == "response_match_score":
59+
rouge_evaluator = RougeEvaluator(
60+
EvalMetric(metric_name=self._metric_name, threshold=self._threshold)
61+
)
62+
return rouge_evaluator.evaluate_invocations(
63+
actual_invocations, expected_invocations
64+
)
65+
5566
total_score = 0.0
5667
num_invocations = 0
5768
per_invocation_results = []
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from google.adk.evaluation.eval_case import Invocation
18+
from google.adk.evaluation.eval_metrics import EvalMetric
19+
from google.adk.evaluation.evaluator import EvalStatus
20+
from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
21+
from google.adk.evaluation.final_response_match_v1 import RougeEvaluator
22+
from google.genai import types as genai_types
23+
import pytest
24+
25+
26+
def _create_test_rouge_evaluator(threshold: float) -> RougeEvaluator:
27+
return RougeEvaluator(
28+
EvalMetric(metric_name="response_match_score", threshold=threshold)
29+
)
30+
31+
32+
def _create_test_invocations(
33+
candidate: str, reference: str
34+
) -> tuple[Invocation, Invocation]:
35+
"""Returns tuple of (actual_invocation, expected_invocation)."""
36+
return Invocation(
37+
user_content=genai_types.Content(
38+
parts=[genai_types.Part(text="This is a test query.")]
39+
),
40+
final_response=genai_types.Content(
41+
parts=[genai_types.Part(text=candidate)]
42+
),
43+
), Invocation(
44+
user_content=genai_types.Content(
45+
parts=[genai_types.Part(text="This is a test query.")]
46+
),
47+
final_response=genai_types.Content(
48+
parts=[genai_types.Part(text=reference)]
49+
),
50+
)
51+
52+
53+
def test_calculate_rouge_1_scores_empty_candidate_and_reference():
54+
candidate = ""
55+
reference = ""
56+
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
57+
assert rouge_1_score.precision == 0
58+
assert rouge_1_score.recall == 0
59+
assert rouge_1_score.fmeasure == 0
60+
61+
62+
def test_calculate_rouge_1_scores_empty_candidate():
63+
candidate = ""
64+
reference = "This is a test reference."
65+
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
66+
assert rouge_1_score.precision == 0
67+
assert rouge_1_score.recall == 0
68+
assert rouge_1_score.fmeasure == 0
69+
70+
71+
def test_calculate_rouge_1_scores_empty_reference():
72+
candidate = "This is a test candidate response."
73+
reference = ""
74+
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
75+
assert rouge_1_score.precision == 0
76+
assert rouge_1_score.recall == 0
77+
assert rouge_1_score.fmeasure == 0
78+
79+
80+
def test_calculate_rouge_1_scores():
81+
candidate = "This is a test candidate response."
82+
reference = "This is a test reference."
83+
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
84+
assert rouge_1_score.precision == pytest.approx(2 / 3)
85+
assert rouge_1_score.recall == pytest.approx(4 / 5)
86+
assert rouge_1_score.fmeasure == pytest.approx(8 / 11)
87+
88+
89+
@pytest.mark.parametrize(
90+
"candidates, references, expected_score, expected_status",
91+
[
92+
(
93+
["The quick brown fox jumps.", "hello world"],
94+
["The quick brown fox jumps over the lazy dog.", "hello"],
95+
0.69048, # (5/7 + 2/3) / 2
96+
EvalStatus.FAILED,
97+
),
98+
(
99+
["This is a test.", "Another test case."],
100+
["This is a test.", "This is a different test."],
101+
0.625, # (1 + 1/4) / 2
102+
EvalStatus.FAILED,
103+
),
104+
(
105+
["No matching words here.", "Second candidate."],
106+
["Completely different text.", "Another reference."],
107+
0.0, # (0 + 1/2) / 2
108+
EvalStatus.FAILED,
109+
),
110+
(
111+
["Same words", "Same words"],
112+
["Same words", "Same words"],
113+
1.0,
114+
EvalStatus.PASSED,
115+
),
116+
],
117+
)
118+
def test_rouge_evaluator_multiple_invocations(
119+
candidates: list[str],
120+
references: list[str],
121+
expected_score: float,
122+
expected_status: EvalStatus,
123+
):
124+
rouge_evaluator = _create_test_rouge_evaluator(threshold=0.8)
125+
actual_invocations = []
126+
expected_invocations = []
127+
for candidate, reference in zip(candidates, references):
128+
actual_invocation, expected_invocation = _create_test_invocations(
129+
candidate, reference
130+
)
131+
actual_invocations.append(actual_invocation)
132+
expected_invocations.append(expected_invocation)
133+
134+
evaluation_result = rouge_evaluator.evaluate_invocations(
135+
actual_invocations, expected_invocations
136+
)
137+
assert evaluation_result.overall_score == pytest.approx(
138+
expected_score, rel=1e-3
139+
)
140+
assert evaluation_result.overall_eval_status == expected_status

tests/unittests/evaluation/test_response_evaluator.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
from unittest.mock import MagicMock
1717
from unittest.mock import patch
1818

19+
from google.adk.evaluation.eval_case import Invocation
20+
from google.adk.evaluation.evaluator import EvalStatus
1921
from google.adk.evaluation.response_evaluator import ResponseEvaluator
22+
from google.genai import types as genai_types
2023
import pandas as pd
2124
import pytest
2225
from vertexai.preview.evaluation import MetricPromptTemplateExamples
@@ -63,7 +66,7 @@
6366
"google.adk.evaluation.response_evaluator.ResponseEvaluator._perform_eval"
6467
)
6568
class TestResponseEvaluator:
66-
"""A class to help organize "patch" that are applicabple to all tests."""
69+
"""A class to help organize "patch" that are applicable to all tests."""
6770

6871
def test_evaluate_none_dataset_raises_value_error(self, mock_perform_eval):
6972
"""Test evaluate function raises ValueError for an empty list."""
@@ -77,6 +80,40 @@ def test_evaluate_empty_dataset_raises_value_error(self, mock_perform_eval):
7780
ResponseEvaluator.evaluate([], ["response_evaluation_score"])
7881
mock_perform_eval.assert_not_called() # Ensure _perform_eval was not called
7982

83+
def test_evaluate_invocations_rouge_metric(self, mock_perform_eval):
84+
"""Test evaluate_invocations function for Rouge metric."""
85+
actual_invocations = [
86+
Invocation(
87+
user_content=genai_types.Content(
88+
parts=[genai_types.Part(text="This is a test query.")]
89+
),
90+
final_response=genai_types.Content(
91+
parts=[
92+
genai_types.Part(text="This is a test candidate response.")
93+
]
94+
),
95+
)
96+
]
97+
expected_invocations = [
98+
Invocation(
99+
user_content=genai_types.Content(
100+
parts=[genai_types.Part(text="This is a test query.")]
101+
),
102+
final_response=genai_types.Content(
103+
parts=[genai_types.Part(text="This is a test reference.")]
104+
),
105+
)
106+
]
107+
evaluator = ResponseEvaluator(
108+
threshold=0.8, metric_name="response_match_score"
109+
)
110+
evaluation_result = evaluator.evaluate_invocations(
111+
actual_invocations, expected_invocations
112+
)
113+
assert evaluation_result.overall_score == pytest.approx(8 / 11)
114+
# ROUGE-1 F1 is approx. 0.73 < 0.8 threshold, so eval status is FAILED.
115+
assert evaluation_result.overall_eval_status == EvalStatus.FAILED
116+
80117
def test_evaluate_determines_metrics_correctly_for_perform_eval(
81118
self, mock_perform_eval
82119
):

0 commit comments

Comments
 (0)