feat: Add rouge_score library to ADK eval dependencies, and implement RougeEvaluator that is computes ROUGE-1 for "response_match_score" metric

jcpagadora737 · copybara-github · commit 9597a446fdec · 2025-06-23T15:31:03.000-07:00
PiperOrigin-RevId: 774949712
diff --git a/pyproject.toml b/pyproject.toml
@@ -87,6 +87,7 @@ eval = [
   "google-cloud-aiplatform[evaluation]>=1.87.0",
   "pandas>=2.2.3",
   "tabulate>=0.9.0",
+  "rouge-score>=0.1.2",
   # go/keep-sorted end
 ]
 
diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
@@ -0,0 +1,110 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Optional
+
+from google.genai import types as genai_types
+from rouge_score import rouge_scorer
+from typing_extensions import override
+
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .evaluator import EvalStatus
+from .evaluator import EvaluationResult
+from .evaluator import Evaluator
+from .evaluator import PerInvocationResult
+
+
+class RougeEvaluator(Evaluator):
+  """Calculates the ROUGE-1 metric to compare responses."""
+
+  def __init__(self, eval_metric: EvalMetric):
+    self._eval_metric = eval_metric
+
+  @override
+  def evaluate_invocations(
+      self,
+      actual_invocations: list[Invocation],
+      expected_invocations: list[Invocation],
+  ) -> EvaluationResult:
+    total_score = 0.0
+    num_invocations = 0
+    per_invocation_results = []
+    for actual, expected in zip(actual_invocations, expected_invocations):
+      reference = _get_text_from_content(expected.final_response)
+      response = _get_text_from_content(actual.final_response)
+      rouge_1_scores = _calculate_rouge_1_scores(response, reference)
+      score = rouge_1_scores.fmeasure
+      per_invocation_results.append(
+          PerInvocationResult(
+              actual_invocation=actual,
+              expected_invocation=expected,
+              score=score,
+              eval_status=_get_eval_status(score, self._eval_metric.threshold),
+          )
+      )
+      total_score += score
+      num_invocations += 1
+
+    if per_invocation_results:
+      overall_score = total_score / num_invocations
+      return EvaluationResult(
+          overall_score=overall_score,
+          overall_eval_status=_get_eval_status(
+              overall_score, self._eval_metric.threshold
+          ),
+          per_invocation_results=per_invocation_results,
+      )
+
+    return EvaluationResult()
+
+
+def _get_text_from_content(content: Optional[genai_types.Content]) -> str:
+  if content and content.parts:
+    return "\n".join([part.text for part in content.parts if part.text])
+
+  return ""
+
+
+def _get_eval_status(score: float, threshold: float):
+  return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
+
+
+def _calculate_rouge_1_scores(candidate: str, reference: str):
+  """Calculates the ROUGE-1 score between a candidate and reference text.
+
+  ROUGE-1 measures the overlap of unigrams (single words) between the
+  candidate and reference texts. The score is broken down into:
+  - Precision: The proportion of unigrams in the candidate that are also in the
+  reference.
+  - Recall: The proportion of unigrams in the reference that are also in the
+  candidate.
+  - F-measure: The harmonic mean of precision and recall.
+
+  Args:
+      candidate: The generated text to be evaluated.
+      reference: The ground-truth text to compare against.
+
+  Returns:
+      A dictionary containing the ROUGE-1 precision, recall, and f-measure.
+  """
+  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+
+  # The score method returns a dictionary where keys are the ROUGE types
+  # and values are Score objects (tuples) with precision, recall, and fmeasure.
+  scores = scorer.score(reference, candidate)
+
+  return scores["rouge1"]
diff --git a/src/google/adk/evaluation/response_evaluator.py b/src/google/adk/evaluation/response_evaluator.py
@@ -27,10 +27,12 @@
 
 from .eval_case import IntermediateData
 from .eval_case import Invocation
+from .eval_metrics import EvalMetric
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
+from .final_response_match_v1 import RougeEvaluator
 
 
 class ResponseEvaluator(Evaluator):
@@ -40,7 +42,7 @@ def __init__(self, threshold: float, metric_name: str):
     if "response_evaluation_score" == metric_name:
       self._metric_name = MetricPromptTemplateExamples.Pointwise.COHERENCE
     elif "response_match_score" == metric_name:
-      self._metric_name = "rouge_1"
+      self._metric_name = "response_match_score"
     else:
       raise ValueError(f"`{metric_name}` is not supported.")
 
@@ -52,6 +54,15 @@ def evaluate_invocations(
       actual_invocations: list[Invocation],
       expected_invocations: list[Invocation],
   ) -> EvaluationResult:
+    # If the metric is response_match_score, just use the RougeEvaluator.
+    if self._metric_name == "response_match_score":
+      rouge_evaluator = RougeEvaluator(
+          EvalMetric(metric_name=self._metric_name, threshold=self._threshold)
+      )
+      return rouge_evaluator.evaluate_invocations(
+          actual_invocations, expected_invocations
+      )
+
     total_score = 0.0
     num_invocations = 0
     per_invocation_results = []
diff --git a/tests/unittests/evaluation/test_final_response_match_v1.py b/tests/unittests/evaluation/test_final_response_match_v1.py
@@ -0,0 +1,140 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import EvalMetric
+from google.adk.evaluation.evaluator import EvalStatus
+from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
+from google.adk.evaluation.final_response_match_v1 import RougeEvaluator
+from google.genai import types as genai_types
+import pytest
+
+
+def _create_test_rouge_evaluator(threshold: float) -> RougeEvaluator:
+  return RougeEvaluator(
+      EvalMetric(metric_name="response_match_score", threshold=threshold)
+  )
+
+
+def _create_test_invocations(
+    candidate: str, reference: str
+) -> tuple[Invocation, Invocation]:
+  """Returns tuple of (actual_invocation, expected_invocation)."""
+  return Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="This is a test query.")]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text=candidate)]
+      ),
+  ), Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="This is a test query.")]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text=reference)]
+      ),
+  )
+
+
+def test_calculate_rouge_1_scores_empty_candidate_and_reference():
+  candidate = ""
+  reference = ""
+  rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+  assert rouge_1_score.precision == 0
+  assert rouge_1_score.recall == 0
+  assert rouge_1_score.fmeasure == 0
+
+
+def test_calculate_rouge_1_scores_empty_candidate():
+  candidate = ""
+  reference = "This is a test reference."
+  rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+  assert rouge_1_score.precision == 0
+  assert rouge_1_score.recall == 0
+  assert rouge_1_score.fmeasure == 0
+
+
+def test_calculate_rouge_1_scores_empty_reference():
+  candidate = "This is a test candidate response."
+  reference = ""
+  rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+  assert rouge_1_score.precision == 0
+  assert rouge_1_score.recall == 0
+  assert rouge_1_score.fmeasure == 0
+
+
+def test_calculate_rouge_1_scores():
+  candidate = "This is a test candidate response."
+  reference = "This is a test reference."
+  rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+  assert rouge_1_score.precision == pytest.approx(2 / 3)
+  assert rouge_1_score.recall == pytest.approx(4 / 5)
+  assert rouge_1_score.fmeasure == pytest.approx(8 / 11)
+
+
+@pytest.mark.parametrize(
+    "candidates, references, expected_score, expected_status",
+    [
+        (
+            ["The quick brown fox jumps.", "hello world"],
+            ["The quick brown fox jumps over the lazy dog.", "hello"],
+            0.69048,  # (5/7 + 2/3) / 2
+            EvalStatus.FAILED,
+        ),
+        (
+            ["This is a test.", "Another test case."],
+            ["This is a test.", "This is a different test."],
+            0.625,  # (1 + 1/4) / 2
+            EvalStatus.FAILED,
+        ),
+        (
+            ["No matching words here.", "Second candidate."],
+            ["Completely different text.", "Another reference."],
+            0.0,  # (0 + 1/2) / 2
+            EvalStatus.FAILED,
+        ),
+        (
+            ["Same words", "Same words"],
+            ["Same words", "Same words"],
+            1.0,
+            EvalStatus.PASSED,
+        ),
+    ],
+)
+def test_rouge_evaluator_multiple_invocations(
+    candidates: list[str],
+    references: list[str],
+    expected_score: float,
+    expected_status: EvalStatus,
+):
+  rouge_evaluator = _create_test_rouge_evaluator(threshold=0.8)
+  actual_invocations = []
+  expected_invocations = []
+  for candidate, reference in zip(candidates, references):
+    actual_invocation, expected_invocation = _create_test_invocations(
+        candidate, reference
+    )
+    actual_invocations.append(actual_invocation)
+    expected_invocations.append(expected_invocation)
+
+  evaluation_result = rouge_evaluator.evaluate_invocations(
+      actual_invocations, expected_invocations
+  )
+  assert evaluation_result.overall_score == pytest.approx(
+      expected_score, rel=1e-3
+  )
+  assert evaluation_result.overall_eval_status == expected_status
diff --git a/tests/unittests/evaluation/test_response_evaluator.py b/tests/unittests/evaluation/test_response_evaluator.py
@@ -16,7 +16,10 @@
 from unittest.mock import MagicMock
 from unittest.mock import patch
 
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.response_evaluator import ResponseEvaluator
+from google.genai import types as genai_types
 import pandas as pd
 import pytest
 from vertexai.preview.evaluation import MetricPromptTemplateExamples
@@ -63,7 +66,7 @@
     "google.adk.evaluation.response_evaluator.ResponseEvaluator._perform_eval"
 )
 class TestResponseEvaluator:
-  """A class to help organize "patch" that are applicabple to all tests."""
+  """A class to help organize "patch" that are applicable to all tests."""
 
   def test_evaluate_none_dataset_raises_value_error(self, mock_perform_eval):
     """Test evaluate function raises ValueError for an empty list."""
@@ -77,6 +80,40 @@ def test_evaluate_empty_dataset_raises_value_error(self, mock_perform_eval):
       ResponseEvaluator.evaluate([], ["response_evaluation_score"])
     mock_perform_eval.assert_not_called()  # Ensure _perform_eval was not called
 
+  def test_evaluate_invocations_rouge_metric(self, mock_perform_eval):
+    """Test evaluate_invocations function for Rouge metric."""
+    actual_invocations = [
+        Invocation(
+            user_content=genai_types.Content(
+                parts=[genai_types.Part(text="This is a test query.")]
+            ),
+            final_response=genai_types.Content(
+                parts=[
+                    genai_types.Part(text="This is a test candidate response.")
+                ]
+            ),
+        )
+    ]
+    expected_invocations = [
+        Invocation(
+            user_content=genai_types.Content(
+                parts=[genai_types.Part(text="This is a test query.")]
+            ),
+            final_response=genai_types.Content(
+                parts=[genai_types.Part(text="This is a test reference.")]
+            ),
+        )
+    ]
+    evaluator = ResponseEvaluator(
+        threshold=0.8, metric_name="response_match_score"
+    )
+    evaluation_result = evaluator.evaluate_invocations(
+        actual_invocations, expected_invocations
+    )
+    assert evaluation_result.overall_score == pytest.approx(8 / 11)
+    # ROUGE-1 F1 is approx. 0.73 < 0.8 threshold, so eval status is FAILED.
+    assert evaluation_result.overall_eval_status == EvalStatus.FAILED
+
   def test_evaluate_determines_metrics_correctly_for_perform_eval(
       self, mock_perform_eval
   ):

Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,7 @@ eval = [`
`87`	`87`	`"google-cloud-aiplatform[evaluation]>=1.87.0",`
`88`	`88`	`"pandas>=2.2.3",`
`89`	`89`	`"tabulate>=0.9.0",`
	`90`	`+ "rouge-score>=0.1.2",`
`90`	`91`	`# go/keep-sorted end`
`91`	`92`	`]`
`92`	`93`