[AQUA][Evaluate] Externalize Supported Evaluation Metrics List to Service Config (#941)

mrDzurb · web-flow · commit bbda290c4f66 · 2024-09-06T12:10:22.000-07:00
diff --git a/ads/aqua/evaluation/evaluation.py b/ads/aqua/evaluation/evaluation.py
@@ -45,6 +45,7 @@
     is_valid_ocid,
     upload_local_to_os,
 )
+from ads.aqua.config.config import evaluation_service_config
 from ads.aqua.constants import (
     CONSOLE_LINK_RESOURCE_TYPE_MAPPING,
     EVALUATION_REPORT,
@@ -191,7 +192,7 @@ def create(
                         enable_spec=True
                     ).inference
                     for container in inference_config.values():
-                        if container.name == runtime.image[:runtime.image.rfind(":")]:
+                        if container.name == runtime.image[: runtime.image.rfind(":")]:
                             eval_inference_configuration = (
                                 container.spec.evaluation_configuration
                             )
@@ -416,9 +417,11 @@ def create(
                 report_path=create_aqua_evaluation_details.report_path,
                 model_parameters=create_aqua_evaluation_details.model_parameters,
                 metrics=create_aqua_evaluation_details.metrics,
-                inference_configuration=eval_inference_configuration.to_filtered_dict()
-                if eval_inference_configuration
-                else {},
+                inference_configuration=(
+                    eval_inference_configuration.to_filtered_dict()
+                    if eval_inference_configuration
+                    else {}
+                ),
             )
         ).create(**kwargs)  ## TODO: decide what parameters will be needed
         logger.debug(
@@ -901,48 +904,8 @@ def get_status(self, eval_id: str) -> dict:
 
     def get_supported_metrics(self) -> dict:
         """Gets a list of supported metrics for evaluation."""
-        # TODO: implement it when starting to support more metrics.
         return [
-            {
-                "use_case": ["text_generation"],
-                "key": "bertscore",
-                "name": "bertscore",
-                "description": (
-                    "BERT Score is a metric for evaluating the quality of text "
-                    "generation models, such as machine translation or summarization. "
-                    "It utilizes pre-trained BERT contextual embeddings for both the "
-                    "generated and reference texts, and then calculates the cosine "
-                    "similarity between these embeddings."
-                ),
-                "args": {},
-            },
-            {
-                "use_case": ["text_generation"],
-                "key": "rouge",
-                "name": "rouge",
-                "description": (
-                    "ROUGE scores compare a candidate document to a collection of "
-                    "reference documents to evaluate the similarity between them. "
-                    "The metrics range from 0 to 1, with higher scores indicating "
-                    "greater similarity. ROUGE is more suitable for models that don't "
-                    "include paraphrasing and do not generate new text units that don't "
-                    "appear in the references."
-                ),
-                "args": {},
-            },
-            {
-                "use_case": ["text_generation"],
-                "key": "bleu",
-                "name": "bleu",
-                "description": (
-                    "BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the "
-                    "quality of text which has been machine-translated from one natural language to another. "
-                    "Quality is considered to be the correspondence between a machine's output and that of a "
-                    "human: 'the closer a machine translation is to a professional human translation, "
-                    "the better it is'."
-                ),
-                "args": {},
-            },
+            item.to_dict() for item in evaluation_service_config().ui_config.metrics
         ]
 
     @telemetry(entry_point="plugin=evaluation&action=load_metrics", name="aqua")
@@ -1225,7 +1188,7 @@ def _delete_job_and_model(job, model):
                 f"Exception message: {ex}"
             )
 
-    def load_evaluation_config(self, eval_id):
+    def load_evaluation_config(self):
         """Loads evaluation config."""
         return {
             "model_params": {
diff --git a/tests/unitary/with_extras/aqua/test_evaluation.py b/tests/unitary/with_extras/aqua/test_evaluation.py
@@ -22,6 +22,11 @@
     AquaMissingKeyError,
     AquaRuntimeError,
 )
+from ads.aqua.config.evaluation.evaluation_service_config import (
+    EvaluationServiceConfig,
+    MetricConfig,
+    UIConfig,
+)
 from ads.aqua.constants import EVALUATION_REPORT_JSON, EVALUATION_REPORT_MD, UNKNOWN
 from ads.aqua.evaluation import AquaEvaluationApp
 from ads.aqua.evaluation.entities import (
@@ -875,17 +880,35 @@ def test_extract_job_lifecycle_details(self, input, expect_output):
         msg = self.app._extract_job_lifecycle_details(input)
         assert msg == expect_output, msg
 
-    def test_get_supported_metrics(self):
-        """Tests getting a list of supported metrics for evaluation.
-        This method currently hardcoded the return value.
+    @patch("ads.aqua.evaluation.evaluation.evaluation_service_config")
+    def test_get_supported_metrics(self, mock_evaluation_service_config):
+        """
+        Tests getting a list of supported metrics for evaluation.
         """
-        from .utils import SupportMetricsFormat as metric_schema
-        from .utils import check
 
+        test_evaluation_service_config = EvaluationServiceConfig(
+            ui_config=UIConfig(
+                metrics=[
+                    MetricConfig(
+                        **{
+                            "args": {},
+                            "description": "BERT Score.",
+                            "key": "bertscore",
+                            "name": "BERT Score",
+                            "tags": [],
+                            "task": ["text-generation"],
+                        },
+                    )
+                ]
+            )
+        )
+        mock_evaluation_service_config.return_value = test_evaluation_service_config
         response = self.app.get_supported_metrics()
         assert isinstance(response, list)
-        for metric in response:
-            assert check(metric_schema, metric)
+        assert len(response) == len(test_evaluation_service_config.ui_config.metrics)
+        assert response == [
+            item.to_dict() for item in test_evaluation_service_config.ui_config.metrics
+        ]
 
     def test_load_evaluation_config(self):
         """Tests loading default config for evaluation.
diff --git a/tests/unitary/with_extras/aqua/utils.py b/tests/unitary/with_extras/aqua/utils.py
@@ -76,17 +76,6 @@ def __post_init__(self):
                 )
 
 
-@dataclass
-class SupportMetricsFormat(BaseFormat):
-    """Format for supported evaluation metrics."""
-
-    use_case: list
-    key: str
-    name: str
-    description: str
-    args: dict
-
-
 @dataclass
 class EvaluationConfigFormat(BaseFormat):
     """Evaluation config format."""