Merge branch 'ODSC-61884/global_evaluation_config' of https://github.com/oracle/accelerated-data-science into ODSC-61986/evaluation_supported_shapes

mrDzurb · mrDzurb · commit 993f1db4c0e1 · 2024-09-06T12:28:52.000-07:00
diff --git a/ads/aqua/config/evaluation/evaluation_service_config.py b/ads/aqua/config/evaluation/evaluation_service_config.py
@@ -10,14 +10,6 @@
 
 from ads.aqua.config.utils.serializer import Serializable
 
-# Constants
-INFERENCE_RPS = 25  # Max RPS for inferencing deployed model.
-INFERENCE_TIMEOUT = 120
-INFERENCE_MAX_THREADS = 10  # Maximum parallel threads for model inference.
-INFERENCE_RETRIES = 3
-INFERENCE_BACKOFF_FACTOR = 3
-INFERENCE_DELAY = 0
-
 
 class ModelParamsOverrides(Serializable):
     """Defines overrides for model parameters, including exclusions and additional inclusions."""
@@ -54,13 +46,6 @@ class Config:
 class InferenceParams(Serializable):
     """Contains inference-related parameters with defaults."""
 
-    inference_rps: Optional[int] = INFERENCE_RPS
-    inference_timeout: Optional[int] = INFERENCE_TIMEOUT
-    inference_max_threads: Optional[int] = INFERENCE_MAX_THREADS
-    inference_retries: Optional[int] = INFERENCE_RETRIES
-    inference_backoff_factor: Optional[float] = INFERENCE_BACKOFF_FACTOR
-    inference_delay: Optional[float] = INFERENCE_DELAY
-
     class Config:
         extra = "allow"
 
@@ -224,20 +209,18 @@ def search_shapes(
         -------
         List[ShapeConfig]: A list of shapes that match the filters.
         """
-        results = []
-        for shape in self.shapes:
-            if (
-                evaluation_container
-                and evaluation_container not in shape.filter.evaluation_container
-            ):
-                continue
+        return [
+            shape
+            for shape in self.shapes
             if (
-                evaluation_target
-                and evaluation_target not in shape.filter.evaluation_target
-            ):
-                continue
-            results.append(shape)
-        return results
+                not evaluation_container
+                or evaluation_container in shape.filter.evaluation_container
+            )
+            and (
+                not evaluation_target
+                or evaluation_target in shape.filter.evaluation_target
+            )
+        ]
 
     class Config:
         extra = "ignore"
diff --git a/ads/aqua/evaluation/evaluation.py b/ads/aqua/evaluation/evaluation.py
@@ -930,48 +930,8 @@ def get_status(self, eval_id: str) -> dict:
 
     def get_supported_metrics(self) -> dict:
         """Gets a list of supported metrics for evaluation."""
-        # TODO: implement it when starting to support more metrics.
         return [
-            {
-                "use_case": ["text_generation"],
-                "key": "bertscore",
-                "name": "bertscore",
-                "description": (
-                    "BERT Score is a metric for evaluating the quality of text "
-                    "generation models, such as machine translation or summarization. "
-                    "It utilizes pre-trained BERT contextual embeddings for both the "
-                    "generated and reference texts, and then calculates the cosine "
-                    "similarity between these embeddings."
-                ),
-                "args": {},
-            },
-            {
-                "use_case": ["text_generation"],
-                "key": "rouge",
-                "name": "rouge",
-                "description": (
-                    "ROUGE scores compare a candidate document to a collection of "
-                    "reference documents to evaluate the similarity between them. "
-                    "The metrics range from 0 to 1, with higher scores indicating "
-                    "greater similarity. ROUGE is more suitable for models that don't "
-                    "include paraphrasing and do not generate new text units that don't "
-                    "appear in the references."
-                ),
-                "args": {},
-            },
-            {
-                "use_case": ["text_generation"],
-                "key": "bleu",
-                "name": "bleu",
-                "description": (
-                    "BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the "
-                    "quality of text which has been machine-translated from one natural language to another. "
-                    "Quality is considered to be the correspondence between a machine's output and that of a "
-                    "human: 'the closer a machine translation is to a professional human translation, "
-                    "the better it is'."
-                ),
-                "args": {},
-            },
+            item.to_dict() for item in evaluation_service_config().ui_config.metrics
         ]
 
     @telemetry(entry_point="plugin=evaluation&action=load_metrics", name="aqua")
diff --git a/tests/unitary/with_extras/aqua/test_data/config/evaluation_config_with_default_params.json b/tests/unitary/with_extras/aqua/test_data/config/evaluation_config_with_default_params.json
@@ -5,14 +5,7 @@
   },
   "inference_params": {
     "containers": [],
-    "default": {
-      "inference_backoff_factor": 3,
-      "inference_delay": 0,
-      "inference_max_threads": 10,
-      "inference_retries": 3,
-      "inference_rps": 25,
-      "inference_timeout": 120
-    }
+    "default": {}
   },
   "kind": "evaluation_service_config",
   "report_params": {
diff --git a/tests/unitary/with_extras/aqua/test_evaluation.py b/tests/unitary/with_extras/aqua/test_evaluation.py
@@ -24,6 +24,7 @@
 )
 from ads.aqua.config.evaluation.evaluation_service_config import (
     EvaluationServiceConfig,
+    MetricConfig,
     ModelParamsConfig,
     ShapeConfig,
     UIConfig,
@@ -302,7 +303,7 @@ class TestDataset:
                         "category": null,
                         "description": null,
                         "key": "Hyperparameters",
-                        "value": '{"model_params": {"max_tokens": 500, "top_p": 1, "top_k": 50, "temperature": 0.7, "presence_penalty": 0, "frequency_penalty": 0, "stop": [], "shape": "VM.Standard.E3.Flex", "dataset_path": "oci://mybucket@mytenancy/data.jsonl", "report_path": "oci://mybucket@mytenancy/report"}}',
+                        "value": '{"model_params": {"model": "odsc-llm", "max_tokens": 500, "top_p": 1, "top_k": 50, "temperature": 0.7, "presence_penalty": 0, "frequency_penalty": 0, "stop": [], "shape": "VM.Standard.E3.Flex", "dataset_path": "oci://mybucket@mytenancy/data.jsonl", "report_path": "oci://mybucket@mytenancy/report"}}',
                     },
                     {
                         "category": null,
@@ -506,6 +507,7 @@ def test_create_evaluation(
             "lifecycle_state": f"{evaluation_job_run.lifecycle_state}",
             "name": f"{evaluation_model.display_name}",
             "parameters": {
+                "model": "odsc-llm",
                 "dataset_path": "",
                 "frequency_penalty": 0.0,
                 "max_tokens": "",
@@ -881,17 +883,35 @@ def test_extract_job_lifecycle_details(self, input, expect_output):
         msg = self.app._extract_job_lifecycle_details(input)
         assert msg == expect_output, msg
 
-    def test_get_supported_metrics(self):
-        """Tests getting a list of supported metrics for evaluation.
-        This method currently hardcoded the return value.
+    @patch("ads.aqua.evaluation.evaluation.evaluation_service_config")
+    def test_get_supported_metrics(self, mock_evaluation_service_config):
+        """
+        Tests getting a list of supported metrics for evaluation.
         """
-        from .utils import SupportMetricsFormat as metric_schema
-        from .utils import check
 
+        test_evaluation_service_config = EvaluationServiceConfig(
+            ui_config=UIConfig(
+                metrics=[
+                    MetricConfig(
+                        **{
+                            "args": {},
+                            "description": "BERT Score.",
+                            "key": "bertscore",
+                            "name": "BERT Score",
+                            "tags": [],
+                            "task": ["text-generation"],
+                        },
+                    )
+                ]
+            )
+        )
+        mock_evaluation_service_config.return_value = test_evaluation_service_config
         response = self.app.get_supported_metrics()
         assert isinstance(response, list)
-        for metric in response:
-            assert check(metric_schema, metric)
+        assert len(response) == len(test_evaluation_service_config.ui_config.metrics)
+        assert response == [
+            item.to_dict() for item in test_evaluation_service_config.ui_config.metrics
+        ]
 
     @patch("ads.aqua.evaluation.evaluation.evaluation_service_config")
     def test_load_evaluation_config(self, mock_evaluation_service_config):
diff --git a/tests/unitary/with_extras/aqua/utils.py b/tests/unitary/with_extras/aqua/utils.py
@@ -76,17 +76,6 @@ def __post_init__(self):
                 )
 
 
-@dataclass
-class SupportMetricsFormat(BaseFormat):
-    """Format for supported evaluation metrics."""
-
-    use_case: list
-    key: str
-    name: str
-    description: str
-    args: dict
-
-
 def check(conf_schema, conf):
     """Check if the format of the output dictionary is correct."""
     try: