Skip to content

Commit bbda290

Browse files
authored
[AQUA][Evaluate] Externalize Supported Evaluation Metrics List to Service Config (#941)
2 parents ea10a42 + 44814b4 commit bbda290

File tree

3 files changed

+39
-64
lines changed

3 files changed

+39
-64
lines changed

ads/aqua/evaluation/evaluation.py

Lines changed: 9 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
is_valid_ocid,
4646
upload_local_to_os,
4747
)
48+
from ads.aqua.config.config import evaluation_service_config
4849
from ads.aqua.constants import (
4950
CONSOLE_LINK_RESOURCE_TYPE_MAPPING,
5051
EVALUATION_REPORT,
@@ -191,7 +192,7 @@ def create(
191192
enable_spec=True
192193
).inference
193194
for container in inference_config.values():
194-
if container.name == runtime.image[:runtime.image.rfind(":")]:
195+
if container.name == runtime.image[: runtime.image.rfind(":")]:
195196
eval_inference_configuration = (
196197
container.spec.evaluation_configuration
197198
)
@@ -416,9 +417,11 @@ def create(
416417
report_path=create_aqua_evaluation_details.report_path,
417418
model_parameters=create_aqua_evaluation_details.model_parameters,
418419
metrics=create_aqua_evaluation_details.metrics,
419-
inference_configuration=eval_inference_configuration.to_filtered_dict()
420-
if eval_inference_configuration
421-
else {},
420+
inference_configuration=(
421+
eval_inference_configuration.to_filtered_dict()
422+
if eval_inference_configuration
423+
else {}
424+
),
422425
)
423426
).create(**kwargs) ## TODO: decide what parameters will be needed
424427
logger.debug(
@@ -901,48 +904,8 @@ def get_status(self, eval_id: str) -> dict:
901904

902905
def get_supported_metrics(self) -> dict:
903906
"""Gets a list of supported metrics for evaluation."""
904-
# TODO: implement it when starting to support more metrics.
905907
return [
906-
{
907-
"use_case": ["text_generation"],
908-
"key": "bertscore",
909-
"name": "bertscore",
910-
"description": (
911-
"BERT Score is a metric for evaluating the quality of text "
912-
"generation models, such as machine translation or summarization. "
913-
"It utilizes pre-trained BERT contextual embeddings for both the "
914-
"generated and reference texts, and then calculates the cosine "
915-
"similarity between these embeddings."
916-
),
917-
"args": {},
918-
},
919-
{
920-
"use_case": ["text_generation"],
921-
"key": "rouge",
922-
"name": "rouge",
923-
"description": (
924-
"ROUGE scores compare a candidate document to a collection of "
925-
"reference documents to evaluate the similarity between them. "
926-
"The metrics range from 0 to 1, with higher scores indicating "
927-
"greater similarity. ROUGE is more suitable for models that don't "
928-
"include paraphrasing and do not generate new text units that don't "
929-
"appear in the references."
930-
),
931-
"args": {},
932-
},
933-
{
934-
"use_case": ["text_generation"],
935-
"key": "bleu",
936-
"name": "bleu",
937-
"description": (
938-
"BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the "
939-
"quality of text which has been machine-translated from one natural language to another. "
940-
"Quality is considered to be the correspondence between a machine's output and that of a "
941-
"human: 'the closer a machine translation is to a professional human translation, "
942-
"the better it is'."
943-
),
944-
"args": {},
945-
},
908+
item.to_dict() for item in evaluation_service_config().ui_config.metrics
946909
]
947910

948911
@telemetry(entry_point="plugin=evaluation&action=load_metrics", name="aqua")
@@ -1225,7 +1188,7 @@ def _delete_job_and_model(job, model):
12251188
f"Exception message: {ex}"
12261189
)
12271190

1228-
def load_evaluation_config(self, eval_id):
1191+
def load_evaluation_config(self):
12291192
"""Loads evaluation config."""
12301193
return {
12311194
"model_params": {

tests/unitary/with_extras/aqua/test_evaluation.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
AquaMissingKeyError,
2323
AquaRuntimeError,
2424
)
25+
from ads.aqua.config.evaluation.evaluation_service_config import (
26+
EvaluationServiceConfig,
27+
MetricConfig,
28+
UIConfig,
29+
)
2530
from ads.aqua.constants import EVALUATION_REPORT_JSON, EVALUATION_REPORT_MD, UNKNOWN
2631
from ads.aqua.evaluation import AquaEvaluationApp
2732
from ads.aqua.evaluation.entities import (
@@ -875,17 +880,35 @@ def test_extract_job_lifecycle_details(self, input, expect_output):
875880
msg = self.app._extract_job_lifecycle_details(input)
876881
assert msg == expect_output, msg
877882

878-
def test_get_supported_metrics(self):
879-
"""Tests getting a list of supported metrics for evaluation.
880-
This method currently hardcoded the return value.
883+
@patch("ads.aqua.evaluation.evaluation.evaluation_service_config")
884+
def test_get_supported_metrics(self, mock_evaluation_service_config):
885+
"""
886+
Tests getting a list of supported metrics for evaluation.
881887
"""
882-
from .utils import SupportMetricsFormat as metric_schema
883-
from .utils import check
884888

889+
test_evaluation_service_config = EvaluationServiceConfig(
890+
ui_config=UIConfig(
891+
metrics=[
892+
MetricConfig(
893+
**{
894+
"args": {},
895+
"description": "BERT Score.",
896+
"key": "bertscore",
897+
"name": "BERT Score",
898+
"tags": [],
899+
"task": ["text-generation"],
900+
},
901+
)
902+
]
903+
)
904+
)
905+
mock_evaluation_service_config.return_value = test_evaluation_service_config
885906
response = self.app.get_supported_metrics()
886907
assert isinstance(response, list)
887-
for metric in response:
888-
assert check(metric_schema, metric)
908+
assert len(response) == len(test_evaluation_service_config.ui_config.metrics)
909+
assert response == [
910+
item.to_dict() for item in test_evaluation_service_config.ui_config.metrics
911+
]
889912

890913
def test_load_evaluation_config(self):
891914
"""Tests loading default config for evaluation.

tests/unitary/with_extras/aqua/utils.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,6 @@ def __post_init__(self):
7676
)
7777

7878

79-
@dataclass
80-
class SupportMetricsFormat(BaseFormat):
81-
"""Format for supported evaluation metrics."""
82-
83-
use_case: list
84-
key: str
85-
name: str
86-
description: str
87-
args: dict
88-
89-
9079
@dataclass
9180
class EvaluationConfigFormat(BaseFormat):
9281
"""Evaluation config format."""

0 commit comments

Comments
 (0)