Skip to content

Commit 7a9d905

Browse files
authored
Adds supporting BLEU score metric for evaluation. (#784)
2 parents 3e7fd13 + 0efe6fd commit 7a9d905

File tree

2 files changed

+44
-17
lines changed

2 files changed

+44
-17
lines changed

ads/aqua/evaluation.py

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ class EvaluationJobExitCode(Enum):
7878
SUCCESS = 0
7979
COMMON_ERROR = 1
8080

81-
# Configuration-related issues
81+
# Configuration-related issues 10-19
8282
INVALID_EVALUATION_CONFIG = 10
8383
EVALUATION_CONFIG_NOT_PROVIDED = 11
8484
INVALID_OUTPUT_DIR = 12
@@ -87,7 +87,7 @@ class EvaluationJobExitCode(Enum):
8787
INVALID_TARGET_EVALUATION_ID = 15
8888
INVALID_EVALUATION_CONFIG_VALIDATION = 16
8989

90-
# Evaluation process issues
90+
# Evaluation process issues 20-39
9191
OUTPUT_DIR_NOT_FOUND = 20
9292
INVALID_INPUT_DATASET = 21
9393
INPUT_DATA_NOT_FOUND = 22
@@ -100,6 +100,7 @@ class EvaluationJobExitCode(Enum):
100100
MODEL_INFERENCE_WRONG_RESPONSE_FORMAT = 29
101101
UNSUPPORTED_METRICS = 30
102102
METRIC_CALCULATION_FAILURE = 31
103+
EVALUATION_MODEL_CATALOG_RECORD_CREATION_FAILED = 32
103104

104105

105106
EVALUATION_JOB_EXIT_CODE_MESSAGE = {
@@ -124,6 +125,11 @@ class EvaluationJobExitCode(Enum):
124125
EvaluationJobExitCode.MODEL_INFERENCE_WRONG_RESPONSE_FORMAT.value: "Evaluation encountered unsupported, or unexpected model output, verify the target evaluation model is compatible and produces the correct format.",
125126
EvaluationJobExitCode.UNSUPPORTED_METRICS.value: "None of the provided metrics are supported by the framework.",
126127
EvaluationJobExitCode.METRIC_CALCULATION_FAILURE.value: "All attempted metric calculations were unsuccessful. Please review the metric configurations and input data.",
128+
EvaluationJobExitCode.EVALUATION_MODEL_CATALOG_RECORD_CREATION_FAILED.value: (
129+
"Failed to create a Model Catalog record for the evaluation. "
130+
"This could be due to missing required permissions. "
131+
"Please check the log for more information."
132+
),
127133
}
128134

129135

@@ -849,13 +855,17 @@ def get(self, eval_id) -> AquaEvaluationDetail:
849855
loggroup_id = ""
850856

851857
loggroup_url = get_log_links(region=self.region, log_group_id=loggroup_id)
852-
log_url = get_log_links(
853-
region=self.region,
854-
log_group_id=loggroup_id,
855-
log_id=log_id,
856-
compartment_id=job_run_details.compartment_id,
857-
source_id=jobrun_id
858-
) if job_run_details else ""
858+
log_url = (
859+
get_log_links(
860+
region=self.region,
861+
log_group_id=loggroup_id,
862+
log_id=log_id,
863+
compartment_id=job_run_details.compartment_id,
864+
source_id=jobrun_id,
865+
)
866+
if job_run_details
867+
else ""
868+
)
859869

860870
log_name = None
861871
loggroup_name = None
@@ -931,7 +941,6 @@ def list(
931941
evaluations = []
932942
async_tasks = []
933943
for model in models:
934-
935944
if model.identifier in self._eval_cache.keys():
936945
logger.debug(f"Retrieving evaluation {model.identifier} from cache.")
937946
evaluations.append(self._eval_cache.get(model.identifier))
@@ -1049,13 +1058,17 @@ def get_status(self, eval_id: str) -> dict:
10491058
loggroup_id = ""
10501059

10511060
loggroup_url = get_log_links(region=self.region, log_group_id=loggroup_id)
1052-
log_url = get_log_links(
1053-
region=self.region,
1054-
log_group_id=loggroup_id,
1055-
log_id=log_id,
1056-
compartment_id=job_run_details.compartment_id,
1057-
source_id=jobrun_id
1058-
) if job_run_details else ""
1061+
log_url = (
1062+
get_log_links(
1063+
region=self.region,
1064+
log_group_id=loggroup_id,
1065+
log_id=log_id,
1066+
compartment_id=job_run_details.compartment_id,
1067+
source_id=jobrun_id,
1068+
)
1069+
if job_run_details
1070+
else ""
1071+
)
10591072

10601073
return dict(
10611074
id=eval_id,
@@ -1100,6 +1113,19 @@ def get_supported_metrics(self) -> dict:
11001113
),
11011114
"args": {},
11021115
},
1116+
{
1117+
"use_case": ["text_generation"],
1118+
"key": "bleu",
1119+
"name": "bleu",
1120+
"description": (
1121+
"BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the "
1122+
"quality of text which has been machine-translated from one natural language to another. "
1123+
"Quality is considered to be the correspondence between a machine's output and that of a "
1124+
"human: 'the closer a machine translation is to a professional human translation, "
1125+
"the better it is'."
1126+
),
1127+
"args": {},
1128+
},
11031129
]
11041130

11051131
@telemetry(entry_point="plugin=evaluation&action=load_metrics", name="aqua")

ads/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
"AQUA_TELEMETRY_BUCKET", "service-managed-models"
8080
)
8181
AQUA_TELEMETRY_BUCKET_NS = os.environ.get("AQUA_TELEMETRY_BUCKET_NS", CONDA_BUCKET_NS)
82+
8283
DEBUG_TELEMETRY = os.environ.get("DEBUG_TELEMETRY", None)
8384
AQUA_SERVICE_NAME = "aqua"
8485
DATA_SCIENCE_SERVICE_NAME = "data-science"

0 commit comments

Comments
 (0)