@@ -78,7 +78,7 @@ class EvaluationJobExitCode(Enum):
78
78
SUCCESS = 0
79
79
COMMON_ERROR = 1
80
80
81
- # Configuration-related issues
81
+ # Configuration-related issues 10-19
82
82
INVALID_EVALUATION_CONFIG = 10
83
83
EVALUATION_CONFIG_NOT_PROVIDED = 11
84
84
INVALID_OUTPUT_DIR = 12
@@ -87,7 +87,7 @@ class EvaluationJobExitCode(Enum):
87
87
INVALID_TARGET_EVALUATION_ID = 15
88
88
INVALID_EVALUATION_CONFIG_VALIDATION = 16
89
89
90
- # Evaluation process issues
90
+ # Evaluation process issues 20-39
91
91
OUTPUT_DIR_NOT_FOUND = 20
92
92
INVALID_INPUT_DATASET = 21
93
93
INPUT_DATA_NOT_FOUND = 22
@@ -100,6 +100,7 @@ class EvaluationJobExitCode(Enum):
100
100
MODEL_INFERENCE_WRONG_RESPONSE_FORMAT = 29
101
101
UNSUPPORTED_METRICS = 30
102
102
METRIC_CALCULATION_FAILURE = 31
103
+ EVALUATION_MODEL_CATALOG_RECORD_CREATION_FAILED = 32
103
104
104
105
105
106
EVALUATION_JOB_EXIT_CODE_MESSAGE = {
@@ -124,6 +125,11 @@ class EvaluationJobExitCode(Enum):
124
125
EvaluationJobExitCode .MODEL_INFERENCE_WRONG_RESPONSE_FORMAT .value : "Evaluation encountered unsupported, or unexpected model output, verify the target evaluation model is compatible and produces the correct format." ,
125
126
EvaluationJobExitCode .UNSUPPORTED_METRICS .value : "None of the provided metrics are supported by the framework." ,
126
127
EvaluationJobExitCode .METRIC_CALCULATION_FAILURE .value : "All attempted metric calculations were unsuccessful. Please review the metric configurations and input data." ,
128
+ EvaluationJobExitCode .EVALUATION_MODEL_CATALOG_RECORD_CREATION_FAILED .value : (
129
+ "Failed to create a Model Catalog record for the evaluation. "
130
+ "This could be due to missing required permissions. "
131
+ "Please check the log for more information."
132
+ ),
127
133
}
128
134
129
135
@@ -849,13 +855,17 @@ def get(self, eval_id) -> AquaEvaluationDetail:
849
855
loggroup_id = ""
850
856
851
857
loggroup_url = get_log_links (region = self .region , log_group_id = loggroup_id )
852
- log_url = get_log_links (
853
- region = self .region ,
854
- log_group_id = loggroup_id ,
855
- log_id = log_id ,
856
- compartment_id = job_run_details .compartment_id ,
857
- source_id = jobrun_id
858
- ) if job_run_details else ""
858
+ log_url = (
859
+ get_log_links (
860
+ region = self .region ,
861
+ log_group_id = loggroup_id ,
862
+ log_id = log_id ,
863
+ compartment_id = job_run_details .compartment_id ,
864
+ source_id = jobrun_id ,
865
+ )
866
+ if job_run_details
867
+ else ""
868
+ )
859
869
860
870
log_name = None
861
871
loggroup_name = None
@@ -931,7 +941,6 @@ def list(
931
941
evaluations = []
932
942
async_tasks = []
933
943
for model in models :
934
-
935
944
if model .identifier in self ._eval_cache .keys ():
936
945
logger .debug (f"Retrieving evaluation { model .identifier } from cache." )
937
946
evaluations .append (self ._eval_cache .get (model .identifier ))
@@ -1049,13 +1058,17 @@ def get_status(self, eval_id: str) -> dict:
1049
1058
loggroup_id = ""
1050
1059
1051
1060
loggroup_url = get_log_links (region = self .region , log_group_id = loggroup_id )
1052
- log_url = get_log_links (
1053
- region = self .region ,
1054
- log_group_id = loggroup_id ,
1055
- log_id = log_id ,
1056
- compartment_id = job_run_details .compartment_id ,
1057
- source_id = jobrun_id
1058
- ) if job_run_details else ""
1061
+ log_url = (
1062
+ get_log_links (
1063
+ region = self .region ,
1064
+ log_group_id = loggroup_id ,
1065
+ log_id = log_id ,
1066
+ compartment_id = job_run_details .compartment_id ,
1067
+ source_id = jobrun_id ,
1068
+ )
1069
+ if job_run_details
1070
+ else ""
1071
+ )
1059
1072
1060
1073
return dict (
1061
1074
id = eval_id ,
@@ -1100,6 +1113,19 @@ def get_supported_metrics(self) -> dict:
1100
1113
),
1101
1114
"args" : {},
1102
1115
},
1116
+ {
1117
+ "use_case" : ["text_generation" ],
1118
+ "key" : "bleu" ,
1119
+ "name" : "bleu" ,
1120
+ "description" : (
1121
+ "BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the "
1122
+ "quality of text which has been machine-translated from one natural language to another. "
1123
+ "Quality is considered to be the correspondence between a machine's output and that of a "
1124
+ "human: 'the closer a machine translation is to a professional human translation, "
1125
+ "the better it is'."
1126
+ ),
1127
+ "args" : {},
1128
+ },
1103
1129
]
1104
1130
1105
1131
@telemetry (entry_point = "plugin=evaluation&action=load_metrics" , name = "aqua" )
0 commit comments