|
45 | 45 | is_valid_ocid,
|
46 | 46 | upload_local_to_os,
|
47 | 47 | )
|
| 48 | +from ads.aqua.config.config import evaluation_service_config |
48 | 49 | from ads.aqua.constants import (
|
49 | 50 | CONSOLE_LINK_RESOURCE_TYPE_MAPPING,
|
50 | 51 | EVALUATION_REPORT,
|
@@ -191,7 +192,7 @@ def create(
|
191 | 192 | enable_spec=True
|
192 | 193 | ).inference
|
193 | 194 | for container in inference_config.values():
|
194 |
| - if container.name == runtime.image[:runtime.image.rfind(":")]: |
| 195 | + if container.name == runtime.image[: runtime.image.rfind(":")]: |
195 | 196 | eval_inference_configuration = (
|
196 | 197 | container.spec.evaluation_configuration
|
197 | 198 | )
|
@@ -416,9 +417,11 @@ def create(
|
416 | 417 | report_path=create_aqua_evaluation_details.report_path,
|
417 | 418 | model_parameters=create_aqua_evaluation_details.model_parameters,
|
418 | 419 | metrics=create_aqua_evaluation_details.metrics,
|
419 |
| - inference_configuration=eval_inference_configuration.to_filtered_dict() |
420 |
| - if eval_inference_configuration |
421 |
| - else {}, |
| 420 | + inference_configuration=( |
| 421 | + eval_inference_configuration.to_filtered_dict() |
| 422 | + if eval_inference_configuration |
| 423 | + else {} |
| 424 | + ), |
422 | 425 | )
|
423 | 426 | ).create(**kwargs) ## TODO: decide what parameters will be needed
|
424 | 427 | logger.debug(
|
@@ -901,48 +904,8 @@ def get_status(self, eval_id: str) -> dict:
|
901 | 904 |
|
902 | 905 | def get_supported_metrics(self) -> dict:
|
903 | 906 | """Gets a list of supported metrics for evaluation."""
|
904 |
| - # TODO: implement it when starting to support more metrics. |
905 | 907 | return [
|
906 |
| - { |
907 |
| - "use_case": ["text_generation"], |
908 |
| - "key": "bertscore", |
909 |
| - "name": "bertscore", |
910 |
| - "description": ( |
911 |
| - "BERT Score is a metric for evaluating the quality of text " |
912 |
| - "generation models, such as machine translation or summarization. " |
913 |
| - "It utilizes pre-trained BERT contextual embeddings for both the " |
914 |
| - "generated and reference texts, and then calculates the cosine " |
915 |
| - "similarity between these embeddings." |
916 |
| - ), |
917 |
| - "args": {}, |
918 |
| - }, |
919 |
| - { |
920 |
| - "use_case": ["text_generation"], |
921 |
| - "key": "rouge", |
922 |
| - "name": "rouge", |
923 |
| - "description": ( |
924 |
| - "ROUGE scores compare a candidate document to a collection of " |
925 |
| - "reference documents to evaluate the similarity between them. " |
926 |
| - "The metrics range from 0 to 1, with higher scores indicating " |
927 |
| - "greater similarity. ROUGE is more suitable for models that don't " |
928 |
| - "include paraphrasing and do not generate new text units that don't " |
929 |
| - "appear in the references." |
930 |
| - ), |
931 |
| - "args": {}, |
932 |
| - }, |
933 |
| - { |
934 |
| - "use_case": ["text_generation"], |
935 |
| - "key": "bleu", |
936 |
| - "name": "bleu", |
937 |
| - "description": ( |
938 |
| - "BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the " |
939 |
| - "quality of text which has been machine-translated from one natural language to another. " |
940 |
| - "Quality is considered to be the correspondence between a machine's output and that of a " |
941 |
| - "human: 'the closer a machine translation is to a professional human translation, " |
942 |
| - "the better it is'." |
943 |
| - ), |
944 |
| - "args": {}, |
945 |
| - }, |
| 908 | + item.to_dict() for item in evaluation_service_config().ui_config.metrics |
946 | 909 | ]
|
947 | 910 |
|
948 | 911 | @telemetry(entry_point="plugin=evaluation&action=load_metrics", name="aqua")
|
@@ -1225,7 +1188,7 @@ def _delete_job_and_model(job, model):
|
1225 | 1188 | f"Exception message: {ex}"
|
1226 | 1189 | )
|
1227 | 1190 |
|
1228 |
| - def load_evaluation_config(self, eval_id): |
| 1191 | + def load_evaluation_config(self): |
1229 | 1192 | """Loads evaluation config."""
|
1230 | 1193 | return {
|
1231 | 1194 | "model_params": {
|
|
0 commit comments