From e98fed3365bdc4440a0d12bfbc054562a314023b Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Mon, 24 Jun 2024 10:57:07 -0400 Subject: [PATCH] Revert "Revert "Vb/model offline eval plt 1107 (#1682)" (#1687)" This reverts commit a8dc569d4df4107af75961588a6db37d8bea918d. --- libs/labelbox/src/labelbox/client.py | 179 ++++++++++-------- .../src/labelbox/schema/ontology_kind.py | 16 ++ libs/labelbox/src/labelbox/schema/project.py | 1 + libs/labelbox/tests/integration/conftest.py | 149 ++++++++++++++- .../test_chat_evaluation_ontology_project.py | 13 +- .../test_offline_chat_evaluation_project.py | 26 +++ libs/labelbox/tests/unit/test_project.py | 36 ++++ 7 files changed, 335 insertions(+), 85 deletions(-) create mode 100644 libs/labelbox/tests/integration/test_offline_chat_evaluation_project.py create mode 100644 libs/labelbox/tests/unit/test_project.py diff --git a/libs/labelbox/src/labelbox/client.py b/libs/labelbox/src/labelbox/client.py index a2fb09186..f0abd2bb5 100644 --- a/libs/labelbox/src/labelbox/client.py +++ b/libs/labelbox/src/labelbox/client.py @@ -652,7 +652,8 @@ def delete_model_config(self, id: str) -> bool: params = {"id": id} result = self.execute(query, params) if not result: - raise labelbox.exceptions.ResourceNotFoundError(Entity.ModelConfig, params) + raise labelbox.exceptions.ResourceNotFoundError( + Entity.ModelConfig, params) return result['deleteModelConfig']['success'] def create_dataset(self, @@ -740,85 +741,17 @@ def create_project(self, **kwargs) -> Project: Raises: InvalidAttributeError: If the Project type does not contain any of the attribute names given in kwargs. - """ - - auto_audit_percentage = kwargs.get("auto_audit_percentage") - auto_audit_number_of_labels = kwargs.get("auto_audit_number_of_labels") - if auto_audit_percentage is not None or auto_audit_number_of_labels is not None: - raise ValueError( - "quality_mode must be set instead of auto_audit_percentage or auto_audit_number_of_labels." - ) - - name = kwargs.get("name") - if name is None or not name.strip(): - raise ValueError("project name must be a valid string.") - - queue_mode = kwargs.get("queue_mode") - if queue_mode is QueueMode.Dataset: - raise ValueError( - "Dataset queue mode is deprecated. Please prefer Batch queue mode." - ) - elif queue_mode is QueueMode.Batch: - logger.warning( - "Passing a queue mode of batch is redundant and will soon no longer be supported." - ) - media_type = kwargs.get("media_type") - if media_type and MediaType.is_supported(media_type): - media_type_value = media_type.value - elif media_type: - raise TypeError(f"{media_type} is not a valid media type. Use" - f" any of {MediaType.get_supported_members()}" - " from MediaType. Example: MediaType.Image.") - else: - logger.warning( - "Creating a project without specifying media_type" - " through this method will soon no longer be supported.") - media_type_value = None - - ontology_kind = kwargs.pop("ontology_kind", None) - if ontology_kind and OntologyKind.is_supported(ontology_kind): - editor_task_type_value = EditorTaskTypeMapper.to_editor_task_type( - ontology_kind, media_type).value - elif ontology_kind: - raise OntologyKind.get_ontology_kind_validation_error(ontology_kind) - else: - editor_task_type_value = None - - quality_mode = kwargs.get("quality_mode") - if not quality_mode: - logger.info("Defaulting quality mode to Benchmark.") - - data = kwargs - data.pop("quality_mode", None) - if quality_mode is None or quality_mode is QualityMode.Benchmark: - data[ - "auto_audit_number_of_labels"] = BENCHMARK_AUTO_AUDIT_NUMBER_OF_LABELS - data["auto_audit_percentage"] = BENCHMARK_AUTO_AUDIT_PERCENTAGE - elif quality_mode is QualityMode.Consensus: - data[ - "auto_audit_number_of_labels"] = CONSENSUS_AUTO_AUDIT_NUMBER_OF_LABELS - data["auto_audit_percentage"] = CONSENSUS_AUTO_AUDIT_PERCENTAGE - else: - raise ValueError(f"{quality_mode} is not a valid quality mode.") - - params = {**data} - if media_type_value: - params["media_type"] = media_type_value - if editor_task_type_value: - params["editor_task_type"] = editor_task_type_value - - extra_params = { - Field.String("dataset_name_or_id"): - params.pop("dataset_name_or_id", None), - Field.Boolean("append_to_existing_dataset"): - params.pop("append_to_existing_dataset", None), - Field.Int("data_row_count"): - params.pop("data_row_count", None), - } - extra_params = {k: v for k, v in extra_params.items() if v is not None} - - return self._create(Entity.Project, params, extra_params) + NOTE: the following attributes are used only in chat model evaluation projects: + dataset_name_or_id, append_to_existing_dataset, data_row_count, editor_task_type + They are not used for general projects and not supported in this method + """ + # The following arguments are not supported for general projects, only for chat model evaluation projects + kwargs.pop("dataset_name_or_id", None) + kwargs.pop("append_to_existing_dataset", None) + kwargs.pop("data_row_count", None) + kwargs.pop("editor_task_type", None) + return self._create_project(**kwargs) @overload def create_model_evaluation_project(self, @@ -881,13 +814,99 @@ def create_model_evaluation_project(self, dataset_name_or_id = dataset_name kwargs["media_type"] = MediaType.Conversational - kwargs["ontology_kind"] = OntologyKind.ModelEvaluation kwargs["dataset_name_or_id"] = dataset_name_or_id kwargs["append_to_existing_dataset"] = append_to_existing_dataset kwargs["data_row_count"] = data_row_count + kwargs["editor_task_type"] = EditorTaskType.ModelChatEvaluation.value + + return self._create_project(**kwargs) + + def create_offline_model_evaluation_project(self, **kwargs) -> Project: + """ + Creates a project for offline model evaluation. + Args: + **kwargs: Additional parameters to pass see the create_project method + Returns: + Project: The created project + """ + kwargs[ + "media_type"] = MediaType.Conversational # Only Conversational is supported + kwargs[ + "editor_task_type"] = EditorTaskType.OfflineModelChatEvaluation.value # Special editor task type for offline model evaluation + + # The following arguments are not supported for offline model evaluation + kwargs.pop("dataset_name_or_id", None) + kwargs.pop("append_to_existing_dataset", None) + kwargs.pop("data_row_count", None) return self.create_project(**kwargs) + def _create_project(self, **kwargs) -> Project: + auto_audit_percentage = kwargs.get("auto_audit_percentage") + auto_audit_number_of_labels = kwargs.get("auto_audit_number_of_labels") + if auto_audit_percentage is not None or auto_audit_number_of_labels is not None: + raise ValueError( + "quality_mode must be set instead of auto_audit_percentage or auto_audit_number_of_labels." + ) + + name = kwargs.get("name") + if name is None or not name.strip(): + raise ValueError("project name must be a valid string.") + + queue_mode = kwargs.get("queue_mode") + if queue_mode is QueueMode.Dataset: + raise ValueError( + "Dataset queue mode is deprecated. Please prefer Batch queue mode." + ) + elif queue_mode is QueueMode.Batch: + logger.warning( + "Passing a queue mode of batch is redundant and will soon no longer be supported." + ) + + media_type = kwargs.get("media_type") + if media_type and MediaType.is_supported(media_type): + media_type_value = media_type.value + elif media_type: + raise TypeError(f"{media_type} is not a valid media type. Use" + f" any of {MediaType.get_supported_members()}" + " from MediaType. Example: MediaType.Image.") + else: + logger.warning( + "Creating a project without specifying media_type" + " through this method will soon no longer be supported.") + media_type_value = None + + quality_mode = kwargs.get("quality_mode") + if not quality_mode: + logger.info("Defaulting quality mode to Benchmark.") + + data = kwargs + data.pop("quality_mode", None) + if quality_mode is None or quality_mode is QualityMode.Benchmark: + data[ + "auto_audit_number_of_labels"] = BENCHMARK_AUTO_AUDIT_NUMBER_OF_LABELS + data["auto_audit_percentage"] = BENCHMARK_AUTO_AUDIT_PERCENTAGE + elif quality_mode is QualityMode.Consensus: + data[ + "auto_audit_number_of_labels"] = CONSENSUS_AUTO_AUDIT_NUMBER_OF_LABELS + data["auto_audit_percentage"] = CONSENSUS_AUTO_AUDIT_PERCENTAGE + else: + raise ValueError(f"{quality_mode} is not a valid quality mode.") + + params = {**data} + if media_type_value: + params["media_type"] = media_type_value + + extra_params = { + Field.String("dataset_name_or_id"): + params.pop("dataset_name_or_id", None), + Field.Boolean("append_to_existing_dataset"): + params.pop("append_to_existing_dataset", None), + } + extra_params = {k: v for k, v in extra_params.items() if v is not None} + + return self._create(Entity.Project, params, extra_params) + def get_roles(self) -> List[Role]: """ Returns: diff --git a/libs/labelbox/src/labelbox/schema/ontology_kind.py b/libs/labelbox/src/labelbox/schema/ontology_kind.py index c50368e9c..e33e7cef3 100644 --- a/libs/labelbox/src/labelbox/schema/ontology_kind.py +++ b/libs/labelbox/src/labelbox/schema/ontology_kind.py @@ -26,12 +26,28 @@ def get_ontology_kind_validation_error(cls, ontology_kind): class EditorTaskType(Enum): ModelChatEvaluation = "MODEL_CHAT_EVALUATION" ResponseCreation = "RESPONSE_CREATION" + OfflineModelChatEvaluation = "OFFLINE_MODEL_CHAT_EVALUATION" Missing = None @classmethod def is_supported(cls, value): return isinstance(value, cls) + @classmethod + def _missing_(cls, name) -> 'EditorTaskType': + """Handle missing null new task types + Handle upper case names for compatibility with + the GraphQL""" + + if name is None: + return cls.Missing + + for name, member in cls.__members__.items(): + if name == name.upper(): + return member + + return cls.Missing + class EditorTaskTypeMapper: diff --git a/libs/labelbox/src/labelbox/schema/project.py b/libs/labelbox/src/labelbox/schema/project.py index aa51cdc22..a2142ebc5 100644 --- a/libs/labelbox/src/labelbox/schema/project.py +++ b/libs/labelbox/src/labelbox/schema/project.py @@ -121,6 +121,7 @@ class Project(DbObject, Updateable, Deletable): # Bind data_type and allowedMediaTYpe using the GraphQL type MediaType media_type = Field.Enum(MediaType, "media_type", "allowedMediaType") editor_task_type = Field.Enum(EditorTaskType, "editor_task_type") + data_row_count = Field.Int("data_row_count") # Relationships created_by = Relationship.ToOne("User", False, "created_by") diff --git a/libs/labelbox/tests/integration/conftest.py b/libs/labelbox/tests/integration/conftest.py index 844933388..612d98122 100644 --- a/libs/labelbox/tests/integration/conftest.py +++ b/libs/labelbox/tests/integration/conftest.py @@ -400,7 +400,10 @@ def chat_evaluation_ontology(client, rand_gen): yield ontology - client.delete_unused_ontology(ontology.uid) + try: + client.delete_unused_ontology(ontology.uid) + except Exception as e: + print(f"Failed to delete ontology {ontology.uid}: {str(e)}") @pytest.fixture @@ -416,6 +419,16 @@ def chat_evaluation_project_create_dataset(client, rand_gen): project.delete() +@pytest.fixture +def offline_chat_evaluation_project(client, rand_gen): + project_name = f"test-offline-model-evaluation-project-{rand_gen(str)}" + project = client.create_offline_model_evaluation_project(name=project_name) + + yield project + + project.delete() + + @pytest.fixture def chat_evaluation_project_append_to_dataset(client, dataset, rand_gen): project_name = f"test-model-evaluation-project-{rand_gen(str)}" @@ -429,6 +442,138 @@ def chat_evaluation_project_append_to_dataset(client, dataset, rand_gen): project.delete() +@pytest.fixture +def offline_conversational_data_row(initial_dataset): + convo_v2_row_data = { + "type": "application/vnd.labelbox.conversational.model-chat-evaluation", + "version": 2, + "actors": { + "clxhs9wk000013b6w7imiz0h8": { + "role": "human", + "metadata": { + "name": "User" + } + }, + "clxhsc6xb00013b6w1awh579j": { + "role": "model", + "metadata": { + "modelConfigId": "5a50d319-56bd-405d-87bb-4442daea0d0f" + } + }, + "clxhsc6xb00023b6wlp0768zs": { + "role": "model", + "metadata": { + "modelConfigId": "1cfc833a-2684-47df-95ac-bb7d9f9e3e1f" + } + } + }, + "messages": { + "clxhs9wk000023b6wrufora3k": { + "actorId": "clxhs9wk000013b6w7imiz0h8", + "content": [{ + "type": "text", + "content": "Hello world" + }], + "childMessageIds": ["clxhscb4z00033b6wukpvmuol"] + }, + "clxhscb4z00033b6wukpvmuol": { + "actorId": "clxhsc6xb00013b6w1awh579j", + "content": [{ + "type": + "text", + "content": + "Hello to you too! 👋 \n\nIt's great to be your guide in the digital world. What can I help you with today? 😊 \n" + }], + "childMessageIds": ["clxhu2s0900013b6wbv0ndddd"] + }, + "clxhu2s0900013b6wbv0ndddd": { + "actorId": + "clxhs9wk000013b6w7imiz0h8", + "content": [{ + "type": "text", + "content": "Lets some some multi-turn happening" + }], + "childMessageIds": [ + "clxhu4qib00023b6wuep47b1l", "clxhu4qib00033b6wf18az01q" + ] + }, + "clxhu4qib00023b6wuep47b1l": { + "actorId": "clxhsc6xb00013b6w1awh579j", + "content": [{ + "type": + "text", + "content": + "Okay, I'm ready for some multi-turn fun! To make it interesting, how about we try building a story together? \n\n**Here's the beginning:**\n\nThe old, dusty book lay forgotten on the shelf, its leather cover cracked and faded. But as the afternoon sun slanted through the window, a single ray caught a glint of gold on the book's spine. Suddenly...\n\n**Now you tell me what happens!** What does the glint of gold turn out to be? What happens next? 🤔 \n" + }], + "childMessageIds": ["clxhu596m00043b6wvkgahcwz"] + }, + "clxhu4qib00033b6wf18az01q": { + "actorId": "clxhsc6xb00023b6wlp0768zs", + "content": [{ + "type": + "text", + "content": + "Okay, I'm game! Let's have a multi-turn conversation! But to get us started, give me a topic or a question to get us going! 😄 \n\nWhat's on your mind? \n" + }], + "childMessageIds": ["clxhu596m00043b6wvkgahcwz"] + }, + "clxhu596m00043b6wvkgahcwz": { + "actorId": + "clxhs9wk000013b6w7imiz0h8", + "content": [{ + "type": "text", + "content": "Can you generate an image?" + }], + "childMessageIds": [ + "clxhu5hqk00053b6wk21u6xi7", "clxhu5hqk00063b6whl4gofx8" + ] + }, + "clxhu5hqk00053b6wk21u6xi7": { + "actorId": "clxhsc6xb00013b6w1awh579j", + "content": [{ + "type": + "text", + "content": + "I can't generate images directly yet. I'm a text-based AI, so my superpowers are all about words! 🪄 \n\nHowever, I'd love to help you describe an image that fits our story. Tell me: \n\n* **What should the image show?** (For example: The dusty book, the glint of gold, the bookshelf, the room, etc.) \n* **What kind of feeling or atmosphere should the image have?** (Mysterious? Magical? Ominous?) \n\nThe more details you give me, the better I can describe the image! ✨ \n" + }], + "childMessageIds": [] + }, + "clxhu5hqk00063b6whl4gofx8": { + "actorId": "clxhsc6xb00023b6wlp0768zs", + "content": [{ + "type": + "text", + "content": + "I can't *actually* generate images directly. 😔 I'm primarily a text-based AI. \n\nTo help me understand what you'd like to see, tell me: \n\n* **What should be in the image?** Be specific! (e.g., \"A cat wearing a tiny hat\", \"A futuristic cityscape at sunset\")\n* **What style do you imagine?** (e.g., realistic, cartoonish, abstract)\n\nOnce you give me those details, I can try to give you a vivid description that's almost as good as seeing it! 😊 \n" + }], + "childMessageIds": [] + } + }, + "rootMessageIds": ["clxhs9wk000023b6wrufora3k"] + } + + convo_v2_asset = { + "row_data": convo_v2_row_data, + } + data_row = initial_dataset.create_data_row(convo_v2_asset) + + return data_row + + +@pytest.fixture() +def conversation_data_row(initial_dataset, rand_gen): + data = { + "row_data": + "https://storage.googleapis.com/labelbox-developer-testing-assets/conversational_text/1000-conversations/conversation-1.json", + "global_key": + f"https://storage.googleapis.com/labelbox-developer-testing-assets/conversational_text/1000-conversations/conversation-1.json-{rand_gen(str)}", + } + convo_asset = {"row_data": data} + data_row = initial_dataset.create_data_row(convo_asset) + + return data_row + + def pytest_configure(): pytest.report = defaultdict(int) @@ -456,4 +601,4 @@ def print_perf_summary(): num_of_entries = 10 if len(sorted_dict) >= 10 else len(sorted_dict) slowest_fixtures = [(aaa, sorted_dict[aaa]) for aaa in islice(sorted_dict, num_of_entries)] - print("\nTop slowest fixtures:\n", slowest_fixtures, file=sys.stderr) \ No newline at end of file + print("\nTop slowest fixtures:\n", slowest_fixtures, file=sys.stderr) diff --git a/libs/labelbox/tests/integration/test_chat_evaluation_ontology_project.py b/libs/labelbox/tests/integration/test_chat_evaluation_ontology_project.py index 4e344cb00..6a41a7a09 100644 --- a/libs/labelbox/tests/integration/test_chat_evaluation_ontology_project.py +++ b/libs/labelbox/tests/integration/test_chat_evaluation_ontology_project.py @@ -1,13 +1,13 @@ import pytest -from labelbox import OntologyBuilder, Tool from labelbox import MediaType from labelbox.schema.ontology_kind import OntologyKind -from labelbox.schema.labeling_frontend import LabelingFrontend +from labelbox.exceptions import MalformedQueryException def test_create_chat_evaluation_ontology_project( client, chat_evaluation_ontology, - chat_evaluation_project_create_dataset): + chat_evaluation_project_create_dataset, conversation_data_row, + rand_gen): ontology = chat_evaluation_ontology # here we are essentially testing the ontology creation which is a fixture @@ -29,6 +29,13 @@ def test_create_chat_evaluation_ontology_project( assert project.labeling_frontend().name == "Editor" assert project.ontology().name == ontology.name + with pytest.raises(MalformedQueryException, + match="No valid data rows to add to project"): + project.create_batch( + rand_gen(str), + [conversation_data_row.uid], # sample of data row objects + ) + def test_create_chat_evaluation_ontology_project_existing_dataset( client, chat_evaluation_ontology, diff --git a/libs/labelbox/tests/integration/test_offline_chat_evaluation_project.py b/libs/labelbox/tests/integration/test_offline_chat_evaluation_project.py new file mode 100644 index 000000000..d27f4e95e --- /dev/null +++ b/libs/labelbox/tests/integration/test_offline_chat_evaluation_project.py @@ -0,0 +1,26 @@ +import pytest + + +def test_create_offline_chat_evaluation_project(client, rand_gen, + offline_chat_evaluation_project, + chat_evaluation_ontology, + offline_conversational_data_row, + model_config): + project = offline_chat_evaluation_project + assert project + + ontology = chat_evaluation_ontology + project.setup_editor(ontology) + + assert project.labeling_frontend().name == "Editor" + assert project.ontology().name == ontology.name + + batch = project.create_batch( + rand_gen(str), + [offline_conversational_data_row.uid], # sample of data row objects + ) + assert batch + + # Can not add a model config to an offline chat evaluation project, since we do not use live models + with pytest.raises(Exception): + project.add_model_config(model_config.uid) diff --git a/libs/labelbox/tests/unit/test_project.py b/libs/labelbox/tests/unit/test_project.py new file mode 100644 index 000000000..9d522e7c0 --- /dev/null +++ b/libs/labelbox/tests/unit/test_project.py @@ -0,0 +1,36 @@ +import pytest +from unittest.mock import MagicMock + +from labelbox.schema.project import Project +from labelbox.schema.ontology_kind import EditorTaskType + + +@pytest.mark.parametrize( + 'api_editor_task_type, expected_editor_task_type', + [(None, EditorTaskType.Missing), + ('MODEL_CHAT_EVALUATION', EditorTaskType.ModelChatEvaluation), + ('RESPONSE_CREATION', EditorTaskType.ResponseCreation), + ('OFFLINE_MODEL_CHAT_EVALUATION', + EditorTaskType.OfflineModelChatEvaluation), + ('NEW_TYPE', EditorTaskType.Missing)]) +def test_project_editor_task_type(api_editor_task_type, + expected_editor_task_type): + client = MagicMock() + project = Project( + client, { + "id": "test", + "name": "test", + "createdAt": "2021-06-01T00:00:00.000Z", + "updatedAt": "2021-06-01T00:00:00.000Z", + "autoAuditNumberOfLabels": 1, + "autoAuditPercentage": 100, + "dataRowCount": 1, + "description": "test", + "editorTaskType": api_editor_task_type, + "lastActivityTime": "2021-06-01T00:00:00.000Z", + "allowedMediaType": "IMAGE", + "queueMode": "BATCH", + "setupComplete": "2021-06-01T00:00:00.000Z", + }) + + assert project.editor_task_type == expected_editor_task_type