Skip to content

[PLT-1614] Support data row / batch for live mmc projects #1856

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/labelbox/datarow_payload_templates.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Datarow payload templates
===============================================================================================

.. automodule:: labelbox.schema.data_row_payload_templates
:members:
:show-inheritance:
45 changes: 34 additions & 11 deletions libs/labelbox/src/labelbox/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import sys
import time
import urllib.parse
import warnings
from collections import defaultdict
from datetime import datetime, timezone
from types import MappingProxyType
Expand Down Expand Up @@ -910,11 +911,21 @@ def create_model_evaluation_project(
) -> Project:
pass

@overload
def create_model_evaluation_project(
self,
dataset_id: Optional[str] = None,
dataset_name: Optional[str] = None,
data_row_count: int = 100,
data_row_count: Optional[int] = None,
**kwargs,
) -> Project:
pass

def create_model_evaluation_project(
self,
dataset_id: Optional[str] = None,
dataset_name: Optional[str] = None,
data_row_count: Optional[int] = None,
**kwargs,
) -> Project:
"""
Expand All @@ -940,26 +951,38 @@ def create_model_evaluation_project(
>>> client.create_model_evaluation_project(name=project_name, dataset_id="clr00u8j0j0j0", data_row_count=10)
>>> This creates a new project, and adds 100 datarows to the dataset with id "clr00u8j0j0j0" and assigns a batch of the newly created 10 data rows to the project.

>>> client.create_model_evaluation_project(name=project_name)
>>> This creates a new project with no data rows.

"""
if not dataset_id and not dataset_name:
raise ValueError(
"dataset_name or data_set_id must be present and not be an empty string."
)
if data_row_count <= 0:
raise ValueError("data_row_count must be a positive integer.")
autogenerate_data_rows = False
dataset_name_or_id = None
append_to_existing_dataset = None

if dataset_id or dataset_name:
autogenerate_data_rows = True

if dataset_id:
append_to_existing_dataset = True
dataset_name_or_id = dataset_id
else:
elif dataset_name:
append_to_existing_dataset = False
dataset_name_or_id = dataset_name

if autogenerate_data_rows:
kwargs["dataset_name_or_id"] = dataset_name_or_id
kwargs["append_to_existing_dataset"] = append_to_existing_dataset
if data_row_count is None:
data_row_count = 100
if data_row_count < 0:
raise ValueError("data_row_count must be a positive integer.")
kwargs["data_row_count"] = data_row_count
warnings.warn(
"Automatic generation of data rows of live model evaluation projects is deprecated. dataset_name_or_id, append_to_existing_dataset, data_row_count will be removed in a future version.",
DeprecationWarning,
)

kwargs["media_type"] = MediaType.Conversational
kwargs["dataset_name_or_id"] = dataset_name_or_id
kwargs["append_to_existing_dataset"] = append_to_existing_dataset
kwargs["data_row_count"] = data_row_count
kwargs["editor_task_type"] = EditorTaskType.ModelChatEvaluation.value

return self._create_project(**kwargs)
Expand Down
10 changes: 3 additions & 7 deletions libs/labelbox/src/labelbox/data/annotation_types/collection.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable, Generator, Iterable, Union, Optional
from uuid import uuid4
import warnings
from typing import Callable, Generator, Iterable, Union

from tqdm import tqdm

from labelbox.schema import ontology
from labelbox.orm.model import Entity
from ..ontology import get_classifications, get_tools
from labelbox.schema import ontology

from ..generator import PrefetchGenerator
from .label import Label

Expand Down
40 changes: 40 additions & 0 deletions libs/labelbox/src/labelbox/schema/data_row_payload_templates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from typing import Dict, List

from pydantic import BaseModel, Field

from labelbox.schema.data_row import DataRowMetadataField


class ModelEvalutationTemlateRowData(BaseModel):
type: str = Field(
default="application/vnd.labelbox.conversational.model-chat-evaluation",
frozen=True,
)
draft: bool = Field(default=True, frozen=True)
rootMessageIds: List[str] = Field(default=[])
actors: Dict = Field(default={})
version: int = Field(default=2, frozen=True)
messages: Dict = Field(default={})


class ModelEvaluationTemplate(BaseModel):
"""
Use this class to create a model evaluation data row.

Examples:
>>> data = ModelEvaluationTemplate()
>>> data.row_data.rootMessageIds = ["root1"]
>>> vector = [random.uniform(1.0, 2.0) for _ in range(embedding.dims)]
>>> data.embeddings = [...]
>>> data.metadata_fields = [...]
>>> data.attachments = [...]
>>> content = data.model_dump()
>>> task = dataset.create_data_rows([content])
"""

row_data: ModelEvalutationTemlateRowData = Field(
default=ModelEvalutationTemlateRowData()
)
attachments: List[Dict] = Field(default=[])
embeddings: List[Dict] = Field(default=[])
metadata_fields: List[DataRowMetadataField] = Field(default=[])
46 changes: 20 additions & 26 deletions libs/labelbox/src/labelbox/schema/project.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import json
import logging
from string import Template
import time
import warnings
from collections import namedtuple
from datetime import datetime, timezone
from pathlib import Path
from string import Template
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -14,28 +14,18 @@
List,
Optional,
Tuple,
TypeVar,
Union,
overload,
)
from urllib.parse import urlparse

from labelbox.schema.labeling_service import (
LabelingService,
LabelingServiceStatus,
)
from labelbox.schema.labeling_service_dashboard import LabelingServiceDashboard
import requests

from labelbox import parser
from labelbox import utils
from labelbox.exceptions import error_message_for_unparsed_graphql_error
from labelbox.exceptions import (
InvalidQueryError,
LabelboxError,
ProcessingWaitTimeout,
ResourceConflict,
ResourceNotFoundError,
error_message_for_unparsed_graphql_error,
)
from labelbox.orm import query
from labelbox.orm.db_object import DbObject, Deletable, Updateable, experimental
Expand All @@ -46,30 +36,33 @@
from labelbox.schema.data_row import DataRow
from labelbox.schema.export_filters import (
ProjectExportFilters,
validate_datetime,
build_filters,
)
from labelbox.schema.export_params import ProjectExportParams
from labelbox.schema.export_task import ExportTask
from labelbox.schema.id_type import IdType
from labelbox.schema.identifiable import DataRowIdentifier, GlobalKey, UniqueId
from labelbox.schema.identifiables import DataRowIdentifiers, UniqueIds
from labelbox.schema.labeling_service import (
LabelingService,
LabelingServiceStatus,
)
from labelbox.schema.labeling_service_dashboard import LabelingServiceDashboard
from labelbox.schema.media_type import MediaType
from labelbox.schema.model_config import ModelConfig
from labelbox.schema.project_model_config import ProjectModelConfig
from labelbox.schema.queue_mode import QueueMode
from labelbox.schema.resource_tag import ResourceTag
from labelbox.schema.task import Task
from labelbox.schema.task_queue import TaskQueue
from labelbox.schema.ontology_kind import (
EditorTaskType,
OntologyKind,
UploadType,
)
from labelbox.schema.project_model_config import ProjectModelConfig
from labelbox.schema.project_overview import (
ProjectOverview,
ProjectOverviewDetailed,
)
from labelbox.schema.queue_mode import QueueMode
from labelbox.schema.resource_tag import ResourceTag
from labelbox.schema.task import Task
from labelbox.schema.task_queue import TaskQueue

if TYPE_CHECKING:
from labelbox import BulkImportRequest
Expand Down Expand Up @@ -579,7 +572,7 @@ def upsert_instructions(self, instructions_file: str) -> None:

if frontend.name != "Editor":
logger.warning(
f"This function has only been tested to work with the Editor front end. Found %s",
"This function has only been tested to work with the Editor front end. Found %s",
frontend.name,
)

Expand Down Expand Up @@ -788,7 +781,9 @@ def create_batch(
if self.queue_mode != QueueMode.Batch:
raise ValueError("Project must be in batch mode")

if self.is_auto_data_generation():
if (
self.is_auto_data_generation() and not self.is_chat_evaluation()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we now will allow to create batches for live mmc project

): # NOTE live chat evaluatiuon projects in sdk do not pre-generate data rows, but use batch as all other projects
raise ValueError(
"Cannot create batches for auto data generation projects"
)
Expand All @@ -814,7 +809,7 @@ def create_batch(

if row_count > 100_000:
raise ValueError(
f"Batch exceeds max size, break into smaller batches"
"Batch exceeds max size, break into smaller batches"
)
if not row_count:
raise ValueError("You need at least one data row in a batch")
Expand Down Expand Up @@ -1088,8 +1083,7 @@ def _create_batch_async(
task = self._wait_for_task(task_id)
if task.status != "COMPLETE":
raise LabelboxError(
f"Batch was not created successfully: "
+ json.dumps(task.errors)
"Batch was not created successfully: " + json.dumps(task.errors)
)

return self.client.get_batch(self.uid, batch_id)
Expand Down Expand Up @@ -1436,7 +1430,7 @@ def update_data_row_labeling_priority(
task = self._wait_for_task(task_id)
if task.status != "COMPLETE":
raise LabelboxError(
f"Priority was not updated successfully: "
"Priority was not updated successfully: "
+ json.dumps(task.errors)
)
return True
Expand Down Expand Up @@ -1629,7 +1623,7 @@ def move_data_rows_to_task_queue(self, data_row_ids, task_queue_id: str):
task = self._wait_for_task(task_id)
if task.status != "COMPLETE":
raise LabelboxError(
f"Data rows were not moved successfully: "
"Data rows were not moved successfully: "
+ json.dumps(task.errors)
)

Expand Down
25 changes: 21 additions & 4 deletions libs/labelbox/tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,11 +646,28 @@ def chat_evaluation_ontology(client, rand_gen):


@pytest.fixture
def live_chat_evaluation_project_with_new_dataset(client, rand_gen):
def live_chat_evaluation_project(client, rand_gen):
project_name = f"test-model-evaluation-project-{rand_gen(str)}"
dataset_name = f"test-model-evaluation-dataset-{rand_gen(str)}"
project = client.create_model_evaluation_project(
name=project_name, dataset_name=dataset_name, data_row_count=1
project = client.create_model_evaluation_project(name=project_name)

yield project

project.delete()


@pytest.fixture
def live_chat_evaluation_project_with_batch(
client,
rand_gen,
live_chat_evaluation_project,
offline_conversational_data_row,
):
project_name = f"test-model-evaluation-project-{rand_gen(str)}"
project = client.create_model_evaluation_project(name=project_name)

project.create_batch(
rand_gen(str),
[offline_conversational_data_row.uid], # sample of data row objects
)

yield project
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import pytest
from unittest.mock import patch

from labelbox import MediaType
from labelbox.schema.ontology_kind import OntologyKind
from labelbox.exceptions import MalformedQueryException


def test_create_chat_evaluation_ontology_project(
client,
chat_evaluation_ontology,
live_chat_evaluation_project_with_new_dataset,
live_chat_evaluation_project,
offline_conversational_data_row,
rand_gen,
):
Expand All @@ -28,36 +25,19 @@ def test_create_chat_evaluation_ontology_project(
assert classification.schema_id
assert classification.feature_schema_id

project = live_chat_evaluation_project_with_new_dataset
project = live_chat_evaluation_project
assert project.model_setup_complete is None

project.connect_ontology(ontology)

assert project.labeling_frontend().name == "Editor"
assert project.ontology().name == ontology.name

with pytest.raises(
ValueError,
match="Cannot create batches for auto data generation projects",
):
project.create_batch(
rand_gen(str),
[offline_conversational_data_row.uid], # sample of data row objects
)

with pytest.raises(
ValueError,
match="Cannot create batches for auto data generation projects",
):
with patch(
"labelbox.schema.project.MAX_SYNC_BATCH_ROW_COUNT", new=0
): # force to async
project.create_batch(
rand_gen(str),
[
offline_conversational_data_row.uid
], # sample of data row objects
)
batch = project.create_batch(
rand_gen(str),
[offline_conversational_data_row.uid], # sample of data row objects
)
assert batch


def test_create_chat_evaluation_ontology_project_existing_dataset(
Expand Down
2 changes: 1 addition & 1 deletion libs/labelbox/tests/integration/test_data_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def test_create_data_row_with_metadata_dict(
row_data=image_url, metadata_fields=make_metadata_fields_dict
)

assert len(list(dataset.data_rows())) == 1
assert len([dr for dr in dataset.data_rows()]) == 1
assert data_row.dataset() == dataset
assert data_row.created_by() == client.get_user()
assert data_row.organization() == client.get_organization()
Expand Down
Loading
Loading