diff --git a/libs/labelbox/src/labelbox/schema/dataset.py b/libs/labelbox/src/labelbox/schema/dataset.py index 081bc949a..9d02604d4 100644 --- a/libs/labelbox/src/labelbox/schema/dataset.py +++ b/libs/labelbox/src/labelbox/schema/dataset.py @@ -32,9 +32,7 @@ from labelbox.schema.task import Task, DataUpsertTask from labelbox.schema.user import User from labelbox.schema.iam_integration import IAMIntegration -from labelbox.schema.internal.data_row_upsert_item import (DataRowItemBase, - DataRowUpsertItem, - DataRowCreateItem) +from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem) import labelbox.schema.internal.data_row_uploader as data_row_uploader from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator from labelbox.schema.internal.datarow_upload_constants import ( @@ -290,9 +288,10 @@ def create_data_rows(self, string_items = [item for item in items if isinstance(item, str)] dict_items = [item for item in items if isinstance(item, dict)] dict_string_items = [] + if len(string_items) > 0: dict_string_items = self._build_from_local_paths(string_items) - specs = DataRowCreateItem.build(self.uid, + specs = DataRowUpsertItem.build(self.uid, dict_items + dict_string_items) return self._exec_upsert_data_rows(specs, file_upload_thread_count) @@ -614,7 +613,7 @@ def upsert_data_rows(self, def _exec_upsert_data_rows( self, - specs: List[DataRowItemBase], + specs: List[DataRowUpsertItem], file_upload_thread_count: int = FILE_UPLOAD_THREAD_COUNT ) -> "DataUpsertTask": diff --git a/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py b/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py index 70c74660e..41b3d9752 100644 --- a/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py +++ b/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py @@ -3,7 +3,7 @@ from typing import List from labelbox import pydantic_compat -from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem, DataRowCreateItem +from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator @@ -16,19 +16,14 @@ class UploadManifest(pydantic_compat.BaseModel): SOURCE_SDK = "SDK" -def upload_in_chunks(client, specs: List[DataRowItemBase], +def upload_in_chunks(client, specs: List[DataRowUpsertItem], file_upload_thread_count: int, max_chunk_size_bytes: int) -> UploadManifest: empty_specs = list(filter(lambda spec: spec.is_empty(), specs)) if empty_specs: ids = list(map(lambda spec: spec.id.get("value"), empty_specs)) - ids = list(filter(lambda x: x is not None and len(x) > 0, ids)) - if len(ids) > 0: - raise ValueError( - f"The following items have an empty payload: {ids}") - else: # case of create items - raise ValueError("Some items have an empty payload") + raise ValueError(f"The following items have an empty payload: {ids}") chunk_uris = DescriptorFileCreator(client).create( specs, max_chunk_size_bytes=max_chunk_size_bytes) diff --git a/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py b/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py index 12fd209fd..e2c0cb2b5 100644 --- a/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py +++ b/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py @@ -1,32 +1,23 @@ -from abc import ABC, abstractmethod - from typing import List, Tuple, Optional -from labelbox.pydantic_compat import BaseModel from labelbox.schema.identifiable import UniqueId, GlobalKey -from labelbox import pydantic_compat -from labelbox.schema.data_row import DataRow +from labelbox.pydantic_compat import BaseModel -class DataRowItemBase(ABC, pydantic_compat.BaseModel): +class DataRowUpsertItem(BaseModel): """ Base class for creating payloads for upsert operations. """ - id: dict payload: dict - @abstractmethod - def is_empty(self) -> bool: - ... - @classmethod def build( cls, dataset_id: str, items: List[dict], key_types: Optional[Tuple[type, ...]] = () - ) -> List["DataRowItemBase"]: + ) -> List["DataRowUpsertItem"]: upload_items = [] for item in items: @@ -50,9 +41,6 @@ def build( upload_items.append(cls(payload=item, id=key)) return upload_items - -class DataRowUpsertItem(DataRowItemBase): - def is_empty(self) -> bool: """ The payload is considered empty if it's actually empty or the only key is `dataset_id`. @@ -60,36 +48,3 @@ def is_empty(self) -> bool: """ return (not self.payload or len(self.payload.keys()) == 1 and "dataset_id" in self.payload) - - @classmethod - def build( - cls, - dataset_id: str, - items: List[dict], - key_types: Optional[Tuple[type, ...]] = (UniqueId, GlobalKey) - ) -> List["DataRowItemBase"]: - return super().build(dataset_id, items, (UniqueId, GlobalKey)) - - -class DataRowCreateItem(DataRowItemBase): - - def is_empty(self) -> bool: - """ - The payload is considered empty if it's actually empty or row_data is empty - or the only key is `dataset_id`. - :return: bool - """ - row_data = self.payload.get("row_data", None) or self.payload.get( - DataRow.row_data, None) - return (not self.payload or len(self.payload.keys()) == 1 and - "dataset_id" in self.payload or row_data is None or - len(row_data) == 0) - - @classmethod - def build( - cls, - dataset_id: str, - items: List[dict], - key_types: Optional[Tuple[type, ...]] = () - ) -> List["DataRowItemBase"]: - return super().build(dataset_id, items, ()) diff --git a/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py b/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py index ac59b5e14..4c7e95ee4 100644 --- a/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py +++ b/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py @@ -13,7 +13,7 @@ from labelbox.schema.embedding import EmbeddingVector from labelbox.schema.internal.datarow_upload_constants import ( FILE_UPLOAD_THREAD_COUNT) -from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem +from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -248,7 +248,7 @@ def format_legacy_conversational_data(item): return item def convert_item(data_row_item): - if isinstance(data_row_item, DataRowItemBase): + if isinstance(data_row_item, DataRowUpsertItem): item = data_row_item.payload else: item = data_row_item @@ -273,7 +273,7 @@ def convert_item(data_row_item): # Upload any local file paths item = upload_if_necessary(item) - if isinstance(data_row_item, DataRowItemBase): + if isinstance(data_row_item, DataRowUpsertItem): return {'id': data_row_item.id, 'payload': item} else: return item diff --git a/libs/labelbox/tests/unit/test_data_row_upsert_data.py b/libs/labelbox/tests/unit/test_data_row_upsert_data.py index 9c7b2b383..9f6eb1400 100644 --- a/libs/labelbox/tests/unit/test_data_row_upsert_data.py +++ b/libs/labelbox/tests/unit/test_data_row_upsert_data.py @@ -1,13 +1,7 @@ -from unittest.mock import MagicMock, patch - import pytest -from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem, - DataRowCreateItem) +from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem) from labelbox.schema.identifiable import UniqueId, GlobalKey from labelbox.schema.asset_attachment import AttachmentType -from labelbox.schema.dataset import Dataset -from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator -from labelbox.schema.data_row import DataRow @pytest.fixture @@ -32,17 +26,6 @@ def data_row_create_items(): return dataset_id, items -@pytest.fixture -def data_row_create_items_row_data_none(): - dataset_id = 'test_dataset' - items = [ - { - "row_data": None, - }, - ] - return dataset_id, items - - @pytest.fixture def data_row_update_items(): dataset_id = 'test_dataset' @@ -63,7 +46,7 @@ def test_data_row_upsert_items(data_row_create_items, data_row_update_items): dataset_id, create_items = data_row_create_items dataset_id, update_items = data_row_update_items items = create_items + update_items - result = DataRowUpsertItem.build(dataset_id, items) + result = DataRowUpsertItem.build(dataset_id, items, (UniqueId, GlobalKey)) assert len(result) == len(items) for item, res in zip(items, result): assert res.payload == item @@ -71,7 +54,7 @@ def test_data_row_upsert_items(data_row_create_items, data_row_update_items): def test_data_row_create_items(data_row_create_items): dataset_id, items = data_row_create_items - result = DataRowCreateItem.build(dataset_id, items) + result = DataRowUpsertItem.build(dataset_id, items) assert len(result) == len(items) for item, res in zip(items, result): assert res.payload == item @@ -80,75 +63,4 @@ def test_data_row_create_items(data_row_create_items): def test_data_row_create_items_not_updateable(data_row_update_items): dataset_id, items = data_row_update_items with pytest.raises(ValueError): - DataRowCreateItem.build(dataset_id, items) - - -def test_upsert_is_empty(): - item = DataRowUpsertItem(id={ - "id": UniqueId, - "value": UniqueId("123") - }, - payload={}) - assert item.is_empty() - - item = DataRowUpsertItem(id={ - "id": UniqueId, - "value": UniqueId("123") - }, - payload={"dataset_id": "test_dataset"}) - assert item.is_empty() - - item = DataRowUpsertItem( - id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"}) - assert not item.is_empty() - - -def test_create_is_empty(): - item = DataRowCreateItem(id={}, payload={}) - assert item.is_empty() - - item = DataRowCreateItem(id={}, payload={"dataset_id": "test_dataset"}) - assert item.is_empty() - - item = DataRowCreateItem(id={}, payload={"row_data": None}) - assert item.is_empty() - - item = DataRowCreateItem(id={}, payload={"row_data": ""}) - assert item.is_empty() - - item = DataRowCreateItem( - id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"}) - assert not item.is_empty() - - item = DataRowCreateItem( - id={}, - payload={DataRow.row_data: "http://my_site.com/photos/img_01.jpg"}) - assert not item.is_empty() - - -def test_create_row_data_none(): - items = [ - { - "row_data": None, - "global_key": "global_key1", - }, - ] - client = MagicMock() - dataset = Dataset( - client, { - "id": 'test_dataset', - "name": 'test_dataset', - "createdAt": "2021-06-01T00:00:00.000Z", - "description": "test_dataset", - "updatedAt": "2021-06-01T00:00:00.000Z", - "rowCount": 0, - }) - - with patch.object(DescriptorFileCreator, - 'create', - return_value=["http://bar.com/chunk_uri"]): - with pytest.raises(ValueError, - match="Some items have an empty payload"): - dataset.create_data_rows(items) - - client.execute.assert_not_called() + DataRowUpsertItem.build(dataset_id, items, ())