From bf3b7c823487a6714f1877080c32215cd54fa3ba Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Mon, 10 Jun 2024 16:02:41 -0700 Subject: [PATCH 1/2] Add validation for empty row_data for Dataset create_data_rows --- libs/labelbox/src/labelbox/schema/dataset.py | 9 +- .../schema/internal/data_row_uploader.py | 11 ++- .../schema/internal/data_row_upsert_item.py | 49 +++++++++- .../internal/descriptor_file_creator.py | 6 +- .../tests/unit/test_data_row_upsert_data.py | 90 ++++++++++++++++++- 5 files changed, 148 insertions(+), 17 deletions(-) diff --git a/libs/labelbox/src/labelbox/schema/dataset.py b/libs/labelbox/src/labelbox/schema/dataset.py index 9d02604d4..081bc949a 100644 --- a/libs/labelbox/src/labelbox/schema/dataset.py +++ b/libs/labelbox/src/labelbox/schema/dataset.py @@ -32,7 +32,9 @@ from labelbox.schema.task import Task, DataUpsertTask from labelbox.schema.user import User from labelbox.schema.iam_integration import IAMIntegration -from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem) +from labelbox.schema.internal.data_row_upsert_item import (DataRowItemBase, + DataRowUpsertItem, + DataRowCreateItem) import labelbox.schema.internal.data_row_uploader as data_row_uploader from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator from labelbox.schema.internal.datarow_upload_constants import ( @@ -288,10 +290,9 @@ def create_data_rows(self, string_items = [item for item in items if isinstance(item, str)] dict_items = [item for item in items if isinstance(item, dict)] dict_string_items = [] - if len(string_items) > 0: dict_string_items = self._build_from_local_paths(string_items) - specs = DataRowUpsertItem.build(self.uid, + specs = DataRowCreateItem.build(self.uid, dict_items + dict_string_items) return self._exec_upsert_data_rows(specs, file_upload_thread_count) @@ -613,7 +614,7 @@ def upsert_data_rows(self, def _exec_upsert_data_rows( self, - specs: List[DataRowUpsertItem], + specs: List[DataRowItemBase], file_upload_thread_count: int = FILE_UPLOAD_THREAD_COUNT ) -> "DataUpsertTask": diff --git a/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py b/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py index 41b3d9752..70c74660e 100644 --- a/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py +++ b/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py @@ -3,7 +3,7 @@ from typing import List from labelbox import pydantic_compat -from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem +from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem, DataRowCreateItem from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator @@ -16,14 +16,19 @@ class UploadManifest(pydantic_compat.BaseModel): SOURCE_SDK = "SDK" -def upload_in_chunks(client, specs: List[DataRowUpsertItem], +def upload_in_chunks(client, specs: List[DataRowItemBase], file_upload_thread_count: int, max_chunk_size_bytes: int) -> UploadManifest: empty_specs = list(filter(lambda spec: spec.is_empty(), specs)) if empty_specs: ids = list(map(lambda spec: spec.id.get("value"), empty_specs)) - raise ValueError(f"The following items have an empty payload: {ids}") + ids = list(filter(lambda x: x is not None and len(x) > 0, ids)) + if len(ids) > 0: + raise ValueError( + f"The following items have an empty payload: {ids}") + else: # case of create items + raise ValueError("Some items have an empty payload") chunk_uris = DescriptorFileCreator(client).create( specs, max_chunk_size_bytes=max_chunk_size_bytes) diff --git a/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py b/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py index e2c0cb2b5..ad6f67a99 100644 --- a/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py +++ b/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py @@ -1,23 +1,31 @@ +from abc import ABC, abstractmethod + from typing import List, Tuple, Optional -from labelbox.schema.identifiable import UniqueId, GlobalKey from labelbox.pydantic_compat import BaseModel +from labelbox.schema.identifiable import UniqueId, GlobalKey +from labelbox import pydantic_compat -class DataRowUpsertItem(BaseModel): +class DataRowItemBase(ABC, pydantic_compat.BaseModel): """ Base class for creating payloads for upsert operations. """ + id: dict payload: dict + @abstractmethod + def is_empty(self) -> bool: + ... + @classmethod def build( cls, dataset_id: str, items: List[dict], key_types: Optional[Tuple[type, ...]] = () - ) -> List["DataRowUpsertItem"]: + ) -> List["DataRowItemBase"]: upload_items = [] for item in items: @@ -41,6 +49,9 @@ def build( upload_items.append(cls(payload=item, id=key)) return upload_items + +class DataRowUpsertItem(DataRowItemBase): + def is_empty(self) -> bool: """ The payload is considered empty if it's actually empty or the only key is `dataset_id`. @@ -48,3 +59,35 @@ def is_empty(self) -> bool: """ return (not self.payload or len(self.payload.keys()) == 1 and "dataset_id" in self.payload) + + @classmethod + def build( + cls, + dataset_id: str, + items: List[dict], + key_types: Optional[Tuple[type, ...]] = (UniqueId, GlobalKey) + ) -> List["DataRowItemBase"]: + return super().build(dataset_id, items, (UniqueId, GlobalKey)) + + +class DataRowCreateItem(DataRowItemBase): + + def is_empty(self) -> bool: + """ + The payload is considered empty if it's actually empty or row_data is empty + or the only key is `dataset_id`. + :return: bool + """ + row_data = self.payload.get("row_data", None) + return (not self.payload or len(self.payload.keys()) == 1 and + "dataset_id" in self.payload or row_data is None or + len(row_data) == 0) + + @classmethod + def build( + cls, + dataset_id: str, + items: List[dict], + key_types: Optional[Tuple[type, ...]] = () + ) -> List["DataRowItemBase"]: + return super().build(dataset_id, items, ()) diff --git a/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py b/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py index 4c7e95ee4..ac59b5e14 100644 --- a/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py +++ b/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py @@ -13,7 +13,7 @@ from labelbox.schema.embedding import EmbeddingVector from labelbox.schema.internal.datarow_upload_constants import ( FILE_UPLOAD_THREAD_COUNT) -from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem +from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -248,7 +248,7 @@ def format_legacy_conversational_data(item): return item def convert_item(data_row_item): - if isinstance(data_row_item, DataRowUpsertItem): + if isinstance(data_row_item, DataRowItemBase): item = data_row_item.payload else: item = data_row_item @@ -273,7 +273,7 @@ def convert_item(data_row_item): # Upload any local file paths item = upload_if_necessary(item) - if isinstance(data_row_item, DataRowUpsertItem): + if isinstance(data_row_item, DataRowItemBase): return {'id': data_row_item.id, 'payload': item} else: return item diff --git a/libs/labelbox/tests/unit/test_data_row_upsert_data.py b/libs/labelbox/tests/unit/test_data_row_upsert_data.py index 9f6eb1400..bb1522c42 100644 --- a/libs/labelbox/tests/unit/test_data_row_upsert_data.py +++ b/libs/labelbox/tests/unit/test_data_row_upsert_data.py @@ -1,7 +1,12 @@ +from unittest.mock import MagicMock, patch + import pytest -from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem) +from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem, + DataRowCreateItem) from labelbox.schema.identifiable import UniqueId, GlobalKey from labelbox.schema.asset_attachment import AttachmentType +from labelbox.schema.dataset import Dataset +from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator @pytest.fixture @@ -26,6 +31,17 @@ def data_row_create_items(): return dataset_id, items +@pytest.fixture +def data_row_create_items_row_data_none(): + dataset_id = 'test_dataset' + items = [ + { + "row_data": None, + }, + ] + return dataset_id, items + + @pytest.fixture def data_row_update_items(): dataset_id = 'test_dataset' @@ -46,7 +62,7 @@ def test_data_row_upsert_items(data_row_create_items, data_row_update_items): dataset_id, create_items = data_row_create_items dataset_id, update_items = data_row_update_items items = create_items + update_items - result = DataRowUpsertItem.build(dataset_id, items, (UniqueId, GlobalKey)) + result = DataRowUpsertItem.build(dataset_id, items) assert len(result) == len(items) for item, res in zip(items, result): assert res.payload == item @@ -54,7 +70,7 @@ def test_data_row_upsert_items(data_row_create_items, data_row_update_items): def test_data_row_create_items(data_row_create_items): dataset_id, items = data_row_create_items - result = DataRowUpsertItem.build(dataset_id, items) + result = DataRowCreateItem.build(dataset_id, items) assert len(result) == len(items) for item, res in zip(items, result): assert res.payload == item @@ -63,4 +79,70 @@ def test_data_row_create_items(data_row_create_items): def test_data_row_create_items_not_updateable(data_row_update_items): dataset_id, items = data_row_update_items with pytest.raises(ValueError): - DataRowUpsertItem.build(dataset_id, items, ()) + DataRowCreateItem.build(dataset_id, items) + + +def test_upsert_is_empty(): + item = DataRowUpsertItem(id={ + "id": UniqueId, + "value": UniqueId("123") + }, + payload={}) + assert item.is_empty() + + item = DataRowUpsertItem(id={ + "id": UniqueId, + "value": UniqueId("123") + }, + payload={"dataset_id": "test_dataset"}) + assert item.is_empty() + + item = DataRowUpsertItem( + id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"}) + assert not item.is_empty() + + +def test_create_is_empty(): + item = DataRowCreateItem(id={}, payload={}) + assert item.is_empty() + + item = DataRowCreateItem(id={}, payload={"dataset_id": "test_dataset"}) + assert item.is_empty() + + item = DataRowCreateItem(id={}, payload={"row_data": None}) + assert item.is_empty() + + item = DataRowCreateItem(id={}, payload={"row_data": ""}) + assert item.is_empty() + + item = DataRowCreateItem( + id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"}) + assert not item.is_empty() + + +def test_create_row_data_none(): + items = [ + { + "row_data": None, + "global_key": "global_key1", + }, + ] + client = MagicMock() + dataset = Dataset( + client, { + "id": 'test_dataset', + "name": 'test_dataset', + "createdAt": "2021-06-01T00:00:00.000Z", + "description": "test_dataset", + "updatedAt": "2021-06-01T00:00:00.000Z", + "rowCount": 0, + }) + + with patch.object(DescriptorFileCreator, + 'create', + return_value=["http://bar.com/chunk_uri"]): + with pytest.raises(ValueError, + match="Some items have an empty payload"): + dataset.create_data_rows(items) + + client.execute.assert_not_called() From e4542043a406660bcba2e1994a41d39d6a06159f Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Tue, 11 Jun 2024 10:55:36 -0700 Subject: [PATCH 2/2] Fix tests --- .../src/labelbox/schema/internal/data_row_upsert_item.py | 4 +++- libs/labelbox/tests/unit/test_data_row_upsert_data.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py b/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py index ad6f67a99..12fd209fd 100644 --- a/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py +++ b/libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py @@ -5,6 +5,7 @@ from labelbox.pydantic_compat import BaseModel from labelbox.schema.identifiable import UniqueId, GlobalKey from labelbox import pydantic_compat +from labelbox.schema.data_row import DataRow class DataRowItemBase(ABC, pydantic_compat.BaseModel): @@ -78,7 +79,8 @@ def is_empty(self) -> bool: or the only key is `dataset_id`. :return: bool """ - row_data = self.payload.get("row_data", None) + row_data = self.payload.get("row_data", None) or self.payload.get( + DataRow.row_data, None) return (not self.payload or len(self.payload.keys()) == 1 and "dataset_id" in self.payload or row_data is None or len(row_data) == 0) diff --git a/libs/labelbox/tests/unit/test_data_row_upsert_data.py b/libs/labelbox/tests/unit/test_data_row_upsert_data.py index bb1522c42..9c7b2b383 100644 --- a/libs/labelbox/tests/unit/test_data_row_upsert_data.py +++ b/libs/labelbox/tests/unit/test_data_row_upsert_data.py @@ -7,6 +7,7 @@ from labelbox.schema.asset_attachment import AttachmentType from labelbox.schema.dataset import Dataset from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator +from labelbox.schema.data_row import DataRow @pytest.fixture @@ -119,6 +120,11 @@ def test_create_is_empty(): id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"}) assert not item.is_empty() + item = DataRowCreateItem( + id={}, + payload={DataRow.row_data: "http://my_site.com/photos/img_01.jpg"}) + assert not item.is_empty() + def test_create_row_data_none(): items = [