Skip to content

Revert "[PLT-1107] Add validation for empty row_data for Dataset create_data_rows (#1667)" #1670

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions libs/labelbox/src/labelbox/schema/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@
from labelbox.schema.task import Task, DataUpsertTask
from labelbox.schema.user import User
from labelbox.schema.iam_integration import IAMIntegration
from labelbox.schema.internal.data_row_upsert_item import (DataRowItemBase,
DataRowUpsertItem,
DataRowCreateItem)
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
import labelbox.schema.internal.data_row_uploader as data_row_uploader
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
from labelbox.schema.internal.datarow_upload_constants import (
Expand Down Expand Up @@ -290,9 +288,10 @@ def create_data_rows(self,
string_items = [item for item in items if isinstance(item, str)]
dict_items = [item for item in items if isinstance(item, dict)]
dict_string_items = []

if len(string_items) > 0:
dict_string_items = self._build_from_local_paths(string_items)
specs = DataRowCreateItem.build(self.uid,
specs = DataRowUpsertItem.build(self.uid,
dict_items + dict_string_items)
return self._exec_upsert_data_rows(specs, file_upload_thread_count)

Expand Down Expand Up @@ -614,7 +613,7 @@ def upsert_data_rows(self,

def _exec_upsert_data_rows(
self,
specs: List[DataRowItemBase],
specs: List[DataRowUpsertItem],
file_upload_thread_count: int = FILE_UPLOAD_THREAD_COUNT
) -> "DataUpsertTask":

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import List

from labelbox import pydantic_compat
from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem, DataRowCreateItem
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator


Expand All @@ -16,19 +16,14 @@ class UploadManifest(pydantic_compat.BaseModel):
SOURCE_SDK = "SDK"


def upload_in_chunks(client, specs: List[DataRowItemBase],
def upload_in_chunks(client, specs: List[DataRowUpsertItem],
file_upload_thread_count: int,
max_chunk_size_bytes: int) -> UploadManifest:
empty_specs = list(filter(lambda spec: spec.is_empty(), specs))

if empty_specs:
ids = list(map(lambda spec: spec.id.get("value"), empty_specs))
ids = list(filter(lambda x: x is not None and len(x) > 0, ids))
if len(ids) > 0:
raise ValueError(
f"The following items have an empty payload: {ids}")
else: # case of create items
raise ValueError("Some items have an empty payload")
raise ValueError(f"The following items have an empty payload: {ids}")

chunk_uris = DescriptorFileCreator(client).create(
specs, max_chunk_size_bytes=max_chunk_size_bytes)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,32 +1,23 @@
from abc import ABC, abstractmethod

from typing import List, Tuple, Optional

from labelbox.pydantic_compat import BaseModel
from labelbox.schema.identifiable import UniqueId, GlobalKey
from labelbox import pydantic_compat
from labelbox.schema.data_row import DataRow
from labelbox.pydantic_compat import BaseModel


class DataRowItemBase(ABC, pydantic_compat.BaseModel):
class DataRowUpsertItem(BaseModel):
"""
Base class for creating payloads for upsert operations.
"""

id: dict
payload: dict

@abstractmethod
def is_empty(self) -> bool:
...

@classmethod
def build(
cls,
dataset_id: str,
items: List[dict],
key_types: Optional[Tuple[type, ...]] = ()
) -> List["DataRowItemBase"]:
) -> List["DataRowUpsertItem"]:
upload_items = []

for item in items:
Expand All @@ -50,46 +41,10 @@ def build(
upload_items.append(cls(payload=item, id=key))
return upload_items


class DataRowUpsertItem(DataRowItemBase):

def is_empty(self) -> bool:
"""
The payload is considered empty if it's actually empty or the only key is `dataset_id`.
:return: bool
"""
return (not self.payload or
len(self.payload.keys()) == 1 and "dataset_id" in self.payload)

@classmethod
def build(
cls,
dataset_id: str,
items: List[dict],
key_types: Optional[Tuple[type, ...]] = (UniqueId, GlobalKey)
) -> List["DataRowItemBase"]:
return super().build(dataset_id, items, (UniqueId, GlobalKey))


class DataRowCreateItem(DataRowItemBase):

def is_empty(self) -> bool:
"""
The payload is considered empty if it's actually empty or row_data is empty
or the only key is `dataset_id`.
:return: bool
"""
row_data = self.payload.get("row_data", None) or self.payload.get(
DataRow.row_data, None)
return (not self.payload or len(self.payload.keys()) == 1 and
"dataset_id" in self.payload or row_data is None or
len(row_data) == 0)

@classmethod
def build(
cls,
dataset_id: str,
items: List[dict],
key_types: Optional[Tuple[type, ...]] = ()
) -> List["DataRowItemBase"]:
return super().build(dataset_id, items, ())
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from labelbox.schema.embedding import EmbeddingVector
from labelbox.schema.internal.datarow_upload_constants import (
FILE_UPLOAD_THREAD_COUNT)
from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem

from typing import TYPE_CHECKING
if TYPE_CHECKING:
Expand Down Expand Up @@ -248,7 +248,7 @@ def format_legacy_conversational_data(item):
return item

def convert_item(data_row_item):
if isinstance(data_row_item, DataRowItemBase):
if isinstance(data_row_item, DataRowUpsertItem):
item = data_row_item.payload
else:
item = data_row_item
Expand All @@ -273,7 +273,7 @@ def convert_item(data_row_item):
# Upload any local file paths
item = upload_if_necessary(item)

if isinstance(data_row_item, DataRowItemBase):
if isinstance(data_row_item, DataRowUpsertItem):
return {'id': data_row_item.id, 'payload': item}
else:
return item
Expand Down
96 changes: 4 additions & 92 deletions libs/labelbox/tests/unit/test_data_row_upsert_data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
from unittest.mock import MagicMock, patch

import pytest
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem,
DataRowCreateItem)
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
from labelbox.schema.identifiable import UniqueId, GlobalKey
from labelbox.schema.asset_attachment import AttachmentType
from labelbox.schema.dataset import Dataset
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
from labelbox.schema.data_row import DataRow


@pytest.fixture
Expand All @@ -32,17 +26,6 @@ def data_row_create_items():
return dataset_id, items


@pytest.fixture
def data_row_create_items_row_data_none():
dataset_id = 'test_dataset'
items = [
{
"row_data": None,
},
]
return dataset_id, items


@pytest.fixture
def data_row_update_items():
dataset_id = 'test_dataset'
Expand All @@ -63,15 +46,15 @@ def test_data_row_upsert_items(data_row_create_items, data_row_update_items):
dataset_id, create_items = data_row_create_items
dataset_id, update_items = data_row_update_items
items = create_items + update_items
result = DataRowUpsertItem.build(dataset_id, items)
result = DataRowUpsertItem.build(dataset_id, items, (UniqueId, GlobalKey))
assert len(result) == len(items)
for item, res in zip(items, result):
assert res.payload == item


def test_data_row_create_items(data_row_create_items):
dataset_id, items = data_row_create_items
result = DataRowCreateItem.build(dataset_id, items)
result = DataRowUpsertItem.build(dataset_id, items)
assert len(result) == len(items)
for item, res in zip(items, result):
assert res.payload == item
Expand All @@ -80,75 +63,4 @@ def test_data_row_create_items(data_row_create_items):
def test_data_row_create_items_not_updateable(data_row_update_items):
dataset_id, items = data_row_update_items
with pytest.raises(ValueError):
DataRowCreateItem.build(dataset_id, items)


def test_upsert_is_empty():
item = DataRowUpsertItem(id={
"id": UniqueId,
"value": UniqueId("123")
},
payload={})
assert item.is_empty()

item = DataRowUpsertItem(id={
"id": UniqueId,
"value": UniqueId("123")
},
payload={"dataset_id": "test_dataset"})
assert item.is_empty()

item = DataRowUpsertItem(
id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"})
assert not item.is_empty()


def test_create_is_empty():
item = DataRowCreateItem(id={}, payload={})
assert item.is_empty()

item = DataRowCreateItem(id={}, payload={"dataset_id": "test_dataset"})
assert item.is_empty()

item = DataRowCreateItem(id={}, payload={"row_data": None})
assert item.is_empty()

item = DataRowCreateItem(id={}, payload={"row_data": ""})
assert item.is_empty()

item = DataRowCreateItem(
id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"})
assert not item.is_empty()

item = DataRowCreateItem(
id={},
payload={DataRow.row_data: "http://my_site.com/photos/img_01.jpg"})
assert not item.is_empty()


def test_create_row_data_none():
items = [
{
"row_data": None,
"global_key": "global_key1",
},
]
client = MagicMock()
dataset = Dataset(
client, {
"id": 'test_dataset',
"name": 'test_dataset',
"createdAt": "2021-06-01T00:00:00.000Z",
"description": "test_dataset",
"updatedAt": "2021-06-01T00:00:00.000Z",
"rowCount": 0,
})

with patch.object(DescriptorFileCreator,
'create',
return_value=["http://bar.com/chunk_uri"]):
with pytest.raises(ValueError,
match="Some items have an empty payload"):
dataset.create_data_rows(items)

client.execute.assert_not_called()
DataRowUpsertItem.build(dataset_id, items, ())
Loading