Skip to content

Commit 22b9a3a

Browse files
author
Val Brodsky
committed
Add validation for empty row_data for Dataset create_data_rows
1 parent d77170f commit 22b9a3a

File tree

5 files changed

+155
-22
lines changed

5 files changed

+155
-22
lines changed

libs/labelbox/src/labelbox/schema/dataset.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@
3232
from labelbox.schema.task import Task, DataUpsertTask
3333
from labelbox.schema.user import User
3434
from labelbox.schema.iam_integration import IAMIntegration
35-
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
35+
from labelbox.schema.internal.data_row_upsert_item import (DataRowItemBase,
36+
DataRowUpsertItem,
37+
DataRowCreateItem)
3638
import labelbox.schema.internal.data_row_uploader as data_row_uploader
3739
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
3840
from labelbox.schema.internal.datarow_upload_constants import (
@@ -288,10 +290,9 @@ def create_data_rows(self,
288290
string_items = [item for item in items if isinstance(item, str)]
289291
dict_items = [item for item in items if isinstance(item, dict)]
290292
dict_string_items = []
291-
292293
if len(string_items) > 0:
293294
dict_string_items = self._build_from_local_paths(string_items)
294-
specs = DataRowUpsertItem.build(self.uid,
295+
specs = DataRowCreateItem.build(self.uid,
295296
dict_items + dict_string_items)
296297
return self._exec_upsert_data_rows(specs, file_upload_thread_count)
297298

@@ -613,7 +614,7 @@ def upsert_data_rows(self,
613614

614615
def _exec_upsert_data_rows(
615616
self,
616-
specs: List[DataRowUpsertItem],
617+
specs: List[DataRowItemBase],
617618
file_upload_thread_count: int = FILE_UPLOAD_THREAD_COUNT
618619
) -> "DataUpsertTask":
619620

libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import List
44

55
from labelbox import pydantic_compat
6-
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
6+
from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem, DataRowCreateItem
77
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
88

99

@@ -16,14 +16,19 @@ class UploadManifest(pydantic_compat.BaseModel):
1616
SOURCE_SDK = "SDK"
1717

1818

19-
def upload_in_chunks(client, specs: List[DataRowUpsertItem],
19+
def upload_in_chunks(client, specs: List[DataRowItemBase],
2020
file_upload_thread_count: int,
2121
max_chunk_size_bytes: int) -> UploadManifest:
2222
empty_specs = list(filter(lambda spec: spec.is_empty(), specs))
2323

2424
if empty_specs:
2525
ids = list(map(lambda spec: spec.id.get("value"), empty_specs))
26-
raise ValueError(f"The following items have an empty payload: {ids}")
26+
ids = list(filter(lambda x: x is not None and len(x) > 0, ids))
27+
if len(ids) > 0:
28+
raise ValueError(
29+
f"The following items have an empty payload: {ids}")
30+
else: # case of create items
31+
raise ValueError("Some items have an empty payload")
2732

2833
chunk_uris = DescriptorFileCreator(client).create(
2934
specs, max_chunk_size_bytes=max_chunk_size_bytes)

libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,24 @@
1+
from abc import ABC, abstractmethod
2+
13
from typing import List, Tuple, Optional
24

3-
from labelbox.schema.identifiable import UniqueId, GlobalKey
45
from labelbox.pydantic_compat import BaseModel
6+
from labelbox.schema.identifiable import UniqueId, GlobalKey
57

68

7-
class DataRowUpsertItem(BaseModel):
8-
"""
9-
Base class for creating payloads for upsert operations.
10-
"""
11-
id: dict
12-
payload: dict
9+
class DataRowItemBase(ABC):
10+
11+
@abstractmethod
12+
def is_empty(self) -> bool:
13+
...
1314

1415
@classmethod
1516
def build(
1617
cls,
1718
dataset_id: str,
1819
items: List[dict],
1920
key_types: Optional[Tuple[type, ...]] = ()
20-
) -> List["DataRowUpsertItem"]:
21+
) -> List["DataRowItemBase"]:
2122
upload_items = []
2223

2324
for item in items:
@@ -41,10 +42,54 @@ def build(
4142
upload_items.append(cls(payload=item, id=key))
4243
return upload_items
4344

45+
46+
class DataRowUpsertItem(BaseModel, DataRowItemBase):
47+
"""
48+
Base class for creating payloads for upsert operations.
49+
"""
50+
id: dict
51+
payload: dict
52+
4453
def is_empty(self) -> bool:
4554
"""
4655
The payload is considered empty if it's actually empty or the only key is `dataset_id`.
4756
:return: bool
4857
"""
4958
return (not self.payload or
5059
len(self.payload.keys()) == 1 and "dataset_id" in self.payload)
60+
61+
@classmethod
62+
def build(
63+
cls,
64+
dataset_id: str,
65+
items: List[dict],
66+
key_types: Optional[Tuple[type, ...]] = ()
67+
) -> List["DataRowUpsertItem"]:
68+
return super().build(dataset_id, items, (UniqueId, GlobalKey))
69+
70+
71+
class DataRowCreateItem(BaseModel, DataRowItemBase):
72+
"""
73+
Base class for creating payloads for create operations.
74+
"""
75+
id: dict
76+
payload: dict
77+
78+
def is_empty(self) -> bool:
79+
"""
80+
The payload is considered empty if it's actually empty or row_data is empty
81+
or the only key is `dataset_id`.
82+
:return: bool
83+
"""
84+
row_data = self.payload.get("row_data", None)
85+
return (not self.payload or len(self.payload.keys()) == 1 and
86+
"dataset_id" in self.payload or row_data is None or
87+
len(row_data) == 0)
88+
89+
@classmethod
90+
def build(
91+
cls,
92+
dataset_id: str,
93+
items: List[dict],
94+
) -> List["DataRowUpsertItem"]:
95+
return super().build(dataset_id, items, ())

libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from labelbox.schema.embedding import EmbeddingVector
1414
from labelbox.schema.internal.datarow_upload_constants import (
1515
FILE_UPLOAD_THREAD_COUNT)
16-
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
16+
from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem
1717

1818
from typing import TYPE_CHECKING
1919
if TYPE_CHECKING:
@@ -248,7 +248,7 @@ def format_legacy_conversational_data(item):
248248
return item
249249

250250
def convert_item(data_row_item):
251-
if isinstance(data_row_item, DataRowUpsertItem):
251+
if isinstance(data_row_item, DataRowItemBase):
252252
item = data_row_item.payload
253253
else:
254254
item = data_row_item
@@ -273,7 +273,7 @@ def convert_item(data_row_item):
273273
# Upload any local file paths
274274
item = upload_if_necessary(item)
275275

276-
if isinstance(data_row_item, DataRowUpsertItem):
276+
if isinstance(data_row_item, DataRowItemBase):
277277
return {'id': data_row_item.id, 'payload': item}
278278
else:
279279
return item

libs/labelbox/tests/unit/test_data_row_upsert_data.py

Lines changed: 86 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
1+
from unittest.mock import MagicMock, patch
2+
13
import pytest
2-
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
4+
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem,
5+
DataRowCreateItem)
36
from labelbox.schema.identifiable import UniqueId, GlobalKey
47
from labelbox.schema.asset_attachment import AttachmentType
8+
from labelbox.schema.dataset import Dataset
9+
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
510

611

712
@pytest.fixture
@@ -26,6 +31,17 @@ def data_row_create_items():
2631
return dataset_id, items
2732

2833

34+
@pytest.fixture
35+
def data_row_create_items_row_data_none():
36+
dataset_id = 'test_dataset'
37+
items = [
38+
{
39+
"row_data": None,
40+
},
41+
]
42+
return dataset_id, items
43+
44+
2945
@pytest.fixture
3046
def data_row_update_items():
3147
dataset_id = 'test_dataset'
@@ -46,15 +62,15 @@ def test_data_row_upsert_items(data_row_create_items, data_row_update_items):
4662
dataset_id, create_items = data_row_create_items
4763
dataset_id, update_items = data_row_update_items
4864
items = create_items + update_items
49-
result = DataRowUpsertItem.build(dataset_id, items, (UniqueId, GlobalKey))
65+
result = DataRowUpsertItem.build(dataset_id, items)
5066
assert len(result) == len(items)
5167
for item, res in zip(items, result):
5268
assert res.payload == item
5369

5470

5571
def test_data_row_create_items(data_row_create_items):
5672
dataset_id, items = data_row_create_items
57-
result = DataRowUpsertItem.build(dataset_id, items)
73+
result = DataRowCreateItem.build(dataset_id, items)
5874
assert len(result) == len(items)
5975
for item, res in zip(items, result):
6076
assert res.payload == item
@@ -63,4 +79,70 @@ def test_data_row_create_items(data_row_create_items):
6379
def test_data_row_create_items_not_updateable(data_row_update_items):
6480
dataset_id, items = data_row_update_items
6581
with pytest.raises(ValueError):
66-
DataRowUpsertItem.build(dataset_id, items, ())
82+
DataRowCreateItem.build(dataset_id, items)
83+
84+
85+
def test_upsert_is_empty():
86+
item = DataRowUpsertItem(id={
87+
"id": UniqueId,
88+
"value": UniqueId("123")
89+
},
90+
payload={})
91+
assert item.is_empty()
92+
93+
item = DataRowUpsertItem(id={
94+
"id": UniqueId,
95+
"value": UniqueId("123")
96+
},
97+
payload={"dataset_id": "test_dataset"})
98+
assert item.is_empty()
99+
100+
item = DataRowUpsertItem(
101+
id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"})
102+
assert not item.is_empty()
103+
104+
105+
def test_create_is_empty():
106+
item = DataRowCreateItem(id={}, payload={})
107+
assert item.is_empty()
108+
109+
item = DataRowCreateItem(id={}, payload={"dataset_id": "test_dataset"})
110+
assert item.is_empty()
111+
112+
item = DataRowCreateItem(id={}, payload={"row_data": None})
113+
assert item.is_empty()
114+
115+
item = DataRowCreateItem(id={}, payload={"row_data": ""})
116+
assert item.is_empty()
117+
118+
item = DataRowCreateItem(
119+
id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"})
120+
assert not item.is_empty()
121+
122+
123+
def test_create_row_data_none():
124+
items = [
125+
{
126+
"row_data": None,
127+
"global_key": "global_key1",
128+
},
129+
]
130+
client = MagicMock()
131+
dataset = Dataset(
132+
client, {
133+
"id": 'test_dataset',
134+
"name": 'test_dataset',
135+
"createdAt": "2021-06-01T00:00:00.000Z",
136+
"description": "test_dataset",
137+
"updatedAt": "2021-06-01T00:00:00.000Z",
138+
"rowCount": 0,
139+
})
140+
141+
with patch.object(DescriptorFileCreator,
142+
'create',
143+
return_value=["http://bar.com/chunk_uri"]):
144+
with pytest.raises(ValueError,
145+
match="Some items have an empty payload"):
146+
dataset.create_data_rows(items)
147+
148+
client.execute.assert_not_called()

0 commit comments

Comments
 (0)