Skip to content

Commit e66f9e9

Browse files
author
Val Brodsky
committed
Restore back "[PLT-1107] Add validation for empty row_data for Dataset create_data_rows (#1667)"
1 parent 97df6e7 commit e66f9e9

File tree

5 files changed

+156
-17
lines changed

5 files changed

+156
-17
lines changed

libs/labelbox/src/labelbox/schema/dataset.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@
3232
from labelbox.schema.task import Task, DataUpsertTask
3333
from labelbox.schema.user import User
3434
from labelbox.schema.iam_integration import IAMIntegration
35-
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
35+
from labelbox.schema.internal.data_row_upsert_item import (DataRowItemBase,
36+
DataRowUpsertItem,
37+
DataRowCreateItem)
3638
import labelbox.schema.internal.data_row_uploader as data_row_uploader
3739
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
3840
from labelbox.schema.internal.datarow_upload_constants import (
@@ -288,10 +290,9 @@ def create_data_rows(self,
288290
string_items = [item for item in items if isinstance(item, str)]
289291
dict_items = [item for item in items if isinstance(item, dict)]
290292
dict_string_items = []
291-
292293
if len(string_items) > 0:
293294
dict_string_items = self._build_from_local_paths(string_items)
294-
specs = DataRowUpsertItem.build(self.uid,
295+
specs = DataRowCreateItem.build(self.uid,
295296
dict_items + dict_string_items)
296297
return self._exec_upsert_data_rows(specs, file_upload_thread_count)
297298

@@ -613,7 +614,7 @@ def upsert_data_rows(self,
613614

614615
def _exec_upsert_data_rows(
615616
self,
616-
specs: List[DataRowUpsertItem],
617+
specs: List[DataRowItemBase],
617618
file_upload_thread_count: int = FILE_UPLOAD_THREAD_COUNT
618619
) -> "DataUpsertTask":
619620

libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import List
44

55
from labelbox import pydantic_compat
6-
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
6+
from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem, DataRowCreateItem
77
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
88

99

@@ -16,14 +16,19 @@ class UploadManifest(pydantic_compat.BaseModel):
1616
SOURCE_SDK = "SDK"
1717

1818

19-
def upload_in_chunks(client, specs: List[DataRowUpsertItem],
19+
def upload_in_chunks(client, specs: List[DataRowItemBase],
2020
file_upload_thread_count: int,
2121
max_chunk_size_bytes: int) -> UploadManifest:
2222
empty_specs = list(filter(lambda spec: spec.is_empty(), specs))
2323

2424
if empty_specs:
2525
ids = list(map(lambda spec: spec.id.get("value"), empty_specs))
26-
raise ValueError(f"The following items have an empty payload: {ids}")
26+
ids = list(filter(lambda x: x is not None and len(x) > 0, ids))
27+
if len(ids) > 0:
28+
raise ValueError(
29+
f"The following items have an empty payload: {ids}")
30+
else: # case of create items
31+
raise ValueError("Some items have an empty payload")
2732

2833
chunk_uris = DescriptorFileCreator(client).create(
2934
specs, max_chunk_size_bytes=max_chunk_size_bytes)

libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,32 @@
1+
from abc import ABC, abstractmethod
2+
13
from typing import List, Tuple, Optional
24

3-
from labelbox.schema.identifiable import UniqueId, GlobalKey
45
from labelbox.pydantic_compat import BaseModel
6+
from labelbox.schema.identifiable import UniqueId, GlobalKey
7+
from labelbox import pydantic_compat
8+
from labelbox.schema.data_row import DataRow
59

610

7-
class DataRowUpsertItem(BaseModel):
11+
class DataRowItemBase(ABC, pydantic_compat.BaseModel):
812
"""
913
Base class for creating payloads for upsert operations.
1014
"""
15+
1116
id: dict
1217
payload: dict
1318

19+
@abstractmethod
20+
def is_empty(self) -> bool:
21+
...
22+
1423
@classmethod
1524
def build(
1625
cls,
1726
dataset_id: str,
1827
items: List[dict],
1928
key_types: Optional[Tuple[type, ...]] = ()
20-
) -> List["DataRowUpsertItem"]:
29+
) -> List["DataRowItemBase"]:
2130
upload_items = []
2231

2332
for item in items:
@@ -41,10 +50,46 @@ def build(
4150
upload_items.append(cls(payload=item, id=key))
4251
return upload_items
4352

53+
54+
class DataRowUpsertItem(DataRowItemBase):
55+
4456
def is_empty(self) -> bool:
4557
"""
4658
The payload is considered empty if it's actually empty or the only key is `dataset_id`.
4759
:return: bool
4860
"""
4961
return (not self.payload or
5062
len(self.payload.keys()) == 1 and "dataset_id" in self.payload)
63+
64+
@classmethod
65+
def build(
66+
cls,
67+
dataset_id: str,
68+
items: List[dict],
69+
key_types: Optional[Tuple[type, ...]] = (UniqueId, GlobalKey)
70+
) -> List["DataRowItemBase"]:
71+
return super().build(dataset_id, items, (UniqueId, GlobalKey))
72+
73+
74+
class DataRowCreateItem(DataRowItemBase):
75+
76+
def is_empty(self) -> bool:
77+
"""
78+
The payload is considered empty if it's actually empty or row_data is empty
79+
or the only key is `dataset_id`.
80+
:return: bool
81+
"""
82+
row_data = self.payload.get("row_data", None) or self.payload.get(
83+
DataRow.row_data, None)
84+
return (not self.payload or len(self.payload.keys()) == 1 and
85+
"dataset_id" in self.payload or row_data is None or
86+
len(row_data) == 0)
87+
88+
@classmethod
89+
def build(
90+
cls,
91+
dataset_id: str,
92+
items: List[dict],
93+
key_types: Optional[Tuple[type, ...]] = ()
94+
) -> List["DataRowItemBase"]:
95+
return super().build(dataset_id, items, ())

libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from labelbox.schema.embedding import EmbeddingVector
1414
from labelbox.schema.internal.datarow_upload_constants import (
1515
FILE_UPLOAD_THREAD_COUNT)
16-
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
16+
from labelbox.schema.internal.data_row_upsert_item import DataRowItemBase, DataRowUpsertItem
1717

1818
from typing import TYPE_CHECKING
1919
if TYPE_CHECKING:
@@ -248,7 +248,7 @@ def format_legacy_conversational_data(item):
248248
return item
249249

250250
def convert_item(data_row_item):
251-
if isinstance(data_row_item, DataRowUpsertItem):
251+
if isinstance(data_row_item, DataRowItemBase):
252252
item = data_row_item.payload
253253
else:
254254
item = data_row_item
@@ -273,7 +273,7 @@ def convert_item(data_row_item):
273273
# Upload any local file paths
274274
item = upload_if_necessary(item)
275275

276-
if isinstance(data_row_item, DataRowUpsertItem):
276+
if isinstance(data_row_item, DataRowItemBase):
277277
return {'id': data_row_item.id, 'payload': item}
278278
else:
279279
return item

libs/labelbox/tests/unit/test_data_row_upsert_data.py

Lines changed: 92 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
1+
from unittest.mock import MagicMock, patch
2+
13
import pytest
2-
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
4+
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem,
5+
DataRowCreateItem)
36
from labelbox.schema.identifiable import UniqueId, GlobalKey
47
from labelbox.schema.asset_attachment import AttachmentType
8+
from labelbox.schema.dataset import Dataset
9+
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
10+
from labelbox.schema.data_row import DataRow
511

612

713
@pytest.fixture
@@ -26,6 +32,17 @@ def data_row_create_items():
2632
return dataset_id, items
2733

2834

35+
@pytest.fixture
36+
def data_row_create_items_row_data_none():
37+
dataset_id = 'test_dataset'
38+
items = [
39+
{
40+
"row_data": None,
41+
},
42+
]
43+
return dataset_id, items
44+
45+
2946
@pytest.fixture
3047
def data_row_update_items():
3148
dataset_id = 'test_dataset'
@@ -46,15 +63,15 @@ def test_data_row_upsert_items(data_row_create_items, data_row_update_items):
4663
dataset_id, create_items = data_row_create_items
4764
dataset_id, update_items = data_row_update_items
4865
items = create_items + update_items
49-
result = DataRowUpsertItem.build(dataset_id, items, (UniqueId, GlobalKey))
66+
result = DataRowUpsertItem.build(dataset_id, items)
5067
assert len(result) == len(items)
5168
for item, res in zip(items, result):
5269
assert res.payload == item
5370

5471

5572
def test_data_row_create_items(data_row_create_items):
5673
dataset_id, items = data_row_create_items
57-
result = DataRowUpsertItem.build(dataset_id, items)
74+
result = DataRowCreateItem.build(dataset_id, items)
5875
assert len(result) == len(items)
5976
for item, res in zip(items, result):
6077
assert res.payload == item
@@ -63,4 +80,75 @@ def test_data_row_create_items(data_row_create_items):
6380
def test_data_row_create_items_not_updateable(data_row_update_items):
6481
dataset_id, items = data_row_update_items
6582
with pytest.raises(ValueError):
66-
DataRowUpsertItem.build(dataset_id, items, ())
83+
DataRowCreateItem.build(dataset_id, items)
84+
85+
86+
def test_upsert_is_empty():
87+
item = DataRowUpsertItem(id={
88+
"id": UniqueId,
89+
"value": UniqueId("123")
90+
},
91+
payload={})
92+
assert item.is_empty()
93+
94+
item = DataRowUpsertItem(id={
95+
"id": UniqueId,
96+
"value": UniqueId("123")
97+
},
98+
payload={"dataset_id": "test_dataset"})
99+
assert item.is_empty()
100+
101+
item = DataRowUpsertItem(
102+
id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"})
103+
assert not item.is_empty()
104+
105+
106+
def test_create_is_empty():
107+
item = DataRowCreateItem(id={}, payload={})
108+
assert item.is_empty()
109+
110+
item = DataRowCreateItem(id={}, payload={"dataset_id": "test_dataset"})
111+
assert item.is_empty()
112+
113+
item = DataRowCreateItem(id={}, payload={"row_data": None})
114+
assert item.is_empty()
115+
116+
item = DataRowCreateItem(id={}, payload={"row_data": ""})
117+
assert item.is_empty()
118+
119+
item = DataRowCreateItem(
120+
id={}, payload={"row_data": "http://my_site.com/photos/img_01.jpg"})
121+
assert not item.is_empty()
122+
123+
item = DataRowCreateItem(
124+
id={},
125+
payload={DataRow.row_data: "http://my_site.com/photos/img_01.jpg"})
126+
assert not item.is_empty()
127+
128+
129+
def test_create_row_data_none():
130+
items = [
131+
{
132+
"row_data": None,
133+
"global_key": "global_key1",
134+
},
135+
]
136+
client = MagicMock()
137+
dataset = Dataset(
138+
client, {
139+
"id": 'test_dataset',
140+
"name": 'test_dataset',
141+
"createdAt": "2021-06-01T00:00:00.000Z",
142+
"description": "test_dataset",
143+
"updatedAt": "2021-06-01T00:00:00.000Z",
144+
"rowCount": 0,
145+
})
146+
147+
with patch.object(DescriptorFileCreator,
148+
'create',
149+
return_value=["http://bar.com/chunk_uri"]):
150+
with pytest.raises(ValueError,
151+
match="Some items have an empty payload"):
152+
dataset.create_data_rows(items)
153+
154+
client.execute.assert_not_called()

0 commit comments

Comments
 (0)