Skip to content

Commit b7680e8

Browse files
author
Val Brodsky
committed
PR feedback
1 parent 79d3d0c commit b7680e8

File tree

6 files changed

+66
-70
lines changed

6 files changed

+66
-70
lines changed

libs/labelbox/src/labelbox/schema/dataset.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,7 @@
3232
from labelbox.schema.task import Task, DataUpsertTask
3333
from labelbox.schema.user import User
3434
from labelbox.schema.iam_integration import IAMIntegration
35-
from labelbox.schema.internal.data_row_create_upsert import (DataRowItemBase,
36-
DataRowUpsertItem,
37-
DataRowCreateItem)
35+
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
3836
from labelbox.schema.internal.data_row_uploader import DataRowUploader
3937
from labelbox.schema.internal.datarow_upload_constants import (
4038
MAX_DATAROW_PER_API_OPERATION, FILE_UPLOAD_THREAD_COUNT, UPSERT_CHUNK_SIZE)
@@ -284,13 +282,18 @@ def create_data_rows(self,
284282
285283
NOTE dicts and strings items can not be mixed in the same call. It is a responsibility of the caller to ensure that all items are of the same type.
286284
"""
285+
286+
if file_upload_thread_count < 1:
287+
raise ValueError(
288+
"file_upload_thread_count must be a positive integer")
289+
287290
string_items = [item for item in items if isinstance(item, str)]
288291
dict_items = [item for item in items if isinstance(item, dict)]
289292
dict_string_items = []
290293

291294
if len(string_items) > 0:
292295
dict_string_items = self._build_from_local_paths(string_items)
293-
specs = DataRowCreateItem.build(self.uid,
296+
specs = DataRowUpsertItem.build(self.uid,
294297
dict_items + dict_string_items)
295298
return self._exec_upsert_data_rows(specs, file_upload_thread_count)
296299

@@ -607,12 +610,12 @@ def upsert_data_rows(self,
607610
>>> ])
608611
>>> task.wait_till_done()
609612
"""
610-
specs = DataRowUpsertItem.build(self.uid, items)
613+
specs = DataRowUpsertItem.build(self.uid, items, (UniqueId, GlobalKey))
611614
return self._exec_upsert_data_rows(specs, file_upload_thread_count)
612615

613616
def _exec_upsert_data_rows(
614617
self,
615-
specs: List[DataRowItemBase],
618+
specs: List[DataRowUpsertItem],
616619
file_upload_thread_count: int = FILE_UPLOAD_THREAD_COUNT
617620
) -> "DataUpsertTask":
618621

@@ -622,7 +625,7 @@ def _exec_upsert_data_rows(
622625
file_upload_thread_count=file_upload_thread_count,
623626
upsert_chunk_size=UPSERT_CHUNK_SIZE)
624627

625-
data = json.dumps(manifest.to_dict()).encode("utf-8")
628+
data = json.dumps(manifest.dict()).encode("utf-8")
626629
manifest_uri = self.client.upload_data(data,
627630
content_type="application/json",
628631
filename="manifest.json")

libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,24 @@
22
import os
33
from concurrent.futures import ThreadPoolExecutor, as_completed
44

5-
from typing import Iterable, List, Union
5+
from typing import Iterable, List
66

77
from labelbox.exceptions import InvalidQueryError
88
from labelbox.exceptions import InvalidAttributeError
99
from labelbox.exceptions import MalformedQueryException
1010
from labelbox.orm.model import Entity
1111
from labelbox.orm.model import Field
1212
from labelbox.schema.embedding import EmbeddingVector
13-
from labelbox.schema.internal.data_row_create_upsert import DataRowItemBase
14-
from labelbox.schema.internal.datarow_upload_constants import MAX_DATAROW_PER_API_OPERATION
13+
from labelbox.pydantic_compat import BaseModel
14+
from labelbox.schema.internal.datarow_upload_constants import (
15+
MAX_DATAROW_PER_API_OPERATION, FILE_UPLOAD_THREAD_COUNT)
16+
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
1517

1618

17-
class UploadManifest:
18-
19-
def __init__(self, source: str, item_count: int, chunk_uris: List[str]):
20-
self.source = source
21-
self.item_count = item_count
22-
self.chunk_uris = chunk_uris
23-
24-
def to_dict(self):
25-
return {
26-
"source": self.source,
27-
"item_count": self.item_count,
28-
"chunk_uris": self.chunk_uris
29-
}
19+
class UploadManifest(BaseModel):
20+
source: str
21+
item_count: int
22+
chunk_uris: List[str]
3023

3124

3225
class DataRowUploader:
@@ -39,15 +32,15 @@ def create_descriptor_file(client,
3932
"""
4033
This function is shared by `Dataset.create_data_rows`, `Dataset.create_data_rows_sync` and `Dataset.update_data_rows`.
4134
It is used to prepare the input file. The user defined input is validated, processed, and json stringified.
42-
Finally the json data is uploaded to gcs and a uri is returned. This uri can be passed to
35+
Finally the json data is uploaded to gcs and a uri is returned. This uri can be passed as a parameter to a mutation that uploads data rows
4336
4437
Each element in `items` can be either a `str` or a `dict`. If
4538
it is a `str`, then it is interpreted as a local file path. The file
4639
is uploaded to Labelbox and a DataRow referencing it is created.
4740
4841
If an item is a `dict`, then it could support one of the two following structures
4942
1. For static imagery, video, and text it should map `DataRow` field names to values.
50-
At the minimum an `item` passed as a `dict` must contain a `row_data` key and value.
43+
At the minimum an `items` passed as a `dict` must contain a `row_data` key and value.
5144
If the value for row_data is a local file path and the path exists,
5245
then the local file will be uploaded to labelbox.
5346
@@ -81,7 +74,7 @@ def create_descriptor_file(client,
8174
a DataRow.
8275
ValueError: When the upload parameters are invalid
8376
"""
84-
file_upload_thread_count = 20
77+
file_upload_thread_count = FILE_UPLOAD_THREAD_COUNT
8578
DataRow = Entity.DataRow
8679
AssetAttachment = Entity.AssetAttachment
8780

@@ -192,7 +185,7 @@ def validate_keys(item):
192185
raise InvalidAttributeError(DataRow, invalid_keys)
193186
return item
194187

195-
def formatLegacyConversationalData(item):
188+
def format_legacy_conversational_data(item):
196189
messages = item.pop("conversationalData")
197190
version = item.pop("version", 1)
198191
type = item.pop("type", "application/vnd.labelbox.conversational")
@@ -213,7 +206,7 @@ def formatLegacyConversationalData(item):
213206
return item
214207

215208
def convert_item(data_row_item):
216-
if isinstance(data_row_item, DataRowItemBase):
209+
if isinstance(data_row_item, DataRowUpsertItem):
217210
item = data_row_item.payload
218211
else:
219212
item = data_row_item
@@ -223,7 +216,7 @@ def convert_item(data_row_item):
223216
return item
224217

225218
if "conversationalData" in item:
226-
formatLegacyConversationalData(item)
219+
format_legacy_conversational_data(item)
227220

228221
# Convert all payload variations into the same dict format
229222
item = format_row(item)
@@ -238,7 +231,7 @@ def convert_item(data_row_item):
238231
# Upload any local file paths
239232
item = upload_if_necessary(item)
240233

241-
if isinstance(data_row_item, DataRowItemBase):
234+
if isinstance(data_row_item, DataRowUpsertItem):
242235
return {'id': data_row_item.id, 'payload': item}
243236
else:
244237
return item
@@ -263,7 +256,7 @@ def convert_item(data_row_item):
263256
filename="json_import.json")
264257

265258
@staticmethod
266-
def upload_in_chunks(client, specs: List[DataRowItemBase],
259+
def upload_in_chunks(client, specs: List[DataRowUpsertItem],
267260
file_upload_thread_count: int,
268261
upsert_chunk_size: int) -> UploadManifest:
269262
empty_specs = list(filter(lambda spec: spec.is_empty(), specs))
@@ -278,9 +271,9 @@ def upload_in_chunks(client, specs: List[DataRowItemBase],
278271
for i in range(0, len(specs), upsert_chunk_size)
279272
]
280273

281-
def _upload_chunk(_chunk):
274+
def _upload_chunk(chunk):
282275
return DataRowUploader.create_descriptor_file(client,
283-
_chunk,
276+
chunk,
284277
is_upsert=True)
285278

286279
with ThreadPoolExecutor(file_upload_thread_count) as executor:

libs/labelbox/src/labelbox/schema/internal/data_row_create_upsert.py renamed to libs/labelbox/src/labelbox/schema/internal/data_row_upsert_item.py

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,17 @@
1-
from abc import ABC, abstractmethod
21
from typing import List, Tuple, Optional
32

43
from labelbox.schema.identifiable import UniqueId, GlobalKey
54
from labelbox.pydantic_compat import BaseModel
65

76

8-
class DataRowItemBase(BaseModel, ABC):
7+
class DataRowUpsertItem(BaseModel):
98
"""
109
Base class for creating payloads for upsert operations.
1110
"""
1211
id: dict
1312
payload: dict
1413

1514
@classmethod
16-
@abstractmethod
1715
def build(
1816
cls,
1917
dataset_id: str,
@@ -52,25 +50,24 @@ def is_empty(self) -> bool:
5250
len(self.payload.keys()) == 1 and "dataset_id" in self.payload)
5351

5452

55-
class DataRowUpsertItem(DataRowItemBase):
56-
57-
@classmethod
58-
def build(
59-
cls,
60-
dataset_id: str,
61-
items: List[dict],
62-
key_types: Optional[Tuple[type, ...]] = ()
63-
) -> List["DataRowItemBase"]:
64-
return super().build(dataset_id, items, (UniqueId, GlobalKey))
53+
# class DataRowUpsertItem(DataRowItemBase):
6554

55+
# @classmethod
56+
# def build(
57+
# cls,
58+
# dataset_id: str,
59+
# items: List[dict],
60+
# key_types: Optional[Tuple[type, ...]] = ()
61+
# ) -> List["DataRowItemBase"]:
62+
# return super().build(dataset_id, items, (UniqueId, GlobalKey))
6663

67-
class DataRowCreateItem(DataRowItemBase):
64+
# class DataRowCreateItem(DataRowItemBase):
6865

69-
@classmethod
70-
def build(
71-
cls,
72-
dataset_id: str,
73-
items: List[dict],
74-
key_types: Optional[Tuple[type, ...]] = ()
75-
) -> List["DataRowItemBase"]:
76-
return super().build(dataset_id, items, ())
66+
# @classmethod
67+
# def build(
68+
# cls,
69+
# dataset_id: str,
70+
# items: List[dict],
71+
# key_types: Optional[Tuple[type, ...]] = ()
72+
# ) -> List["DataRowItemBase"]:
73+
# return super().build(dataset_id, items, ())

libs/labelbox/src/labelbox/schema/task.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ class DataUpsertTask(Task):
233233
"""
234234
Task class for data row upsert operations
235235
"""
236-
__max_donwload_size: Final = MAX_DATAROW_PER_API_OPERATION
236+
MAX_DOWNLOAD_SIZE: Final = MAX_DATAROW_PER_API_OPERATION
237237

238238
def __init__(self, *args, **kwargs):
239239
super().__init__(*args, **kwargs)
@@ -389,7 +389,7 @@ def _results_as_list(self) -> Optional[List[Dict[str, Any]]]:
389389
for row in data:
390390
results.append(row)
391391
total_downloaded += 1
392-
if total_downloaded >= self.__max_donwload_size:
392+
if total_downloaded >= self.MAX_DOWNLOAD_SIZE:
393393
break
394394

395395
if len(results) == 0:
@@ -405,7 +405,7 @@ def _errors_as_list(self) -> Optional[List[Dict[str, Any]]]:
405405
for row in data:
406406
errors.append(row)
407407
total_downloaded += 1
408-
if total_downloaded >= self.__max_donwload_size:
408+
if total_downloaded >= self.MAX_DOWNLOAD_SIZE:
409409
break
410410

411411
if len(errors) == 0:

libs/labelbox/tests/integration/test_data_rows.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,8 @@ def test_data_row_bulk_creation(dataset, rand_gen, image_url):
186186
{
187187
"row_data": image_url
188188
},
189-
])
189+
],
190+
file_upload_thread_count=2)
190191
task.wait_till_done()
191192
assert task.has_errors() is False
192193
assert task.status == "COMPLETE"

libs/labelbox/tests/unit/test_data_row_upsert_data.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import pytest
2-
from labelbox.schema.internal.data_row_create_upsert import (DataRowUpsertItem,
3-
DataRowCreateItem)
2+
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
43
from labelbox.schema.identifiable import UniqueId, GlobalKey
54
from labelbox.schema.asset_attachment import AttachmentType
65

@@ -13,12 +12,15 @@ def data_row_create_items():
1312
"row_data": "http://my_site.com/photos/img_01.jpg",
1413
"global_key": "global_key1",
1514
"external_id": "ex_id1",
16-
"attachments": [
17-
{"type": AttachmentType.RAW_TEXT, "name": "att1", "value": "test1"}
18-
],
19-
"metadata": [
20-
{"name": "tag", "value": "tag value"},
21-
]
15+
"attachments": [{
16+
"type": AttachmentType.RAW_TEXT,
17+
"name": "att1",
18+
"value": "test1"
19+
}],
20+
"metadata": [{
21+
"name": "tag",
22+
"value": "tag value"
23+
},]
2224
},
2325
]
2426
return dataset_id, items
@@ -44,15 +46,15 @@ def test_data_row_upsert_items(data_row_create_items, data_row_update_items):
4446
dataset_id, create_items = data_row_create_items
4547
dataset_id, update_items = data_row_update_items
4648
items = create_items + update_items
47-
result = DataRowUpsertItem.build(dataset_id, items)
49+
result = DataRowUpsertItem.build(dataset_id, items, (UniqueId, GlobalKey))
4850
assert len(result) == len(items)
4951
for item, res in zip(items, result):
5052
assert res.payload == item
5153

5254

5355
def test_data_row_create_items(data_row_create_items):
5456
dataset_id, items = data_row_create_items
55-
result = DataRowCreateItem.build(dataset_id, items)
57+
result = DataRowUpsertItem.build(dataset_id, items)
5658
assert len(result) == len(items)
5759
for item, res in zip(items, result):
5860
assert res.payload == item
@@ -61,4 +63,4 @@ def test_data_row_create_items(data_row_create_items):
6163
def test_data_row_create_items_not_updateable(data_row_update_items):
6264
dataset_id, items = data_row_update_items
6365
with pytest.raises(ValueError):
64-
DataRowCreateItem.build(dataset_id, items)
66+
DataRowUpsertItem.build(dataset_id, items, ())

0 commit comments

Comments
 (0)