Skip to content

Commit 8c70057

Browse files
author
Val Brodsky
committed
Cleanup: remove unused const UPSERT_CHUNK_SIZE, update create_data_rows_sync, remove unused MAX_DATAROW_PER_API_OPERATION
1 parent e30d1be commit 8c70057

File tree

7 files changed

+16
-32
lines changed

7 files changed

+16
-32
lines changed

libs/labelbox/src/labelbox/schema/dataset.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@
3434
from labelbox.schema.iam_integration import IAMIntegration
3535
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
3636
from labelbox.schema.internal.data_row_uploader import DataRowUploader
37+
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
3738
from labelbox.schema.internal.datarow_upload_constants import (
38-
MAX_DATAROW_PER_API_OPERATION, FILE_UPLOAD_THREAD_COUNT, UPSERT_CHUNK_SIZE,
39-
UPSERT_CHUNK_SIZE_BYTES)
39+
FILE_UPLOAD_THREAD_COUNT, UPSERT_CHUNK_SIZE_BYTES)
4040

4141
logger = logging.getLogger(__name__)
4242

@@ -54,7 +54,6 @@ class Dataset(DbObject, Updateable, Deletable):
5454
created_by (Relationship): `ToOne` relationship to User
5555
organization (Relationship): `ToOne` relationship to Organization
5656
"""
57-
__upsert_chunk_size: Final = UPSERT_CHUNK_SIZE
5857

5958
name = Field.String("name")
6059
description = Field.String("description")
@@ -241,10 +240,8 @@ def create_data_rows_sync(self, items) -> None:
241240
f"Dataset.create_data_rows_sync() supports a max of {max_data_rows_supported} data rows."
242241
" For larger imports use the async function Dataset.create_data_rows()"
243242
)
244-
descriptor_url = DataRowUploader.create_descriptor_file(
245-
self.client,
246-
items,
247-
max_attachments_per_data_row=max_attachments_per_data_row)
243+
descriptor_url = DescriptorFileCreator(self.client).create_one(
244+
items, max_attachments_per_data_row=max_attachments_per_data_row)
248245
dataset_param = "datasetId"
249246
url_param = "jsonUrl"
250247
query_str = """mutation AppendRowsToDatasetSyncPyApi($%s: ID!, $%s: String!){
@@ -624,7 +621,6 @@ def _exec_upsert_data_rows(
624621
client=self.client,
625622
specs=specs,
626623
file_upload_thread_count=file_upload_thread_count,
627-
upsert_chunk_size=UPSERT_CHUNK_SIZE,
628624
max_chunk_size_bytes=UPSERT_CHUNK_SIZE_BYTES)
629625

630626
data = json.dumps(manifest.dict()).encode("utf-8")

libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class DataRowUploader:
1717

1818
@staticmethod
1919
def upload_in_chunks(client, specs: List[DataRowUpsertItem],
20-
file_upload_thread_count: int, upsert_chunk_size: int,
20+
file_upload_thread_count: int,
2121
max_chunk_size_bytes: int) -> UploadManifest:
2222
empty_specs = list(filter(lambda spec: spec.is_empty(), specs))
2323

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
MAX_DATAROW_PER_API_OPERATION = 150_000
21
FILE_UPLOAD_THREAD_COUNT = 20
3-
UPSERT_CHUNK_SIZE = 10_000
42
UPSERT_CHUNK_SIZE_BYTES = 10_000_000
53
DOWNLOAD_RESULT_PAGE_SIZE = 5_000

libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from labelbox.orm.model import Field
1313
from labelbox.schema.embedding import EmbeddingVector
1414
from labelbox.schema.internal.datarow_upload_constants import (
15-
MAX_DATAROW_PER_API_OPERATION, FILE_UPLOAD_THREAD_COUNT)
15+
FILE_UPLOAD_THREAD_COUNT)
1616
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
1717

1818

@@ -291,11 +291,6 @@ def convert_item(data_row_item):
291291
f"Must pass an iterable to create_data_rows. Found {type(items)}"
292292
)
293293

294-
if len(items) > MAX_DATAROW_PER_API_OPERATION:
295-
raise MalformedQueryException(
296-
f"Cannot create more than {MAX_DATAROW_PER_API_OPERATION} DataRows per function call."
297-
)
298-
299294
with ThreadPoolExecutor(file_upload_thread_count) as executor:
300295
futures = [executor.submit(convert_item, item) for item in items]
301296
items = [future.result() for future in as_completed(futures)]
@@ -308,15 +303,15 @@ def _chunk_down_by_bytes(
308303
if not items:
309304
return
310305
data = json.dumps(items)
311-
chunk_size = sys.getsizeof(data)
312-
if sys.getsizeof(data) <= max_chunk_size:
306+
chunk_size = len(data.encode("utf-8"))
307+
if chunk_size <= max_chunk_size:
313308
yield data
314309
return
315310

316311
if len(items) == 1:
317-
raise ValueError(
318-
f"Item {items[0]} size exceeds max_chunk_size: {chunk_size} > {max_chunk_size}"
319-
)
312+
yield data
313+
return
314+
320315
half = len(items) // 2
321316
yield from self._chunk_down_by_bytes(items[:half], max_chunk_size)
322317
yield from self._chunk_down_by_bytes(items[half:], max_chunk_size)

libs/labelbox/src/labelbox/schema/task.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,7 @@
1111

1212
from labelbox.pagination import PaginatedCollection
1313
from labelbox.schema.internal.datarow_upload_constants import (
14-
MAX_DATAROW_PER_API_OPERATION,
15-
DOWNLOAD_RESULT_PAGE_SIZE,
16-
)
14+
DOWNLOAD_RESULT_PAGE_SIZE,)
1715

1816
if TYPE_CHECKING:
1917
from labelbox import User
@@ -233,7 +231,6 @@ class DataUpsertTask(Task):
233231
"""
234232
Task class for data row upsert operations
235233
"""
236-
MAX_DOWNLOAD_SIZE: Final = MAX_DATAROW_PER_API_OPERATION
237234

238235
def __init__(self, *args, **kwargs):
239236
super().__init__(*args, **kwargs)

libs/labelbox/tests/integration/test_dataset.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import pytest
22
import requests
33
from labelbox import Dataset
4-
from labelbox.exceptions import ResourceNotFoundError, MalformedQueryException, InvalidQueryError
5-
from labelbox.schema.dataset import MAX_DATAROW_PER_API_OPERATION
4+
from labelbox.exceptions import ResourceNotFoundError, InvalidQueryError
65
from labelbox.schema.internal.data_row_uploader import DataRowUploader
76

87

libs/labelbox/tests/unit/test_unit_descriptor_file_creator.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,9 @@ def test_chunk_down_by_bytes_row_too_large():
1414
chunk = [{"row_data": "a"}]
1515
max_chunk_size_bytes = 1
1616

17-
with pytest.raises(ValueError):
18-
res = descriptor_file_creator._chunk_down_by_bytes(
19-
chunk, max_chunk_size_bytes)
20-
[x for x in res]
17+
res = descriptor_file_creator._chunk_down_by_bytes(chunk,
18+
max_chunk_size_bytes)
19+
assert [x for x in res] == [json.dumps([{"row_data": "a"}])]
2120

2221

2322
def test_chunk_down_by_bytes_more_chunks():

0 commit comments

Comments
 (0)