Skip to content

Commit ac76e38

Browse files
author
Val Brodsky
committed
Cleanup: remove unused const UPSERT_CHUNK_SIZE, update create_data_rows_sync, remove unused MAX_DATAROW_PER_API_OPERATION
1 parent e30d1be commit ac76e38

File tree

7 files changed

+24
-37
lines changed

7 files changed

+24
-37
lines changed

libs/labelbox/src/labelbox/schema/dataset.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@
3434
from labelbox.schema.iam_integration import IAMIntegration
3535
from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
3636
from labelbox.schema.internal.data_row_uploader import DataRowUploader
37+
from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
3738
from labelbox.schema.internal.datarow_upload_constants import (
38-
MAX_DATAROW_PER_API_OPERATION, FILE_UPLOAD_THREAD_COUNT, UPSERT_CHUNK_SIZE,
39-
UPSERT_CHUNK_SIZE_BYTES)
39+
FILE_UPLOAD_THREAD_COUNT, UPSERT_CHUNK_SIZE_BYTES)
4040

4141
logger = logging.getLogger(__name__)
4242

@@ -54,7 +54,6 @@ class Dataset(DbObject, Updateable, Deletable):
5454
created_by (Relationship): `ToOne` relationship to User
5555
organization (Relationship): `ToOne` relationship to Organization
5656
"""
57-
__upsert_chunk_size: Final = UPSERT_CHUNK_SIZE
5857

5958
name = Field.String("name")
6059
description = Field.String("description")
@@ -241,10 +240,8 @@ def create_data_rows_sync(self, items) -> None:
241240
f"Dataset.create_data_rows_sync() supports a max of {max_data_rows_supported} data rows."
242241
" For larger imports use the async function Dataset.create_data_rows()"
243242
)
244-
descriptor_url = DataRowUploader.create_descriptor_file(
245-
self.client,
246-
items,
247-
max_attachments_per_data_row=max_attachments_per_data_row)
243+
descriptor_url = DescriptorFileCreator(self.client).create_one(
244+
items, max_attachments_per_data_row=max_attachments_per_data_row)
248245
dataset_param = "datasetId"
249246
url_param = "jsonUrl"
250247
query_str = """mutation AppendRowsToDatasetSyncPyApi($%s: ID!, $%s: String!){
@@ -624,7 +621,6 @@ def _exec_upsert_data_rows(
624621
client=self.client,
625622
specs=specs,
626623
file_upload_thread_count=file_upload_thread_count,
627-
upsert_chunk_size=UPSERT_CHUNK_SIZE,
628624
max_chunk_size_bytes=UPSERT_CHUNK_SIZE_BYTES)
629625

630626
data = json.dumps(manifest.dict()).encode("utf-8")

libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class DataRowUploader:
1717

1818
@staticmethod
1919
def upload_in_chunks(client, specs: List[DataRowUpsertItem],
20-
file_upload_thread_count: int, upsert_chunk_size: int,
20+
file_upload_thread_count: int,
2121
max_chunk_size_bytes: int) -> UploadManifest:
2222
empty_specs = list(filter(lambda spec: spec.is_empty(), specs))
2323

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
MAX_DATAROW_PER_API_OPERATION = 150_000
21
FILE_UPLOAD_THREAD_COUNT = 20
3-
UPSERT_CHUNK_SIZE = 10_000
42
UPSERT_CHUNK_SIZE_BYTES = 10_000_000
53
DOWNLOAD_RESULT_PAGE_SIZE = 5_000

libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,13 @@
1212
from labelbox.orm.model import Field
1313
from labelbox.schema.embedding import EmbeddingVector
1414
from labelbox.schema.internal.datarow_upload_constants import (
15-
MAX_DATAROW_PER_API_OPERATION, FILE_UPLOAD_THREAD_COUNT)
15+
FILE_UPLOAD_THREAD_COUNT)
1616
from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
1717

18+
from typing import TYPE_CHECKING
19+
if TYPE_CHECKING:
20+
from labelbox import Client
21+
1822

1923
class DescriptorFileCreator:
2024
"""
@@ -291,32 +295,26 @@ def convert_item(data_row_item):
291295
f"Must pass an iterable to create_data_rows. Found {type(items)}"
292296
)
293297

294-
if len(items) > MAX_DATAROW_PER_API_OPERATION:
295-
raise MalformedQueryException(
296-
f"Cannot create more than {MAX_DATAROW_PER_API_OPERATION} DataRows per function call."
297-
)
298-
299298
with ThreadPoolExecutor(file_upload_thread_count) as executor:
300299
futures = [executor.submit(convert_item, item) for item in items]
301300
items = [future.result() for future in as_completed(futures)]
302301

303302
return items
304303

305-
def _chunk_down_by_bytes(
306-
self, items: List[dict],
307-
max_chunk_size: int) -> Generator[List[str], None, None]:
304+
def _chunk_down_by_bytes(self, items: List[dict],
305+
max_chunk_size: int) -> Generator[str, None, None]:
308306
if not items:
309307
return
310308
data = json.dumps(items)
311-
chunk_size = sys.getsizeof(data)
312-
if sys.getsizeof(data) <= max_chunk_size:
309+
chunk_size = len(data.encode("utf-8"))
310+
if chunk_size <= max_chunk_size:
313311
yield data
314312
return
315313

316314
if len(items) == 1:
317-
raise ValueError(
318-
f"Item {items[0]} size exceeds max_chunk_size: {chunk_size} > {max_chunk_size}"
319-
)
315+
yield data
316+
return
317+
320318
half = len(items) // 2
321319
yield from self._chunk_down_by_bytes(items[:half], max_chunk_size)
322320
yield from self._chunk_down_by_bytes(items[half:], max_chunk_size)

libs/labelbox/src/labelbox/schema/task.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,7 @@
1111

1212
from labelbox.pagination import PaginatedCollection
1313
from labelbox.schema.internal.datarow_upload_constants import (
14-
MAX_DATAROW_PER_API_OPERATION,
15-
DOWNLOAD_RESULT_PAGE_SIZE,
16-
)
14+
DOWNLOAD_RESULT_PAGE_SIZE,)
1715

1816
if TYPE_CHECKING:
1917
from labelbox import User
@@ -233,7 +231,6 @@ class DataUpsertTask(Task):
233231
"""
234232
Task class for data row upsert operations
235233
"""
236-
MAX_DOWNLOAD_SIZE: Final = MAX_DATAROW_PER_API_OPERATION
237234

238235
def __init__(self, *args, **kwargs):
239236
super().__init__(*args, **kwargs)

libs/labelbox/tests/integration/test_dataset.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import pytest
22
import requests
33
from labelbox import Dataset
4-
from labelbox.exceptions import ResourceNotFoundError, MalformedQueryException, InvalidQueryError
5-
from labelbox.schema.dataset import MAX_DATAROW_PER_API_OPERATION
4+
from labelbox.exceptions import ResourceNotFoundError, InvalidQueryError
65
from labelbox.schema.internal.data_row_uploader import DataRowUploader
76

87

libs/labelbox/tests/unit/test_unit_descriptor_file_creator.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,9 @@ def test_chunk_down_by_bytes_row_too_large():
1414
chunk = [{"row_data": "a"}]
1515
max_chunk_size_bytes = 1
1616

17-
with pytest.raises(ValueError):
18-
res = descriptor_file_creator._chunk_down_by_bytes(
19-
chunk, max_chunk_size_bytes)
20-
[x for x in res]
17+
res = descriptor_file_creator._chunk_down_by_bytes(chunk,
18+
max_chunk_size_bytes)
19+
assert [x for x in res] == [json.dumps([{"row_data": "a"}])]
2120

2221

2322
def test_chunk_down_by_bytes_more_chunks():
@@ -26,7 +25,7 @@ def test_chunk_down_by_bytes_more_chunks():
2625
descriptor_file_creator = DescriptorFileCreator(client)
2726

2827
chunk = [{"row_data": "a"}, {"row_data": "b"}]
29-
max_chunk_size_bytes = json.dumps(chunk).__sizeof__() - 1
28+
max_chunk_size_bytes = len(json.dumps(chunk).encode("utf-8")) - 1
3029

3130
res = descriptor_file_creator._chunk_down_by_bytes(chunk,
3231
max_chunk_size_bytes)
@@ -45,7 +44,7 @@ def test_chunk_down_by_bytes_one_chunk():
4544
descriptor_file_creator = DescriptorFileCreator(client)
4645

4746
chunk = [{"row_data": "a"}, {"row_data": "b"}]
48-
max_chunk_size_bytes = json.dumps(chunk).__sizeof__()
47+
max_chunk_size_bytes = len(json.dumps(chunk).encode("utf-8"))
4948

5049
res = descriptor_file_creator._chunk_down_by_bytes(chunk,
5150
max_chunk_size_bytes)

0 commit comments

Comments
 (0)