Cleanup: remove unused const UPSERT_CHUNK_SIZE, update create_data_rows_sync, remove unused MAX_DATAROW_PER_API_OPERATION

Val Brodsky · Val Brodsky · commit ac76e38e8f65 · 2024-06-03T15:09:11.000-07:00
diff --git a/libs/labelbox/src/labelbox/schema/dataset.py b/libs/labelbox/src/labelbox/schema/dataset.py
@@ -34,9 +34,9 @@
 from labelbox.schema.iam_integration import IAMIntegration
 from labelbox.schema.internal.data_row_upsert_item import (DataRowUpsertItem)
 from labelbox.schema.internal.data_row_uploader import DataRowUploader
+from labelbox.schema.internal.descriptor_file_creator import DescriptorFileCreator
 from labelbox.schema.internal.datarow_upload_constants import (
-    MAX_DATAROW_PER_API_OPERATION, FILE_UPLOAD_THREAD_COUNT, UPSERT_CHUNK_SIZE,
-    UPSERT_CHUNK_SIZE_BYTES)
+    FILE_UPLOAD_THREAD_COUNT, UPSERT_CHUNK_SIZE_BYTES)
 
 logger = logging.getLogger(__name__)
 
@@ -54,7 +54,6 @@ class Dataset(DbObject, Updateable, Deletable):
         created_by (Relationship): `ToOne` relationship to User
         organization (Relationship): `ToOne` relationship to Organization
     """
-    __upsert_chunk_size: Final = UPSERT_CHUNK_SIZE
 
     name = Field.String("name")
     description = Field.String("description")
@@ -241,10 +240,8 @@ def create_data_rows_sync(self, items) -> None:
                 f"Dataset.create_data_rows_sync() supports a max of {max_data_rows_supported} data rows."
                 " For larger imports use the async function Dataset.create_data_rows()"
             )
-        descriptor_url = DataRowUploader.create_descriptor_file(
-            self.client,
-            items,
-            max_attachments_per_data_row=max_attachments_per_data_row)
+        descriptor_url = DescriptorFileCreator(self.client).create_one(
+            items, max_attachments_per_data_row=max_attachments_per_data_row)
         dataset_param = "datasetId"
         url_param = "jsonUrl"
         query_str = """mutation AppendRowsToDatasetSyncPyApi($%s: ID!, $%s: String!){
@@ -624,7 +621,6 @@ def _exec_upsert_data_rows(
             client=self.client,
             specs=specs,
             file_upload_thread_count=file_upload_thread_count,
-            upsert_chunk_size=UPSERT_CHUNK_SIZE,
             max_chunk_size_bytes=UPSERT_CHUNK_SIZE_BYTES)
 
         data = json.dumps(manifest.dict()).encode("utf-8")
diff --git a/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py b/libs/labelbox/src/labelbox/schema/internal/data_row_uploader.py
@@ -17,7 +17,7 @@ class DataRowUploader:
 
     @staticmethod
     def upload_in_chunks(client, specs: List[DataRowUpsertItem],
-                         file_upload_thread_count: int, upsert_chunk_size: int,
+                         file_upload_thread_count: int,
                          max_chunk_size_bytes: int) -> UploadManifest:
         empty_specs = list(filter(lambda spec: spec.is_empty(), specs))
 
diff --git a/libs/labelbox/src/labelbox/schema/internal/datarow_upload_constants.py b/libs/labelbox/src/labelbox/schema/internal/datarow_upload_constants.py
@@ -1,5 +1,3 @@
-MAX_DATAROW_PER_API_OPERATION = 150_000
 FILE_UPLOAD_THREAD_COUNT = 20
-UPSERT_CHUNK_SIZE = 10_000
 UPSERT_CHUNK_SIZE_BYTES = 10_000_000
 DOWNLOAD_RESULT_PAGE_SIZE = 5_000
diff --git a/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py b/libs/labelbox/src/labelbox/schema/internal/descriptor_file_creator.py
@@ -12,9 +12,13 @@
 from labelbox.orm.model import Field
 from labelbox.schema.embedding import EmbeddingVector
 from labelbox.schema.internal.datarow_upload_constants import (
-    MAX_DATAROW_PER_API_OPERATION, FILE_UPLOAD_THREAD_COUNT)
+    FILE_UPLOAD_THREAD_COUNT)
 from labelbox.schema.internal.data_row_upsert_item import DataRowUpsertItem
 
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from labelbox import Client
+
 
 class DescriptorFileCreator:
     """
@@ -291,32 +295,26 @@ def convert_item(data_row_item):
                 f"Must pass an iterable to create_data_rows. Found {type(items)}"
             )
 
-        if len(items) > MAX_DATAROW_PER_API_OPERATION:
-            raise MalformedQueryException(
-                f"Cannot create more than {MAX_DATAROW_PER_API_OPERATION} DataRows per function call."
-            )
-
         with ThreadPoolExecutor(file_upload_thread_count) as executor:
             futures = [executor.submit(convert_item, item) for item in items]
             items = [future.result() for future in as_completed(futures)]
 
         return items
 
-    def _chunk_down_by_bytes(
-            self, items: List[dict],
-            max_chunk_size: int) -> Generator[List[str], None, None]:
+    def _chunk_down_by_bytes(self, items: List[dict],
+                             max_chunk_size: int) -> Generator[str, None, None]:
         if not items:
             return
         data = json.dumps(items)
-        chunk_size = sys.getsizeof(data)
-        if sys.getsizeof(data) <= max_chunk_size:
+        chunk_size = len(data.encode("utf-8"))
+        if chunk_size <= max_chunk_size:
             yield data
             return
 
         if len(items) == 1:
-            raise ValueError(
-                f"Item {items[0]} size exceeds max_chunk_size: {chunk_size} > {max_chunk_size}"
-            )
+            yield data
+            return
+
         half = len(items) // 2
         yield from self._chunk_down_by_bytes(items[:half], max_chunk_size)
         yield from self._chunk_down_by_bytes(items[half:], max_chunk_size)
diff --git a/libs/labelbox/src/labelbox/schema/task.py b/libs/labelbox/src/labelbox/schema/task.py
@@ -11,9 +11,7 @@
 
 from labelbox.pagination import PaginatedCollection
 from labelbox.schema.internal.datarow_upload_constants import (
-    MAX_DATAROW_PER_API_OPERATION,
-    DOWNLOAD_RESULT_PAGE_SIZE,
-)
+    DOWNLOAD_RESULT_PAGE_SIZE,)
 
 if TYPE_CHECKING:
     from labelbox import User
@@ -233,7 +231,6 @@ class DataUpsertTask(Task):
     """
     Task class for data row upsert operations
     """
-    MAX_DOWNLOAD_SIZE: Final = MAX_DATAROW_PER_API_OPERATION
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/libs/labelbox/tests/integration/test_dataset.py b/libs/labelbox/tests/integration/test_dataset.py
@@ -1,8 +1,7 @@
 import pytest
 import requests
 from labelbox import Dataset
-from labelbox.exceptions import ResourceNotFoundError, MalformedQueryException, InvalidQueryError
-from labelbox.schema.dataset import MAX_DATAROW_PER_API_OPERATION
+from labelbox.exceptions import ResourceNotFoundError, InvalidQueryError
 from labelbox.schema.internal.data_row_uploader import DataRowUploader
 
 
diff --git a/libs/labelbox/tests/unit/test_unit_descriptor_file_creator.py b/libs/labelbox/tests/unit/test_unit_descriptor_file_creator.py
@@ -14,10 +14,9 @@ def test_chunk_down_by_bytes_row_too_large():
     chunk = [{"row_data": "a"}]
     max_chunk_size_bytes = 1
 
-    with pytest.raises(ValueError):
-        res = descriptor_file_creator._chunk_down_by_bytes(
-            chunk, max_chunk_size_bytes)
-        [x for x in res]
+    res = descriptor_file_creator._chunk_down_by_bytes(chunk,
+                                                       max_chunk_size_bytes)
+    assert [x for x in res] == [json.dumps([{"row_data": "a"}])]
 
 
 def test_chunk_down_by_bytes_more_chunks():
@@ -26,7 +25,7 @@ def test_chunk_down_by_bytes_more_chunks():
     descriptor_file_creator = DescriptorFileCreator(client)
 
     chunk = [{"row_data": "a"}, {"row_data": "b"}]
-    max_chunk_size_bytes = json.dumps(chunk).__sizeof__() - 1
+    max_chunk_size_bytes = len(json.dumps(chunk).encode("utf-8")) - 1
 
     res = descriptor_file_creator._chunk_down_by_bytes(chunk,
                                                        max_chunk_size_bytes)
@@ -45,7 +44,7 @@ def test_chunk_down_by_bytes_one_chunk():
     descriptor_file_creator = DescriptorFileCreator(client)
 
     chunk = [{"row_data": "a"}, {"row_data": "b"}]
-    max_chunk_size_bytes = json.dumps(chunk).__sizeof__()
+    max_chunk_size_bytes = len(json.dumps(chunk).encode("utf-8"))
 
     res = descriptor_file_creator._chunk_down_by_bytes(chunk,
                                                        max_chunk_size_bytes)