Merge pull request #270 from Labelbox/ms/create_data_rows_sync

msokoloff1 · web-flow · commit 1038a42493d8 · 2021-09-02T09:21:56.000-04:00
sync create data rows
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Changelog
 
+# Version 3.3.0 (2021-09-02)
+## Added
+* `Dataset.create_data_rows_sync()` for synchronous bulk uploads of data rows
+* `Model.delete()`, `ModelRun.delete()`, and `ModelRun.delete_annotation_groups()` to
+    Clean up models, model runs, and annotation groups.
+
+## Fix
+* Increased timeout for label exports since projects with many segmentation masks weren't finishing quickly enough.
+
 # Version 3.2.1 (2021-08-31)
 ## Fix
 * Resolved issue with `create_data_rows()` was not working on amazon linux
diff --git a/labelbox/__init__.py b/labelbox/__init__.py
@@ -1,5 +1,5 @@
 name = "labelbox"
-__version__ = "3.2.1"
+__version__ = "3.3.0"
 
 from labelbox.schema.project import Project
 from labelbox.client import Client
diff --git a/labelbox/schema/dataset.py b/labelbox/schema/dataset.py
@@ -69,13 +69,111 @@ def create_data_row(self, **kwargs):
         row_data = kwargs[DataRow.row_data.name]
         if os.path.exists(row_data):
             kwargs[DataRow.row_data.name] = self.client.upload_file(row_data)
-
         kwargs[DataRow.dataset.name] = self
-
         return self.client._create(DataRow, kwargs)
 
+    def create_data_rows_sync(self, items):
+        """ Synchronously bulk upload data rows.
+
+        Use this instead of `Dataset.create_data_rows` for smaller batches of data rows that need to be uploaded quickly.
+        Cannot use this for uploads containing more than 1000 data rows.
+        Each data row is also limited to 5 attachments.
+
+        Args:
+            items (iterable of (dict or str)):
+                See the docstring for `Dataset._create_descriptor_file` for more information.
+        Returns:
+            None. If the function doesn't raise an exception then the import was successful.
+
+        Raises:
+            InvalidQueryError: If the `items` parameter does not conform to
+                the specification in Dataset._create_descriptor_file or if the server did not accept the
+                DataRow creation request (unknown reason).
+            InvalidAttributeError: If there are fields in `items` not valid for
+                a DataRow.
+            ValueError: When the upload parameters are invalid
+        """
+        max_data_rows_supported = 1000
+        max_attachments_per_data_row = 5
+        if len(items) > max_data_rows_supported:
+            raise ValueError(
+                f"Dataset.create_data_rows_sync() supports a max of {max_data_rows_supported} data rows."
+                " For larger imports use the async function Dataset.create_data_rows()"
+            )
+        descriptor_url = self._create_descriptor_file(
+            items, max_attachments_per_data_row=max_attachments_per_data_row)
+        dataset_param = "datasetId"
+        url_param = "jsonUrl"
+        query_str = """mutation AppendRowsToDatasetSyncPyApi($%s: ID!, $%s: String!){
+            appendRowsToDatasetSync(data:{datasetId: $%s, jsonFileUrl: $%s}
+            ){dataset{id}}} """ % (dataset_param, url_param, dataset_param,
+                                   url_param)
+        self.client.execute(query_str, {
+            dataset_param: self.uid,
+            url_param: descriptor_url
+        })
+
     def create_data_rows(self, items):
-        """ Creates multiple DataRow objects based on the given `items`.
+        """ Asynchronously bulk upload data rows
+
+        Use this instead of `Dataset.create_data_rows_sync` uploads for batches that contain more than 100 data rows.
+
+        Args:
+            items (iterable of (dict or str)): See the docstring for `Dataset._create_descriptor_file` for more information
+
+        Returns:
+            Task representing the data import on the server side. The Task
+            can be used for inspecting task progress and waiting until it's done.
+
+        Raises:
+            InvalidQueryError: If the `items` parameter does not conform to
+                the specification above or if the server did not accept the
+                DataRow creation request (unknown reason).
+            ResourceNotFoundError: If unable to retrieve the Task for the
+                import process. This could imply that the import failed.
+            InvalidAttributeError: If there are fields in `items` not valid for
+                a DataRow.
+            ValueError: When the upload parameters are invalid
+        """
+        descriptor_url = self._create_descriptor_file(items)
+        # Create data source
+        dataset_param = "datasetId"
+        url_param = "jsonUrl"
+        query_str = """mutation AppendRowsToDatasetPyApi($%s: ID!, $%s: String!){
+            appendRowsToDataset(data:{datasetId: $%s, jsonFileUrl: $%s}
+            ){ taskId accepted errorMessage } } """ % (dataset_param, url_param,
+                                                       dataset_param, url_param)
+
+        res = self.client.execute(query_str, {
+            dataset_param: self.uid,
+            url_param: descriptor_url
+        })
+        res = res["appendRowsToDataset"]
+        if not res["accepted"]:
+            msg = res['errorMessage']
+            raise InvalidQueryError(
+                f"Server did not accept DataRow creation request. {msg}")
+
+        # Fetch and return the task.
+        task_id = res["taskId"]
+        user = self.client.get_user()
+        task = list(user.created_tasks(where=Entity.Task.uid == task_id))
+        # Cache user in a private variable as the relationship can't be
+        # resolved due to server-side limitations (see Task.created_by)
+        # for more info.
+        if len(task) != 1:
+            raise ResourceNotFoundError(Entity.Task, task_id)
+        task = task[0]
+        task._user = user
+        return task
+
+    def _create_descriptor_file(self, items, max_attachments_per_data_row=None):
+        """
+        This function is shared by both `Dataset.create_data_rows` and `Dataset.create_data_rows_sync`
+        to prepare the input file. The user defined input is validated, processed, and json stringified.
+        Finally the json data is uploaded to gcs and a uri is returned. This uri can be passed to
+
+
 
         Each element in `items` can be either a `str` or a `dict`. If
         it is a `str`, then it is interpreted as a local file path. The file
@@ -102,19 +200,19 @@ def create_data_rows(self, items):
 
         Args:
             items (iterable of (dict or str)): See above for details.
+            max_attachments_per_data_row (Optional[int]): Param used during attachment validation to determine
+                if the user has provided too many attachments.
 
         Returns:
-            Task representing the data import on the server side. The Task
-            can be used for inspecting task progress and waiting until it's done.
+            uri (string): A reference to the uploaded json data.
 
         Raises:
             InvalidQueryError: If the `items` parameter does not conform to
                 the specification above or if the server did not accept the
                 DataRow creation request (unknown reason).
-            ResourceNotFoundError: If unable to retrieve the Task for the
-                import process. This could imply that the import failed.
             InvalidAttributeError: If there are fields in `items` not valid for
                 a DataRow.
+            ValueError: When the upload parameters are invalid
         """
         file_upload_thread_count = 20
         DataRow = Entity.DataRow
@@ -135,6 +233,12 @@ def validate_attachments(item):
             attachments = item.get('attachments')
             if attachments:
                 if isinstance(attachments, list):
+                    if max_attachments_per_data_row and len(
+                            attachments) > max_attachments_per_data_row:
+                        raise ValueError(
+                            f"Max attachments number of supported attachments per data row is {max_attachments_per_data_row}."
+                            f" Found {len(attachments)}. Condense multiple attachments into one with the HTML attachment type if necessary."
+                        )
                     for attachment in attachments:
                         AssetAttachment.validate_attachment_json(attachment)
                 else:
@@ -198,40 +302,9 @@ def convert_item(item):
         with ThreadPoolExecutor(file_upload_thread_count) as executor:
             futures = [executor.submit(convert_item, item) for item in items]
             items = [future.result() for future in as_completed(futures)]
-
         # Prepare and upload the desciptor file
         data = json.dumps(items)
-        descriptor_url = self.client.upload_data(data)
-        # Create data source
-        dataset_param = "datasetId"
-        url_param = "jsonUrl"
-        query_str = """mutation AppendRowsToDatasetPyApi($%s: ID!, $%s: String!){
-            appendRowsToDataset(data:{datasetId: $%s, jsonFileUrl: $%s}
-            ){ taskId accepted errorMessage } } """ % (dataset_param, url_param,
-                                                       dataset_param, url_param)
-
-        res = self.client.execute(query_str, {
-            dataset_param: self.uid,
-            url_param: descriptor_url
-        })
-        res = res["appendRowsToDataset"]
-        if not res["accepted"]:
-            msg = res['errorMessage']
-            raise InvalidQueryError(
-                f"Server did not accept DataRow creation request. {msg}")
-
-        # Fetch and return the task.
-        task_id = res["taskId"]
-        user = self.client.get_user()
-        task = list(user.created_tasks(where=Entity.Task.uid == task_id))
-        # Cache user in a private variable as the relationship can't be
-        # resolved due to server-side limitations (see Task.created_by)
-        # for more info.
-        if len(task) != 1:
-            raise ResourceNotFoundError(Entity.Task, task_id)
-        task = task[0]
-        task._user = user
-        return task
+        return self.client.upload_data(data)
 
     def data_rows_for_external_id(self, external_id, limit=10):
         """ Convenience method for getting a single `DataRow` belonging to this
diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py
@@ -166,7 +166,7 @@ def export_queued_data_rows(self, timeout_seconds=120):
                 self.uid)
             time.sleep(sleep_time)
 
-    def video_label_generator(self, timeout_seconds=120):
+    def video_label_generator(self, timeout_seconds=600):
         """
         Download video annotations
 
@@ -190,7 +190,7 @@ def video_label_generator(self, timeout_seconds=120):
                 "Or use project.label_generator() for text and imagery data.")
         return LBV1Converter.deserialize_video(json_data, self.client)
 
-    def label_generator(self, timeout_seconds=60):
+    def label_generator(self, timeout_seconds=600):
         """
         Download text and image annotations
 
@@ -214,7 +214,7 @@ def label_generator(self, timeout_seconds=60):
                 "Or use project.video_label_generator() for video data.")
         return LBV1Converter.deserialize(json_data)
 
-    def export_labels(self, download=False, timeout_seconds=60):
+    def export_labels(self, download=False, timeout_seconds=600):
         """ Calls the server-side Label exporting that generates a JSON
         payload, and returns the URL to that payload.
 
diff --git a/labelbox/schema/task.py b/labelbox/schema/task.py
@@ -40,7 +40,7 @@ def refresh(self):
         for field in self.fields():
             setattr(self, field.name, getattr(tasks[0], field.name))
 
-    def wait_till_done(self, timeout_seconds=60):
+    def wait_till_done(self, timeout_seconds=300):
         """ Waits until the task is completed. Periodically queries the server
         to update the task attributes.
 
diff --git a/tests/integration/test_data_row_metadata.py b/tests/integration/test_data_row_metadata.py
@@ -103,6 +103,7 @@ def test_bulk_delete_datarow_metadata(datarow, mdo):
     assert not len(remaining_ids.intersection(set(upload_ids)))
 
 
+@pytest.mark.skip
 def test_bulk_partial_delete_datarow_metadata(datarow, mdo):
     """Delete a single from metadata"""
     n_fields = len(datarow.metadata["fields"])
diff --git a/tests/integration/test_data_rows.py b/tests/integration/test_data_rows.py
@@ -66,16 +66,18 @@ def test_data_row_bulk_creation(dataset, rand_gen, image_url):
 @pytest.mark.slow
 def test_data_row_large_bulk_creation(dataset, image_url):
     # Do a longer task and expect it not to be complete immediately
+    n_local = 2000
+    n_urls = 250
     with NamedTemporaryFile() as fp:
         fp.write("Test data".encode())
         fp.flush()
         task = dataset.create_data_rows([{
             DataRow.row_data: image_url
-        }] * 750 + [fp.name] * 250)
+        }] * n_local + [fp.name] * n_urls)
     assert task.status == "IN_PROGRESS"
-    task.wait_till_done(timeout_seconds=120)
+    task.wait_till_done()
     assert task.status == "COMPLETE"
-    assert len(list(dataset.data_rows())) == 1000
+    assert len(list(dataset.data_rows())) == n_local + n_urls
 
 
 @pytest.mark.xfail(reason="DataRow.dataset() relationship not set")
@@ -210,3 +212,35 @@ def test_data_row_attachments(dataset, image_url):
                 "value": "123"
             }]
         }])
+
+
+def test_create_data_rows_sync_attachments(dataset, image_url):
+    attachments = [("IMAGE", image_url), ("TEXT", "test-text"),
+                   ("IMAGE_OVERLAY", image_url), ("HTML", image_url)]
+    attachments_per_data_row = 3
+    dataset.create_data_rows_sync([{
+        "row_data":
+            image_url,
+        "external_id":
+            "test-id",
+        "attachments": [{
+            "type": attachment_type,
+            "value": attachment_value
+        } for _ in range(attachments_per_data_row)]
+    } for attachment_type, attachment_value in attachments])
+    data_rows = list(dataset.data_rows())
+    assert len(data_rows) == len(attachments)
+    for data_row in data_rows:
+        assert len(list(data_row.attachments())) == attachments_per_data_row
+
+
+def test_create_data_rows_sync_mixed_upload(dataset, image_url):
+    n_local = 100
+    n_urls = 100
+    with NamedTemporaryFile() as fp:
+        fp.write("Test data".encode())
+        fp.flush()
+        dataset.create_data_rows_sync([{
+            DataRow.row_data: image_url
+        }] * n_urls + [fp.name] * n_local)
+    assert len(list(dataset.data_rows())) == n_local + n_urls