Running metadata_update with async flag (#338)

jean-lucas · web-flow · commit 275c6a5597e2 · 2022-08-12T18:11:12.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,10 +19,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - client.slices to list all of users slices independent of dataset
+- Added optional parameter `asynchronous: bool` to `Dataset.update_item_metadata` and  `Dataset.update_scene_metadata`,
+allowing the update to run as a background job when set to `True`
 
 ### Fixed
 - Validate unit test listing and evaluation history listing. Now uses new bulk fetch endpoints for faster listing.
 
+
 ## [0.14.13](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.13) - 2022-08-10
 
 ### Fixed
@@ -37,6 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 - Change `{Dataset,Slice}.items_and_annotation_generator` to work with improved paginate endpoint
 
+
 ## [0.14.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.11) - 2022-07-20
 
 ### Fixed
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -1723,14 +1723,17 @@ def _upload_items(
             local_file_upload_concurrency=local_file_upload_concurrency,
         )
 
-    def update_scene_metadata(self, mapping: Dict[str, dict]):
+    def update_scene_metadata(
+        self, mapping: Dict[str, dict], asynchronous: bool = False
+    ):
         """
         Update (merge) scene metadata for each reference_id given in the mapping.
         The backend will join the specified mapping metadata to the existing metadata.
         If there is a key-collision, the value given in the mapping will take precedence.
 
         Args:
             mapping: key-value pair of <reference_id>: <metadata>
+            asynchronous: if True, run the update as a background job
 
         Examples:
             >>> mapping = {"scene_ref_1": {"new_key": "foo"}, "scene_ref_2": {"some_value": 123}}
@@ -1740,11 +1743,17 @@ def update_scene_metadata(self, mapping: Dict[str, dict]):
             A dictionary outlining success or failures.
         """
         mm = MetadataManager(
-            self.id, self._client, mapping, ExportMetadataType.SCENES
+            self.id,
+            self._client,
+            mapping,
+            ExportMetadataType.SCENES,
+            asynchronous,
         )
         return mm.update()
 
-    def update_item_metadata(self, mapping: Dict[str, dict]):
+    def update_item_metadata(
+        self, mapping: Dict[str, dict], asynchronous: bool = False
+    ):
         """
         Update (merge) dataset item metadata for each reference_id given in the mapping.
         The backend will join the specified mapping metadata to the existing metadata.
@@ -1755,6 +1764,7 @@ def update_item_metadata(self, mapping: Dict[str, dict]):
 
         Args:
             mapping: key-value pair of <reference_id>: <metadata>
+            asynchronous: if True, run the update as a background job
 
         Examples:
             >>> mapping = {"item_ref_1": {"new_key": "foo"}, "item_ref_2": {"some_value": 123, "camera_params": {...}}}
@@ -1764,7 +1774,11 @@ def update_item_metadata(self, mapping: Dict[str, dict]):
             A dictionary outlining success or failures.
         """
         mm = MetadataManager(
-            self.id, self._client, mapping, ExportMetadataType.DATASET_ITEMS
+            self.id,
+            self._client,
+            mapping,
+            ExportMetadataType.DATASET_ITEMS,
+            asynchronous,
         )
         return mm.update()
 
diff --git a/nucleus/metadata_manager.py b/nucleus/metadata_manager.py
@@ -3,6 +3,7 @@
 
 from .camera_params import CameraParams
 from .constants import CAMERA_PARAMS_KEY
+from .job import AsyncJob
 
 if TYPE_CHECKING:
     from . import NucleusClient
@@ -26,11 +27,19 @@ def __init__(
         client: "NucleusClient",
         raw_mappings: Dict[str, dict],
         level: ExportMetadataType,
+        asynchronous: bool,
     ):
         self.dataset_id = dataset_id
         self._client = client
         self.raw_mappings = raw_mappings
         self.level = level
+        self.asynchronous = asynchronous
+
+        if len(self.raw_mappings) > 500 and not self.asynchronous:
+            raise Exception(
+                "Number of items to update is too large to perform it synchronously. "
+                "Consider running the metadata_update with `asynchronous=True`, to avoid timeouts."
+            )
 
         self._payload = self._format_mappings()
 
@@ -55,7 +64,19 @@ def _format_mappings(self):
 
     def update(self):
         payload = {"metadata": self._payload, "level": self.level.value}
-        resp = self._client.make_request(
-            payload=payload, route=f"dataset/{self.dataset_id}/metadata"
-        )
-        return resp
+        is_async = int(self.asynchronous)
+        try:
+            resp = self._client.make_request(
+                payload=payload,
+                route=f"dataset/{self.dataset_id}/metadata?async={is_async}",
+            )
+            if self.asynchronous:
+                return AsyncJob.from_json(resp, self._client)
+            return resp
+        except Exception as e:  # pylint: disable=W0703
+            print(
+                "Failed to complete the request. If a timeout occurred, consider running the "
+                "metadata_update with `asynchronous=True`."
+            )
+            print(f"Request failed with:\n\n{e}")
+            return None