Endpoint to fetch annotations grouped by scenes (#439)

vinay553 · web-flow · commit 389275cd6607 · 2024-07-08T15:39:28.000-07:00
* initial

* tune page size

* formatting

* Add docstring, parameterize page_size
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,19 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.17.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.6) - 2024-07-03
+
+### Added
+- Method for downloading all annotations grouped by `scene` and `track_reference_id`. 
+
+Example usage:
+
+```python
+dataset = client.get_dataset("ds_...")
+for scene in dataset.scene_and_annotation_generator():
+  #...
+```
+
 ## [0.17.5](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.5) - 2024-04-15
 
 ### Added
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -1449,6 +1449,49 @@ def items_and_annotations(
         )
         return convert_export_payload(api_payload[EXPORTED_ROWS])
 
+    def scene_and_annotation_generator(self, page_size=10):
+        """Provides a generator of all DatasetItems and Annotations in the dataset grouped by scene.
+
+
+        Returns:
+            Generator where each element is a nested dict (representing a JSON) structured in the following way:
+
+            Iterable[{
+                "file_location": str,
+                "metadata": Dict[str, Any],
+                "annotations": {
+                    "{trackId}": {
+                        "label": str,
+                        "name": str,
+                        "frames": List[{
+                            "left": int,
+                            "top": int,
+                            "width": int,
+                            "height": int,
+                            "key": str, # frame key
+                            "metadata": Dict[str, Any]
+                        }]
+                    }
+                }
+            }]
+
+            This is similar to how the Scale API returns task data
+        """
+
+        if page_size > 30:
+            raise ValueError("Page size must be less than or equal to 30")
+
+        endpoint_name = "exportForTrainingByScene"
+        json_generator = paginate_generator(
+            client=self._client,
+            endpoint=f"dataset/{self.id}/{endpoint_name}",
+            result_key=EXPORT_FOR_TRAINING_KEY,
+            page_size=page_size,
+        )
+
+        for data in json_generator:
+            yield data
+
     def items_and_annotation_generator(
         self,
         query: Optional[str] = None,
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"]  # Easy ignore for getting it running
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.17.5"
+version = "0.17.6"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]