paginate export methods (#341)

drakejwong · web-flow · commit 0ddc3f59617b · 2022-08-06T16:29:13.000-07:00
* paginate

* docstring

* bump semver and changelog

* lint
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.14.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.12) - 2022-08-05
+
+### Added
+- Added auto-paginated `Slice.export_predictions_generator`
+### Fixed
+- Change `{Dataset,Slice}.items_and_annotation_generator` to work with improved paginate endpoint
+
 ## [0.14.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.11) - 2022-07-20
 
 ### Fixed
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -28,6 +28,7 @@
 AUTOTAG_SCORE_THRESHOLD = "score_threshold"
 EXPORTED_ROWS = "exportedRows"
 EXPORTED_SCALE_TASK_INFO_ROWS = "exportedScaleTaskInfoRows"
+EXPORT_FOR_TRAINING_KEY = "data"
 CAMERA_MODEL_KEY = "camera_model"
 CAMERA_PARAMS_KEY = "camera_params"
 CLASS_PDF_KEY = "class_pdf"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -33,6 +33,7 @@
     DEFAULT_ANNOTATION_UPDATE_MODE,
     EMBEDDING_DIMENSION_KEY,
     EMBEDDINGS_URL_KEY,
+    EXPORT_FOR_TRAINING_KEY,
     EXPORTED_ROWS,
     FRAME_RATE_KEY,
     ITEMS_KEY,
@@ -1250,8 +1251,15 @@ def items_and_annotation_generator(
                     }
                 }]
         """
-        for item in self.items_generator():
-            yield self.refloc(reference_id=item.reference_id)
+        json_generator = paginate_generator(
+            client=self._client,
+            endpoint=f"dataset/{self.id}/exportForTrainingPage",
+            result_key=EXPORT_FOR_TRAINING_KEY,
+            page_size=100000,
+        )
+        for data in json_generator:
+            for ia in convert_export_payload([data], has_predictions=False):
+                yield ia
 
     def export_embeddings(
         self,
diff --git a/nucleus/slice.py b/nucleus/slice.py
@@ -4,14 +4,13 @@
 import requests
 
 from nucleus.annotation import Annotation
-from nucleus.constants import EXPORTED_ROWS, ITEMS_KEY
+from nucleus.constants import EXPORT_FOR_TRAINING_KEY, EXPORTED_ROWS, ITEMS_KEY
 from nucleus.dataset_item import DatasetItem
 from nucleus.errors import NucleusAPIError
 from nucleus.job import AsyncJob
 from nucleus.utils import (
     KeyErrorDict,
     convert_export_payload,
-    format_dataset_item_response,
     format_scale_task_info_response,
     paginate_generator,
 )
@@ -203,13 +202,15 @@ def items_and_annotation_generator(
                     }
                 }]
         """
-        for item in self.items_generator():
-            yield format_dataset_item_response(
-                self._client.dataitem_ref_id(
-                    dataset_id=self.dataset_id,
-                    reference_id=item.reference_id,
-                )
-            )
+        json_generator = paginate_generator(
+            client=self._client,
+            endpoint=f"slice/{self.id}/exportForTrainingPage",
+            result_key=EXPORT_FOR_TRAINING_KEY,
+            page_size=100000,
+        )
+        for data in json_generator:
+            for ia in convert_export_payload([data], has_predictions=False):
+                yield ia
 
     def items_and_annotations(
         self,
@@ -256,7 +257,7 @@ def export_predictions(
 
                 List[{
                     "item": DatasetItem,
-                    "predicions": {
+                    "predictions": {
                         "box": List[BoxAnnotation],
                         "polygon": List[PolygonAnnotation],
                         "cuboid": List[CuboidAnnotation],
@@ -272,6 +273,40 @@ def export_predictions(
         )
         return convert_export_payload(api_payload[EXPORTED_ROWS], True)
 
+    def export_predictions_generator(
+        self, model
+    ) -> Iterable[Dict[str, Union[DatasetItem, Dict[str, List[Annotation]]]]]:
+        """Provides a list of all DatasetItems and Predictions in the Slice for the given Model.
+
+        Parameters:
+            model (Model): the nucleus model objects representing the model for which to export predictions.
+
+        Returns:
+            Iterable where each element is a dict containing the DatasetItem
+            and all of its associated Predictions, grouped by type (e.g. box).
+            ::
+
+                List[{
+                    "item": DatasetItem,
+                    "predictions": {
+                        "box": List[BoxAnnotation],
+                        "polygon": List[PolygonAnnotation],
+                        "cuboid": List[CuboidAnnotation],
+                        "segmentation": List[SegmentationAnnotation],
+                        "category": List[CategoryAnnotation],
+                    }
+                }]
+        """
+        json_generator = paginate_generator(
+            client=self._client,
+            endpoint=f"slice/{self.id}/{model.id}/exportForTrainingPage",
+            result_key=EXPORT_FOR_TRAINING_KEY,
+            page_size=100000,
+        )
+        for data in json_generator:
+            for ip in convert_export_payload([data], has_predictions=True):
+                yield ip
+
     def export_scale_task_info(self):
         """Fetches info for all linked Scale tasks of items/scenes in the slice.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.14.11"
+version = "0.14.12"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -533,7 +533,7 @@ def sort_labelmap(segmentation_annotation):
         assert row[ITEM_KEY] == ds_items[0]
         assert row[ANNOTATIONS_KEY][BOX_TYPE][0] == box_annotation
         assert sort_labelmap(
-            row[ANNOTATIONS_KEY][SEGMENTATION_TYPE][0]
+            row[ANNOTATIONS_KEY][SEGMENTATION_TYPE]
         ) == sort_labelmap(clear_fields(segmentation_annotation))
         assert row[ANNOTATIONS_KEY][POLYGON_TYPE][0] == polygon_annotation
         assert row[ANNOTATIONS_KEY][CATEGORY_TYPE][0] == category_annotation