fix items_and_annotation_generator (#336)

drakejwong · web-flow · commit 3537ba8060d7 · 2022-07-20T11:40:57.000-07:00
* fix items_and_annotation_generator

* bump semver and changelog

* add inttests

* refactor slice async export inttest
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [0.14.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.9) - 2022-07-14
 
+### Added
+- `Dataset.items_and_annotation_generator()`
+
+### Fixed
+- `Slice.items_and_annotation_generator()` bug
+
+## [0.14.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.9) - 2022-07-14
+
 ### Fixed
 - NoneType errors in Validate
 
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -707,7 +707,7 @@ def _append_video_scenes(
         return response
 
     def iloc(self, i: int) -> dict:
-        """Retrieves dataset item by absolute numerical index.
+        """Retrieves dataset item and associated annotations by absolute numerical index.
 
         Parameters:
             i: Absolute numerical index of the dataset item within the dataset.
@@ -735,7 +735,7 @@ def iloc(self, i: int) -> dict:
 
     @sanitize_string_args
     def refloc(self, reference_id: str) -> dict:
-        """Retrieves a dataset item by reference ID.
+        """Retrieves a dataset item and associated annotations by reference ID.
 
         Parameters:
             reference_id: User-defined reference ID of the dataset item.
@@ -762,7 +762,7 @@ def refloc(self, reference_id: str) -> dict:
         return format_dataset_item_response(response)
 
     def loc(self, dataset_item_id: str) -> dict:
-        """Retrieves a dataset item by Nucleus-generated ID.
+        """Retrieves a dataset item and associated annotations by Nucleus-generated ID.
 
         Parameters:
             dataset_item_id: Nucleus-generated dataset item ID (starts with ``di_``).
@@ -1178,9 +1178,9 @@ def items_and_annotations(
                         "cuboid": Optional[List[CuboidAnnotation]],
                         "line": Optional[List[LineAnnotation]],
                         "polygon": Optional[List[PolygonAnnotation]],
-                        "keypoints": Optional[List[KeypointsAnnotation]],
                         "segmentation": Optional[List[SegmentationAnnotation]],
                         "category": Optional[List[CategoryAnnotation]],
+                        "keypoints": Optional[List[KeypointsAnnotation]],
                     }
                 }]
         """
@@ -1191,6 +1191,32 @@ def items_and_annotations(
         )
         return convert_export_payload(api_payload[EXPORTED_ROWS])
 
+    def items_and_annotation_generator(
+        self,
+    ) -> Iterable[Dict[str, Union[DatasetItem, Dict[str, List[Annotation]]]]]:
+        """Provides a generator of all DatasetItems and Annotations in the dataset.
+
+        Returns:
+            Generator where each element is a dict containing the DatasetItem
+            and all of its associated Annotations, grouped by type.
+            ::
+
+                Iterable[{
+                    "item": DatasetItem,
+                    "annotations": {
+                        "box": List[BoxAnnotation],
+                        "polygon": List[PolygonAnnotation],
+                        "cuboid": List[CuboidAnnotation],
+                        "line": Optional[List[LineAnnotation]],
+                        "segmentation": List[SegmentationAnnotation],
+                        "category": List[CategoryAnnotation],
+                        "keypoints": List[KeypointsAnnotation],
+                    }
+                }]
+        """
+        for item in self.items_generator():
+            yield self.refloc(reference_id=item.reference_id)
+
     def export_embeddings(
         self,
     ) -> List[Dict[str, Union[str, List[float]]]]:
diff --git a/nucleus/slice.py b/nucleus/slice.py
@@ -185,7 +185,7 @@ def items_and_annotation_generator(
 
         Returns:
             Generator where each element is a dict containing the DatasetItem
-            and all of its associated Annotations, grouped by type.
+            and all of its associated Annotations, grouped by type (e.g. box).
             ::
 
                 Iterable[{
@@ -194,16 +194,18 @@ def items_and_annotation_generator(
                         "box": List[BoxAnnotation],
                         "polygon": List[PolygonAnnotation],
                         "cuboid": List[CuboidAnnotation],
+                        "line": List[LineAnnotation],
                         "segmentation": List[SegmentationAnnotation],
                         "category": List[CategoryAnnotation],
+                        "keypoints": List[KeypointsAnnotation],
                     }
                 }]
         """
-        for item_metadata in self.items:
+        for item in self.items_generator():
             yield format_dataset_item_response(
-                self._client.dataitem_loc(
+                self._client.dataitem_ref_id(
                     dataset_id=self.dataset_id,
-                    dataset_item_id=item_metadata["id"],
+                    reference_id=item.reference_id,
                 )
             )
 
@@ -223,8 +225,10 @@ def items_and_annotations(
                         "box": List[BoxAnnotation],
                         "polygon": List[PolygonAnnotation],
                         "cuboid": List[CuboidAnnotation],
+                        "line": List[LineAnnotation],
                         "segmentation": List[SegmentationAnnotation],
                         "category": List[CategoryAnnotation],
+                        "keypoints": List[KeypointsAnnotation],
                     }
                 }]
         """
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.14.9"
+version = "0.14.10"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -528,6 +528,24 @@ def sort_labelmap(segmentation_annotation):
         == multicategory_annotation
     )
 
+    # test async export
+    for row in dataset.items_and_annotation_generator():
+        assert row[ITEM_KEY] == ds_items[0]
+        assert row[ANNOTATIONS_KEY][BOX_TYPE][0] == box_annotation
+        assert sort_labelmap(
+            row[ANNOTATIONS_KEY][SEGMENTATION_TYPE][0]
+        ) == sort_labelmap(clear_fields(segmentation_annotation))
+        assert row[ANNOTATIONS_KEY][POLYGON_TYPE][0] == polygon_annotation
+        assert row[ANNOTATIONS_KEY][CATEGORY_TYPE][0] == category_annotation
+        row[ANNOTATIONS_KEY][MULTICATEGORY_TYPE][0].labels = set(
+            row[ANNOTATIONS_KEY][MULTICATEGORY_TYPE][0].labels
+        )
+        multicategory_annotation.labels = set(multicategory_annotation.labels)
+        assert (
+            row[ANNOTATIONS_KEY][MULTICATEGORY_TYPE][0]
+            == multicategory_annotation
+        )
+
 
 def test_dataset_item_metadata_update(dataset):
     items = make_dataset_items()
diff --git a/tests/test_slice.py b/tests/test_slice.py
@@ -100,8 +100,15 @@ def get_expected_item(reference_id):
             if item.reference_id == reference_id:
                 return item
 
-    exported = slc.items_and_annotations()
-    for row in exported:
+    for row in slc.items_and_annotations():
+        reference_id = row[ITEM_KEY].reference_id
+        assert row[ITEM_KEY] == get_expected_item(reference_id)
+        assert row[ANNOTATIONS_KEY][BOX_TYPE][
+            0
+        ] == get_expected_box_annotation(reference_id)
+
+    # test async
+    for row in slc.items_and_annotation_generator():
         reference_id = row[ITEM_KEY].reference_id
         assert row[ITEM_KEY] == get_expected_item(reference_id)
         assert row[ANNOTATIONS_KEY][BOX_TYPE][