Update results returned by slice.items() (#368)

jean-lucas · web-flow · commit 830e2253017d · 2022-10-31T10:58:50.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.14.25](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.25) - 2022-10-20
+
+### Updated
+- Items of a slice can be retrieved by Slice property `.item`
+- The type of items returned from `.items` is based on the slice `type`:
+  - `slice.type == 'dataset_item'` => list of `DatasetItem` objects
+  - `slice.type == 'object'` => list of `Annotation`/`Prediction` objects
+  - `slice.type == 'scene'` => list of `Scene` objects
+
 ## [0.14.24](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.24) - 2022-10-19
 
 ### Fixed
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -38,6 +38,7 @@
 DATASET_ID_KEY = "dataset_id"
 DATASET_IS_SCENE_KEY = "is_scene"
 DATASET_ITEM_ID_KEY = "dataset_item_id"
+DATASET_ITEMS_KEY = "dataset_items"
 DATASET_LENGTH_KEY = "length"
 DATASET_MODEL_RUNS_KEY = "model_run_ids"
 DATASET_NAME_KEY = "name"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -31,6 +31,7 @@
     BACKFILL_JOB_KEY,
     DATASET_ID_KEY,
     DATASET_IS_SCENE_KEY,
+    DATASET_ITEMS_KEY,
     DEFAULT_ANNOTATION_UPDATE_MODE,
     EMBEDDING_DIMENSION_KEY,
     EMBEDDINGS_URL_KEY,
@@ -212,7 +213,8 @@ def items(self) -> List[DatasetItem]:
             if e.status_code == 503:
                 e.message += "\nThe server timed out while trying to load your items. Please try iterating over dataset.items_generator() instead."
             raise e
-        dataset_item_jsons = response.get("dataset_items", None)
+        dataset_item_jsons = response.get(DATASET_ITEMS_KEY, None)
+
         return [
             DatasetItem.from_json(item_json)
             for item_json in dataset_item_jsons
diff --git a/nucleus/scene.py b/nucleus/scene.py
@@ -124,6 +124,7 @@ class Scene(ABC):
     reference_id: str
     frames: List[Frame] = field(default_factory=list)
     metadata: Optional[dict] = field(default_factory=dict)
+    skip_validate: Optional[bool] = False
 
     def __post_init__(self):
         self.sensors = set(
@@ -133,7 +134,8 @@ def __post_init__(self):
         if self.metadata is None:
             self.metadata = {}
 
-        self.validate()
+        if not self.skip_validate:
+            self.validate()
 
     def __eq__(self, other):
         return all(
@@ -310,14 +312,15 @@ def validate_frames_dict(self):
         ), "frames must be 0-indexed and continuous (no missing frames)"
 
     @classmethod
-    def from_json(cls, payload: dict):
+    def from_json(cls, payload: dict, skip_validate: Optional[bool] = False):
         """Instantiates scene object from schematized JSON dict payload."""
         frames_payload = payload.get(FRAMES_KEY, [])
         frames = [Frame.from_json(frame) for frame in frames_payload]
         return cls(
             reference_id=payload[REFERENCE_ID_KEY],
             frames=frames,
             metadata=payload.get(METADATA_KEY, {}),
+            skip_validate=skip_validate,
         )
 
     def to_payload(self) -> dict:
diff --git a/nucleus/slice.py b/nucleus/slice.py
@@ -11,6 +11,8 @@
 from nucleus.dataset_item import DatasetItem
 from nucleus.errors import NucleusAPIError
 from nucleus.job import AsyncJob
+from nucleus.prediction import from_json as prediction_from_json
+from nucleus.scene import Scene
 from nucleus.utils import (
     KeyErrorDict,
     convert_export_payload,
@@ -113,6 +115,7 @@ def __init__(self, slice_id: str, client):
         self._dataset_id = None
         self._created_at = None
         self._pending_job_count = None
+        self._type = None
 
     def __repr__(self):
         return f"Slice(slice_id='{self.id}', name={self._name}, dataset_id={self._dataset_id})"
@@ -182,6 +185,13 @@ def dataset_id(self):
             self._dataset_id = self.info()["dataset_id"]
         return self._dataset_id
 
+    @property
+    def type(self):
+        """The type of the Slice."""
+        if self._type is None:
+            self._type = self.info()["type"]
+        return self._type
+
     def items_generator(self, page_size=100000):
         """Generator yielding all dataset items in the dataset.
 
@@ -209,28 +219,69 @@ def items_generator(self, page_size=100000):
         for item_json in json_generator:
             yield DatasetItem.from_json(item_json)
 
+    def dataset_items(self):
+        """Fetch all DatasetItems contained in the Slice.
+
+        We recommend using :meth:`Slice.items_generator` if the Slice has more than 200k items.
+
+        Returns: list of DatasetItem objects
+
+        """
+        try:
+            response = self._client.make_request(
+                {}, f"slice/{self.id}", requests_command=requests.get
+            )
+        except NucleusAPIError as e:
+            if e.status_code == 503:
+                e.message += "/n Your request timed out while trying to get all the items in the slice. Please try slice.items_generator() instead."
+            raise e
+
+        dataset_item_jsons = response.get(ITEMS_KEY, [])
+        return [
+            DatasetItem.from_json(dataset_item_json)
+            for dataset_item_json in dataset_item_jsons
+        ]
+
     @property
     def items(self):
-        """All DatasetItems contained in the Slice.
+        """Fetch all items belonging to this slice, the type of items returned depends on the type of the slice.
+        The type of the slice can be one of { dataset_item, object, scene }.
 
-        We recommend using :meth:`Slice.items_generator` if the Slice has more than 200k items.
 
+        Returns: List of DatasetItems for a `dataset_item` slice,
+            list of Annotations/Predictions for an `object` slice,
+            or a list of Scenes for a `scene` slice.
         """
         try:
-            dataset_item_jsons = self._client.make_request(
+            response = self._client.make_request(
                 {}, f"slice/{self.id}", requests_command=requests.get
-            )[
-                "dataset_items"
-            ]  # Unfortunately, we didn't use a standard value here, so not using a constant for the key
-            return [
-                DatasetItem.from_json(dataset_item_json)
-                for dataset_item_json in dataset_item_jsons
-            ]
+            )
         except NucleusAPIError as e:
             if e.status_code == 503:
                 e.message += "/n Your request timed out while trying to get all the items in the slice. Please try slice.items_generator() instead."
             raise e
 
+        items = response.get(ITEMS_KEY, [])
+
+        formatted_items = []
+        for item in items:
+            item_id_prefix = item["id"].split("_")[0]
+            if item_id_prefix == "di":
+                formatted_items.append(DatasetItem.from_json(item))
+            elif item_id_prefix == "ann":
+                formatted_items.append(Annotation.from_json(item))
+            elif item_id_prefix == "pred":
+                formatted_items.append(prediction_from_json(item))
+            elif item_id_prefix == "scn":
+                # here we skip validate since no frames for the scene is fetched
+                formatted_items.append(
+                    Scene.from_json(item, skip_validate=True)
+                )
+            else:
+                raise ValueError("Unknown prefix", item_id_prefix)
+
+        return formatted_items
+
     def info(self) -> dict:
         """Retrieves the name, slice_id, and dataset_id of the Slice.
 
@@ -251,6 +302,11 @@ def info(self) -> dict:
             {}, f"slice/{self.id}/info", requests_command=requests.get
         )
         info.update(res)
+        self._name = info["name"]
+        self._dataset_id = info["dataset_id"]
+        self._created_at = info["created_at"]
+        self._pending_job_count = info["pending_job_count"]
+        self._type = info["type"]
         return info
 
     def append(
@@ -552,7 +608,10 @@ def check_annotations_are_in_slice(
         for annotation in annotations
         if annotation.reference_id is not None
     }.difference(
-        {item_metadata["ref_id"] for item_metadata in slice_to_check.items}
+        {
+            item_metadata["ref_id"]
+            for item_metadata in slice_to_check.dataset_items()
+        }
     )
     if reference_ids_not_found_in_slice:
         annotations_are_in_slice = False
diff --git a/nucleus/validate/scenario_test.py b/nucleus/validate/scenario_test.py
@@ -8,7 +8,7 @@
 from typing import List, Optional
 
 from ..connection import Connection
-from ..constants import NAME_KEY, SLICE_ID_KEY
+from ..constants import DATASET_ITEMS_KEY, NAME_KEY, SLICE_ID_KEY
 from ..dataset_item import DatasetItem
 from .constants import (
     EVAL_FUNCTION_ID_KEY,
@@ -27,8 +27,6 @@
 from .scenario_test_evaluation import ScenarioTestEvaluation
 from .scenario_test_metric import ScenarioTestMetric
 
-DATASET_ITEMS_KEY = "dataset_items"
-
 
 @dataclass
 class ScenarioTest:
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.14.24"
+version = "0.14.25"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_slice.py b/tests/test_slice.py
@@ -53,7 +53,7 @@ def test_slice_create_and_delete_and_list(dataset: Dataset):
     assert slc.name == TEST_SLICE_NAME
     assert slc.dataset_id == dataset.id
 
-    assert {item.reference_id for item in slc.items} == {
+    assert {item.reference_id for item in slc.dataset_items()} == {
         item.reference_id for item in ds_items[:2]
     }
 
@@ -122,7 +122,7 @@ def test_slice_create_and_prediction_export(dataset, slc, model):
 
     assert response
 
-    slice_reference_ids = [item.reference_id for item in slc.items]
+    slice_reference_ids = [item.reference_id for item in slc.dataset_items()]
 
     def get_expected_box_prediction(reference_id):
         for prediction in predictions:
@@ -156,7 +156,7 @@ def test_slice_append(dataset):
 
     # Insert duplicate first item
     slc.append(reference_ids=[item.reference_id for item in ds_items[:3]])
-    slice_items = slc.items
+    slice_items = slc.dataset_items()
 
     assert len(slice_items) == 3
 
@@ -176,7 +176,7 @@ def test_slice_send_to_labeling(dataset):
         reference_ids=[ds_items[0].reference_id, ds_items[1].reference_id],
     )
 
-    items = slc.items
+    items = slc.dataset_items()
     assert len(items) == 2
 
     response = slc.send_to_labeling(TEST_PROJECT_ID)
@@ -210,7 +210,9 @@ def test_slice_dataset_item_iterator(dataset):
         name=TEST_SLICE_NAME + get_uuid(),
         reference_ids=[item.reference_id for item in all_items[:1]],
     )
-    expected_items = {item.reference_id: item for item in test_slice.items}
+    expected_items = {
+        item.reference_id: item for item in test_slice.dataset_items()
+    }
     actual_items = {
         item.reference_id: item
         for item in test_slice.items_generator(page_size=1)