Creating specific datasets for scenes (#165)

pfmark · jean-lucas · sasha-scale · web-flow · commit bfde0d02bb53 · 2021-12-13T21:13:12.000-08:00
* can create datasets with scenes

* get scene info about dataset

* adapting tests for dataset creation

* black

* adding constants file

* more info on is_scene attribute

* Update nucleus/dataset.py

Co-authored-by: Jean Lucas  &lt;jeanlpf@hotmail.com&gt;

* updated minor version

* minor version update

* linter

* assert that dataset support scenes before appending

* ensure frames for frame datasets, scenes for scene dataset

* adding scene info to dataset info printout

* deprecation warning + docstring update

* update tests for scenes

* test for illegal items upload

* move scene check within actual append methods

* Update deprecation warning

Co-authored-by: Sasha Harrison &lt;70984140+sasha-scale@users.noreply.github.com&gt;

* Update version number

Co-authored-by: Sasha Harrison &lt;70984140+sasha-scale@users.noreply.github.com&gt;

* black

* linting

* import order

* fix tests

Co-authored-by: Jean Lucas  &lt;jeanlpf@hotmail.com&gt;
Co-authored-by: Sasha Harrison &lt;70984140+sasha-scale@users.noreply.github.com&gt;
Co-authored-by: Sasha Harrison &lt;sasha.harrison@scale.com&gt;
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -35,6 +35,8 @@
 ]
 
 import os
+import time
+import warnings
 from typing import Dict, List, Optional, Sequence, Union
 
 import pkg_resources
@@ -63,6 +65,7 @@
     ANNOTATIONS_PROCESSED_KEY,
     AUTOTAGS_KEY,
     DATASET_ID_KEY,
+    DATASET_IS_SCENE_KEY,
     DEFAULT_NETWORK_TIMEOUT_SEC,
     EMBEDDING_DIMENSION_KEY,
     EMBEDDINGS_URL_KEY,
@@ -333,14 +336,21 @@ def create_dataset_from_project(
     def create_dataset(
         self,
         name: str,
+        is_scene: bool = False,
         item_metadata_schema: Optional[Dict] = None,
         annotation_metadata_schema: Optional[Dict] = None,
     ) -> Dataset:
         """
         Creates a new, empty dataset.
 
+        Make sure that the dataset is created for the data type you would like to support.
+        Be aware to set the `is_scene` correctly.
+
         Parameters:
             name: A human-readable name for the dataset.
+            is_scene: Boolean specifying if the dataset type. This value is immutable.
+                     `False` will allow users to uplaod :class:`DatasetItems<DatasetItem>`s.
+                     `True` will allow users to upload :class:`Scenes<LidarScene>`s.
             item_metadata_schema: Dict defining item-level metadata schema. See below.
             annotation_metadata_schema: Dict defining annotation-level metadata schema.
 
@@ -358,9 +368,17 @@ def create_dataset(
         Returns:
             :class:`Dataset`: The newly created Nucleus dataset as an object.
         """
+        warnings.warn(
+            "The default create_dataset('dataset_name', ...) method without the is_scene parameter will be deprecated soon in favor of providing the is_scene parameter explicitly. "
+            "Please make sure to create a dataset with either create_dataset('dataset_name', is_scene=True, ...) to upload "
+            "DatasetItems or create_dataset('dataset_name', is_scene=False, ...) to upload "
+            "LidarScenes.",
+            DeprecationWarning,
+        )
         response = self.make_request(
             {
                 NAME_KEY: name,
+                DATASET_IS_SCENE_KEY: is_scene,
                 ANNOTATION_METADATA_SCHEMA_KEY: annotation_metadata_schema,
                 ITEM_METADATA_SCHEMA_KEY: item_metadata_schema,
             },
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -29,6 +29,7 @@
 CX_KEY = "cx"
 CY_KEY = "cy"
 DATASET_ID_KEY = "dataset_id"
+DATASET_IS_SCENE_KEY = "is_scene"
 DATASET_ITEM_ID_KEY = "dataset_item_id"
 DATASET_LENGTH_KEY = "length"
 DATASET_MODEL_RUNS_KEY = "model_run_ids"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -35,6 +35,7 @@
     AUTOTAG_SCORE_THRESHOLD,
     BACKFILL_JOB_KEY,
     DATASET_ID_KEY,
+    DATASET_IS_SCENE_KEY,
     DEFAULT_ANNOTATION_UPDATE_MODE,
     EXPORTED_ROWS,
     KEEP_HISTORY_KEY,
@@ -77,6 +78,8 @@ class Dataset:
     with metadata to your dataset, annotate it with ground truth, and upload
     model predictions to evaluate and compare model performance on your data.
 
+    Make sure that the dataset is set up correctly supporting the required datatype (see code sample below).
+
     Datasets cannot be instantiated directly and instead must be created via API
     endpoint using :meth:`NucleusClient.create_dataset`, or in the dashboard.
 
@@ -86,8 +89,11 @@ class Dataset:
 
         client = nucleus.NucleusClient(YOUR_SCALE_API_KEY)
 
-        # Create new dataset
-        dataset = client.create_dataset(YOUR_DATASET_NAME)
+        # Create new dataset supporting DatasetItems
+        dataset = client.create_dataset(YOUR_DATASET_NAME, is_scene=False)
+
+        # OR create new dataset supporting LidarScenes
+        dataset = client.create_dataset(YOUR_DATASET_NAME, is_scene=True)
 
         # Or, retrieve existing dataset by ID
         # This ID can be fetched using client.list_datasets() or from a dashboard URL
@@ -102,9 +108,9 @@ def __init__(self, dataset_id, client, name=None):
 
     def __repr__(self):
         if os.environ.get("NUCLEUS_DEBUG", None):
-            return f"Dataset(name='{self.name}, dataset_id='{self.id}', client={self._client})"
+            return f"Dataset(name='{self.name}, dataset_id='{self.id}', is_scene='{self.is_scene}', client={self._client})"
         else:
-            return f"Dataset(name='{self.name}, dataset_id='{self.id}')"
+            return f"Dataset(name='{self.name}, dataset_id='{self.id}', is_scene='{self.is_scene}')"
 
     def __eq__(self, other):
         if self.id == other.id:
@@ -121,6 +127,14 @@ def name(self) -> str:
             )["name"]
         return self._name
 
+    @property
+    def is_scene(self) -> bool:
+        """If the dataset can contain scenes or not."""
+        response = self._client.make_request(
+            {}, f"dataset/{self.id}/is_scene", requests.get
+        )[DATASET_IS_SCENE_KEY]
+        return response
+
     @property
     def model_runs(self) -> Dict[Any, Any]:
         """List of all model runs associated with the Dataset."""
@@ -382,6 +396,14 @@ def append(
     ) -> Union[Dict[Any, Any], AsyncJob, UploadResponse]:
         """Appends items or scenes to a dataset.
 
+        Attention (!!!)
+        You will only be able to add :class:`DatasetItems<DatasetItem>`s to a dataset supporting "
+        ":class:`DatasetItems<DatasetItem>`s.
+        Also, you will only be able to add :class:`Scenes<LidarScene>`s to a dataset supporting "
+        ":class:`Scenes<LidarScene>`s.
+        A :class:`DatasetItems<DatasetItem>` dataset can be created with the is_scene flag set to False.
+        A :class:`Scenes<LidarScene>` dataset can be created with the is_scene flag set to True.
+
         ::
 
             import nucleus
@@ -480,6 +502,7 @@ def append(
             assert (
                 asynchronous
             ), "In order to avoid timeouts, you must set asynchronous=True when uploading scenes."
+
             return self.append_scenes(scenes, update, asynchronous)
 
         check_for_duplicate_reference_ids(dataset_items)
@@ -517,6 +540,14 @@ def append_scenes(
         asynchronous: Optional[bool] = False,
     ) -> Union[dict, AsyncJob]:
         # TODO: make private in favor of Dataset.append invocation
+        if not self.is_scene:
+            raise Exception(
+                "Your dataset is not a scene dataset but only supports single dataset items. "
+                "In order to be able to add scenes, please create another dataset with "
+                "client.create_dataset(<dataset_name>, is_scene=True) or add the scenes to "
+                "an existing scene dataset."
+            )
+
         for scene in scenes:
             scene.validate()
 
@@ -1288,5 +1319,13 @@ def _upload_items(
         Returns:
             UploadResponse
         """
+        if self.is_scene:
+            raise Exception(
+                "Your dataset is a scene dataset and does not support the upload of single dataset items. "
+                "In order to be able to add dataset items, please create another dataset with "
+                "client.create_dataset(<dataset_name>, is_scene=False) or add the dataset items to "
+                "an existing dataset supporting dataset items."
+            )
+
         populator = DatasetItemUploader(self.id, self._client)
         return populator.upload(dataset_items, batch_size, update)
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.4.0"
+version = "0.4.1"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -52,7 +52,7 @@
 
 @pytest.fixture()
 def dataset(CLIENT):
-    ds = CLIENT.create_dataset(TEST_DATASET_NAME)
+    ds = CLIENT.create_dataset(TEST_DATASET_NAME, is_scene=False)
 
     response = ds.add_taxonomy(
         "[Pytest] Category Taxonomy 1",
@@ -75,6 +75,11 @@ def dataset(CLIENT):
     assert response == {"message": "Beginning dataset deletion..."}
 
 
+@pytest.fixture()
+def dataset_scene(CLIENT):
+    CLIENT.create_dataset(TEST_DATASET_NAME, is_scene=True)
+
+
 def make_dataset_items():
     ds_items_with_metadata = []
     for i, url in enumerate(TEST_IMG_URLS):
@@ -97,11 +102,28 @@ def make_dataset_items():
     return ds_items_with_metadata
 
 
-def test_dataset_create_and_delete(CLIENT):
+def test_dataset_create_and_delete_no_scene(CLIENT):
     # Creation
     ds = CLIENT.create_dataset(TEST_DATASET_NAME)
     assert isinstance(ds, Dataset)
     assert ds.name == TEST_DATASET_NAME
+    assert not ds.is_scene
+    assert ds.model_runs == []
+    assert ds.slices == []
+    assert ds.size == 0
+    assert ds.items == []
+
+    # Deletion
+    response = CLIENT.delete_dataset(ds.id)
+    assert response == {"message": "Beginning dataset deletion..."}
+
+
+def test_dataset_create_and_delete_scene(CLIENT):
+    # Creation
+    ds = CLIENT.create_dataset(name=TEST_DATASET_NAME, is_scene=True)
+    assert isinstance(ds, Dataset)
+    assert ds.name == TEST_DATASET_NAME
+    assert ds.is_scene
     assert ds.model_runs == []
     assert ds.slices == []
     assert ds.size == 0
@@ -195,6 +217,24 @@ def check_is_expected_response(response):
     check_is_expected_response(response)
 
 
+def test_scene_dataset_append(dataset_scene):
+    # Plain image upload
+    ds_items_plain = []
+    for i, url in enumerate(TEST_IMG_URLS):
+        # Upload just the first item in privacy mode
+        upload_to_scale = i == 0
+        ds_items_plain.append(
+            DatasetItem(
+                image_location=url,
+                upload_to_scale=upload_to_scale,
+                reference_id=url.split("/")[-1] + "_plain",
+            )
+        )
+
+    with pytest.raises(Exception):
+        dataset_scene.append(ds_items_plain)
+
+
 def test_dataset_name_access(CLIENT, dataset):
     assert dataset.name == TEST_DATASET_NAME
 
diff --git a/tests/test_scene.py b/tests/test_scene.py
@@ -32,8 +32,17 @@
 
 
 @pytest.fixture()
-def dataset(CLIENT):
-    ds = CLIENT.create_dataset(TEST_DATASET_3D_NAME)
+def dataset_scene(CLIENT):
+    ds = CLIENT.create_dataset(TEST_DATASET_3D_NAME, is_scene=True)
+    yield ds
+
+    response = CLIENT.delete_dataset(ds.id)
+    assert response == {"message": "Beginning dataset deletion..."}
+
+
+@pytest.fixture()
+def dataset_item(CLIENT):
+    ds = CLIENT.create_dataset(TEST_DATASET_3D_NAME, is_scene=False)
     yield ds
 
     response = CLIENT.delete_dataset(ds.id)
@@ -246,38 +255,38 @@ def test_scene_add_frame():
 
 
 @pytest.mark.skip("Deactivated sync upload for scenes")
-def test_scene_upload_sync(dataset):
+def test_scene_upload_sync(dataset_scene):
     payload = TEST_LIDAR_SCENES
     scenes = [
         LidarScene.from_json(scene_json) for scene_json in payload[SCENES_KEY]
     ]
     update = payload[UPDATE_KEY]
 
-    response = dataset.append(scenes, update=update)
+    response = dataset_scene.append(scenes, update=update)
 
-    first_scene = dataset.get_scene(scenes[0].reference_id)
+    first_scene = dataset_scene.get_scene(scenes[0].reference_id)
 
     assert first_scene == scenes[0]
     first_scene_modified = copy.deepcopy(first_scene)
     first_scene_modified.reference_id = "WRONG!"
     assert first_scene_modified != scenes[0]
 
-    assert response["dataset_id"] == dataset.id
+    assert response["dataset_id"] == dataset_scene.id
     assert response["new_scenes"] == len(scenes)
 
 
 @pytest.mark.skip("Deactivated sync upload for scenes")
 @pytest.mark.integration
-def test_scene_and_cuboid_upload_sync(dataset):
+def test_scene_and_cuboid_upload_sync(dataset_scene):
     payload = TEST_LIDAR_SCENES
     scenes = [
         LidarScene.from_json(scene_json) for scene_json in payload[SCENES_KEY]
     ]
     update = payload[UPDATE_KEY]
 
-    response = dataset.append(scenes, update=update)
+    response = dataset_scene.append(scenes, update=update)
 
-    assert response["dataset_id"] == dataset.id
+    assert response["dataset_id"] == dataset_scene.id
     assert response["new_scenes"] == len(scenes)
 
     lidar_item_ref = payload[SCENES_KEY][0][FRAMES_KEY][0]["lidar"][
@@ -286,30 +295,30 @@ def test_scene_and_cuboid_upload_sync(dataset):
     TEST_CUBOID_ANNOTATIONS[0][REFERENCE_ID_KEY] = lidar_item_ref
 
     annotations = [CuboidAnnotation.from_json(TEST_CUBOID_ANNOTATIONS[0])]
-    response = dataset.annotate(annotations)
+    response = dataset_scene.annotate(annotations)
 
-    assert response["dataset_id"] == dataset.id
+    assert response["dataset_id"] == dataset_scene.id
     assert response["annotations_processed"] == len(annotations)
     assert response["annotations_ignored"] == 0
 
-    response_annotations = dataset.refloc(lidar_item_ref)[ANNOTATIONS_KEY][
-        "cuboid"
-    ]
+    response_annotations = dataset_scene.refloc(lidar_item_ref)[
+        ANNOTATIONS_KEY
+    ]["cuboid"]
     assert len(response_annotations) == 1
     assert_cuboid_annotation_matches_dict(
         response_annotations[0], TEST_CUBOID_ANNOTATIONS[0]
     )
 
 
 @pytest.mark.integration
-def test_scene_upload_async(dataset):
+def test_scene_upload_async(dataset_scene):
     payload = TEST_LIDAR_SCENES
     scenes = [
         LidarScene.from_json(scene_json) for scene_json in payload[SCENES_KEY]
     ]
     update = payload[UPDATE_KEY]
 
-    job = dataset.append(scenes, update=update, asynchronous=True)
+    job = dataset_scene.append(scenes, update=update, asynchronous=True)
     job.sleep_until_complete()
     status = job.status()
 
@@ -319,7 +328,7 @@ def test_scene_upload_async(dataset):
         "message": {
             "scene_upload_progress": {
                 "errors": [],
-                "dataset_id": dataset.id,
+                "dataset_id": dataset_scene.id,
                 "new_scenes": len(scenes),
                 "ignored_scenes": 0,
                 "scenes_errored": 0,
@@ -330,3 +339,15 @@ def test_scene_upload_async(dataset):
         "completed_steps": 1,
         "total_steps": 1,
     }
+
+
+@pytest.mark.integration
+def test_scene_upload_async_item_dataset(dataset_item):
+    payload = TEST_LIDAR_SCENES
+    scenes = [
+        LidarScene.from_json(scene_json) for scene_json in payload[SCENES_KEY]
+    ]
+    update = payload[UPDATE_KEY]
+
+    with pytest.raises(Exception):
+        dataset_item.append(scenes, update=update, asynchronous=True)