Skip to content

Commit bfde0d0

Browse files
pfmarkjean-lucassasha-scale
authored
Creating specific datasets for scenes (#165)
* can create datasets with scenes * get scene info about dataset * adapting tests for dataset creation * black * adding constants file * more info on is_scene attribute * Update nucleus/dataset.py Co-authored-by: Jean Lucas <jeanlpf@hotmail.com> * updated minor version * minor version update * linter * assert that dataset support scenes before appending * ensure frames for frame datasets, scenes for scene dataset * adding scene info to dataset info printout * deprecation warning + docstring update * update tests for scenes * test for illegal items upload * move scene check within actual append methods * Update deprecation warning Co-authored-by: Sasha Harrison <70984140+sasha-scale@users.noreply.github.com> * Update version number Co-authored-by: Sasha Harrison <70984140+sasha-scale@users.noreply.github.com> * black * linting * import order * fix tests Co-authored-by: Jean Lucas <jeanlpf@hotmail.com> Co-authored-by: Sasha Harrison <70984140+sasha-scale@users.noreply.github.com> Co-authored-by: Sasha Harrison <sasha.harrison@scale.com>
1 parent 7335bdc commit bfde0d0

File tree

6 files changed

+143
-24
lines changed

6 files changed

+143
-24
lines changed

nucleus/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
]
3636

3737
import os
38+
import time
39+
import warnings
3840
from typing import Dict, List, Optional, Sequence, Union
3941

4042
import pkg_resources
@@ -63,6 +65,7 @@
6365
ANNOTATIONS_PROCESSED_KEY,
6466
AUTOTAGS_KEY,
6567
DATASET_ID_KEY,
68+
DATASET_IS_SCENE_KEY,
6669
DEFAULT_NETWORK_TIMEOUT_SEC,
6770
EMBEDDING_DIMENSION_KEY,
6871
EMBEDDINGS_URL_KEY,
@@ -333,14 +336,21 @@ def create_dataset_from_project(
333336
def create_dataset(
334337
self,
335338
name: str,
339+
is_scene: bool = False,
336340
item_metadata_schema: Optional[Dict] = None,
337341
annotation_metadata_schema: Optional[Dict] = None,
338342
) -> Dataset:
339343
"""
340344
Creates a new, empty dataset.
341345
346+
Make sure that the dataset is created for the data type you would like to support.
347+
Be aware to set the `is_scene` correctly.
348+
342349
Parameters:
343350
name: A human-readable name for the dataset.
351+
is_scene: Boolean specifying if the dataset type. This value is immutable.
352+
`False` will allow users to uplaod :class:`DatasetItems<DatasetItem>`s.
353+
`True` will allow users to upload :class:`Scenes<LidarScene>`s.
344354
item_metadata_schema: Dict defining item-level metadata schema. See below.
345355
annotation_metadata_schema: Dict defining annotation-level metadata schema.
346356
@@ -358,9 +368,17 @@ def create_dataset(
358368
Returns:
359369
:class:`Dataset`: The newly created Nucleus dataset as an object.
360370
"""
371+
warnings.warn(
372+
"The default create_dataset('dataset_name', ...) method without the is_scene parameter will be deprecated soon in favor of providing the is_scene parameter explicitly. "
373+
"Please make sure to create a dataset with either create_dataset('dataset_name', is_scene=True, ...) to upload "
374+
"DatasetItems or create_dataset('dataset_name', is_scene=False, ...) to upload "
375+
"LidarScenes.",
376+
DeprecationWarning,
377+
)
361378
response = self.make_request(
362379
{
363380
NAME_KEY: name,
381+
DATASET_IS_SCENE_KEY: is_scene,
364382
ANNOTATION_METADATA_SCHEMA_KEY: annotation_metadata_schema,
365383
ITEM_METADATA_SCHEMA_KEY: item_metadata_schema,
366384
},

nucleus/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
CX_KEY = "cx"
3030
CY_KEY = "cy"
3131
DATASET_ID_KEY = "dataset_id"
32+
DATASET_IS_SCENE_KEY = "is_scene"
3233
DATASET_ITEM_ID_KEY = "dataset_item_id"
3334
DATASET_LENGTH_KEY = "length"
3435
DATASET_MODEL_RUNS_KEY = "model_run_ids"

nucleus/dataset.py

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
AUTOTAG_SCORE_THRESHOLD,
3636
BACKFILL_JOB_KEY,
3737
DATASET_ID_KEY,
38+
DATASET_IS_SCENE_KEY,
3839
DEFAULT_ANNOTATION_UPDATE_MODE,
3940
EXPORTED_ROWS,
4041
KEEP_HISTORY_KEY,
@@ -77,6 +78,8 @@ class Dataset:
7778
with metadata to your dataset, annotate it with ground truth, and upload
7879
model predictions to evaluate and compare model performance on your data.
7980
81+
Make sure that the dataset is set up correctly supporting the required datatype (see code sample below).
82+
8083
Datasets cannot be instantiated directly and instead must be created via API
8184
endpoint using :meth:`NucleusClient.create_dataset`, or in the dashboard.
8285
@@ -86,8 +89,11 @@ class Dataset:
8689
8790
client = nucleus.NucleusClient(YOUR_SCALE_API_KEY)
8891
89-
# Create new dataset
90-
dataset = client.create_dataset(YOUR_DATASET_NAME)
92+
# Create new dataset supporting DatasetItems
93+
dataset = client.create_dataset(YOUR_DATASET_NAME, is_scene=False)
94+
95+
# OR create new dataset supporting LidarScenes
96+
dataset = client.create_dataset(YOUR_DATASET_NAME, is_scene=True)
9197
9298
# Or, retrieve existing dataset by ID
9399
# This ID can be fetched using client.list_datasets() or from a dashboard URL
@@ -102,9 +108,9 @@ def __init__(self, dataset_id, client, name=None):
102108

103109
def __repr__(self):
104110
if os.environ.get("NUCLEUS_DEBUG", None):
105-
return f"Dataset(name='{self.name}, dataset_id='{self.id}', client={self._client})"
111+
return f"Dataset(name='{self.name}, dataset_id='{self.id}', is_scene='{self.is_scene}', client={self._client})"
106112
else:
107-
return f"Dataset(name='{self.name}, dataset_id='{self.id}')"
113+
return f"Dataset(name='{self.name}, dataset_id='{self.id}', is_scene='{self.is_scene}')"
108114

109115
def __eq__(self, other):
110116
if self.id == other.id:
@@ -121,6 +127,14 @@ def name(self) -> str:
121127
)["name"]
122128
return self._name
123129

130+
@property
131+
def is_scene(self) -> bool:
132+
"""If the dataset can contain scenes or not."""
133+
response = self._client.make_request(
134+
{}, f"dataset/{self.id}/is_scene", requests.get
135+
)[DATASET_IS_SCENE_KEY]
136+
return response
137+
124138
@property
125139
def model_runs(self) -> Dict[Any, Any]:
126140
"""List of all model runs associated with the Dataset."""
@@ -382,6 +396,14 @@ def append(
382396
) -> Union[Dict[Any, Any], AsyncJob, UploadResponse]:
383397
"""Appends items or scenes to a dataset.
384398
399+
Attention (!!!)
400+
You will only be able to add :class:`DatasetItems<DatasetItem>`s to a dataset supporting "
401+
":class:`DatasetItems<DatasetItem>`s.
402+
Also, you will only be able to add :class:`Scenes<LidarScene>`s to a dataset supporting "
403+
":class:`Scenes<LidarScene>`s.
404+
A :class:`DatasetItems<DatasetItem>` dataset can be created with the is_scene flag set to False.
405+
A :class:`Scenes<LidarScene>` dataset can be created with the is_scene flag set to True.
406+
385407
::
386408
387409
import nucleus
@@ -480,6 +502,7 @@ def append(
480502
assert (
481503
asynchronous
482504
), "In order to avoid timeouts, you must set asynchronous=True when uploading scenes."
505+
483506
return self.append_scenes(scenes, update, asynchronous)
484507

485508
check_for_duplicate_reference_ids(dataset_items)
@@ -517,6 +540,14 @@ def append_scenes(
517540
asynchronous: Optional[bool] = False,
518541
) -> Union[dict, AsyncJob]:
519542
# TODO: make private in favor of Dataset.append invocation
543+
if not self.is_scene:
544+
raise Exception(
545+
"Your dataset is not a scene dataset but only supports single dataset items. "
546+
"In order to be able to add scenes, please create another dataset with "
547+
"client.create_dataset(<dataset_name>, is_scene=True) or add the scenes to "
548+
"an existing scene dataset."
549+
)
550+
520551
for scene in scenes:
521552
scene.validate()
522553

@@ -1288,5 +1319,13 @@ def _upload_items(
12881319
Returns:
12891320
UploadResponse
12901321
"""
1322+
if self.is_scene:
1323+
raise Exception(
1324+
"Your dataset is a scene dataset and does not support the upload of single dataset items. "
1325+
"In order to be able to add dataset items, please create another dataset with "
1326+
"client.create_dataset(<dataset_name>, is_scene=False) or add the dataset items to "
1327+
"an existing dataset supporting dataset items."
1328+
)
1329+
12911330
populator = DatasetItemUploader(self.id, self._client)
12921331
return populator.upload(dataset_items, batch_size, update)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ exclude = '''
2121

2222
[tool.poetry]
2323
name = "scale-nucleus"
24-
version = "0.4.0"
24+
version = "0.4.1"
2525
description = "The official Python client library for Nucleus, the Data Platform for AI"
2626
license = "MIT"
2727
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

tests/test_dataset.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252

5353
@pytest.fixture()
5454
def dataset(CLIENT):
55-
ds = CLIENT.create_dataset(TEST_DATASET_NAME)
55+
ds = CLIENT.create_dataset(TEST_DATASET_NAME, is_scene=False)
5656

5757
response = ds.add_taxonomy(
5858
"[Pytest] Category Taxonomy 1",
@@ -75,6 +75,11 @@ def dataset(CLIENT):
7575
assert response == {"message": "Beginning dataset deletion..."}
7676

7777

78+
@pytest.fixture()
79+
def dataset_scene(CLIENT):
80+
CLIENT.create_dataset(TEST_DATASET_NAME, is_scene=True)
81+
82+
7883
def make_dataset_items():
7984
ds_items_with_metadata = []
8085
for i, url in enumerate(TEST_IMG_URLS):
@@ -97,11 +102,28 @@ def make_dataset_items():
97102
return ds_items_with_metadata
98103

99104

100-
def test_dataset_create_and_delete(CLIENT):
105+
def test_dataset_create_and_delete_no_scene(CLIENT):
101106
# Creation
102107
ds = CLIENT.create_dataset(TEST_DATASET_NAME)
103108
assert isinstance(ds, Dataset)
104109
assert ds.name == TEST_DATASET_NAME
110+
assert not ds.is_scene
111+
assert ds.model_runs == []
112+
assert ds.slices == []
113+
assert ds.size == 0
114+
assert ds.items == []
115+
116+
# Deletion
117+
response = CLIENT.delete_dataset(ds.id)
118+
assert response == {"message": "Beginning dataset deletion..."}
119+
120+
121+
def test_dataset_create_and_delete_scene(CLIENT):
122+
# Creation
123+
ds = CLIENT.create_dataset(name=TEST_DATASET_NAME, is_scene=True)
124+
assert isinstance(ds, Dataset)
125+
assert ds.name == TEST_DATASET_NAME
126+
assert ds.is_scene
105127
assert ds.model_runs == []
106128
assert ds.slices == []
107129
assert ds.size == 0
@@ -195,6 +217,24 @@ def check_is_expected_response(response):
195217
check_is_expected_response(response)
196218

197219

220+
def test_scene_dataset_append(dataset_scene):
221+
# Plain image upload
222+
ds_items_plain = []
223+
for i, url in enumerate(TEST_IMG_URLS):
224+
# Upload just the first item in privacy mode
225+
upload_to_scale = i == 0
226+
ds_items_plain.append(
227+
DatasetItem(
228+
image_location=url,
229+
upload_to_scale=upload_to_scale,
230+
reference_id=url.split("/")[-1] + "_plain",
231+
)
232+
)
233+
234+
with pytest.raises(Exception):
235+
dataset_scene.append(ds_items_plain)
236+
237+
198238
def test_dataset_name_access(CLIENT, dataset):
199239
assert dataset.name == TEST_DATASET_NAME
200240

tests/test_scene.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,17 @@
3232

3333

3434
@pytest.fixture()
35-
def dataset(CLIENT):
36-
ds = CLIENT.create_dataset(TEST_DATASET_3D_NAME)
35+
def dataset_scene(CLIENT):
36+
ds = CLIENT.create_dataset(TEST_DATASET_3D_NAME, is_scene=True)
37+
yield ds
38+
39+
response = CLIENT.delete_dataset(ds.id)
40+
assert response == {"message": "Beginning dataset deletion..."}
41+
42+
43+
@pytest.fixture()
44+
def dataset_item(CLIENT):
45+
ds = CLIENT.create_dataset(TEST_DATASET_3D_NAME, is_scene=False)
3746
yield ds
3847

3948
response = CLIENT.delete_dataset(ds.id)
@@ -246,38 +255,38 @@ def test_scene_add_frame():
246255

247256

248257
@pytest.mark.skip("Deactivated sync upload for scenes")
249-
def test_scene_upload_sync(dataset):
258+
def test_scene_upload_sync(dataset_scene):
250259
payload = TEST_LIDAR_SCENES
251260
scenes = [
252261
LidarScene.from_json(scene_json) for scene_json in payload[SCENES_KEY]
253262
]
254263
update = payload[UPDATE_KEY]
255264

256-
response = dataset.append(scenes, update=update)
265+
response = dataset_scene.append(scenes, update=update)
257266

258-
first_scene = dataset.get_scene(scenes[0].reference_id)
267+
first_scene = dataset_scene.get_scene(scenes[0].reference_id)
259268

260269
assert first_scene == scenes[0]
261270
first_scene_modified = copy.deepcopy(first_scene)
262271
first_scene_modified.reference_id = "WRONG!"
263272
assert first_scene_modified != scenes[0]
264273

265-
assert response["dataset_id"] == dataset.id
274+
assert response["dataset_id"] == dataset_scene.id
266275
assert response["new_scenes"] == len(scenes)
267276

268277

269278
@pytest.mark.skip("Deactivated sync upload for scenes")
270279
@pytest.mark.integration
271-
def test_scene_and_cuboid_upload_sync(dataset):
280+
def test_scene_and_cuboid_upload_sync(dataset_scene):
272281
payload = TEST_LIDAR_SCENES
273282
scenes = [
274283
LidarScene.from_json(scene_json) for scene_json in payload[SCENES_KEY]
275284
]
276285
update = payload[UPDATE_KEY]
277286

278-
response = dataset.append(scenes, update=update)
287+
response = dataset_scene.append(scenes, update=update)
279288

280-
assert response["dataset_id"] == dataset.id
289+
assert response["dataset_id"] == dataset_scene.id
281290
assert response["new_scenes"] == len(scenes)
282291

283292
lidar_item_ref = payload[SCENES_KEY][0][FRAMES_KEY][0]["lidar"][
@@ -286,30 +295,30 @@ def test_scene_and_cuboid_upload_sync(dataset):
286295
TEST_CUBOID_ANNOTATIONS[0][REFERENCE_ID_KEY] = lidar_item_ref
287296

288297
annotations = [CuboidAnnotation.from_json(TEST_CUBOID_ANNOTATIONS[0])]
289-
response = dataset.annotate(annotations)
298+
response = dataset_scene.annotate(annotations)
290299

291-
assert response["dataset_id"] == dataset.id
300+
assert response["dataset_id"] == dataset_scene.id
292301
assert response["annotations_processed"] == len(annotations)
293302
assert response["annotations_ignored"] == 0
294303

295-
response_annotations = dataset.refloc(lidar_item_ref)[ANNOTATIONS_KEY][
296-
"cuboid"
297-
]
304+
response_annotations = dataset_scene.refloc(lidar_item_ref)[
305+
ANNOTATIONS_KEY
306+
]["cuboid"]
298307
assert len(response_annotations) == 1
299308
assert_cuboid_annotation_matches_dict(
300309
response_annotations[0], TEST_CUBOID_ANNOTATIONS[0]
301310
)
302311

303312

304313
@pytest.mark.integration
305-
def test_scene_upload_async(dataset):
314+
def test_scene_upload_async(dataset_scene):
306315
payload = TEST_LIDAR_SCENES
307316
scenes = [
308317
LidarScene.from_json(scene_json) for scene_json in payload[SCENES_KEY]
309318
]
310319
update = payload[UPDATE_KEY]
311320

312-
job = dataset.append(scenes, update=update, asynchronous=True)
321+
job = dataset_scene.append(scenes, update=update, asynchronous=True)
313322
job.sleep_until_complete()
314323
status = job.status()
315324

@@ -319,7 +328,7 @@ def test_scene_upload_async(dataset):
319328
"message": {
320329
"scene_upload_progress": {
321330
"errors": [],
322-
"dataset_id": dataset.id,
331+
"dataset_id": dataset_scene.id,
323332
"new_scenes": len(scenes),
324333
"ignored_scenes": 0,
325334
"scenes_errored": 0,
@@ -330,3 +339,15 @@ def test_scene_upload_async(dataset):
330339
"completed_steps": 1,
331340
"total_steps": 1,
332341
}
342+
343+
344+
@pytest.mark.integration
345+
def test_scene_upload_async_item_dataset(dataset_item):
346+
payload = TEST_LIDAR_SCENES
347+
scenes = [
348+
LidarScene.from_json(scene_json) for scene_json in payload[SCENES_KEY]
349+
]
350+
update = payload[UPDATE_KEY]
351+
352+
with pytest.raises(Exception):
353+
dataset_item.append(scenes, update=update, asynchronous=True)

0 commit comments

Comments
 (0)