Skip to content

Commit a4652dc

Browse files
authored
Check for duplicate Reference IDs on Frame and Scene creation (#295)
1 parent e99dee2 commit a4652dc

File tree

7 files changed

+38
-19
lines changed

7 files changed

+38
-19
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.10.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.10.7) - 2022-05-09
9+
10+
### Fixed
11+
- Add checks for duplicate reference IDs
12+
13+
814
## [0.10.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.10.6) - 2022-05-06
915

1016
### Added

nucleus/dataset.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,9 @@ def append(
554554
]
555555
lidar_scenes = [item for item in items if isinstance(item, LidarScene)]
556556
video_scenes = [item for item in items if isinstance(item, VideoScene)]
557+
558+
check_for_duplicate_reference_ids(dataset_items)
559+
557560
if dataset_items and (lidar_scenes or video_scenes):
558561
raise Exception(
559562
"You must append either DatasetItems or Scenes to the dataset."
@@ -573,8 +576,6 @@ def append(
573576
video_scenes, update, asynchronous
574577
)
575578

576-
check_for_duplicate_reference_ids(dataset_items)
577-
578579
if len(dataset_items) > WARN_FOR_LARGE_UPLOAD and not asynchronous:
579580
print(
580581
"Tip: for large uploads, get faster performance by importing your data "

nucleus/dataset_item.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,13 +195,17 @@ def check_all_paths_remote(dataset_items: Sequence[DatasetItem]):
195195
def check_for_duplicate_reference_ids(dataset_items: Sequence[DatasetItem]):
196196
ref_ids = []
197197
for dataset_item in dataset_items:
198-
if dataset_item.reference_id is not None:
199-
ref_ids.append(dataset_item.reference_id)
198+
if dataset_item.reference_id is None:
199+
raise ValueError(
200+
f"Reference ID cannot be None. Encountered DatasetItem with no reference ID:\n{dataset_item}"
201+
)
202+
ref_ids.append(dataset_item.reference_id)
200203
if len(ref_ids) != len(set(ref_ids)):
201204
duplicates = {
202205
f"{key}": f"Count: {value}"
203206
for key, value in Counter(ref_ids).items()
207+
if value > 1
204208
}
205209
raise ValueError(
206-
f"Duplicate reference ids found among dataset_items: {duplicates}"
210+
f"Duplicate reference IDs found among dataset_items: {duplicates}"
207211
)

nucleus/scene.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,17 @@
1818
)
1919

2020
from .annotation import is_local_path
21-
from .dataset_item import DatasetItem, DatasetItemType
21+
from .dataset_item import (
22+
DatasetItem,
23+
DatasetItemType,
24+
check_for_duplicate_reference_ids,
25+
)
2226

2327

2428
class Frame:
25-
"""Collection of sensor data pertaining to a single timestep.
29+
"""Collection of sensor data pertaining to a single time step.
2630
27-
For 3D data, each Frame hosues a sensor-to-data mapping and must have exactly
31+
For 3D data, each Frame houses a sensor-to-data mapping and must have exactly
2832
one pointcloud with any number of camera images.
2933
3034
Parameters:
@@ -37,16 +41,15 @@ class Frame:
3741
"""
3842

3943
def __init__(self, **kwargs):
40-
self.items = {}
44+
self.items: Dict[str, DatasetItem] = {}
4145
for key, value in kwargs.items():
42-
self.items[key] = value
43-
44-
def __post_init__(self):
45-
for key, value in self.items.items():
4646
assert isinstance(key, str), "All keys must be names of sensors"
4747
assert isinstance(
4848
value, DatasetItem
49-
), "All values must be DatasetItems"
49+
), f"All values must be DatasetItems, instead got type {type(value)}"
50+
self.items[key] = value
51+
52+
check_for_duplicate_reference_ids(list(self.items.values()))
5053

5154
def __repr__(self) -> str:
5255
return f"Frame(items={self.items})"
@@ -129,6 +132,8 @@ def __post_init__(self):
129132
if self.metadata is None:
130133
self.metadata = {}
131134

135+
self.validate()
136+
132137
def __eq__(self, other):
133138
return all(
134139
[
@@ -151,10 +156,14 @@ def num_sensors(self) -> int:
151156
def validate(self):
152157
# TODO: make private
153158
assert self.length > 0, "Must have at least 1 frame in a scene"
159+
all_items = []
154160
for frame in self.frames_dict.values():
155161
assert isinstance(
156162
frame, Frame
157163
), "Each frame in a scene must be a Frame object"
164+
all_items.extend(frame.get_items())
165+
166+
check_for_duplicate_reference_ids(all_items)
158167

159168
def add_item(
160169
self, index: int, sensor_name: str, item: DatasetItem

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ exclude = '''
2121

2222
[tool.poetry]
2323
name = "scale-nucleus"
24-
version = "0.10.6"
24+
version = "0.10.7"
2525
description = "The official Python client library for Nucleus, the Data Platform for AI"
2626
license = "MIT"
2727
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

tests/test_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def test_raises_error_for_duplicate():
352352
)
353353
assert (
354354
str(error.value)
355-
== "Duplicate reference ids found among dataset_items:"
355+
== "Duplicate reference IDs found among dataset_items:"
356356
" {'duplicate': 'Count: 2'}"
357357
)
358358

tests/test_scene.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,11 +161,10 @@ def test_scene_property_methods():
161161

162162
def test_scene_add_item():
163163
scene_ref_id = "scene_1"
164-
scene = LidarScene(scene_ref_id)
164+
firstFrame = Frame(lidar=TEST_LIDAR_ITEMS[0])
165+
scene = LidarScene(scene_ref_id, frames=[firstFrame])
165166
scene.add_item(0, "camera", TEST_DATASET_ITEMS[0])
166-
scene.add_item(0, "lidar", TEST_LIDAR_ITEMS[0])
167167
scene.add_item(1, "lidar", TEST_LIDAR_ITEMS[1])
168-
169168
assert set(scene.get_sensors()) == set(["camera", "lidar"])
170169
assert scene.get_item(1, "lidar") == TEST_LIDAR_ITEMS[1]
171170
assert scene.get_items_from_sensor("lidar") == [

0 commit comments

Comments
 (0)