Skip to content

Commit b8df106

Browse files
author
Claire Pajot
committed
Merge branch 'master' into add_classification_type_to_groundtruth
2 parents 6ff2492 + 019817a commit b8df106

File tree

11 files changed

+218
-42
lines changed

11 files changed

+218
-42
lines changed

nucleus/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
)
2020
ANNOTATION_UPDATE_KEY = "update"
2121
AUTOTAGS_KEY = "autotags"
22+
AUTOTAG_SCORE_THRESHOLD = "score_threshold"
2223
EXPORTED_ROWS = "exportedRows"
2324
CAMERA_PARAMS_KEY = "camera_params"
2425
CLASS_PDF_KEY = "class_pdf"

nucleus/dataset.py

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
NAME_KEY,
2525
REFERENCE_IDS_KEY,
2626
REQUEST_ID_KEY,
27+
AUTOTAG_SCORE_THRESHOLD,
2728
UPDATE_KEY,
2829
)
2930
from .dataset_item import (
@@ -87,21 +88,55 @@ def items(self) -> List[DatasetItem]:
8788
return self._client.get_dataset_items(self.id)
8889

8990
@sanitize_string_args
90-
def autotag_scores(self, autotag_name, for_scores_greater_than=0):
91-
"""Export the autotag scores above a threshold, largest scores first.
91+
def autotag_items(self, autotag_name, for_scores_greater_than=0):
92+
"""For a given Autotag of this dataset, export its tagged items with scores above a threshold, largest scores first.
9293
93-
If you have pandas installed, you can create a pandas dataframe using
94+
:return: dictionary of the form
95+
{
96+
'autotagItems': {
97+
ref_id: str,
98+
score: float,
99+
model_prediction_annotation_id: str | None
100+
ground_truth_annotation_id: str | None,
101+
}[],
102+
'autotag': {
103+
id: str,
104+
name: str,
105+
status: 'started' | 'completed',
106+
autotag_level: 'Image' | 'Object'
107+
}
108+
}
109+
See https://dashboard.nucleus.scale.com/nucleus/docs/api#export-autotag-items for more details on the return types.
110+
"""
111+
response = self._client.make_request(
112+
payload={AUTOTAG_SCORE_THRESHOLD: for_scores_greater_than},
113+
route=f"dataset/{self.id}/autotag/{autotag_name}/taggedItems",
114+
requests_command=requests.get,
115+
)
116+
return response
94117

95-
pandas.Dataframe(dataset.autotag_scores(autotag_name))
118+
def autotag_training_items(self, autotag_name):
119+
"""For a given Autotag of this dataset, export its training items. These are user selected positives during refinement.
96120
97121
:return: dictionary of the form
98-
{'ref_ids': List[str],
99-
'datset_item_ids': List[str],
100-
'score': List[float]}
122+
{
123+
'autotagPositiveTrainingItems': {
124+
ref_id: str,
125+
model_prediction_annotation_id: str | None,
126+
ground_truth_annotation_id: str | None,
127+
}[],
128+
'autotag': {
129+
id: str,
130+
name: str,
131+
status: 'started' | 'completed',
132+
autotag_level: 'Image' | 'Object'
133+
}
134+
}
135+
See https://dashboard.nucleus.scale.com/nucleus/docs/api#export-autotag-training-items for more details on the return types.
101136
"""
102137
response = self._client.make_request(
103138
payload={},
104-
route=f"autotag/{self.id}/{autotag_name}/{for_scores_greater_than}",
139+
route=f"dataset/{self.id}/autotag/{autotag_name}/trainingItems",
105140
requests_command=requests.get,
106141
)
107142
return response
@@ -349,6 +384,21 @@ def loc(self, dataset_item_id: str) -> dict:
349384
response = self._client.dataitem_loc(self.id, dataset_item_id)
350385
return format_dataset_item_response(response)
351386

387+
def ground_truth_loc(self, reference_id: str, annotation_id: str):
388+
"""
389+
Returns info for single ground truth Annotation by its id.
390+
:param reference_id: User specified id for the dataset item the ground truth is attached to
391+
:param annotation_id: User specified, or auto-generated id for the annotation
392+
:return:
393+
BoxAnnotation | PolygonAnnotation | CuboidAnnotation
394+
"""
395+
response = self._client.make_request(
396+
{},
397+
f"dataset/{self.id}/groundTruth/loc/{reference_id}/{annotation_id}",
398+
requests.get,
399+
)
400+
return Annotation.from_json(response)
401+
352402
def create_slice(
353403
self,
354404
name: str,

nucleus/job.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,13 @@ def errors(self) -> List[str]:
4040
def sleep_until_complete(self, verbose_std_out=True):
4141
while 1:
4242
status = self.status()
43-
4443
time.sleep(JOB_POLLING_INTERVAL)
4544

4645
if verbose_std_out:
4746
print(f"Status at {time.ctime()}: {status}")
4847
if status["status"] == "Running":
4948
continue
49+
5050
break
5151

5252
final_status = status

nucleus/model_run.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
CuboidPrediction,
2020
PolygonPrediction,
2121
SegmentationPrediction,
22+
from_json,
2223
)
2324

2425

@@ -160,6 +161,23 @@ def loc(self, dataset_item_id: str):
160161
)
161162
return self._format_prediction_response(response)
162163

164+
def prediction_loc(self, reference_id: str, annotation_id: str):
165+
"""
166+
Returns info for single Prediction by its reference id and annotation id.
167+
:param reference_id: the user specified id for the image
168+
:param annotation_id: the user specified id for the prediction, or if one was not provided, the Scale internally generated id for the prediction
169+
:return:
170+
BoxPrediction | PolygonPrediction | CuboidPrediction
171+
"""
172+
173+
response = self._client.make_request(
174+
{},
175+
f"modelRun/{self.model_run_id}/prediction/loc/{reference_id}/{annotation_id}",
176+
requests.get,
177+
)
178+
179+
return from_json(response)
180+
163181
def ungrouped_export(self):
164182
json_response = self._client.make_request(
165183
payload={},

nucleus/prediction.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,14 @@
1010
)
1111
from .constants import (
1212
ANNOTATION_ID_KEY,
13+
BOX_TYPE,
14+
CUBOID_TYPE,
15+
POLYGON_TYPE,
1316
REFERENCE_ID_KEY,
1417
METADATA_KEY,
1518
GEOMETRY_KEY,
1619
LABEL_KEY,
20+
TYPE_KEY,
1721
X_KEY,
1822
Y_KEY,
1923
WIDTH_KEY,
@@ -29,6 +33,17 @@
2933
)
3034

3135

36+
def from_json(payload: dict):
37+
if payload.get(TYPE_KEY, None) == BOX_TYPE:
38+
return BoxPrediction.from_json(payload)
39+
elif payload.get(TYPE_KEY, None) == POLYGON_TYPE:
40+
return PolygonPrediction.from_json(payload)
41+
elif payload.get(TYPE_KEY, None) == CUBOID_TYPE:
42+
return CuboidPrediction.from_json(payload)
43+
else:
44+
return SegmentationPrediction.from_json(payload)
45+
46+
3247
class SegmentationPrediction(SegmentationAnnotation):
3348
# No need to define init or to_payload methods because
3449
# we default to functions defined in the parent class

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ exclude = '''
2121

2222
[tool.poetry]
2323
name = "scale-nucleus"
24-
version = "0.1.19"
24+
version = "0.1.22"
2525
description = "The official Python client library for Nucleus, the Data Platform for AI"
2626
license = "MIT"
2727
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

scripts/load_test.py

Lines changed: 65 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import nucleus
55
import os
66

7+
from itertools import zip_longest
8+
79
import time
810

911

@@ -21,6 +23,8 @@
2123
"API Key to use. Defaults to NUCLEUS_PYTEST_API_KEY environment variable",
2224
)
2325

26+
flags.DEFINE_integer("job_parallelism", 8, "Amount of concurrent jobs to use.")
27+
2428
# Dataset upload flags
2529
flags.DEFINE_enum(
2630
"create_or_reuse_dataset",
@@ -35,12 +39,12 @@
3539
)
3640
flags.DEFINE_integer(
3741
"num_dataset_items",
38-
100000,
42+
10000000,
3943
"Number of dataset items to create if creating a dataset",
4044
lower_bound=0,
4145
)
4246
flags.DEFINE_bool(
43-
"cleanup_dataset", True, "Whether to delete the dataset after the test."
47+
"cleanup_dataset", False, "Whether to delete the dataset after the test."
4448
)
4549

4650
# Annotation upload flags
@@ -54,11 +58,21 @@
5458
# Prediction upload flags
5559
flags.DEFINE_integer(
5660
"num_predictions_per_dataset_item",
57-
0,
61+
1,
5862
"Number of annotations per dataset item",
5963
lower_bound=0,
6064
)
6165

66+
TIMINGS = {}
67+
68+
69+
def chunk(iterable, chunk_size, fillvalue=None):
70+
"Collect data into fixed-length chunks or blocks"
71+
args = [iter(iterable)] * chunk_size
72+
73+
for chunk_iterable in zip_longest(*args, fillvalue=fillvalue):
74+
yield filter(lambda x: x is not None, chunk_iterable)
75+
6276

6377
def client():
6478
return nucleus.NucleusClient(api_key=FLAGS.api_key)
@@ -126,15 +140,23 @@ def create_or_get_dataset():
126140
dataset = client().create_dataset("Privacy Mode Load Test Dataset")
127141
print("Starting dataset item upload")
128142
tic = time.time()
129-
job = dataset.append(
130-
dataset_item_generator(), update=True, asynchronous=True
131-
)
132-
try:
133-
job.sleep_until_complete(False)
134-
except JobError:
135-
print(job.errors())
143+
chunk_size = FLAGS.num_dataset_items // FLAGS.job_parallelism
144+
jobs = []
145+
for dataset_item_chunk in chunk(dataset_item_generator(), chunk_size):
146+
jobs.append(
147+
dataset.append(
148+
dataset_item_chunk, update=True, asynchronous=True
149+
)
150+
)
151+
152+
for job in jobs:
153+
try:
154+
job.sleep_until_complete(False)
155+
except JobError:
156+
print(job.errors())
136157
toc = time.time()
137158
print("Finished dataset item upload: %s" % (toc - tic))
159+
TIMINGS[f"Dataset Item Upload {FLAGS.num_dataset_items}"] = toc - tic
138160
else:
139161
print(f"Reusing dataset {FLAGS.dataset_id}")
140162
dataset = client().get_dataset(FLAGS.dataset_id)
@@ -144,15 +166,26 @@ def create_or_get_dataset():
144166
def upload_annotations(dataset: Dataset):
145167
print("Starting annotation upload")
146168
tic = time.time()
147-
job = dataset.annotate(
148-
list(annotation_generator()), update=False, asynchronous=True
169+
jobs = []
170+
num_annotations = (
171+
FLAGS.num_dataset_items * FLAGS.num_annotations_per_dataset_item
149172
)
150-
try:
151-
job.sleep_until_complete(False)
152-
except JobError:
153-
print(job.errors())
173+
chunk_size = num_annotations // FLAGS.job_parallelism
174+
for annotation_chunk in chunk(annotation_generator(), chunk_size):
175+
jobs.append(
176+
dataset.annotate(
177+
list(annotation_chunk), update=False, asynchronous=True
178+
)
179+
)
180+
181+
for job in jobs:
182+
try:
183+
job.sleep_until_complete(False)
184+
except JobError:
185+
print(job.errors())
154186
toc = time.time()
155187
print("Finished annotation upload: %s" % (toc - tic))
188+
TIMINGS[f"Annotation Upload {num_annotations}"] = toc - tic
156189

157190

158191
def upload_predictions(dataset: Dataset):
@@ -167,16 +200,24 @@ def upload_predictions(dataset: Dataset):
167200

168201
print("Starting prediction upload")
169202

170-
job = run.predict(
171-
list(prediction_generator()), update=True, asynchronous=True
203+
num_predictions = (
204+
FLAGS.num_dataset_items * FLAGS.num_predictions_per_dataset_item
172205
)
206+
chunk_size = num_predictions // FLAGS.job_parallelism
207+
jobs = []
208+
for prediction_chunk in chunk(prediction_generator(), chunk_size):
209+
jobs.append(
210+
run.predict(list(prediction_chunk), update=True, asynchronous=True)
211+
)
173212

174-
try:
175-
job.sleep_until_complete(False)
176-
except JobError:
177-
print(job.errors())
213+
for job in jobs:
214+
try:
215+
job.sleep_until_complete(False)
216+
except JobError:
217+
print(job.errors())
178218
toc = time.time()
179219
print("Finished prediction upload: %s" % (toc - tic))
220+
TIMINGS[f"Prediction Upload {num_predictions}"] = toc - tic
180221

181222

182223
def main(unused_argv):
@@ -194,6 +235,8 @@ def main(unused_argv):
194235
if FLAGS.cleanup_dataset and FLAGS.create_or_reuse_dataset == "create":
195236
client().delete_dataset(dataset.id)
196237

238+
print(TIMINGS)
239+
197240

198241
if __name__ == "__main__":
199242
app.run(main)

tests/test_annotation.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ def test_box_gt_upload(dataset):
7979
assert response["annotations_ignored"] == 0
8080

8181
response = dataset.refloc(annotation.reference_id)["annotations"]["box"]
82+
single_annotation_response = dataset.ground_truth_loc(
83+
annotation.reference_id, annotation.annotation_id
84+
)
85+
86+
assert response[0] == single_annotation_response
8287
assert len(response) == 1
8388
response_annotation = response[0]
8489
assert_box_annotation_matches_dict(

0 commit comments

Comments
 (0)