Skip to content

Commit 2e64ccf

Browse files
authored
Merge pull request #41 from scaleapi/metadata-schema
Metadata schema
2 parents 24158e8 + 10fc1fd commit 2e64ccf

File tree

5 files changed

+90
-17
lines changed

5 files changed

+90
-17
lines changed

nucleus/__init__.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@
116116
NAME_KEY,
117117
ANNOTATIONS_KEY,
118118
AUTOTAGS_KEY,
119+
ANNOTATION_METADATA_SCHEMA_KEY,
120+
ITEM_METADATA_SCHEMA_KEY,
121+
FORCE_KEY,
119122
)
120123
from .model import Model
121124
from .errors import (
@@ -150,7 +153,16 @@ def list_models(self) -> List[Model]:
150153
"""
151154
model_objects = self._make_request({}, "models/", requests.get)
152155

153-
return [Model(model["id"], model["name"], model["ref_id"], model["metadata"], self) for model in model_objects["models"]]
156+
return [
157+
Model(
158+
model["id"],
159+
model["name"],
160+
model["ref_id"],
161+
model["metadata"],
162+
self,
163+
)
164+
for model in model_objects["models"]
165+
]
154166

155167
def list_datasets(self) -> Dict[str, Union[str, List[str]]]:
156168
"""
@@ -229,15 +241,28 @@ def create_dataset_from_project(
229241
response = self._make_request(payload, "dataset/create_from_project")
230242
return Dataset(response[DATASET_ID_KEY], self)
231243

232-
def create_dataset(self, name: str) -> Dataset:
244+
def create_dataset(
245+
self,
246+
name: str,
247+
item_metadata_schema: Optional[Dict] = None,
248+
annotation_metadata_schema: Optional[Dict] = None,
249+
) -> Dataset:
233250
"""
234-
Creates a new dataset based on payload params:
235-
name -- A human-readable name of the dataset.
251+
Creates a new dataset:
236252
Returns a response with internal id and name for a new dataset.
237-
:param payload: { "name": str }
253+
:param name -- A human-readable name of the dataset.
254+
:param item_metadata_schema -- optional dictionary to define item metadata schema
255+
:param annotation_metadata_schema -- optional dictionary to define annotation metadata schema
238256
:return: new Dataset object
239257
"""
240-
response = self._make_request({NAME_KEY: name}, "dataset/create")
258+
response = self._make_request(
259+
{
260+
NAME_KEY: name,
261+
ANNOTATION_METADATA_SCHEMA_KEY: annotation_metadata_schema,
262+
ITEM_METADATA_SCHEMA_KEY: item_metadata_schema,
263+
},
264+
"dataset/create",
265+
)
241266
return Dataset(response[DATASET_ID_KEY], self)
242267

243268
def delete_dataset(self, dataset_id: str) -> dict:
@@ -325,16 +350,16 @@ def populate_dataset(
325350
async_responses: List[Any] = []
326351

327352
for batch in tqdm_local_batches:
328-
payload = construct_append_payload(batch)
353+
payload = construct_append_payload(batch, force)
329354
responses = self._process_append_requests_local(
330-
dataset_id, payload
355+
dataset_id, payload, force
331356
)
332357
async_responses.extend(responses)
333358

334359
for batch in tqdm_remote_batches:
335-
payload = construct_append_payload(batch)
360+
payload = construct_append_payload(batch, force)
336361
responses = self._process_append_requests(
337-
dataset_id, payload, batch_size, batch_size
362+
dataset_id, payload, force, batch_size, batch_size
338363
)
339364
async_responses.extend(responses)
340365

@@ -411,7 +436,6 @@ def close_files(request_items):
411436
# don't forget to close all open files
412437
for p in request_payloads:
413438
close_files(p)
414-
# [close_files(p) for p in request_payloads]
415439

416440
# response object will be None if an error occurred
417441
async_responses = [
@@ -428,6 +452,7 @@ def _process_append_requests(
428452
self,
429453
dataset_id: str,
430454
payload: dict,
455+
update: bool,
431456
batch_size: int = 20,
432457
size: int = 10,
433458
):
@@ -446,7 +471,7 @@ def exception_handler(request, exception):
446471
items = payload[ITEMS_KEY]
447472
payloads = [
448473
# batch_size images per request
449-
{ITEMS_KEY: items[i : i + batch_size]}
474+
{ITEMS_KEY: items[i : i + batch_size], FORCE_KEY: update}
450475
for i in range(0, len(items), batch_size)
451476
]
452477

@@ -479,7 +504,7 @@ def annotate_dataset(
479504
Union[BoxAnnotation, PolygonAnnotation, SegmentationAnnotation]
480505
],
481506
update: bool,
482-
batch_size: int = 100,
507+
batch_size: int = 5000,
483508
):
484509
"""
485510
Uploads ground truth annotations for a given dataset.
@@ -1009,7 +1034,7 @@ def _make_grequest(
10091034

10101035
def _make_request_raw(
10111036
self, payload: dict, route: str, requests_command=requests.post
1012-
) -> dict:
1037+
):
10131038
"""
10141039
Makes a request to Nucleus endpoint. This method returns the raw
10151040
requests.Response object which is useful for unit testing.
@@ -1046,7 +1071,7 @@ def _make_request(
10461071
"""
10471072
response = self._make_request_raw(payload, route, requests_command)
10481073

1049-
if response.status_code not in SUCCESS_STATUS_CODES:
1074+
if getattr(response, "status_code") not in SUCCESS_STATUS_CODES:
10501075
logger.warning(response)
10511076

10521077
return (

nucleus/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
DATASET_MODEL_RUNS_KEY = "model_run_ids"
3434
DATASET_SLICES_KEY = "slice_ids"
3535
DATASET_LENGTH_KEY = "length"
36-
FORCE_KEY = "force"
36+
FORCE_KEY = "update"
3737
METADATA_KEY = "metadata"
3838
NAME_KEY = "name"
3939
LABEL_KEY = "label"
@@ -51,6 +51,8 @@
5151
ANNOTATION_TYPES = (BOX_TYPE, POLYGON_TYPE, SEGMENTATION_TYPE)
5252
GEOMETRY_KEY = "geometry"
5353
AUTOTAGS_KEY = "autotags"
54+
ANNOTATION_METADATA_SCHEMA_KEY = "annotation_metadata_schema"
55+
ITEM_METADATA_SCHEMA_KEY = "item_metadata_schema"
5456
MASK_URL_KEY = "mask_url"
5557
INDEX_KEY = "index"
5658
SEGMENTATIONS_KEY = "segmentations"

nucleus/dataset.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def create_model_run(
7575
reference_id: Optional[str] = None,
7676
model_id: Optional[str] = None,
7777
metadata: Optional[Dict[str, Any]] = None,
78+
annotation_metadata_schema: Optional[Dict] = None,
7879
):
7980
"""
8081
:param name: A name for the model run.
@@ -83,6 +84,8 @@ def create_model_run(
8384
:param model_id: The internally-controlled identifier of the model.
8485
The 'reference_id' field should be empty if this field is populated,
8586
:param metadata: An arbitrary metadata blob for the current run.
87+
:param annotation_metadata_schema: A dictionary that defines schema for annotations.
88+
:param segmentation_metadata_schema: A dictionary that defines schema for segmentation.
8689
8790
:return:
8891
{
@@ -91,7 +94,11 @@ def create_model_run(
9194
}
9295
"""
9396
payload = construct_model_run_creation_payload(
94-
name, reference_id, model_id, metadata
97+
name,
98+
reference_id,
99+
model_id,
100+
metadata,
101+
annotation_metadata_schema,
95102
)
96103
return self._client.create_model_run(self.id, payload)
97104

nucleus/payload_constructor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
ITEMS_KEY,
2020
FORCE_KEY,
2121
MODEL_ID_KEY,
22+
ANNOTATION_METADATA_SCHEMA_KEY,
2223
SEGMENTATIONS_KEY,
2324
)
2425

@@ -87,6 +88,7 @@ def construct_model_run_creation_payload(
8788
reference_id: Optional[str],
8889
model_id: Optional[str],
8990
metadata: Optional[Dict],
91+
annotation_metadata_schema: Optional[Dict] = None,
9092
) -> dict:
9193
payload = {
9294
NAME_KEY: name,
@@ -100,4 +102,5 @@ def construct_model_run_creation_payload(
100102
NAME_KEY: name,
101103
REFERENCE_ID_KEY: reference_id,
102104
METADATA_KEY: metadata if metadata else {},
105+
ANNOTATION_METADATA_SCHEMA_KEY: annotation_metadata_schema,
103106
}

nucleus/utils.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from typing import List, Union, Dict
2+
3+
from .dataset_item import DatasetItem
4+
from .prediction import BoxPrediction, PolygonPrediction
5+
6+
7+
def _get_all_field_values(metadata_list: List[dict], key: str):
8+
return {metadata[key] for metadata in metadata_list if key in metadata}
9+
10+
11+
def suggest_metadata_schema(
12+
data: Union[
13+
List[DatasetItem], List[BoxPrediction], List[PolygonPrediction]
14+
]
15+
):
16+
metadata_list: List[dict] = [
17+
d.metadata for d in data if d.metadata is not None
18+
]
19+
schema = {}
20+
all_keys = {k for metadata in metadata_list for k in metadata.keys()}
21+
22+
all_key_values: Dict[str, set] = {
23+
k: _get_all_field_values(metadata_list, k) for k in all_keys
24+
}
25+
26+
for key, values in all_key_values.items():
27+
entry: dict = {}
28+
if all(isinstance(x, (float, int)) for x in values):
29+
entry["type"] = "number"
30+
elif len(values) <= 50:
31+
entry["type"] = "category"
32+
entry["choices"] = list(values)
33+
else:
34+
entry["type"] = "text"
35+
schema[key] = entry
36+
return schema

0 commit comments

Comments
 (0)