Skip to content

Commit 37afc9f

Browse files
authored
Merge pull request #67 from scaleapi/da/async_upload
Async item ingest
2 parents 41e93ca + 854462d commit 37afc9f

16 files changed

+317
-124
lines changed

.circleci/config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ jobs:
1414
pip install --upgrade pip
1515
pip install poetry
1616
poetry install
17+
1718
- run:
1819
name: Black Formatting Check # Only validation, without re-formatting
1920
command: |

nucleus/__init__.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
# pylint: disable=C0302
7171
from requests.packages.urllib3.util.retry import Retry
7272

73-
from .constants import REFERENCE_IDS_KEY, DATASET_ITEM_IDS_KEY
73+
from .constants import REFERENCE_IDS_KEY, DATASET_ITEM_IDS_KEY, UPDATE_KEY
7474
from .dataset import Dataset
7575
from .dataset_item import DatasetItem
7676
from .annotation import (
@@ -123,7 +123,6 @@
123123
AUTOTAGS_KEY,
124124
ANNOTATION_METADATA_SCHEMA_KEY,
125125
ITEM_METADATA_SCHEMA_KEY,
126-
FORCE_KEY,
127126
EMBEDDINGS_URL_KEY,
128127
)
129128
from .model import Model
@@ -151,11 +150,14 @@ def __init__(
151150
self,
152151
api_key: str,
153152
use_notebook: bool = False,
154-
endpoint=NUCLEUS_ENDPOINT,
153+
endpoint: str = None,
155154
):
156155
self.api_key = api_key
157156
self.tqdm_bar = tqdm.tqdm
158-
self.endpoint = endpoint
157+
if endpoint is None:
158+
self.endpoint = os.environ.get(
159+
"NUCLEUS_ENDPOINT", NUCLEUS_ENDPOINT
160+
)
159161
self._use_notebook = use_notebook
160162
if use_notebook:
161163
self.tqdm_bar = tqdm_notebook.tqdm
@@ -497,7 +499,7 @@ def exception_handler(request, exception):
497499
items = payload[ITEMS_KEY]
498500
payloads = [
499501
# batch_size images per request
500-
{ITEMS_KEY: items[i : i + batch_size], FORCE_KEY: update}
502+
{ITEMS_KEY: items[i : i + batch_size], UPDATE_KEY: update}
501503
for i in range(0, len(items), batch_size)
502504
]
503505

nucleus/constants.py

Lines changed: 54 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,65 @@
1-
NUCLEUS_ENDPOINT = "https://api.scale.com/v1/nucleus"
2-
DEFAULT_NETWORK_TIMEOUT_SEC = 120
3-
ITEMS_KEY = "items"
4-
ITEM_KEY = "item"
5-
REFERENCE_ID_KEY = "reference_id"
6-
REFERENCE_IDS_KEY = "reference_ids"
7-
DATASET_ID_KEY = "dataset_id"
8-
IMAGE_KEY = "image"
9-
IMAGE_URL_KEY = "image_url"
10-
NEW_ITEMS = "new_items"
11-
UPDATED_ITEMS = "updated_items"
12-
IGNORED_ITEMS = "ignored_items"
13-
ERROR_ITEMS = "upload_errors"
14-
ERROR_PAYLOAD = "error_payload"
15-
ERROR_CODES = "error_codes"
1+
ANNOTATIONS_IGNORED_KEY = "annotations_ignored"
162
ANNOTATIONS_KEY = "annotations"
17-
ANNOTATION_ID_KEY = "annotation_id"
183
ANNOTATIONS_PROCESSED_KEY = "annotations_processed"
19-
ANNOTATIONS_IGNORED_KEY = "annotations_ignored"
20-
PREDICTIONS_PROCESSED_KEY = "predictions_processed"
21-
PREDICTIONS_IGNORED_KEY = "predictions_ignored"
4+
ANNOTATION_ID_KEY = "annotation_id"
5+
ANNOTATION_METADATA_SCHEMA_KEY = "annotation_metadata_schema"
6+
BOX_TYPE = "box"
7+
POLYGON_TYPE = "polygon"
8+
SEGMENTATION_TYPE = "segmentation"
9+
ANNOTATION_TYPES = (BOX_TYPE, POLYGON_TYPE, SEGMENTATION_TYPE)
2210
ANNOTATION_UPDATE_KEY = "update"
23-
DEFAULT_ANNOTATION_UPDATE_MODE = False
24-
STATUS_CODE_KEY = "status_code"
25-
STATUS_KEY = "status"
26-
SUCCESS_STATUS_CODES = [200, 201, 202]
27-
ERRORS_KEY = "errors"
28-
MODEL_RUN_ID_KEY = "model_run_id"
29-
MODEL_ID_KEY = "model_id"
30-
DATASET_ITEM_ID_KEY = "dataset_item_id"
31-
ITEM_ID_KEY = "item_id"
11+
AUTOTAGS_KEY = "autotags"
12+
13+
CONFIDENCE_KEY = "confidence"
14+
DATASET_ID_KEY = "dataset_id"
3215
DATASET_ITEM_IDS_KEY = "dataset_item_ids"
33-
SLICE_ID_KEY = "slice_id"
34-
DATASET_NAME_KEY = "name"
16+
DATASET_ITEM_ID_KEY = "dataset_item_id"
17+
DATASET_LENGTH_KEY = "length"
3518
DATASET_MODEL_RUNS_KEY = "model_run_ids"
19+
DATASET_NAME_KEY = "name"
3620
DATASET_SLICES_KEY = "slice_ids"
37-
DATASET_LENGTH_KEY = "length"
38-
FORCE_KEY = "update"
21+
DEFAULT_ANNOTATION_UPDATE_MODE = False
22+
DEFAULT_NETWORK_TIMEOUT_SEC = 120
23+
EMBEDDINGS_URL_KEY = "embeddings_url"
24+
ERRORS_KEY = "errors"
25+
ERROR_CODES = "error_codes"
26+
ERROR_ITEMS = "upload_errors"
27+
ERROR_PAYLOAD = "error_payload"
28+
GEOMETRY_KEY = "geometry"
29+
HEIGHT_KEY = "height"
30+
IGNORED_ITEMS = "ignored_items"
31+
IMAGE_KEY = "image"
32+
IMAGE_URL_KEY = "image_url"
33+
INDEX_KEY = "index"
34+
ITEMS_KEY = "items"
35+
ITEM_ID_KEY = "item_id"
36+
ITEM_KEY = "item"
37+
ITEM_METADATA_SCHEMA_KEY = "item_metadata_schema"
38+
JOB_ID_KEY = "job_id"
39+
LABEL_KEY = "label"
40+
MASK_URL_KEY = "mask_url"
41+
MESSAGE_KEY = "message"
3942
METADATA_KEY = "metadata"
43+
MODEL_ID_KEY = "model_id"
44+
MODEL_RUN_ID_KEY = "model_run_id"
4045
NAME_KEY = "name"
41-
LABEL_KEY = "label"
42-
CONFIDENCE_KEY = "confidence"
46+
NEW_ITEMS = "new_items"
47+
NUCLEUS_ENDPOINT = "https://api.scale.com/v1/nucleus"
4348
ORIGINAL_IMAGE_URL_KEY = "original_image_url"
44-
X_KEY = "x"
45-
Y_KEY = "y"
46-
WIDTH_KEY = "width"
47-
HEIGHT_KEY = "height"
49+
PREDICTIONS_IGNORED_KEY = "predictions_ignored"
50+
PREDICTIONS_PROCESSED_KEY = "predictions_processed"
51+
REFERENCE_IDS_KEY = "reference_ids"
52+
REFERENCE_ID_KEY = "reference_id"
53+
REQUEST_ID_KEY = "requestId"
54+
SEGMENTATIONS_KEY = "segmentations"
55+
SLICE_ID_KEY = "slice_id"
56+
STATUS_CODE_KEY = "status_code"
57+
STATUS_KEY = "status"
58+
SUCCESS_STATUS_CODES = [200, 201, 202]
4859
TYPE_KEY = "type"
60+
UPDATED_ITEMS = "updated_items"
61+
UPDATE_KEY = "update"
4962
VERTICES_KEY = "vertices"
50-
BOX_TYPE = "box"
51-
POLYGON_TYPE = "polygon"
52-
SEGMENTATION_TYPE = "segmentation"
53-
ANNOTATION_TYPES = (BOX_TYPE, POLYGON_TYPE, SEGMENTATION_TYPE)
54-
GEOMETRY_KEY = "geometry"
55-
AUTOTAGS_KEY = "autotags"
56-
ANNOTATION_METADATA_SCHEMA_KEY = "annotation_metadata_schema"
57-
ITEM_METADATA_SCHEMA_KEY = "item_metadata_schema"
58-
MASK_URL_KEY = "mask_url"
59-
INDEX_KEY = "index"
60-
SEGMENTATIONS_KEY = "segmentations"
61-
EMBEDDINGS_URL_KEY = "embeddings_url"
62-
JOB_ID_KEY = "job_id"
63-
MESSAGE_KEY = "message"
63+
WIDTH_KEY = "width"
64+
X_KEY = "x"
65+
Y_KEY = "y"

nucleus/dataset.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1-
from collections import Counter
2-
from typing import List, Dict, Any, Optional
1+
import uuid
2+
from typing import Any, Dict, List, Optional, Union
33

44
import requests
55

6-
from nucleus.utils import format_dataset_item_response
6+
from nucleus.job import AsyncJob
7+
from nucleus.utils import (
8+
format_dataset_item_response,
9+
serialize_and_write_to_presigned_url,
10+
)
711

812
from .annotation import Annotation
913
from .constants import (
@@ -15,8 +19,14 @@
1519
DEFAULT_ANNOTATION_UPDATE_MODE,
1620
NAME_KEY,
1721
REFERENCE_IDS_KEY,
22+
REQUEST_ID_KEY,
23+
UPDATE_KEY,
24+
)
25+
from .dataset_item import (
26+
DatasetItem,
27+
check_all_paths_remote,
28+
check_for_duplicate_reference_ids,
1829
)
19-
from .dataset_item import DatasetItem
2030
from .payload_constructor import construct_model_run_creation_payload
2131

2232

@@ -27,7 +37,11 @@ class Dataset:
2737
compare model performance on you data.
2838
"""
2939

30-
def __init__(self, dataset_id: str, client):
40+
def __init__(
41+
self,
42+
dataset_id: str,
43+
client: "NucleusClient", # type:ignore # noqa: F821
44+
):
3145
self.id = dataset_id
3246
self._client = client
3347

@@ -161,16 +175,18 @@ def ingest_tasks(self, task_ids: dict):
161175
def append(
162176
self,
163177
dataset_items: List[DatasetItem],
164-
force: Optional[bool] = False,
178+
update: Optional[bool] = False,
165179
batch_size: Optional[int] = 20,
166-
) -> dict:
180+
asynchronous=False,
181+
) -> Union[dict, AsyncJob]:
167182
"""
168183
Appends images with metadata (dataset items) to the dataset. Overwrites images on collision if forced.
169184
170185
Parameters:
171186
:param dataset_items: items to upload
172-
:param force: if True overwrites images on collision
187+
:param update: if True overwrites images and metadata on collision
173188
:param batch_size: batch parameter for long uploads
189+
:param aynchronous: if True, return a job object representing asynchronous ingestion job.
174190
:return:
175191
{
176192
'dataset_id': str,
@@ -179,23 +195,29 @@ def append(
179195
'ignored_items': int,
180196
}
181197
"""
182-
ref_ids = []
183-
for dataset_item in dataset_items:
184-
if dataset_item.reference_id is not None:
185-
ref_ids.append(dataset_item.reference_id)
186-
if len(ref_ids) != len(set(ref_ids)):
187-
duplicates = {
188-
f"{key}": f"Count: {value}"
189-
for key, value in Counter(ref_ids).items()
190-
}
191-
raise ValueError(
192-
"Duplicate reference ids found among dataset_items: %s"
193-
% duplicates
198+
check_for_duplicate_reference_ids(dataset_items)
199+
200+
if asynchronous:
201+
check_all_paths_remote(dataset_items)
202+
request_id = uuid.uuid4().hex
203+
response = self._client.make_request(
204+
payload={},
205+
route=f"dataset/{self.id}/signedUrl/{request_id}",
206+
requests_command=requests.get,
207+
)
208+
serialize_and_write_to_presigned_url(
209+
dataset_items, response["signed_url"]
194210
)
211+
response = self._client.make_request(
212+
payload={REQUEST_ID_KEY: request_id, UPDATE_KEY: update},
213+
route=f"dataset/{self.id}/append?async=1",
214+
)
215+
return AsyncJob(response["job_id"], self._client)
216+
195217
return self._client.populate_dataset(
196218
self.id,
197219
dataset_items,
198-
force=force,
220+
force=update,
199221
batch_size=batch_size,
200222
)
201223

nucleus/dataset_item.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
from collections import Counter
12
import json
23
import os.path
34
from dataclasses import dataclass
4-
from typing import Optional
5+
from typing import Optional, Sequence
6+
from urllib.parse import urlparse
57

68
from .constants import (
79
DATASET_ITEM_ID_KEY,
@@ -21,8 +23,7 @@ class DatasetItem:
2123
metadata: Optional[dict] = None
2224

2325
def __post_init__(self):
24-
self.image_url = self.image_location
25-
self.local = self._is_local_path(self.image_location)
26+
self.local = is_local_path(self.image_location)
2627

2728
@classmethod
2829
def from_json(cls, payload: dict):
@@ -36,16 +37,12 @@ def from_json(cls, payload: dict):
3637
metadata=payload.get(METADATA_KEY, {}),
3738
)
3839

39-
def _is_local_path(self, path: str) -> bool:
40-
path_components = [comp.lower() for comp in path.split("/")]
41-
return path_components[0] not in {"https:", "http:", "s3:", "gs:"}
42-
4340
def local_file_exists(self):
44-
return os.path.isfile(self.image_url)
41+
return os.path.isfile(self.image_location)
4542

4643
def to_payload(self) -> dict:
4744
payload = {
48-
IMAGE_URL_KEY: self.image_url,
45+
IMAGE_URL_KEY: self.image_location,
4946
METADATA_KEY: self.metadata or {},
5047
}
5148
if self.reference_id:
@@ -56,3 +53,32 @@ def to_payload(self) -> dict:
5653

5754
def to_json(self) -> str:
5855
return json.dumps(self.to_payload())
56+
57+
58+
def is_local_path(path: str) -> bool:
59+
return urlparse(path).scheme not in {"https", "http", "s3", "gs"}
60+
61+
62+
def check_all_paths_remote(dataset_items: Sequence[DatasetItem]):
63+
for item in dataset_items:
64+
if is_local_path(item.image_location):
65+
raise ValueError(
66+
f"All paths must be remote, but {item.image_location} is either "
67+
"local, or a remote URL type that is not supported."
68+
)
69+
70+
71+
def check_for_duplicate_reference_ids(dataset_items: Sequence[DatasetItem]):
72+
ref_ids = []
73+
for dataset_item in dataset_items:
74+
if dataset_item.reference_id is not None:
75+
ref_ids.append(dataset_item.reference_id)
76+
if len(ref_ids) != len(set(ref_ids)):
77+
duplicates = {
78+
f"{key}": f"Count: {value}"
79+
for key, value in Counter(ref_ids).items()
80+
}
81+
raise ValueError(
82+
"Duplicate reference ids found among dataset_items: %s"
83+
% duplicates
84+
)

0 commit comments

Comments
 (0)