Skip to content

Commit caaf0f6

Browse files
authored
Merge pull request #85 from scaleapi/jihan/custom-efs
Nucleus - custom indexing improvements
2 parents 906060f + d90cb57 commit caaf0f6

File tree

11 files changed

+34
-12
lines changed

11 files changed

+34
-12
lines changed

nucleus/__init__.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
DATASET_ITEM_IDS_KEY,
8383
DEFAULT_NETWORK_TIMEOUT_SEC,
8484
EMBEDDINGS_URL_KEY,
85+
EMBEDDING_DIMENSION_KEY,
8586
ERROR_ITEMS,
8687
ERROR_PAYLOAD,
8788
ERRORS_KEY,
@@ -1101,9 +1102,23 @@ def delete_model(self, model_id: str) -> dict:
11011102
)
11021103
return response
11031104

1104-
def create_custom_index(self, dataset_id: str, embeddings_url: str):
1105+
def create_custom_index(
1106+
self, dataset_id: str, embeddings_urls: list, embedding_dim: int
1107+
):
1108+
"""
1109+
Creates a custom index for a given dataset, which will then be used
1110+
for autotag and similarity search.
1111+
1112+
:param
1113+
dataset_id: id of dataset that the custom index is being added to.
1114+
embeddings_urls: list of urls, each of which being a json mapping dataset_item_id -> embedding vector
1115+
embedding_dim: the dimension of the embedding vectors, must be consistent for all embedding vectors in the index.
1116+
"""
11051117
return self.make_request(
1106-
{EMBEDDINGS_URL_KEY: embeddings_url},
1118+
{
1119+
EMBEDDINGS_URL_KEY: embeddings_urls,
1120+
EMBEDDING_DIMENSION_KEY: embedding_dim,
1121+
},
11071122
f"indexing/{dataset_id}",
11081123
requests_command=requests.post,
11091124
)

nucleus/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
DEFAULT_ANNOTATION_UPDATE_MODE = False
2525
DEFAULT_NETWORK_TIMEOUT_SEC = 120
2626
DIMENSIONS_KEY = "dimensions"
27-
EMBEDDINGS_URL_KEY = "embeddings_url"
27+
EMBEDDINGS_URL_KEY = "embeddings_urls"
28+
EMBEDDING_DIMENSION_KEY = "embedding_dimension"
2829
ERRORS_KEY = "errors"
2930
ERROR_CODES = "error_codes"
3031
ERROR_ITEMS = "upload_errors"

nucleus/dataset.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,12 @@ def delete_item(self, item_id: str = None, reference_id: str = None):
329329
def list_autotags(self):
330330
return self._client.list_autotags(self.id)
331331

332-
def create_custom_index(self, embeddings_url: str):
333-
return self._client.create_custom_index(self.id, embeddings_url)
332+
def create_custom_index(self, embeddings_urls: list, embedding_dim: int):
333+
return self._client.create_custom_index(
334+
self.id,
335+
embeddings_urls,
336+
embedding_dim,
337+
)
334338

335339
def delete_custom_index(self):
336340
return self._client.delete_custom_index(self.id)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ exclude = '''
2121

2222
[tool.poetry]
2323
name = "scale-nucleus"
24-
version = "0.1.13"
24+
version = "0.1.14"
2525
description = "The official Python client library for Nucleus, the Data Platform for AI"
2626
license = "MIT"
2727
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

tests/helpers.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414

1515

1616
TEST_IMG_URLS = [
17-
"https://homepages.cae.wisc.edu/~ece533/images/airplane.png",
18-
"https://homepages.cae.wisc.edu/~ece533/images/arctichare.png",
19-
"https://homepages.cae.wisc.edu/~ece533/images/baboon.png",
20-
"https://homepages.cae.wisc.edu/~ece533/images/barbara.png",
21-
"https://homepages.cae.wisc.edu/~ece533/images/cat.png",
17+
"https://d3jkudlc7u70kh.cloudfront.net/airplane-fact.jpg",
18+
"https://www.activewild.com/wp-content/uploads/2016/01/Arctic-Hare-Facts-For-Kids-fb.jpg",
19+
"https://thumbs-prod.si-cdn.com/Gd4SBf-ePT2OK6HDdW6Q3uOh5c8=/fit-in/1600x0/https://public-media.si-cdn.com/filer/23/a9/23a9a987-df8e-4d22-ac96-93f9091f9b8f/media_baboons_07_-_photo_by_catherine_markham.jpg",
20+
"https://prod-images.tcm.com/Master-Profile-Images/BarbaraStanwyck.jpg",
21+
"https://thumbs-prod.si-cdn.com/nnJARGtKrLypH4y3Vov2zGTG4xw=/fit-in/1600x0/filters:focal(554x699:555x700)/https://public-media.si-cdn.com/filer/a4/04/a404c799-7118-459a-8de4-89e4a44b124f/img_1317.jpg",
2222
]
2323

2424
TEST_DATASET_ITEMS = [

tests/test_indexing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ def dataset(CLIENT):
3939

4040
def test_index_integration(dataset):
4141
signed_embeddings_url = TEST_INDEX_EMBEDDINGS_FILE
42-
create_response = dataset.create_custom_index(signed_embeddings_url)
42+
create_response = dataset.create_custom_index(
43+
[signed_embeddings_url], embedding_dim=3
44+
)
4345
assert JOB_ID_KEY in create_response
4446
assert MESSAGE_KEY in create_response
4547
job_id = create_response[JOB_ID_KEY]

tests/testdata/airplane.jpeg

45.5 KB
Loading

tests/testdata/arctichare.jpeg

98.5 KB
Loading

tests/testdata/baboon.jpeg

231 KB
Loading

tests/testdata/barbara.jpeg

515 KB
Loading

0 commit comments

Comments
 (0)