Skip to content

Commit 22aea22

Browse files
add list_query_items method (#327)
* add list_query_items method * fix test * rm unused import * rename Co-authored-by: Jean Lucas <jean.ferreira@scale.com> * bump semver and changelog * fixes for api changes * bump semver * lint Co-authored-by: Jean Lucas <jean.ferreira@scale.com>
1 parent e103fbd commit 22aea22

File tree

6 files changed

+63
-12
lines changed

6 files changed

+63
-12
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.14.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.7) - 2022-07-07
9+
10+
### Added
11+
- Support running structured queries and retrieving item results via API
12+
813
## [0.14.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.6) - 2022-07-07
914

1015
### Fixed

nucleus/constants.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@
8282
KEYPOINTS_KEY = "keypoints"
8383
KEYPOINTS_NAMES_KEY = "names"
8484
KEYPOINTS_SKELETON_KEY = "skeleton"
85-
LAST_PAGE = "lastPage"
8685
LABEL_KEY = "label"
8786
LABELS_KEY = "labels"
8887
MASK_URL_KEY = "mask_url"
@@ -98,8 +97,9 @@
9897
NUCLEUS_ENDPOINT = "https://api.scale.com/v1/nucleus"
9998
NUM_SENSORS_KEY = "num_sensors"
10099
ORIGINAL_IMAGE_URL_KEY = "original_image_url"
101-
PAGE_SIZE = "pageSize"
102-
PAGE_TOKEN = "pageToken"
100+
PAGE_SIZE_KEY = "pageSize"
101+
PAGE_TOKEN_KEY = "pageToken"
102+
NEXT_TOKEN_KEY = "nextPageToken"
103103
P1_KEY = "p1"
104104
P2_KEY = "p2"
105105
POINTCLOUD_KEY = "pointcloud"

nucleus/dataset.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,3 +1680,23 @@ def update_item_metadata(self, mapping: Dict[str, dict]):
16801680
self.id, self._client, mapping, ExportMetadataType.DATASET_ITEMS
16811681
)
16821682
return mm.update()
1683+
1684+
def query_items(self, query: str) -> Iterable[DatasetItem]:
1685+
"""
1686+
Fetches all DatasetItems that pertain to a given structured query.
1687+
1688+
Args:
1689+
query: Structured query compatible with the `Nucleus query language <https://nucleus.scale.com/docs/query-language-reference>`_.
1690+
1691+
Returns:
1692+
A list of DatasetItem query results.
1693+
"""
1694+
json_generator = paginate_generator(
1695+
client=self._client,
1696+
endpoint=f"dataset/{self.id}/queryItemsPage",
1697+
result_key=ITEMS_KEY,
1698+
page_size=10000, # max ES page size
1699+
query=query,
1700+
)
1701+
for item_json in json_generator:
1702+
yield DatasetItem.from_json(item_json)

nucleus/utils.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,12 @@
3131
EXPORTED_SCALE_TASK_INFO_ROWS,
3232
ITEM_KEY,
3333
KEYPOINTS_TYPE,
34-
LAST_PAGE,
3534
LINE_TYPE,
3635
MAX_PAYLOAD_SIZE,
3736
MULTICATEGORY_TYPE,
38-
PAGE_SIZE,
39-
PAGE_TOKEN,
37+
NEXT_TOKEN_KEY,
38+
PAGE_SIZE_KEY,
39+
PAGE_TOKEN_KEY,
4040
POLYGON_TYPE,
4141
PREDICTIONS_KEY,
4242
REFERENCE_ID_KEY,
@@ -362,20 +362,26 @@ def paginate_generator(
362362
endpoint: str,
363363
result_key: str,
364364
page_size: int = 100000,
365+
**kwargs,
365366
):
366-
last_page = False
367-
page_token = None
368-
while not last_page:
367+
next_token = None
368+
while True:
369369
try:
370370
response = client.make_request(
371-
{PAGE_TOKEN: page_token, PAGE_SIZE: page_size},
371+
{
372+
PAGE_TOKEN_KEY: next_token,
373+
PAGE_SIZE_KEY: page_size,
374+
**kwargs,
375+
},
372376
endpoint,
373377
requests.post,
374378
)
375379
except NucleusAPIError as e:
376380
if e.status_code == 503:
377381
e.message += f"/n Your request timed out while trying to get a page size of {page_size}. Try lowering the page_size."
378382
raise e
379-
page_token, last_page = response[PAGE_TOKEN], response[LAST_PAGE]
383+
next_token = response[NEXT_TOKEN_KEY]
380384
for json_value in response[result_key]:
381385
yield json_value
386+
if not next_token:
387+
break

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ exclude = '''
2121

2222
[tool.poetry]
2323
name = "scale-nucleus"
24-
version = "0.14.6"
24+
version = "0.14.7"
2525
description = "The official Python client library for Nucleus, the Data Platform for AI"
2626
license = "MIT"
2727
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

tests/test_dataset.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
SEGMENTATION_TYPE,
2727
UPDATED_ITEMS,
2828
)
29+
from nucleus.errors import NucleusAPIError
2930
from nucleus.job import AsyncJob, JobError
3031

3132
from .helpers import (
@@ -582,3 +583,22 @@ def test_dataset_get_object_indexing_status(CLIENT):
582583
assert round(resp["percent_indexed"], 2) == round(
583584
resp["object_count"] / resp["embedding_count"], 2
584585
)
586+
587+
588+
@pytest.mark.integration
589+
def test_query(CLIENT):
590+
dataset = Dataset(DATASET_WITH_EMBEDDINGS, CLIENT)
591+
expected_items = {
592+
ia["item"].reference_id: ia["item"]
593+
for ia in dataset.items_and_annotations()
594+
if len(ia["annotations"]["box"]) > 6 # assume only box annotations
595+
}
596+
queried_items = [i for i in dataset.query_items("annotations.count > 6")]
597+
598+
assert len(queried_items) == len(expected_items)
599+
for qi in queried_items:
600+
assert qi == expected_items[qi.reference_id]
601+
602+
with pytest.raises(NucleusAPIError):
603+
for qi in dataset.query_items("annotations.count bad syntax"):
604+
print(qi) # unreachable, just need to yield an item from generator

0 commit comments

Comments
 (0)