add list_query_items method (#327)

drakejwong · jean-lucas · web-flow · commit 22aea2272fac · 2022-07-07T21:19:12.000-07:00
* add list_query_items method

* fix test

* rm unused import

* rename

Co-authored-by: Jean Lucas  &lt;jean.ferreira@scale.com&gt;

* bump semver and changelog

* fixes for api changes

* bump semver

* lint

Co-authored-by: Jean Lucas  &lt;jean.ferreira@scale.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.14.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.7) - 2022-07-07
+
+### Added
+- Support running structured queries and retrieving item results via API
+
 ## [0.14.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.6) - 2022-07-07
 
 ### Fixed
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -82,7 +82,6 @@
 KEYPOINTS_KEY = "keypoints"
 KEYPOINTS_NAMES_KEY = "names"
 KEYPOINTS_SKELETON_KEY = "skeleton"
-LAST_PAGE = "lastPage"
 LABEL_KEY = "label"
 LABELS_KEY = "labels"
 MASK_URL_KEY = "mask_url"
@@ -98,8 +97,9 @@
 NUCLEUS_ENDPOINT = "https://api.scale.com/v1/nucleus"
 NUM_SENSORS_KEY = "num_sensors"
 ORIGINAL_IMAGE_URL_KEY = "original_image_url"
-PAGE_SIZE = "pageSize"
-PAGE_TOKEN = "pageToken"
+PAGE_SIZE_KEY = "pageSize"
+PAGE_TOKEN_KEY = "pageToken"
+NEXT_TOKEN_KEY = "nextPageToken"
 P1_KEY = "p1"
 P2_KEY = "p2"
 POINTCLOUD_KEY = "pointcloud"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -1680,3 +1680,23 @@ def update_item_metadata(self, mapping: Dict[str, dict]):
             self.id, self._client, mapping, ExportMetadataType.DATASET_ITEMS
         )
         return mm.update()
+
+    def query_items(self, query: str) -> Iterable[DatasetItem]:
+        """
+        Fetches all DatasetItems that pertain to a given structured query.
+
+        Args:
+            query: Structured query compatible with the `Nucleus query language <https://nucleus.scale.com/docs/query-language-reference>`_.
+
+        Returns:
+            A list of DatasetItem query results.
+        """
+        json_generator = paginate_generator(
+            client=self._client,
+            endpoint=f"dataset/{self.id}/queryItemsPage",
+            result_key=ITEMS_KEY,
+            page_size=10000,  # max ES page size
+            query=query,
+        )
+        for item_json in json_generator:
+            yield DatasetItem.from_json(item_json)
diff --git a/nucleus/utils.py b/nucleus/utils.py
@@ -31,12 +31,12 @@
     EXPORTED_SCALE_TASK_INFO_ROWS,
     ITEM_KEY,
     KEYPOINTS_TYPE,
-    LAST_PAGE,
     LINE_TYPE,
     MAX_PAYLOAD_SIZE,
     MULTICATEGORY_TYPE,
-    PAGE_SIZE,
-    PAGE_TOKEN,
+    NEXT_TOKEN_KEY,
+    PAGE_SIZE_KEY,
+    PAGE_TOKEN_KEY,
     POLYGON_TYPE,
     PREDICTIONS_KEY,
     REFERENCE_ID_KEY,
@@ -362,20 +362,26 @@ def paginate_generator(
     endpoint: str,
     result_key: str,
     page_size: int = 100000,
+    **kwargs,
 ):
-    last_page = False
-    page_token = None
-    while not last_page:
+    next_token = None
+    while True:
         try:
             response = client.make_request(
-                {PAGE_TOKEN: page_token, PAGE_SIZE: page_size},
+                {
+                    PAGE_TOKEN_KEY: next_token,
+                    PAGE_SIZE_KEY: page_size,
+                    **kwargs,
+                },
                 endpoint,
                 requests.post,
             )
         except NucleusAPIError as e:
             if e.status_code == 503:
                 e.message += f"/n Your request timed out while trying to get a page size of {page_size}. Try lowering the page_size."
             raise e
-        page_token, last_page = response[PAGE_TOKEN], response[LAST_PAGE]
+        next_token = response[NEXT_TOKEN_KEY]
         for json_value in response[result_key]:
             yield json_value
+        if not next_token:
+            break
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.14.6"
+version = "0.14.7"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -26,6 +26,7 @@
     SEGMENTATION_TYPE,
     UPDATED_ITEMS,
 )
+from nucleus.errors import NucleusAPIError
 from nucleus.job import AsyncJob, JobError
 
 from .helpers import (
@@ -582,3 +583,22 @@ def test_dataset_get_object_indexing_status(CLIENT):
     assert round(resp["percent_indexed"], 2) == round(
         resp["object_count"] / resp["embedding_count"], 2
     )
+
+
+@pytest.mark.integration
+def test_query(CLIENT):
+    dataset = Dataset(DATASET_WITH_EMBEDDINGS, CLIENT)
+    expected_items = {
+        ia["item"].reference_id: ia["item"]
+        for ia in dataset.items_and_annotations()
+        if len(ia["annotations"]["box"]) > 6  # assume only box annotations
+    }
+    queried_items = [i for i in dataset.query_items("annotations.count > 6")]
+
+    assert len(queried_items) == len(expected_items)
+    for qi in queried_items:
+        assert qi == expected_items[qi.reference_id]
+
+    with pytest.raises(NucleusAPIError):
+        for qi in dataset.query_items("annotations.count bad syntax"):
+            print(qi)  # unreachable, just need to yield an item from generator