Merge pull request #136 from scaleapi/jihan/object-indexing

ardila · web-flow · commit 0464cae2d1a3 · 2021-10-18T16:16:57.000-07:00
Add object indexing support
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -323,7 +323,7 @@ def populate_dataset(
         self,
         dataset_id: str,
         dataset_items: List[DatasetItem],
-        batch_size: int = 100,
+        batch_size: int = 30,
         update: bool = False,
     ):
         """
@@ -1180,7 +1180,8 @@ def set_continuous_indexing(self, dataset_id: str, enable: bool = True):
     def create_image_index(self, dataset_id: str):
         """
         Starts generating embeddings for images that don't have embeddings in a given dataset. These embeddings will
-        be used for autotag and similarity search. This endpoint is currently only enabled for enterprise customers.
+        be used for autotag and similarity search. This endpoint is limited to generating embeddings for 2 million
+        images at a time. This endpoint is also currently only enabled for enterprise customers.
         Please reach out to nucleus@scale.com if you wish to learn more.
 
         :param
@@ -1192,6 +1193,33 @@ def create_image_index(self, dataset_id: str):
             requests_command=requests.post,
         )
 
+    def create_object_index(
+        self, dataset_id: str, model_run_id: str, gt_only: bool
+    ):
+        """
+        Starts generating embeddings for objects that don't have embeddings in a given dataset. These embeddings will
+        be used for autotag and similarity search. This endpoint only supports indexing objects sourced from the predictions
+        of a single model run or the ground truth annotations of a dataset.
+
+        This endpoint is limited to generating embeddings for 3 million objects at a time. This endpoint is also currently
+        only enabled for enterprise customers. Please reach out to nucleus@scale.com if you wish to learn more.
+
+        :param
+        dataset_id: id of dataset for generating embeddings on.
+        model_run_id: id of the model run for generating embeddings on. Mutually exclusive with gt_only
+        gt_only: Whether we are generating embeddings on the ground truth objects in a dataset. Mutually exclusive with model_run_id
+        """
+        payload: Dict[str, Union[str, bool]] = {}
+        if model_run_id:
+            payload["model_run_id"] = model_run_id
+        elif gt_only:
+            payload["ingest_gt_only"] = True
+        return self.make_request(
+            payload,
+            f"indexing/{dataset_id}/internal/object",
+            requests_command=requests.post,
+        )
+
     def make_request(
         self, payload: dict, route: str, requests_command=requests.post
     ) -> dict:
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -453,6 +453,14 @@ def create_image_index(self):
         response = self._client.create_image_index(self.id)
         return AsyncJob.from_json(response, self._client)
 
+    def create_object_index(
+        self, model_run_id: str = None, gt_only: bool = None
+    ):
+        response = self._client.create_object_index(
+            self.id, model_run_id, gt_only
+        )
+        return AsyncJob.from_json(response, self._client)
+
     def add_taxonomy(
         self,
         taxonomy_name: str,
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.1.24"
+version = "0.1.25"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]