Merge pull request #121 from scaleapi/jihan/autotag-api

jihan-yin · web-flow · commit 83bab964edc2 · 2021-09-16T14:37:05.000-07:00
Support for new autotag exporting apis
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -13,6 +13,7 @@
 ANNOTATION_TYPES = (BOX_TYPE, POLYGON_TYPE, SEGMENTATION_TYPE, CUBOID_TYPE)
 ANNOTATION_UPDATE_KEY = "update"
 AUTOTAGS_KEY = "autotags"
+AUTOTAG_SCORE_THRESHOLD = "score_threshold"
 EXPORTED_ROWS = "exportedRows"
 CAMERA_PARAMS_KEY = "camera_params"
 CLASS_PDF_KEY = "class_pdf"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -24,6 +24,7 @@
     NAME_KEY,
     REFERENCE_IDS_KEY,
     REQUEST_ID_KEY,
+    AUTOTAG_SCORE_THRESHOLD,
     UPDATE_KEY,
 )
 from .dataset_item import (
@@ -87,21 +88,55 @@ def items(self) -> List[DatasetItem]:
         return self._client.get_dataset_items(self.id)
 
     @sanitize_string_args
-    def autotag_scores(self, autotag_name, for_scores_greater_than=0):
-        """Export the autotag scores above a threshold, largest scores first.
+    def autotag_items(self, autotag_name, for_scores_greater_than=0):
+        """For a given Autotag of this dataset, export its tagged items with scores above a threshold, largest scores first.
 
-        If you have pandas installed, you can create a pandas dataframe using
+        :return: dictionary of the form
+            {
+                'autotagItems': {
+                    ref_id: str,
+                    score: float,
+                    model_prediction_id: str | None
+                    ground_truth_annotation_id: str | None,
+                }[],
+                'autotag': {
+                    id: str,
+                    name: str,
+                    status: 'started' | 'completed',
+                    autotag_level: 'Image' | 'Object'
+                }
+            }
+        See https://dashboard.nucleus.scale.com/nucleus/docs/api#export-autotag-items for more details on the return types.
+        """
+        response = self._client.make_request(
+            payload={AUTOTAG_SCORE_THRESHOLD: for_scores_greater_than},
+            route=f"autotag/dataset/{self.id}/autotag/{autotag_name}/taggedItems",
+            requests_command=requests.get,
+        )
+        return response
 
-        pandas.Dataframe(dataset.autotag_scores(autotag_name))
+    def autotag_training_items(self, autotag_name):
+        """For a given Autotag of this dataset, export its training items. These are user selected positives during refinement.
 
         :return: dictionary of the form
-            {'ref_ids': List[str],
-             'datset_item_ids': List[str],
-             'score': List[float]}
+            {
+                'autotagPositiveTrainingItems': {
+                    ref_id: str,
+                    model_prediction_id: str | None,
+                    ground_truth_annotation_id: str | None,
+                }[],
+                'autotag': {
+                    id: str,
+                    name: str,
+                    status: 'started' | 'completed',
+                    autotag_level: 'Image' | 'Object'
+                }
+            }
+        See https://dashboard.nucleus.scale.com/nucleus/docs/api#export-autotag-training-items for more details on the return types.
         """
         response = self._client.make_request(
             payload={},
-            route=f"autotag/{self.id}/{autotag_name}/{for_scores_greater_than}",
+            route=f"autotag/dataset/{self.id}/autotag/{autotag_name}/trainingItems",
             requests_command=requests.get,
         )
         return response
diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py
@@ -193,6 +193,5 @@ def check_for_duplicate_reference_ids(dataset_items: Sequence[DatasetItem]):
             for key, value in Counter(ref_ids).items()
         }
         raise ValueError(
-            "Duplicate reference ids found among dataset_items: %s"
-            % duplicates
+            f"Duplicate reference ids found among dataset_items: {duplicates}"
         )
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.1.19"
+version = "0.1.20"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -329,7 +329,7 @@ def test_raises_error_for_duplicate():
     )
 
 
-def test_dataset_export_autotag_scores(CLIENT):
+def test_dataset_export_autotag_tagged_items(CLIENT):
     # This test can only run for the test user who has an indexed dataset.
     # TODO: if/when we can create autotags via api, create one instead.
     if NUCLEUS_PYTEST_USER_ID in CLIENT.api_key:
@@ -342,11 +342,51 @@ def test_dataset_export_autotag_scores(CLIENT):
             in str(api_error.value)
         )
 
-        scores = dataset.autotag_scores(autotag_name="PytestTestTag")
+        items = dataset.autotag_items(autotag_name="PytestTestTag")
 
-        for column in ["dataset_item_ids", "ref_ids", "scores"]:
-            assert column in scores
-            assert len(scores[column]) > 0
+        assert "autotagItems" in items
+        assert "autotag" in items
+
+        autotagItems = items["autotagItems"]
+        autotag = items["autotag"]
+
+        assert len(autotagItems) > 0
+        for item in autotagItems:
+            for column in ["ref_id", "score"]:
+                assert column in item
+
+        for column in ["id", "name", "status", "autotag_level"]:
+            assert column in autotag
+
+
+def test_dataset_export_autotag_training_items(CLIENT):
+    # This test can only run for the test user who has an indexed dataset.
+    # TODO: if/when we can create autotags via api, create one instead.
+    if NUCLEUS_PYTEST_USER_ID in CLIENT.api_key:
+        dataset = CLIENT.get_dataset(DATASET_WITH_AUTOTAG)
+
+        with pytest.raises(NucleusAPIError) as api_error:
+            dataset.autotag_scores(autotag_name="NONSENSE_GARBAGE")
+        assert (
+            f"The autotag NONSENSE_GARBAGE was not found in dataset {DATASET_WITH_AUTOTAG}"
+            in str(api_error.value)
+        )
+
+        items = dataset.autotag_training_items(autotag_name="PytestTestTag")
+
+        assert "autotagItems" in items
+        assert "autotag" in items
+
+        autotagTrainingItems = items["autotagPositiveTrainingItems"]
+        autotag = items["autotag"]
+
+        assert len(autotagTrainingItems) > 0
+        for item in autotagTrainingItems:
+            for column in ["ref_id"]:
+                assert column in item
+
+        for column in ["id", "name", "status", "autotag_level"]:
+            assert column in autotag
 
 
 @pytest.mark.integration

Original file line number	Diff line number	Diff line change
`@@ -193,6 +193,5 @@ def check_for_duplicate_reference_ids(dataset_items: Sequence[DatasetItem]):`
`193`	`193`	`for key, value in Counter(ref_ids).items()`
`194`	`194`	`}`
`195`	`195`	`raise ValueError(`
`196`		`- "Duplicate reference ids found among dataset_items: %s"`
`197`		`- % duplicates`
	`196`	`+ f"Duplicate reference ids found among dataset_items: {duplicates}"`
`198`	`197`	`)`