Merge branch 'master' into add_classification_type_to_groundtruth

Claire Pajot · Claire Pajot · commit b8df106a0494 · 2021-09-22T21:17:41.000Z
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -19,6 +19,7 @@
 )
 ANNOTATION_UPDATE_KEY = "update"
 AUTOTAGS_KEY = "autotags"
+AUTOTAG_SCORE_THRESHOLD = "score_threshold"
 EXPORTED_ROWS = "exportedRows"
 CAMERA_PARAMS_KEY = "camera_params"
 CLASS_PDF_KEY = "class_pdf"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -24,6 +24,7 @@
     NAME_KEY,
     REFERENCE_IDS_KEY,
     REQUEST_ID_KEY,
+    AUTOTAG_SCORE_THRESHOLD,
     UPDATE_KEY,
 )
 from .dataset_item import (
@@ -87,21 +88,55 @@ def items(self) -> List[DatasetItem]:
         return self._client.get_dataset_items(self.id)
 
     @sanitize_string_args
-    def autotag_scores(self, autotag_name, for_scores_greater_than=0):
-        """Export the autotag scores above a threshold, largest scores first.
+    def autotag_items(self, autotag_name, for_scores_greater_than=0):
+        """For a given Autotag of this dataset, export its tagged items with scores above a threshold, largest scores first.
 
-        If you have pandas installed, you can create a pandas dataframe using
+        :return: dictionary of the form
+            {
+                'autotagItems': {
+                    ref_id: str,
+                    score: float,
+                    model_prediction_annotation_id: str | None
+                    ground_truth_annotation_id: str | None,
+                }[],
+                'autotag': {
+                    id: str,
+                    name: str,
+                    status: 'started' | 'completed',
+                    autotag_level: 'Image' | 'Object'
+                }
+            }
+        See https://dashboard.nucleus.scale.com/nucleus/docs/api#export-autotag-items for more details on the return types.
+        """
+        response = self._client.make_request(
+            payload={AUTOTAG_SCORE_THRESHOLD: for_scores_greater_than},
+            route=f"dataset/{self.id}/autotag/{autotag_name}/taggedItems",
+            requests_command=requests.get,
+        )
+        return response
 
-        pandas.Dataframe(dataset.autotag_scores(autotag_name))
+    def autotag_training_items(self, autotag_name):
+        """For a given Autotag of this dataset, export its training items. These are user selected positives during refinement.
 
         :return: dictionary of the form
-            {'ref_ids': List[str],
-             'datset_item_ids': List[str],
-             'score': List[float]}
+            {
+                'autotagPositiveTrainingItems': {
+                    ref_id: str,
+                    model_prediction_annotation_id: str | None,
+                    ground_truth_annotation_id: str | None,
+                }[],
+                'autotag': {
+                    id: str,
+                    name: str,
+                    status: 'started' | 'completed',
+                    autotag_level: 'Image' | 'Object'
+                }
+            }
+        See https://dashboard.nucleus.scale.com/nucleus/docs/api#export-autotag-training-items for more details on the return types.
         """
         response = self._client.make_request(
             payload={},
-            route=f"autotag/{self.id}/{autotag_name}/{for_scores_greater_than}",
+            route=f"dataset/{self.id}/autotag/{autotag_name}/trainingItems",
             requests_command=requests.get,
         )
         return response
@@ -349,6 +384,21 @@ def loc(self, dataset_item_id: str) -> dict:
         response = self._client.dataitem_loc(self.id, dataset_item_id)
         return format_dataset_item_response(response)
 
+    def ground_truth_loc(self, reference_id: str, annotation_id: str):
+        """
+        Returns info for single ground truth Annotation by its id.
+        :param reference_id: User specified id for the dataset item the ground truth is attached to
+        :param annotation_id: User specified, or auto-generated id for the annotation
+        :return:
+        BoxAnnotation | PolygonAnnotation | CuboidAnnotation
+        """
+        response = self._client.make_request(
+            {},
+            f"dataset/{self.id}/groundTruth/loc/{reference_id}/{annotation_id}",
+            requests.get,
+        )
+        return Annotation.from_json(response)
+
     def create_slice(
         self,
         name: str,
diff --git a/nucleus/job.py b/nucleus/job.py
@@ -40,13 +40,13 @@ def errors(self) -> List[str]:
     def sleep_until_complete(self, verbose_std_out=True):
         while 1:
             status = self.status()
-
             time.sleep(JOB_POLLING_INTERVAL)
 
             if verbose_std_out:
                 print(f"Status at {time.ctime()}: {status}")
             if status["status"] == "Running":
                 continue
+
             break
 
         final_status = status
diff --git a/nucleus/model_run.py b/nucleus/model_run.py
@@ -19,6 +19,7 @@
     CuboidPrediction,
     PolygonPrediction,
     SegmentationPrediction,
+    from_json,
 )
 
 
@@ -160,6 +161,23 @@ def loc(self, dataset_item_id: str):
         )
         return self._format_prediction_response(response)
 
+    def prediction_loc(self, reference_id: str, annotation_id: str):
+        """
+        Returns info for single Prediction by its reference id and annotation id.
+        :param reference_id: the user specified id for the image
+        :param annotation_id: the user specified id for the prediction, or if one was not provided, the Scale internally generated id for the prediction
+        :return:
+         BoxPrediction | PolygonPrediction | CuboidPrediction
+        """
+
+        response = self._client.make_request(
+            {},
+            f"modelRun/{self.model_run_id}/prediction/loc/{reference_id}/{annotation_id}",
+            requests.get,
+        )
+
+        return from_json(response)
+
     def ungrouped_export(self):
         json_response = self._client.make_request(
             payload={},
diff --git a/nucleus/prediction.py b/nucleus/prediction.py
@@ -10,10 +10,14 @@
 )
 from .constants import (
     ANNOTATION_ID_KEY,
+    BOX_TYPE,
+    CUBOID_TYPE,
+    POLYGON_TYPE,
     REFERENCE_ID_KEY,
     METADATA_KEY,
     GEOMETRY_KEY,
     LABEL_KEY,
+    TYPE_KEY,
     X_KEY,
     Y_KEY,
     WIDTH_KEY,
@@ -29,6 +33,17 @@
 )
 
 
+def from_json(payload: dict):
+    if payload.get(TYPE_KEY, None) == BOX_TYPE:
+        return BoxPrediction.from_json(payload)
+    elif payload.get(TYPE_KEY, None) == POLYGON_TYPE:
+        return PolygonPrediction.from_json(payload)
+    elif payload.get(TYPE_KEY, None) == CUBOID_TYPE:
+        return CuboidPrediction.from_json(payload)
+    else:
+        return SegmentationPrediction.from_json(payload)
+
+
 class SegmentationPrediction(SegmentationAnnotation):
     # No need to define init or to_payload methods because
     # we default to functions defined in the parent class
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.1.19"
+version = "0.1.22"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/scripts/load_test.py b/scripts/load_test.py
@@ -4,6 +4,8 @@
 import nucleus
 import os
 
+from itertools import zip_longest
+
 import time
 
 
@@ -21,6 +23,8 @@
     "API Key to use. Defaults to NUCLEUS_PYTEST_API_KEY environment variable",
 )
 
+flags.DEFINE_integer("job_parallelism", 8, "Amount of concurrent jobs to use.")
+
 # Dataset upload flags
 flags.DEFINE_enum(
     "create_or_reuse_dataset",
@@ -35,12 +39,12 @@
 )
 flags.DEFINE_integer(
     "num_dataset_items",
-    100000,
+    10000000,
     "Number of dataset items to create if creating a dataset",
     lower_bound=0,
 )
 flags.DEFINE_bool(
-    "cleanup_dataset", True, "Whether to delete the dataset after the test."
+    "cleanup_dataset", False, "Whether to delete the dataset after the test."
 )
 
 # Annotation upload flags
@@ -54,11 +58,21 @@
 # Prediction upload flags
 flags.DEFINE_integer(
     "num_predictions_per_dataset_item",
-    0,
+    1,
     "Number of annotations per dataset item",
     lower_bound=0,
 )
 
+TIMINGS = {}
+
+
+def chunk(iterable, chunk_size, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    args = [iter(iterable)] * chunk_size
+
+    for chunk_iterable in zip_longest(*args, fillvalue=fillvalue):
+        yield filter(lambda x: x is not None, chunk_iterable)
+
 
 def client():
     return nucleus.NucleusClient(api_key=FLAGS.api_key)
@@ -126,15 +140,23 @@ def create_or_get_dataset():
         dataset = client().create_dataset("Privacy Mode Load Test Dataset")
         print("Starting dataset item upload")
         tic = time.time()
-        job = dataset.append(
-            dataset_item_generator(), update=True, asynchronous=True
-        )
-        try:
-            job.sleep_until_complete(False)
-        except JobError:
-            print(job.errors())
+        chunk_size = FLAGS.num_dataset_items // FLAGS.job_parallelism
+        jobs = []
+        for dataset_item_chunk in chunk(dataset_item_generator(), chunk_size):
+            jobs.append(
+                dataset.append(
+                    dataset_item_chunk, update=True, asynchronous=True
+                )
+            )
+
+        for job in jobs:
+            try:
+                job.sleep_until_complete(False)
+            except JobError:
+                print(job.errors())
         toc = time.time()
         print("Finished dataset item upload: %s" % (toc - tic))
+        TIMINGS[f"Dataset Item Upload {FLAGS.num_dataset_items}"] = toc - tic
     else:
         print(f"Reusing dataset {FLAGS.dataset_id}")
         dataset = client().get_dataset(FLAGS.dataset_id)
@@ -144,15 +166,26 @@ def create_or_get_dataset():
 def upload_annotations(dataset: Dataset):
     print("Starting annotation upload")
     tic = time.time()
-    job = dataset.annotate(
-        list(annotation_generator()), update=False, asynchronous=True
+    jobs = []
+    num_annotations = (
+        FLAGS.num_dataset_items * FLAGS.num_annotations_per_dataset_item
     )
-    try:
-        job.sleep_until_complete(False)
-    except JobError:
-        print(job.errors())
+    chunk_size = num_annotations // FLAGS.job_parallelism
+    for annotation_chunk in chunk(annotation_generator(), chunk_size):
+        jobs.append(
+            dataset.annotate(
+                list(annotation_chunk), update=False, asynchronous=True
+            )
+        )
+
+    for job in jobs:
+        try:
+            job.sleep_until_complete(False)
+        except JobError:
+            print(job.errors())
     toc = time.time()
     print("Finished annotation upload: %s" % (toc - tic))
+    TIMINGS[f"Annotation Upload {num_annotations}"] = toc - tic
 
 
 def upload_predictions(dataset: Dataset):
@@ -167,16 +200,24 @@ def upload_predictions(dataset: Dataset):
 
     print("Starting prediction upload")
 
-    job = run.predict(
-        list(prediction_generator()), update=True, asynchronous=True
+    num_predictions = (
+        FLAGS.num_dataset_items * FLAGS.num_predictions_per_dataset_item
     )
+    chunk_size = num_predictions // FLAGS.job_parallelism
+    jobs = []
+    for prediction_chunk in chunk(prediction_generator(), chunk_size):
+        jobs.append(
+            run.predict(list(prediction_chunk), update=True, asynchronous=True)
+        )
 
-    try:
-        job.sleep_until_complete(False)
-    except JobError:
-        print(job.errors())
+    for job in jobs:
+        try:
+            job.sleep_until_complete(False)
+        except JobError:
+            print(job.errors())
     toc = time.time()
     print("Finished prediction upload: %s" % (toc - tic))
+    TIMINGS[f"Prediction Upload {num_predictions}"] = toc - tic
 
 
 def main(unused_argv):
@@ -194,6 +235,8 @@ def main(unused_argv):
     if FLAGS.cleanup_dataset and FLAGS.create_or_reuse_dataset == "create":
         client().delete_dataset(dataset.id)
 
+    print(TIMINGS)
+
 
 if __name__ == "__main__":
     app.run(main)
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
@@ -79,6 +79,11 @@ def test_box_gt_upload(dataset):
     assert response["annotations_ignored"] == 0
 
     response = dataset.refloc(annotation.reference_id)["annotations"]["box"]
+    single_annotation_response = dataset.ground_truth_loc(
+        annotation.reference_id, annotation.annotation_id
+    )
+
+    assert response[0] == single_annotation_response
     assert len(response) == 1
     response_annotation = response[0]
     assert_box_annotation_matches_dict(
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
diff --git a/tests/test_scene.py b/tests/test_scene.py

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@`
`19`	`19`	`)`
`20`	`20`	`ANNOTATION_UPDATE_KEY = "update"`
`21`	`21`	`AUTOTAGS_KEY = "autotags"`
	`22`	`+AUTOTAG_SCORE_THRESHOLD = "score_threshold"`
`22`	`23`	`EXPORTED_ROWS = "exportedRows"`
`23`	`24`	`CAMERA_PARAMS_KEY = "camera_params"`
`24`	`25`	`CLASS_PDF_KEY = "class_pdf"`