add check for duplicate ref_id, ann_id (#296)

jean-lucas · web-flow · commit 05173ab4dd1d · 2022-05-13T10:46:08.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.10.8](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.10.8) - 2022-05-10
+
+### Fixed
+- Add checks for duplicate (`reference_id`, `annotation_id`) when uploading Annotations or Predictions
+
+
 ## [0.10.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.10.7) - 2022-05-09
 
 ### Fixed
diff --git a/nucleus/annotation_uploader.py b/nucleus/annotation_uploader.py
@@ -1,4 +1,5 @@
 import json
+from collections import Counter
 from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence
 
 from nucleus.annotation import Annotation, SegmentationAnnotation
@@ -8,6 +9,7 @@
     make_many_form_data_requests_concurrently,
 )
 from nucleus.constants import MASK_TYPE, SERIALIZED_REQUEST_KEY
+from nucleus.errors import DuplicateIDError
 from nucleus.payload_constructor import (
     construct_annotation_payload,
     construct_segmentation_payload,
@@ -208,6 +210,26 @@ def fn():
 
         return fn
 
+    @staticmethod
+    def check_for_duplicate_ids(annotations: Iterable[Annotation]):
+        """Do not allow annotations to have the same (annotation_id, reference_id) tuple"""
+
+        # some annotations like CategoryAnnotation do not have annotation_id attribute, and as such, we allow duplicates
+        tuple_ids = [
+            (ann.reference_id, ann.annotation_id)  # type: ignore
+            for ann in annotations
+            if hasattr(ann, "annotation_id")
+        ]
+        tuple_count = Counter(tuple_ids)
+        duplicates = {key for key, value in tuple_count.items() if value > 1}
+        if len(duplicates) > 0:
+            raise DuplicateIDError(
+                f"Duplicate annotations with the same (reference_id, annotation_id) properties found.\n"
+                f"Duplicates: {duplicates}\n"
+                f"To fix this, avoid duplicate annotations, or specify a different annotation_id attribute "
+                f"for the failing items."
+            )
+
 
 class PredictionUploader(AnnotationUploader):
     def __init__(
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -389,6 +389,9 @@ def annotate(
 
             Otherwise, returns an :class:`AsyncJob` object.
         """
+        uploader = AnnotationUploader(dataset_id=self.id, client=self._client)
+        uploader.check_for_duplicate_ids(annotations)
+
         if asynchronous:
             check_all_mask_paths_remote(annotations)
             request_id = serialize_and_write_to_presigned_url(
@@ -399,7 +402,7 @@ def annotate(
                 route=f"dataset/{self.id}/annotate?async=1",
             )
             return AsyncJob.from_json(response, self._client)
-        uploader = AnnotationUploader(dataset_id=self.id, client=self._client)
+
         return uploader.upload(
             annotations=annotations,
             update=update,
@@ -1405,6 +1408,14 @@ def upload_predictions(
                     "predictions_ignored": int,
                 }
         """
+        uploader = PredictionUploader(
+            model_run_id=None,
+            dataset_id=self.id,
+            model_id=model.id,
+            client=self._client,
+        )
+        uploader.check_for_duplicate_ids(predictions)
+
         if asynchronous:
             check_all_mask_paths_remote(predictions)
 
@@ -1416,21 +1427,15 @@ def upload_predictions(
                 route=f"dataset/{self.id}/model/{model.id}/uploadPredictions?async=1",
             )
             return AsyncJob.from_json(response, self._client)
-        else:
-            uploader = PredictionUploader(
-                model_run_id=None,
-                dataset_id=self.id,
-                model_id=model.id,
-                client=self._client,
-            )
-            return uploader.upload(
-                annotations=predictions,
-                batch_size=batch_size,
-                update=update,
-                remote_files_per_upload_request=remote_files_per_upload_request,
-                local_files_per_upload_request=local_files_per_upload_request,
-                local_file_upload_concurrency=local_file_upload_concurrency,
-            )
+
+        return uploader.upload(
+            annotations=predictions,
+            batch_size=batch_size,
+            update=update,
+            remote_files_per_upload_request=remote_files_per_upload_request,
+            local_files_per_upload_request=local_files_per_upload_request,
+            local_file_upload_concurrency=local_file_upload_concurrency,
+        )
 
     def predictions_iloc(self, model, index):
         """Fetches all predictions of a dataset item by its absolute index.
diff --git a/nucleus/errors.py b/nucleus/errors.py
@@ -72,3 +72,9 @@ def __init__(
     ):
         self.message = message
         super().__init__(self.message)
+
+
+class DuplicateIDError(Exception):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
diff --git a/nucleus/model_run.py b/nucleus/model_run.py
@@ -154,6 +154,11 @@ def predict(
             "predictions_ignored": int,
         }
         """
+        uploader = PredictionUploader(
+            model_run_id=self.model_run_id, client=self._client
+        )
+        uploader.check_for_duplicate_ids(annotations)
+
         if asynchronous:
             check_all_mask_paths_remote(annotations)
 
@@ -165,9 +170,7 @@ def predict(
                 route=f"modelRun/{self.model_run_id}/predict?async=1",
             )
             return AsyncJob.from_json(response, self._client)
-        uploader = PredictionUploader(
-            model_run_id=self.model_run_id, client=self._client
-        )
+
         return uploader.upload(
             annotations=annotations,
             update=update,
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.10.7"
+version = "0.10.8"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
@@ -15,6 +15,7 @@
     SegmentationAnnotation,
 )
 from nucleus.constants import ERROR_PAYLOAD
+from nucleus.errors import DuplicateIDError
 from nucleus.job import AsyncJob, JobError
 
 from .helpers import (
@@ -813,3 +814,10 @@ def test_box_gt_upload_embedding_async(CLIENT, dataset):
     status = job.status()
     assert status["job_id"] == job.job_id
     assert status["status"] == "Running"
+
+
+def test_annotation_duplicate_ids_fail(dataset):
+    box_ann = BoxAnnotation(**TEST_BOX_ANNOTATIONS[0])
+    annotations = [box_ann, box_ann]
+    with pytest.raises(DuplicateIDError):
+        dataset.annotate(annotations=annotations)
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -314,7 +314,8 @@ def test_dataset_append_async_with_local_path(dataset: Dataset):
         dataset.append(ds_items, asynchronous=True)
 
 
-@pytest.mark.integration
+# TODO(Jean): Fix and remove skip, this is a flaky test
+@pytest.mark.skip(reason="Flaky test")
 def test_dataset_append_async_with_1_bad_url(dataset: Dataset):
     ds_items = make_dataset_items()
     ds_items[0].image_location = "https://looks.ok.but.is.not.accessible"
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
@@ -1,4 +1,3 @@
-import os
 import time
 
 import pytest
@@ -16,6 +15,7 @@
     SegmentationPrediction,
 )
 from nucleus.constants import ERROR_PAYLOAD
+from nucleus.errors import DuplicateIDError
 from nucleus.job import AsyncJob, JobError
 
 from .helpers import (
@@ -724,3 +724,14 @@ def test_box_pred_upload_embedding_async(CLIENT, model_run):
     status = job.status()
     assert status["job_id"] == job.job_id
     assert status["status"] == "Running"
+
+
+def test_prediction_duplicate_ids_fail(dataset, model, model_run):
+    box_pred = BoxPrediction(**TEST_BOX_PREDICTIONS_EMBEDDINGS[0])
+    predictions = [box_pred, box_pred]
+
+    with pytest.raises(DuplicateIDError):
+        dataset.upload_predictions(model, predictions=predictions)
+
+    with pytest.raises(DuplicateIDError):
+        model_run.predict(annotations=predictions)