Speed up ModelCI tests and add annotations as are required by backend (#181)

gatli · phil-scale · web-flow · commit 64c7bd8b612b · 2021-12-17T11:36:45.000-08:00
* Speed up Model CI tests and add a evaluation test with missing predictions

* Fix create_annotations bug

* Add TODO to remove test

* Add annotation dependency to unit_test

* Update message

Co-authored-by: phil-scale &lt;phil.chen@scale.com&gt;
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -35,7 +35,6 @@
 ]
 
 import os
-import time
 import warnings
 from typing import Dict, List, Optional, Sequence, Union
 
@@ -102,6 +101,7 @@
     DatasetItemRetrievalError,
     ModelCreationError,
     ModelRunCreationError,
+    NoAPIKey,
     NotFoundError,
     NucleusAPIError,
 )
@@ -150,11 +150,11 @@ class NucleusClient:
 
     def __init__(
         self,
-        api_key: str,
+        api_key: Optional[str] = None,
         use_notebook: bool = False,
         endpoint: str = None,
     ):
-        self.api_key = api_key
+        self.api_key = self._set_api_key(api_key)
         self.tqdm_bar = tqdm.tqdm
         if endpoint is None:
             self.endpoint = os.environ.get(
@@ -166,7 +166,6 @@ def __init__(
         if use_notebook:
             self.tqdm_bar = tqdm_notebook.tqdm
         self._connection = Connection(self.api_key, self.endpoint)
-
         self.modelci = ModelCI(self.api_key, self.endpoint)
 
     def __repr__(self):
@@ -936,3 +935,13 @@ def handle_bad_response(
         self._connection.handle_bad_response(
             endpoint, requests_command, requests_response, aiohttp_response
         )
+
+    def _set_api_key(self, api_key):
+        """Fetch API key from environment variable NUCLEUS_API_KEY if not set"""
+        api_key = (
+            api_key if api_key else os.environ.get("NUCLEUS_API_KEY", None)
+        )
+        if api_key is None:
+            raise NoAPIKey()
+
+        return api_key
diff --git a/nucleus/errors.py b/nucleus/errors.py
@@ -62,3 +62,12 @@ def __init__(
             message += "\n This likely indicates temporary downtime of the API, please try again in a minute or two"
 
         super().__init__(message)
+
+
+class NoAPIKey(Exception):
+    def __init__(
+        self,
+        message="You need to pass an API key to the NucleusClient or set the environment variable NUCLEUS_API_KEY",
+    ):
+        self.message = message
+        super().__init__(self.message)
diff --git a/tests/modelci/conftest.py b/tests/modelci/conftest.py
@@ -2,11 +2,48 @@
 
 import pytest
 
-from tests.helpers import TEST_MODEL_NAME, TEST_SLICE_NAME, get_uuid
+from nucleus import BoxAnnotation
+from tests.helpers import (
+    TEST_BOX_ANNOTATIONS,
+    TEST_MODEL_NAME,
+    TEST_SLICE_NAME,
+    get_uuid,
+)
+from tests.modelci.helpers import create_box_annotations, create_predictions
 from tests.test_dataset import make_dataset_items
 
 
-@pytest.fixture()
+@pytest.fixture(scope="module")
+def modelci_dataset(CLIENT):
+    """SHOULD NOT BE MUTATED IN TESTS. This dataset lives for the whole test module scope."""
+    ds = CLIENT.create_dataset("[Test Model CI] Dataset", is_scene=False)
+    yield ds
+
+    CLIENT.delete_dataset(ds.id)
+
+
+@pytest.fixture(scope="module")
+def dataset_items(modelci_dataset):
+    items = make_dataset_items()
+    modelci_dataset.append(items)
+    yield items
+
+
+@pytest.fixture(scope="module")
+def slice_items(dataset_items):
+    yield dataset_items[:2]
+
+
+@pytest.fixture(scope="module")
+def test_slice(modelci_dataset, slice_items):
+    slc = modelci_dataset.create_slice(
+        name=TEST_SLICE_NAME,
+        reference_ids=[item.reference_id for item in slice_items],
+    )
+    yield slc
+
+
+@pytest.fixture(scope="module")
 def model(CLIENT):
     model_reference = "model_" + str(time.time())
     model = CLIENT.create_model(TEST_MODEL_NAME, model_reference)
@@ -15,34 +52,29 @@ def model(CLIENT):
     CLIENT.delete_model(model.id)
 
 
-@pytest.fixture()
-def unit_test(CLIENT, dataset):
-    items = make_dataset_items()
-    dataset.append(items)
+@pytest.fixture(scope="module")
+def annotations(modelci_dataset, slice_items):
+    annotations = create_box_annotations(modelci_dataset, slice_items)
+    yield annotations
+
+
+@pytest.fixture(scope="module")
+def predictions(model, modelci_dataset, annotations):
+    predictions = create_predictions(modelci_dataset, model, annotations)
+    yield predictions
+
+
+@pytest.fixture(scope="module")
+@pytest.mark.usefixtures(
+    "annotations"
+)  # Unit test needs to have annotations in the slice
+def unit_test(CLIENT, test_slice):
     test_name = "unit_test_" + get_uuid()  # use uuid to make unique
-    slc = dataset.create_slice(
-        name=TEST_SLICE_NAME,
-        reference_ids=[items[0].reference_id],
-    )
     unit_test = CLIENT.modelci.create_unit_test(
         name=test_name,
-        slice_id=slc.id,
+        slice_id=test_slice.id,
         evaluation_criteria=[CLIENT.modelci.eval_functions.bbox_recall > 0.5],
     )
     yield unit_test
 
     CLIENT.modelci.delete_unit_test(unit_test.id)
-
-
-@pytest.fixture()
-def test_slice(CLIENT, dataset):
-    items = make_dataset_items()
-    dataset.append(items)
-    slice_name = TEST_SLICE_NAME + f"_{get_uuid()}"
-    slc = dataset.create_slice(
-        name=slice_name,
-        reference_ids=[items[0].reference_id],
-    )
-    yield slc
-
-    CLIENT.delete_slice(slc.id)
diff --git a/tests/modelci/helpers.py b/tests/modelci/helpers.py
@@ -0,0 +1,41 @@
+from typing import List
+
+from nucleus import BoxAnnotation, BoxPrediction, Dataset, DatasetItem, Model
+
+
+def create_box_annotations(
+    dataset: Dataset, dataset_items: List[DatasetItem]
+) -> List[BoxAnnotation]:
+    annotations = [
+        BoxAnnotation(
+            label=f"[Pytest] Box Annotation {ds_item.reference_id}",
+            x=50 + i * 10,
+            y=60 + i * 10,
+            width=70 + i * 10,
+            height=80 + i * 10,
+            reference_id=ds_item.reference_id,
+            annotation_id=f"[Pytest] Box Annotation Annotation Id{i}",
+        )
+        for i, ds_item in enumerate(dataset_items)
+    ]
+    dataset.annotate(annotations)
+    return annotations
+
+
+def create_predictions(
+    dataset: Dataset, model: Model, annotations: List[BoxAnnotation]
+) -> List[BoxPrediction]:
+    predictions = [
+        BoxPrediction(
+            label=ann.label,
+            x=ann.x,
+            y=ann.y,
+            width=ann.width,
+            height=ann.height,
+            reference_id=ann.reference_id,
+            confidence=0.1 * i,
+        )
+        for i, ann in enumerate(annotations)
+    ]
+    dataset.upload_predictions(model, predictions)
+    return predictions
diff --git a/tests/modelci/test_unit_test.py b/tests/modelci/test_unit_test.py
@@ -5,10 +5,8 @@
 from tests.helpers import (
     EVAL_FUNCTION_COMPARISON,
     EVAL_FUNCTION_THRESHOLD,
-    TEST_SLICE_NAME,
     get_uuid,
 )
-from tests.test_dataset import make_dataset_items
 
 
 def test_unit_test_metric_creation(CLIENT, unit_test):
@@ -42,23 +40,16 @@ def test_list_unit_test(CLIENT, test_slice):
     CLIENT.modelci.delete_unit_test(unit_test.id)
 
 
-def test_unit_test_items(CLIENT, dataset):
-    # create some dataset_items for the unit test to reference
-    items = make_dataset_items()
-    dataset.append(items)
+def test_unit_test_items(CLIENT, test_slice, slice_items, annotations):
     test_name = "unit_test_" + get_uuid()  # use uuid to make unique
-    slc = dataset.create_slice(
-        name=TEST_SLICE_NAME,
-        reference_ids=[item.reference_id for item in items],
-    )
 
     unit_test = CLIENT.modelci.create_unit_test(
         name=test_name,
-        slice_id=slc.id,
+        slice_id=test_slice.id,
         evaluation_criteria=[CLIENT.modelci.eval_functions.bbox_iou() > 0.5],
     )
 
-    expected_items_locations = [item.image_location for item in items]
+    expected_items_locations = [item.image_location for item in slice_items]
     actual_items_locations = [
         item.image_location for item in unit_test.get_items()
     ]
diff --git a/tests/modelci/test_unit_test_evaluation.py b/tests/modelci/test_unit_test_evaluation.py
@@ -1,26 +1,73 @@
 import pytest
 
-from nucleus import BoxAnnotation, BoxPrediction
 from nucleus.job import AsyncJob
 from nucleus.modelci.unit_test_evaluation import (
     UnitTestEvaluation,
     UnitTestItemEvaluation,
 )
-from tests.helpers import (
-    EVAL_FUNCTION_THRESHOLD,
-    TEST_BOX_ANNOTATIONS,
-    TEST_BOX_PREDICTIONS,
-)
+from tests.helpers import EVAL_FUNCTION_THRESHOLD, get_uuid
+from tests.modelci.helpers import create_predictions
 
 
 @pytest.mark.integration
-def test_unit_test_evaluation(CLIENT, dataset, model, unit_test):
-    annotations = [BoxAnnotation(**TEST_BOX_ANNOTATIONS[0])]
-    dataset.annotate(annotations=annotations)
-    predictions = [BoxPrediction(**TEST_BOX_PREDICTIONS[0])]
-    dataset.upload_predictions(model, predictions)
+def test_unit_test_evaluation(
+    CLIENT, modelci_dataset, model, unit_test, annotations, predictions
+):
+    iou = CLIENT.modelci.eval_functions.bbox_iou
+    # NOTE: Another criterion is defined in the unit_test fixture
+    unit_test.add_criterion(iou() > EVAL_FUNCTION_THRESHOLD)
+
+    job: AsyncJob = CLIENT.modelci.evaluate_model_on_unit_tests(
+        model.id, [unit_test.name]
+    )
+    job.sleep_until_complete()
 
+    criteria = unit_test.get_criteria()
+    evaluations = unit_test.get_eval_history()
+    assert isinstance(evaluations, list)
+    assert len(evaluations) == len(criteria)
+    assert all(
+        isinstance(evaluation, UnitTestEvaluation)
+        for evaluation in evaluations
+    )
+    assert all(
+        evaluation.unit_test_id == unit_test.id for evaluation in evaluations
+    )
+    assert all(evaluation.model_id == model.id for evaluation in evaluations)
+
+    unit_test_slice = CLIENT.get_slice(unit_test.slice_id)
+    item_evaluations = evaluations[0].item_evals
+    assert isinstance(item_evaluations, list)
+    assert len(item_evaluations) == len(
+        unit_test_slice.items_and_annotations()
+    )
+    assert isinstance(item_evaluations[0], UnitTestItemEvaluation)
+    assert all(
+        eval.evaluation_id == evaluations[0].id for eval in item_evaluations
+    )
+    assert all(eval.unit_test_id == unit_test.id for eval in item_evaluations)
+
+
+@pytest.mark.integration
+@pytest.mark.xfail(
+    reason="Missing predictions is currently treated as failure in evaluation."
+)
+@pytest.mark.skip
+def test_unit_test_evaluation_no_prediction_for_last_item(
+    # TODO(gunnar): Remove this slow integration tests after this is confirmed and tested on the evaluation side.
+    #  there's no reason doing unit testing for evaluation here.
+    CLIENT,
+    modelci_dataset,
+    unit_test,
+    annotations,
+):
+    uuid = get_uuid()
+    model = CLIENT.create_model(
+        f"[Model CI Test] {uuid}", reference_id=f"model_ci_{uuid}"
+    )
+    create_predictions(modelci_dataset, model, annotations[:-1])
     iou = CLIENT.modelci.eval_functions.bbox_iou
+    # NOTE: Another criterion is defined in the unit_test fixture
     unit_test.add_criterion(iou() > EVAL_FUNCTION_THRESHOLD)
 
     job: AsyncJob = CLIENT.modelci.evaluate_model_on_unit_tests(