Merge pull request #71 from scaleapi/da/async_annotations

ardila · web-flow · commit 64ada346a09c · 2021-06-03T09:10:51.000-07:00
Da/async annotations
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -35,7 +35,7 @@ jobs:
           name: Pytest Test Cases
           command: | # Run test suite, uses NUCLEUS_TEST_API_KEY env variable
             mkdir test_results
-            poetry run coverage run --include=nucleus/* -m pytest --junitxml=test_results/junit.xml
+            poetry run coverage run --include=nucleus/* -m pytest -s -v --junitxml=test_results/junit.xml
             poetry run coverage report
             poetry run coverage html
 
diff --git a/README.md b/README.md
@@ -6,15 +6,13 @@ Aggregate metrics in ML are not good enough. To improve production ML, you need
 
 Scale Nucleus helps you:
 
-* Visualize your data
-* Curate interesting slices within your dataset
-* Review and manage annotations
-* Measure and debug your model performance
+- Visualize your data
+- Curate interesting slices within your dataset
+- Review and manage annotations
+- Measure and debug your model performance
 
 Nucleus is a new way—the right way—to develop ML models, helping us move away from the concept of one dataset and towards a paradigm of collections of scenarios.
 
-
-
 ## Installation
 
 `$ pip install scale-nucleus`
@@ -26,65 +24,83 @@ The client abstractions serves to authenticate the user and act as the gateway
 for users to interact with their datasets, models, and model runs.
 
 ### Create a client object
+
 ```python
 import nucleus
 client = nucleus.NucleusClient("YOUR_API_KEY_HERE")
 ```
 
 ### Create Dataset
+
 ```python
 dataset = client.create_dataset("My Dataset")
 ```
 
 ### List Datasets
+
 ```python
 datasets = client.list_datasets()
 ```
 
 ### Delete a Dataset
+
 By specifying target dataset id.
 A response code of 200 indicates successful deletion.
+
 ```python
 client.delete_dataset("YOUR_DATASET_ID")
 ```
 
 ### Append Items to a Dataset
+
 You can append both local images and images from the web. Simply specify the location and Nucleus will automatically infer if it's remote or a local file.
+
 ```python
 dataset_item_1 = DatasetItem(image_location="./1.jpeg", reference_id="1", metadata={"key": "value"})
 dataset_item_2 = DatasetItem(image_location="s3://srikanth-nucleus/9-1.jpg", reference_id="2", metadata={"key": "value"})
 ```
 
 The append function expects a list of `DatasetItem` objects to upload, like this:
+
 ```python
 response = dataset.append([dataset_item_1, dataset_item_2])
 ```
 
 ### Get Dataset Info
+
 Tells us the dataset name, number of dataset items, model_runs, and slice_ids.
+
 ```python
 dataset.info
 ```
 
 ### Access Dataset Items
+
 There are three methods to access individual Dataset Items:
 
 (1) Dataset Items are accessible by reference id
+
 ```python
 item = dataset.refloc("my_img_001.png")
 ```
+
 (2) Dataset Items are accessible by index
+
 ```python
 item = dataset.iloc(0)
 ```
+
 (3) Dataset Items are accessible by the dataset_item_id assigned internally
+
 ```python
 item = dataset.loc("dataset_item_id")
 ```
 
 ### Add Annotations
+
 Upload groundtruth annotations for the items in your dataset.
 Box2DAnnotation has same format as https://dashboard.scale.com/nucleus/docs/api#add-ground-truth
+
 ```python
 annotation_1 = BoxAnnotation(reference_id="1", label="label", x=0, y=0, width=10, height=10, annotation_id="ann_1", metadata={})
 annotation_2 = BoxAnnotation(reference_id="2", label="label", x=0, y=0, width=10, height=10, annotation_id="ann_2", metadata={})
@@ -94,6 +110,7 @@ response = dataset.annotate([annotation_1, annotation_2])
 For particularly large payloads, please reference the accompanying scripts in **references**
 
 ### Add Model
+
 The model abstraction is intended to represent a unique architecture.
 Models are independent of any dataset.
 
@@ -102,10 +119,12 @@ model = client.add_model(name="My Model", reference_id="newest-cnn-its-new", met
 ```
 
 ### Upload Predictions to ModelRun
+
 This method populates the model_run object with predictions. `ModelRun` objects need to reference a `Dataset` that has been created.
 Returns the associated model_id, human-readable name of the run, status, and user specified metadata.
 Takes a list of Box2DPredictions within the payload, where Box2DPrediction
 is formulated as in https://dashboard.scale.com/nucleus/docs/api#upload-model-outputs
+
 ```python
 prediction_1 = BoxPrediction(reference_id="1", label="label", x=0, y=0, width=10, height=10, annotation_id="pred_1", confidence=0.9)
 prediction_2 = BoxPrediction(reference_id="2", label="label", x=0, y=0, width=10, height=10, annotation_id="pred_2", confidence=0.2)
@@ -114,39 +133,51 @@ model_run = model.create_run(name="My Model Run", metadata={"timestamp": "121012
 ```
 
 ### Commit ModelRun
+
 The commit action indicates that the user is finished uploading predictions associated
-with this model run.  Committing a model run kicks off Nucleus internal processes
+with this model run. Committing a model run kicks off Nucleus internal processes
 to calculate performance metrics like IoU. After being committed, a ModelRun object becomes immutable.
+
 ```python
 model_run.commit()
 ```
 
 ### Get ModelRun Info
+
 Returns the associated model_id, human-readable name of the run, status, and user specified metadata.
+
 ```python
 model_run.info
 ```
 
 ### Accessing ModelRun Predictions
+
 You can access the modelRun predictions for an individual dataset_item through three methods:
 
 (1) user specified reference_id
+
 ```python
 model_run.refloc("my_img_001.png")
 ```
+
 (2) Index
+
 ```python
 model_run.iloc(0)
 ```
+
 (3) Internally maintained dataset_item_id
+
 ```python
 model_run.loc("dataset_item_id")
 ```
 
 ### Delete ModelRun
+
 Delete a model run using the target model_run_id.
 
 A response code of 200 indicates successful deletion.
+
 ```python
 client.delete_model_run("model_run_id")
 ```
@@ -163,14 +194,20 @@ poetry install
 ```
 
 Please install the pre-commit hooks by running the following command:
+
 ```python
 poetry run pre-commit install
 ```
 
 **Best practices for testing:**
 (1). Please run pytest from the root directory of the repo, i.e.
+
 ```
-poetry pytest tests/test_dataset.py
+poetry run pytest tests/test_dataset.py
 ```
 
+(2) To skip slow integration tests that have to wait for an async job to start.
 
+```
+poetry run pytest -m "not integration"
+```
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -8,14 +8,15 @@
     serialize_and_write_to_presigned_url,
 )
 
-from .annotation import Annotation
+from .annotation import Annotation, check_all_annotation_paths_remote
 from .constants import (
     DATASET_ITEM_IDS_KEY,
     DATASET_LENGTH_KEY,
     DATASET_MODEL_RUNS_KEY,
     DATASET_NAME_KEY,
     DATASET_SLICES_KEY,
     DEFAULT_ANNOTATION_UPDATE_MODE,
+    JOB_ID_KEY,
     NAME_KEY,
     REFERENCE_IDS_KEY,
     REQUEST_ID_KEY,
@@ -143,7 +144,8 @@ def annotate(
         annotations: List[Annotation],
         update: Optional[bool] = DEFAULT_ANNOTATION_UPDATE_MODE,
         batch_size: int = 5000,
-    ) -> dict:
+        asynchronous: bool = False,
+    ) -> Union[Dict[str, Any], AsyncJob]:
         """
         Uploads ground truth annotations for a given dataset.
         :param annotations: ground truth annotations for a given dataset to upload
@@ -156,6 +158,19 @@ def annotate(
             "ignored_items": int,
         }
         """
+        if asynchronous:
+            check_all_annotation_paths_remote(annotations)
+
+            request_id = serialize_and_write_to_presigned_url(
+                annotations, self.id, self._client
+            )
+            response = self._client.make_request(
+                payload={REQUEST_ID_KEY: request_id, UPDATE_KEY: update},
+                route=f"dataset/{self.id}/annotate?async=1",
+            )
+
+            return AsyncJob(response[JOB_ID_KEY], self._client)
+
         return self._client.annotate_dataset(
             self.id, annotations, update=update, batch_size=batch_size
         )
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.1.5"
+version = "0.1.16"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
@@ -35,7 +35,7 @@ packages = [{include="nucleus"}]
 python = "^3.6.2"
 grequests = "^0.6.0"
 requests = "^2.25.1"
-tqdm = "^4.60.0"
+tqdm = "^4.41.0"
 dataclasses = { version = "^0.7", python = "^3.6.1, <3.7" }
 
 [tool.poetry.dev-dependencies]
@@ -48,6 +48,11 @@ mypy = "^0.812"
 coverage = "^5.5"
 pre-commit = "^2.12.1"
 
+[tool.pytest.ini_options]
+markers = [
+    "integration: marks tests as slow (deselect with '-m \"not integration\"')",
+]
+
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,8 +1,16 @@
-from nucleus.job import JobError
+from nucleus.annotation import (
+    BoxAnnotation,
+    PolygonAnnotation,
+    SegmentationAnnotation,
+)
+from nucleus.job import AsyncJob, JobError
 import pytest
 import os
 
 from .helpers import (
+    TEST_BOX_ANNOTATIONS,
+    TEST_POLYGON_ANNOTATIONS,
+    TEST_SEGMENTATION_ANNOTATIONS,
     TEST_SLICE_NAME,
     TEST_DATASET_NAME,
     TEST_IMG_URLS,
@@ -136,6 +144,7 @@ def test_dataset_append_local(CLIENT, dataset):
     assert ERROR_PAYLOAD not in resp_json
 
 
+@pytest.mark.integration
 def test_dataset_append_async(dataset: Dataset):
     job = dataset.append(make_dataset_items(), asynchronous=True)
     job.sleep_until_complete()
@@ -165,6 +174,7 @@ def test_dataset_append_async_with_local_path(dataset: Dataset):
         dataset.append(ds_items, asynchronous=True)
 
 
+@pytest.mark.integration
 def test_dataset_append_async_with_1_bad_url(dataset: Dataset):
     ds_items = make_dataset_items()
     ds_items[0].image_location = "https://looks.ok.but.is.not.accessible"
@@ -238,3 +248,75 @@ def test_dataset_export_autotag_scores(CLIENT):
         for column in ["dataset_item_ids", "ref_ids", "scores"]:
             assert column in scores
             assert len(scores[column]) > 0
+
+
+@pytest.mark.integration
+def test_annotate_async(dataset: Dataset):
+    dataset.append(make_dataset_items())
+    semseg = SegmentationAnnotation.from_json(TEST_SEGMENTATION_ANNOTATIONS[0])
+    polygon = PolygonAnnotation(**TEST_POLYGON_ANNOTATIONS[0])
+    bbox = BoxAnnotation(**TEST_BOX_ANNOTATIONS[0])
+
+    job: AsyncJob = dataset.annotate(
+        annotations=[semseg, polygon, bbox],
+        asynchronous=True,
+    )
+    job.sleep_until_complete()
+    assert job.status() == {
+        "job_id": job.id,
+        "status": "Completed",
+        "message": {
+            "annotation_upload": {
+                "epoch": 1,
+                "total": 2,
+                "errored": 0,
+                "ignored": 0,
+                "datasetId": dataset.id,
+                "processed": 2,
+            },
+            "segmentation_upload": {
+                "errors": [],
+                "ignored": 0,
+                "n_errors": 0,
+                "processed": 1,
+            },
+        },
+    }
+
+
+@pytest.mark.integration
+def test_annotate_async_with_error(dataset: Dataset):
+    dataset.append(make_dataset_items())
+    semseg = SegmentationAnnotation.from_json(TEST_SEGMENTATION_ANNOTATIONS[0])
+    polygon = PolygonAnnotation(**TEST_POLYGON_ANNOTATIONS[0])
+    bbox = BoxAnnotation(**TEST_BOX_ANNOTATIONS[0])
+    bbox.reference_id = "fake_garbage"
+
+    job: AsyncJob = dataset.annotate(
+        annotations=[semseg, polygon, bbox],
+        asynchronous=True,
+    )
+    job.sleep_until_complete()
+
+    assert job.status() == {
+        "job_id": job.id,
+        "status": "Completed",
+        "message": {
+            "annotation_upload": {
+                "epoch": 1,
+                "total": 2,
+                "errored": 1,
+                "ignored": 0,
+                "datasetId": dataset.id,
+                "processed": 1,
+            },
+            "segmentation_upload": {
+                "errors": [],
+                "ignored": 0,
+                "n_errors": 0,
+                "processed": 1,
+            },
+        },
+    }
+
+    assert "Item with id fake_garbage doesn" in str(job.errors())
diff --git a/tests/test_prediction.py b/tests/test_prediction.py