Merge branch 'master' into da-export-embeddings

Diego Ardila · Diego Ardila · commit fbc720859860 · 2021-09-03T13:58:46.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -134,3 +134,6 @@ dmypy.json
 
 # Poetry lockfile (no need for deploys, best practice is to not check this in)
 poetry.lock
+
+# vscode
+.vscode/
diff --git a/nucleus/autocurate.py b/nucleus/autocurate.py
@@ -0,0 +1,29 @@
+import datetime
+import requests
+from nucleus.constants import (
+    JOB_CREATION_TIME_KEY,
+    JOB_LAST_KNOWN_STATUS_KEY,
+    JOB_TYPE_KEY,
+)
+from nucleus.job import AsyncJob
+
+
+def entropy(name, model_runs, client):
+    assert (
+        len({model_run.dataset_id for model_run in model_runs}) == 1
+    ), f"Model runs have conflicting dataset ids: {model_runs}"
+    model_run_ids = [model_run.model_run_id for model_run in model_runs]
+    dataset_id = model_runs[0].dataset_id
+    response = client.make_request(
+        payload={"modelRunIds": model_run_ids},
+        route=f"autocurate/{dataset_id}/single_model_entropy/{name}",
+        requests_command=requests.post,
+    )
+    # TODO: the response should already have the below three fields populated
+    response[JOB_LAST_KNOWN_STATUS_KEY] = "Started"
+    response[JOB_TYPE_KEY] = "autocurateEntropy"
+    response[JOB_CREATION_TIME_KEY] = (
+        datetime.datetime.now().isoformat("T", "milliseconds") + "Z"
+    )
+    job = AsyncJob.from_json(response, client)
+    return job
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -86,6 +86,7 @@
 TYPE_KEY = "type"
 UPDATED_ITEMS = "updated_items"
 UPDATE_KEY = "update"
+UPLOAD_TO_SCALE_KEY = "upload_to_scale"
 URL_KEY = "url"
 VERTICES_KEY = "vertices"
 WIDTH_KEY = "width"
diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py
@@ -10,6 +10,7 @@
     IMAGE_URL_KEY,
     METADATA_KEY,
     ORIGINAL_IMAGE_URL_KEY,
+    UPLOAD_TO_SCALE_KEY,
     REFERENCE_ID_KEY,
     TYPE_KEY,
     URL_KEY,
@@ -92,12 +93,17 @@ class DatasetItem:  # pylint: disable=R0902
     reference_id: Optional[str] = None
     metadata: Optional[dict] = None
     pointcloud_location: Optional[str] = None
+    upload_to_scale: Optional[bool] = True
 
     def __post_init__(self):
         assert self.reference_id is not None, "reference_id is required."
         assert bool(self.image_location) != bool(
             self.pointcloud_location
         ), "Must specify exactly one of the image_location, pointcloud_location parameters"
+        if self.pointcloud_location and not self.upload_to_scale:
+            raise NotImplementedError(
+                "Skipping upload to Scale is not currently implemented for pointclouds."
+            )
         self.local = (
             is_local_path(self.image_location) if self.image_location else None
         )
@@ -133,6 +139,7 @@ def from_json(cls, payload: dict, is_scene=False):
             image_location=image_url,
             reference_id=payload.get(REFERENCE_ID_KEY, None),
             metadata=payload.get(METADATA_KEY, {}),
+            upload_to_scale=payload.get(UPLOAD_TO_SCALE_KEY, None),
         )
 
     def local_file_exists(self):
@@ -158,6 +165,7 @@ def to_payload(self, is_scene=False) -> dict:
                 self.image_location
             ), "Must specify image_location for DatasetItems not in a LidarScene"
             payload[IMAGE_URL_KEY] = self.image_location
+            payload[UPLOAD_TO_SCALE_KEY] = self.upload_to_scale
 
         return payload
 
diff --git a/nucleus/model_run.py b/nucleus/model_run.py
@@ -31,10 +31,10 @@ class ModelRun:
     def __init__(self, model_run_id: str, dataset_id: str, client):
         self.model_run_id = model_run_id
         self._client = client
-        self._dataset_id = dataset_id
+        self.dataset_id = dataset_id
 
     def __repr__(self):
-        return f"ModelRun(model_run_id='{self.model_run_id}', dataset_id='{self._dataset_id}', client={self._client})"
+        return f"ModelRun(model_run_id='{self.model_run_id}', dataset_id='{self.dataset_id}', client={self._client})"
 
     def __eq__(self, other):
         if self.model_run_id == other.model_run_id:
@@ -115,7 +115,7 @@ def predict(
             check_all_mask_paths_remote(annotations)
 
             request_id = serialize_and_write_to_presigned_url(
-                annotations, self._dataset_id, self._client
+                annotations, self.dataset_id, self._client
             )
             response = self._client.make_request(
                 payload={REQUEST_ID_KEY: request_id, UPDATE_KEY: update},
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,7 @@ flake8 = "^3.9.1"
 mypy = "^0.812"
 coverage = "^5.5"
 pre-commit = "^2.12.1"
+jupyterlab = "^3.1.10"
 
 [tool.pytest.ini_options]
 markers = [
diff --git a/tests/test_autocurate.py b/tests/test_autocurate.py
@@ -0,0 +1,55 @@
+from nucleus.prediction import BoxPrediction
+from nucleus.job import AsyncJob
+from nucleus import autocurate, DatasetItem
+import time
+from nucleus.constants import ERROR_PAYLOAD
+from tests.helpers import (
+    TEST_BOX_PREDICTIONS,
+    TEST_DATASET_NAME,
+    TEST_IMG_URLS,
+    TEST_MODEL_NAME,
+    TEST_MODEL_RUN,
+    reference_id_from_url,
+)
+import pytest
+
+
+@pytest.fixture()
+def model_run(CLIENT):
+    ds = CLIENT.create_dataset(TEST_DATASET_NAME)
+    ds_items = []
+    for url in TEST_IMG_URLS[:2]:
+        ds_items.append(
+            DatasetItem(
+                image_location=url,
+                reference_id=reference_id_from_url(url),
+            )
+        )
+
+    response = ds.append(ds_items)
+
+    assert ERROR_PAYLOAD not in response.json()
+
+    model = CLIENT.add_model(
+        name=TEST_MODEL_NAME, reference_id="model_" + str(time.time())
+    )
+
+    run = model.create_run(name=TEST_MODEL_RUN, dataset=ds, predictions=[])
+    prediction = BoxPrediction(**TEST_BOX_PREDICTIONS[1])
+    run.predict(annotations=[prediction])
+
+    yield run
+
+    response = CLIENT.delete_dataset(ds.id)
+    assert response == {"message": "Beginning dataset deletion..."}
+    response = CLIENT.delete_model(model.id)
+    assert response == {}
+
+
+@pytest.mark.integration
+def test_autocurate_integration(model_run, CLIENT):
+    job = autocurate.entropy(
+        "Test Autocurate Integration", [model_run], CLIENT
+    )
+    job.sleep_until_complete()
+    assert job.job_last_known_status == "Completed"
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,5 +1,7 @@
 import copy
 import math
+from nucleus.model import Model
+from nucleus.prediction import BoxPrediction
 import os
 
 import pytest
@@ -176,12 +178,17 @@ def check_is_expected_response(response):
 
     # Plain image upload
     ds_items_plain = []
-    for url in TEST_IMG_URLS:
+    for i, url in enumerate(TEST_IMG_URLS):
+        # Upload just the first item in privacy mode
+        upload_to_scale = i == 0
         ds_items_plain.append(
             DatasetItem(
-                image_location=url, reference_id=url.split("/")[-1] + "_plain"
+                image_location=url,
+                upload_to_scale=upload_to_scale,
+                reference_id=url.split("/")[-1] + "_plain",
             )
         )
+
     response = dataset.append(ds_items_plain)
     check_is_expected_response(response)
 
@@ -289,8 +296,8 @@ def test_dataset_append_async_with_1_bad_url(dataset: Dataset):
             "started_image_processing": f"Dataset: {dataset.id}, Job: {job.job_id}",
         },
         "job_progress": "1.00",
-        "completed_steps": 1,
-        "total_steps": 1,
+        "completed_steps": 4,
+        "total_steps": 4,
     }
     # The error is fairly detailed and subject to change. What's important is we surface which URLs failed.
     assert (
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
@@ -298,7 +298,7 @@ def test_mixed_pred_upload_async(model_run: ModelRun):
                 "total": 2,
                 "errored": 0,
                 "ignored": 0,
-                "datasetId": model_run._dataset_id,
+                "datasetId": model_run.dataset_id,
                 "processed": 2,
             },
             "segmentation_upload": {
@@ -339,7 +339,7 @@ def test_mixed_pred_upload_async_with_error(model_run: ModelRun):
                 "total": 2,
                 "errored": 1,
                 "ignored": 0,
-                "datasetId": model_run._dataset_id,
+                "datasetId": model_run.dataset_id,
                 "processed": 1,
             },
             "segmentation_upload": {