[Validate] Track-level metrics upload (#375)

Anthony Krivonos · web-flow · commit 2028e55cf104 · 2022-12-01T13:36:56.000-05:00
* [Validate] Track-level metrics upload

* Refactor upload_external_evaluation_results

* Export EntityLevel

* Version bump

* Fix test and pr fixes
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.14.30](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.30) - 2022-11-29
+
+### Added
+- Support for uploading track-level metrics to external evaluation functions using track_ref_ids
+
 ## [0.14.29](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.29) - 2022-11-22
 
 ### Added
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -178,7 +178,7 @@ def __init__(
             import tqdm.notebook as tqdm_notebook
 
             self.tqdm_bar = tqdm_notebook.tqdm
-        self._connection = Connection(self.api_key, self.endpoint)
+        self.connection = Connection(self.api_key, self.endpoint)
         self.validate = Validate(self.api_key, self.endpoint)
 
     def __repr__(self):
@@ -1014,16 +1014,16 @@ def create_object_index(
         )
 
     def delete(self, route: str):
-        return self._connection.delete(route)
+        return self.connection.delete(route)
 
     def get(self, route: str):
-        return self._connection.get(route)
+        return self.connection.get(route)
 
     def post(self, payload: dict, route: str):
-        return self._connection.post(payload, route)
+        return self.connection.post(payload, route)
 
     def put(self, payload: dict, route: str):
-        return self._connection.put(payload, route)
+        return self.connection.put(payload, route)
 
     # TODO: Fix return type, can be a list as well. Brings on a lot of mypy errors ...
     def make_request(
@@ -1054,7 +1054,7 @@ def make_request(
                     "Received defined payload with GET request! Will ignore payload"
                 )
             payload = None
-        return self._connection.make_request(payload, route, requests_command, return_raw_response)  # type: ignore
+        return self.connection.make_request(payload, route, requests_command, return_raw_response)  # type: ignore
 
     def _set_api_key(self, api_key):
         """Fetch API key from environment variable NUCLEUS_API_KEY if not set"""
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -1863,7 +1863,7 @@ def tracks(self) -> List[Track]:
         tracks_list = [
             Track.from_json(
                 payload=track,
-                client=self._client,
+                connection=self._client.connection,
             )
             for track in response[TRACKS_KEY]
         ]
diff --git a/nucleus/scene.py b/nucleus/scene.py
@@ -330,7 +330,10 @@ def from_json(
         frames = [Frame.from_json(frame) for frame in frames_payload]
         tracks_payload = payload.get(TRACKS_KEY, [])
         tracks = (
-            [Track.from_json(track, client) for track in tracks_payload]
+            [
+                Track.from_json(track, connection=client.connection)
+                for track in tracks_payload
+            ]
             if client
             else []
         )
@@ -680,7 +683,10 @@ def from_json(
         items = [DatasetItem.from_json(item) for item in items_payload]
         tracks_payload = payload.get(TRACKS_KEY, [])
         tracks = (
-            [Track.from_json(track, client) for track in tracks_payload]
+            [
+                Track.from_json(track, connection=client.connection)
+                for track in tracks_payload
+            ]
             if client
             else []
         )
diff --git a/nucleus/track.py b/nucleus/track.py
@@ -12,7 +12,7 @@
 )
 
 if TYPE_CHECKING:
-    from . import NucleusClient
+    from . import Connection
 
 
 @dataclass  # pylint: disable=R0902
@@ -25,7 +25,7 @@ class Track:  # pylint: disable=R0902
         metadata: Arbitrary key/value dictionary of info to attach to this track.
     """
 
-    _client: "NucleusClient"
+    _connection: "Connection"
     dataset_id: str
     reference_id: str
     metadata: Optional[dict] = None
@@ -41,10 +41,10 @@ def __eq__(self, other):
         )
 
     @classmethod
-    def from_json(cls, payload: dict, client: "NucleusClient"):
+    def from_json(cls, payload: dict, connection: "Connection"):
         """Instantiates track object from schematized JSON dict payload."""
         return cls(
-            _client=client,
+            _connection=connection,
             reference_id=str(payload[REFERENCE_ID_KEY]),
             dataset_id=str(payload[DATASET_ID_KEY]),
             metadata=payload.get(METADATA_KEY, None),
@@ -79,7 +79,7 @@ def update(
                 entire metadata object will be overwritten. Otherwise, only the keys in metadata will be overwritten.
         """
 
-        self._client.make_request(
+        self._connection.make_request(
             payload={
                 REFERENCE_ID_KEY: self.reference_id,
                 METADATA_KEY: metadata,
diff --git a/nucleus/validate/__init__.py b/nucleus/validate/__init__.py
@@ -7,7 +7,7 @@
 ]
 
 from .client import Validate
-from .constants import ThresholdComparison
+from .constants import EntityLevel, ThresholdComparison
 from .data_transfer_objects.eval_function import (
     EvalFunctionEntry,
     EvaluationCriterion,
diff --git a/nucleus/validate/client.py b/nucleus/validate/client.py
@@ -213,7 +213,7 @@ def create_external_eval_function(
 
         Args:
             name: unique name of evaluation function
-            level: level at which the eval function is run, defaults to "item"
+            level: level at which the eval function is run, defaults to EntityLevel.ITEM.
 
         Raises:
             - NucleusAPIError if the creation of the function fails on the server side
diff --git a/nucleus/validate/constants.py b/nucleus/validate/constants.py
@@ -23,7 +23,14 @@ class ThresholdComparison(str, Enum):
 
 
 class EntityLevel(str, Enum):
-    """Level for evaluation functions and unit tests."""
+    """
+    Data level at which evaluation functions produce outputs.
+    For instance, when comparing results across dataset items, use
+    `EntityLevel.ITEM`. For scenes, use `EntityLevel.SCENE`. Finally,
+    when comparing results between tracks within a single scene or a
+    holistic item datset, use `EntityLevel.TRACK`.
+    """
 
+    TRACK = "track"
     ITEM = "item"
     SCENE = "scene"
diff --git a/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py b/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py
@@ -6,6 +6,7 @@
 
 
 class EvaluationResult(ImmutableModel):
+    track_ref_id: Optional[str] = None
     item_ref_id: Optional[str] = None
     scene_ref_id: Optional[str] = None
     score: float = 0
@@ -15,16 +16,15 @@ class EvaluationResult(ImmutableModel):
     def is_item_or_scene_provided(
         cls, values
     ):  # pylint: disable=no-self-argument
-        if (
-            values.get("item_ref_id") is None
-            and values.get("scene_ref_id") is None
-        ) or (
-            (
-                values.get("item_ref_id") is not None
-                and values.get("scene_ref_id") is not None
+        ref_ids = [
+            values.get("track_ref_id", None),
+            values.get("item_ref_id", None),
+            values.get("scene_ref_id", None),
+        ]
+        if len([ref_id for ref_id in ref_ids if ref_id is not None]) != 1:
+            raise ValueError(
+                "Must provide exactly one of track_ref_id, item_ref_id, or scene_ref_id"
             )
-        ):
-            raise ValueError("Must provide either item_ref_id or scene_ref_id")
         return values
 
     @validator("score", "weight")
diff --git a/nucleus/validate/scenario_test.py b/nucleus/validate/scenario_test.py
@@ -8,9 +8,16 @@
 from typing import List, Optional, Union
 
 from ..connection import Connection
-from ..constants import DATASET_ITEMS_KEY, NAME_KEY, SCENES_KEY, SLICE_ID_KEY
+from ..constants import (
+    DATASET_ITEMS_KEY,
+    NAME_KEY,
+    SCENES_KEY,
+    SLICE_ID_KEY,
+    TRACKS_KEY,
+)
 from ..dataset_item import DatasetItem
 from ..scene import Scene
+from ..track import Track
 from .constants import (
     EVAL_FUNCTION_ID_KEY,
     SCENARIO_TEST_ID_KEY,
@@ -166,8 +173,8 @@ def get_eval_history(self) -> List[ScenarioTestEvaluation]:
 
     def get_items(
         self, level: EntityLevel = EntityLevel.ITEM
-    ) -> Union[List[DatasetItem], List[Scene]]:
-        """Gets items within a scenario test at a given level, returning a list of DatasetItem or Scene objects.
+    ) -> Union[List[Track], List[DatasetItem], List[Scene]]:
+        """Gets items within a scenario test at a given level, returning a list of Track, DatasetItem, or Scene objects.
 
         Args:
             level: :class:`EntityLevel`
@@ -178,14 +185,22 @@ def get_items(
         response = self.connection.get(
             f"validate/scenario_test/{self.id}/items",
         )
+        if level == EntityLevel.TRACK:
+            return [
+                Track.from_json(track, connection=self.connection)
+                for track in response.get(TRACKS_KEY, [])
+            ]
         if level == EntityLevel.SCENE:
             return [
                 Scene.from_json(scene, skip_validate=True)
-                for scene in response[SCENES_KEY]
+                for scene in response.get(SCENES_KEY, [])
             ]
-        return [
-            DatasetItem.from_json(item) for item in response[DATASET_ITEMS_KEY]
-        ]
+        if level == EntityLevel.ITEM:
+            return [
+                DatasetItem.from_json(item)
+                for item in response.get(DATASET_ITEMS_KEY, [])
+            ]
+        raise ValueError(f"Invalid entity level: {level}")
 
     def set_baseline_model(self, model_id: str):
         """Sets a new baseline model for the ScenarioTest.  In order to be eligible to be a baseline,
@@ -222,23 +237,41 @@ def upload_external_evaluation_results(
             len(results) > 0
         ), "Submitting evaluation requires at least one result."
 
-        level = EntityLevel.ITEM
+        level: Optional[EntityLevel] = None
         metric_per_ref_id = {}
         weight_per_ref_id = {}
         aggregate_weighted_sum = 0.0
         aggregate_weight = 0.0
 
+        # Ensures reults at only one EntityLevel are provided, otherwise throwing a ValueError
+        def ensure_level_consistency_or_raise(
+            cur_level: Optional[EntityLevel], new_level: EntityLevel
+        ):
+            if level is not None and level != new_level:
+                raise ValueError(
+                    f"All evaluation results must only pertain to one level. Received {cur_level} then {new_level}"
+                )
+
         # aggregation based on https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
         for r in results:
-            # Ensure results are uploaded ONLY for items or ONLY for scenes
+            # Ensure results are uploaded ONLY for ONE OF tracks, items, and scenes
+            if r.track_ref_id is not None:
+                ensure_level_consistency_or_raise(level, EntityLevel.TRACK)
+                level = EntityLevel.TRACK
+            if r.item_ref_id is not None:
+                ensure_level_consistency_or_raise(level, EntityLevel.ITEM)
+                level = EntityLevel.ITEM
             if r.scene_ref_id is not None:
+                ensure_level_consistency_or_raise(level, EntityLevel.SCENE)
                 level = EntityLevel.SCENE
-            if r.item_ref_id is not None and level == EntityLevel.SCENE:
-                raise ValueError(
-                    "All evaluation results must either pertain to a scene_ref_id or an item_ref_id, not both."
-                )
             ref_id = (
-                r.item_ref_id if level == EntityLevel.ITEM else r.scene_ref_id
+                r.track_ref_id
+                if level == EntityLevel.TRACK
+                else (
+                    r.item_ref_id
+                    if level == EntityLevel.ITEM
+                    else r.scene_ref_id
+                )
             )
 
             # Aggregate scores and weights
@@ -255,7 +288,7 @@ def upload_external_evaluation_results(
             "overall_metric": aggregate_weighted_sum / aggregate_weight,
             "model_id": model_id,
             "slice_id": self.slice_id,
-            "level": level.value,
+            "level": level.value if level else None,
         }
         response = self.connection.post(
             payload,
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.14.29"
+version = "0.14.30"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -20,6 +20,7 @@
 DATASET_WITH_EMBEDDINGS = "ds_c8jwdhy4y4f0078hzceg"
 NUCLEUS_PYTEST_USER_ID = "60ad648c85db770026e9bf77"
 
+EVAL_FUNCTION_NAME = "eval_fn"
 EVAL_FUNCTION_THRESHOLD = 0.5
 EVAL_FUNCTION_COMPARISON = ThresholdComparison.GREATER_THAN_EQUAL_TO
 
diff --git a/tests/test_track.py b/tests/test_track.py
@@ -1,3 +1,4 @@
+import time
 from copy import deepcopy
 
 import pytest
@@ -69,7 +70,7 @@ def test_create_mp_with_tracks(CLIENT, dataset_scene):
     expected_track_reference_ids = [
         ann["track_reference_id"] for ann in TEST_SCENE_BOX_PREDS_WITH_TRACK
     ]
-    model_reference = "model_test_create_mp_with_tracks"
+    model_reference = "model_" + str(time.time())
     model = CLIENT.create_model(TEST_MODEL_NAME, model_reference)
 
     # Act
diff --git a/tests/validate/test_scenario_test.py b/tests/validate/test_scenario_test.py

Original file line number	Diff line number	Diff line change
`@@ -1863,7 +1863,7 @@ def tracks(self) -> List[Track]:`
`1863`	`1863`	`tracks_list = [`
`1864`	`1864`	`Track.from_json(`
`1865`	`1865`	`payload=track,`
`1866`		`- client=self._client,`
	`1866`	`+ connection=self._client.connection,`
`1867`	`1867`	`)`
`1868`	`1868`	`for track in response[TRACKS_KEY]`
`1869`	`1869`	`]`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`]`
`8`	`8`
`9`	`9`	`from .client import Validate`
`10`		`-from .constants import ThresholdComparison`
	`10`	`+from .constants import EntityLevel, ThresholdComparison`
`11`	`11`	`from .data_transfer_objects.eval_function import (`
`12`	`12`	`EvalFunctionEntry,`
`13`	`13`	`EvaluationCriterion,`