Skip to content

Commit b9c1a24

Browse files
authored
[Validate] Provide endpoint for uploading custom evaluation results (#306)
1 parent d72072f commit b9c1a24

File tree

6 files changed

+81
-4
lines changed

6 files changed

+81
-4
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1515

1616
### Added
1717

18-
- Allow users to create placeholder evaluation functions for Scenario Tests in Validate
18+
- Allow users to create external evaluation functions for Scenario Tests in Validate.
19+
- Allow users to upload external evaluation results calculated on the client side.
1920

2021

2122
## [0.11.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.11.2) - 2022-05-20

nucleus/validate/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
GetEvalFunctions,
1515
)
1616
from .data_transfer_objects.scenario_test import CreateScenarioTestRequest
17+
from .data_transfer_objects.scenario_test_evaluations import EvaluationResult
1718
from .errors import CreateScenarioTestError
1819
from .eval_functions.available_eval_functions import AvailableEvalFunctions
1920
from .scenario_test import ScenarioTest

nucleus/validate/client.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,15 @@ def create_scenario_test(
8585
"evaluation_functions=[client.validate.eval_functions.bbox_iou()]"
8686
)
8787

88+
external_fns = [
89+
f.eval_func_entry.is_external_function
90+
for f in evaluation_functions
91+
]
92+
if any(external_fns):
93+
assert all(
94+
external_fns
95+
), "Cannot create scenario tests with mixed placeholder and non-placeholder evaluation functions"
96+
8897
response = self.connection.post(
8998
CreateScenarioTestRequest(
9099
name=name,

nucleus/validate/data_transfer_objects/scenario_test_evaluations.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from typing import List
22

3+
from pydantic import validator
4+
35
from nucleus.pydantic_base import ImmutableModel
46

57

@@ -9,3 +11,17 @@ class EvalDetail(ImmutableModel):
911

1012
class GetEvalHistory(ImmutableModel):
1113
evaluations: List[EvalDetail]
14+
15+
16+
class EvaluationResult(ImmutableModel):
17+
item_ref_id: str
18+
score: float
19+
weight: float = 1
20+
21+
@validator("score", "weight")
22+
def is_normalized(cls, v): # pylint: disable=no-self-argument
23+
if 0 <= v <= 1:
24+
return v
25+
raise ValueError(
26+
f"Expected evaluation score and weights to be normalized between 0 and 1, but got: {v}"
27+
)

nucleus/validate/eval_functions/available_eval_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1303,7 +1303,7 @@ def __repr__(self):
13031303
return (
13041304
f"<AvailableEvaluationFunctions: public: {functions_lower} "
13051305
f"private: {list(self._custom_to_function.keys())} "
1306-
f"external: {list(self._external_to_function.keys())} "
1306+
f"external: {list(self._external_to_function.keys())}"
13071307
)
13081308

13091309
@property

nucleus/validate/scenario_test.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,15 @@
1818
THRESHOLD_KEY,
1919
ThresholdComparison,
2020
)
21-
from .data_transfer_objects.scenario_test_evaluations import GetEvalHistory
21+
from .data_transfer_objects.scenario_test_evaluations import (
22+
EvaluationResult,
23+
GetEvalHistory,
24+
)
2225
from .data_transfer_objects.scenario_test_metric import AddScenarioTestFunction
23-
from .eval_functions.available_eval_functions import EvalFunction
26+
from .eval_functions.available_eval_functions import (
27+
EvalFunction,
28+
ExternalEvalFunction,
29+
)
2430
from .scenario_test_evaluation import ScenarioTestEvaluation
2531
from .scenario_test_metric import ScenarioTestMetric
2632

@@ -83,9 +89,13 @@ def add_eval_function(
8389
Args:
8490
eval_function: :class:`EvalFunction`
8591
92+
Raises:
93+
NucleusAPIError: By adding this function, the scenario test mixes external with non-external functions which is not permitted.
94+
8695
Returns:
8796
The created ScenarioTestMetric object.
8897
"""
98+
8999
response = self.connection.post(
90100
AddScenarioTestFunction(
91101
scenario_test_name=self.name,
@@ -174,3 +184,43 @@ def set_baseline_model(self, model_id: str):
174184
)
175185
self.baseline_model_id = response.get("baseline_model_id")
176186
return self.baseline_model_id
187+
188+
def upload_external_evaluation_results(
189+
self,
190+
eval_fn: ExternalEvalFunction,
191+
results: List[EvaluationResult],
192+
model_id: str,
193+
):
194+
assert (
195+
eval_fn.eval_func_entry.is_external_function
196+
), "Submitting evaluation results is only available for external functions."
197+
198+
assert (
199+
len(results) > 0
200+
), "Submitting evaluation requires at least one result."
201+
202+
metric_per_ref_id = {}
203+
weight_per_ref_id = {}
204+
aggregate_weighted_sum = 0.0
205+
aggregate_weight = 0.0
206+
# aggregation based on https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
207+
for r in results:
208+
metric_per_ref_id[r.item_ref_id] = r.score
209+
weight_per_ref_id[r.item_ref_id] = r.weight
210+
aggregate_weighted_sum += r.score * r.weight
211+
aggregate_weight += r.weight
212+
213+
payload = {
214+
"unit_test_id": self.id,
215+
"eval_function_id": eval_fn.id,
216+
"result_per_ref_id": metric_per_ref_id,
217+
"weight_per_ref_id": weight_per_ref_id,
218+
"overall_metric": aggregate_weighted_sum / aggregate_weight,
219+
"model_id": model_id,
220+
"slice_id": self.slice_id,
221+
}
222+
response = self.connection.post(
223+
payload,
224+
"validate/scenario_test/upload_results",
225+
)
226+
return response

0 commit comments

Comments
 (0)