Skip to content

Commit c7353f0

Browse files
authored
[Validate] Add Categorization F1 metrics (#202)
1 parent a651911 commit c7353f0

File tree

12 files changed

+502
-60
lines changed

12 files changed

+502
-60
lines changed

CHANGELOG.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,17 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7-
## [0.6.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.6.0) - 2021-01-11
7+
## [0.6.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.6.3) - 2021-02-08
8+
9+
### Added
10+
- Add categorization f1 score to metrics
11+
12+
## [0.6.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.6.1) - 2021-02-08
13+
14+
### Added
15+
- Adapt scipy and click dependencies to allow Google COLAB usage without update
16+
17+
## [0.6.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.6.0) - 2021-02-07
818

919
### Added
1020
- Nucleus CLI interface `nu`. Installation instructions are in the `README.md`.

nucleus/__init__.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ def create_dataset_from_project(
357357
def create_dataset(
358358
self,
359359
name: str,
360-
is_scene: bool = False,
360+
is_scene: Optional[bool] = None,
361361
item_metadata_schema: Optional[Dict] = None,
362362
annotation_metadata_schema: Optional[Dict] = None,
363363
) -> Dataset:
@@ -389,13 +389,15 @@ def create_dataset(
389389
Returns:
390390
:class:`Dataset`: The newly created Nucleus dataset as an object.
391391
"""
392-
warnings.warn(
393-
"The default create_dataset('dataset_name', ...) method without the is_scene parameter will be deprecated soon in favor of providing the is_scene parameter explicitly. "
394-
"Please make sure to create a dataset with either create_dataset('dataset_name', is_scene=False, ...) to upload "
395-
"DatasetItems or create_dataset('dataset_name', is_scene=True, ...) to upload "
396-
"LidarScenes.",
397-
DeprecationWarning,
398-
)
392+
if is_scene is None:
393+
warnings.warn(
394+
"The default create_dataset('dataset_name', ...) method without the is_scene parameter will be "
395+
"deprecated soon in favor of providing the is_scene parameter explicitly. "
396+
"Please make sure to create a dataset with either create_dataset('dataset_name', is_scene=False, ...) "
397+
"to upload DatasetItems or create_dataset('dataset_name', is_scene=True, ...) to upload LidarScenes.",
398+
DeprecationWarning,
399+
)
400+
is_scene = False
399401
response = self.make_request(
400402
{
401403
NAME_KEY: name,

nucleus/metrics/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from .base import Metric, MetricResult
1+
from .base import Metric, ScalarResult
2+
from .categorization_metrics import CategorizationF1
23
from .polygon_metrics import (
34
PolygonAveragePrecision,
45
PolygonIOU,

nucleus/metrics/base.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
import sys
22
from abc import ABC, abstractmethod
33
from dataclasses import dataclass
4-
from typing import Iterable
4+
from typing import Iterable, List
55

66
from nucleus.annotation import AnnotationList
77
from nucleus.prediction import PredictionList
88

99

10+
class MetricResult(ABC):
11+
"""Base MetricResult class"""
12+
13+
1014
@dataclass
11-
class MetricResult:
12-
"""A Metric Result contains the value of an evaluation, as well as its weight.
15+
class ScalarResult(MetricResult):
16+
"""A scalar result contains the value of an evaluation, as well as its weight.
1317
The weight is useful when aggregating metrics where each dataset item may hold a
1418
different relative weight. For example, when calculating precision over a dataset,
1519
the denominator of the precision is the number of annotations, and therefore the weight
@@ -24,13 +28,13 @@ class MetricResult:
2428
weight: float = 1.0
2529

2630
@staticmethod
27-
def aggregate(results: Iterable["MetricResult"]) -> "MetricResult":
31+
def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult":
2832
"""Aggregates results using a weighted average."""
2933
results = list(filter(lambda x: x.weight != 0, results))
3034
total_weight = sum([result.weight for result in results])
3135
total_value = sum([result.value * result.weight for result in results])
3236
value = total_value / max(total_weight, sys.float_info.epsilon)
33-
return MetricResult(value, total_weight)
37+
return ScalarResult(value, total_weight)
3438

3539

3640
class Metric(ABC):
@@ -87,3 +91,27 @@ def __call__(
8791
self, annotations: AnnotationList, predictions: PredictionList
8892
) -> MetricResult:
8993
"""A metric must override this method and return a metric result, given annotations and predictions."""
94+
95+
@abstractmethod
96+
def aggregate_score(self, results: List[MetricResult]) -> ScalarResult:
97+
"""A metric must define how to aggregate results from single items to a single ScalarResult.
98+
99+
E.g. to calculate a R2 score with sklearn you could define a custom metric class ::
100+
101+
class R2Result(MetricResult):
102+
y_true: float
103+
y_pred: float
104+
105+
106+
And then define an aggregate_score ::
107+
108+
def aggregate_score(self, results: List[MetricResult]) -> ScalarResult:
109+
y_trues = []
110+
y_preds = []
111+
for result in results:
112+
y_true.append(result.y_true)
113+
y_preds.append(result.y_pred)
114+
r2_score = sklearn.metrics.r2_score(y_trues, y_preds)
115+
return ScalarResult(r2_score)
116+
117+
"""
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
from abc import abstractmethod
2+
from dataclasses import dataclass
3+
from typing import List, Set, Tuple, Union
4+
5+
from sklearn.metrics import f1_score
6+
7+
from nucleus.annotation import AnnotationList, CategoryAnnotation
8+
from nucleus.metrics.base import Metric, MetricResult, ScalarResult
9+
from nucleus.metrics.filters import confidence_filter
10+
from nucleus.prediction import CategoryPrediction, PredictionList
11+
12+
F1_METHODS = {"micro", "macro", "samples", "weighted", "binary"}
13+
14+
15+
def to_taxonomy_labels(
16+
anns_or_preds: Union[List[CategoryAnnotation], List[CategoryPrediction]]
17+
) -> Set[str]:
18+
"""Transforms annotation or prediction lists to taxonomy labels by joining them with a seperator (->)"""
19+
labels = set()
20+
for item in anns_or_preds:
21+
taxonomy_label = (
22+
f"{item.taxonomy_name}->{item.label}"
23+
if item.taxonomy_name
24+
else item.label
25+
)
26+
labels.add(taxonomy_label)
27+
return labels
28+
29+
30+
@dataclass
31+
class CategorizationResult(MetricResult):
32+
annotations: List[CategoryAnnotation]
33+
predictions: List[CategoryPrediction]
34+
35+
@property
36+
def value(self):
37+
annotation_labels = to_taxonomy_labels(self.annotations)
38+
prediction_labels = to_taxonomy_labels(self.predictions)
39+
40+
# TODO: Change task.py interface such that we can return label matching
41+
# NOTE: Returning 1 if all taxonomy labels match else 0
42+
value = f1_score(
43+
annotation_labels, prediction_labels, average=self.f1_method
44+
)
45+
return value
46+
47+
48+
class CategorizationMetric(Metric):
49+
"""Abstract class for metrics related to Categorization
50+
51+
The Categorization class automatically filters incoming annotations and
52+
predictions for only categorization annotations. It also filters
53+
predictions whose confidence is less than the provided confidence_threshold.
54+
"""
55+
56+
def __init__(
57+
self,
58+
confidence_threshold: float = 0.0,
59+
):
60+
"""Initializes CategorizationMetric abstract object.
61+
62+
Args:
63+
confidence_threshold: minimum confidence threshold for predictions to be taken into account for evaluation. Must be in [0, 1]. Default 0.0
64+
"""
65+
assert 0 <= confidence_threshold <= 1
66+
self.confidence_threshold = confidence_threshold
67+
68+
@abstractmethod
69+
def eval(
70+
self,
71+
annotations: List[
72+
CategoryAnnotation
73+
], # TODO(gunnar): List to conform with other APIs or single instance?
74+
predictions: List[CategoryPrediction],
75+
) -> CategorizationResult:
76+
# Main evaluation function that subclasses must override.
77+
# TODO(gunnar): Allow passing multiple predictions and selecting highest confidence? Allows us to show next
78+
# contender. Are top-5 scores something that we care about?
79+
# TODO(gunnar): How do we handle multi-head classification?
80+
pass
81+
82+
@abstractmethod
83+
def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult: # type: ignore[override]
84+
pass
85+
86+
def __call__(
87+
self, annotations: AnnotationList, predictions: PredictionList
88+
) -> CategorizationResult:
89+
if self.confidence_threshold > 0:
90+
predictions = confidence_filter(
91+
predictions, self.confidence_threshold
92+
)
93+
94+
cat_annotations, cat_predictions = self._filter_common_taxonomies(
95+
annotations.category_annotations, predictions.category_predictions
96+
)
97+
98+
result = self.eval(
99+
cat_annotations,
100+
cat_predictions,
101+
)
102+
return result
103+
104+
def _filter_common_taxonomies(
105+
self,
106+
annotations: List[CategoryAnnotation],
107+
predictions: List[CategoryPrediction],
108+
) -> Tuple[List[CategoryAnnotation], List[CategoryPrediction]]:
109+
annotated_taxonomies = {ann.taxonomy_name for ann in annotations}
110+
matching_predictions, matching_taxonomies = self._filter_in_taxonomies(
111+
predictions, annotated_taxonomies
112+
)
113+
matching_annotations, _ = self._filter_in_taxonomies(
114+
annotations, matching_taxonomies
115+
)
116+
117+
return matching_annotations, matching_predictions # type: ignore
118+
119+
def _filter_in_taxonomies(
120+
self,
121+
anns_or_preds: Union[
122+
List[CategoryAnnotation], List[CategoryPrediction]
123+
],
124+
filter_on_taxonomies: Set[Union[None, str]],
125+
) -> Tuple[
126+
Union[List[CategoryAnnotation], List[CategoryPrediction]],
127+
Set[Union[None, str]],
128+
]:
129+
matching_predictions = []
130+
matching_taxonomies = set()
131+
for pred in anns_or_preds:
132+
if pred.taxonomy_name in filter_on_taxonomies:
133+
matching_predictions.append(pred)
134+
matching_taxonomies.add(pred.taxonomy_name)
135+
return matching_predictions, matching_taxonomies
136+
137+
138+
class CategorizationF1(CategorizationMetric):
139+
"""Evaluation method that matches categories and returns a CategorizationF1Result that aggregates to the F1 score"""
140+
141+
def __init__(
142+
self, confidence_threshold: float = 0.0, f1_method: str = "macro"
143+
):
144+
"""
145+
Args:
146+
confidence_threshold: minimum confidence threshold for predictions to be taken into account for evaluation. Must be in [0, 1]. Default 0.0
147+
f1_method: {'micro', 'macro', 'samples','weighted', 'binary'}, \
148+
default='macro'
149+
This parameter is required for multiclass/multilabel targets.
150+
If ``None``, the scores for each class are returned. Otherwise, this
151+
determines the type of averaging performed on the data:
152+
153+
``'binary'``:
154+
Only report results for the class specified by ``pos_label``.
155+
This is applicable only if targets (``y_{true,pred}``) are binary.
156+
``'micro'``:
157+
Calculate metrics globally by counting the total true positives,
158+
false negatives and false positives.
159+
``'macro'``:
160+
Calculate metrics for each label, and find their unweighted
161+
mean. This does not take label imbalance into account.
162+
``'weighted'``:
163+
Calculate metrics for each label, and find their average weighted
164+
by support (the number of true instances for each label). This
165+
alters 'macro' to account for label imbalance; it can result in an
166+
F-score that is not between precision and recall.
167+
``'samples'``:
168+
Calculate metrics for each instance, and find their average (only
169+
meaningful for multilabel classification where this differs from
170+
:func:`accuracy_score`).
171+
"""
172+
super().__init__(confidence_threshold)
173+
assert (
174+
f1_method in F1_METHODS
175+
), f"Invalid f1_method {f1_method}, expected one of {F1_METHODS}"
176+
self.f1_method = f1_method
177+
178+
def eval(
179+
self,
180+
annotations: List[CategoryAnnotation],
181+
predictions: List[CategoryPrediction],
182+
) -> CategorizationResult:
183+
"""
184+
Notes: This is a little weird eval function. It essentially only does matching of annotation to label and
185+
the actual metric computation happens in the aggregate step since F1 score only makes sense on a collection.
186+
"""
187+
188+
return CategorizationResult(
189+
annotations=annotations, predictions=predictions
190+
)
191+
192+
def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult: # type: ignore[override]
193+
gt = []
194+
predicted = []
195+
for result in results:
196+
gt.extend(list(to_taxonomy_labels(result.annotations)))
197+
predicted.extend(list(to_taxonomy_labels(result.predictions)))
198+
value = f1_score(gt, predicted, average=self.f1_method)
199+
return ScalarResult(value)

0 commit comments

Comments
 (0)