scaleapi
diff --git a/‎CHANGELOG.md
Lines changed: 11 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 11 additions & 1 deletion
diff --git a/‎nucleus/__init__.py
Lines changed: 10 additions & 8 deletions b/‎nucleus/__init__.py
Lines changed: 10 additions & 8 deletions
diff --git a/‎nucleus/metrics/__init__.py
Lines changed: 2 additions & 1 deletion b/‎nucleus/metrics/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎nucleus/metrics/base.py
Lines changed: 33 additions & 5 deletions b/‎nucleus/metrics/base.py
Lines changed: 33 additions & 5 deletions
diff --git a/‎nucleus/metrics/categorization_metrics.py
Lines changed: 199 additions & 0 deletions b/‎nucleus/metrics/categorization_metrics.py
Lines changed: 199 additions & 0 deletions
@@ -4,7 +4,17 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.6.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.6.0) - 2021-01-11
+## [0.6.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.6.3) - 2021-02-08
+
+### Added
+- Add categorization f1 score to metrics
+ 
+## [0.6.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.6.1) - 2021-02-08
+
+### Added
+- Adapt scipy and click dependencies to allow Google COLAB usage without update
+ 
+## [0.6.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.6.0) - 2021-02-07
 
 ### Added
 - Nucleus CLI interface `nu`. Installation instructions are in the `README.md`.
 
@@ -357,7 +357,7 @@ def create_dataset_from_project(
     def create_dataset(
         self,
         name: str,
-        is_scene: bool = False,
+        is_scene: Optional[bool] = None,
         item_metadata_schema: Optional[Dict] = None,
         annotation_metadata_schema: Optional[Dict] = None,
     ) -> Dataset:
@@ -389,13 +389,15 @@ def create_dataset(
         Returns:
             :class:`Dataset`: The newly created Nucleus dataset as an object.
         """
-        warnings.warn(
-            "The default create_dataset('dataset_name', ...) method without the is_scene parameter will be deprecated soon in favor of providing the is_scene parameter explicitly. "
-            "Please make sure to create a dataset with either create_dataset('dataset_name', is_scene=False, ...) to upload "
-            "DatasetItems or create_dataset('dataset_name', is_scene=True, ...) to upload "
-            "LidarScenes.",
-            DeprecationWarning,
-        )
+        if is_scene is None:
+            warnings.warn(
+                "The default create_dataset('dataset_name', ...) method without the is_scene parameter will be "
+                "deprecated soon in favor of providing the is_scene parameter explicitly. "
+                "Please make sure to create a dataset with either create_dataset('dataset_name', is_scene=False, ...) "
+                "to upload DatasetItems or create_dataset('dataset_name', is_scene=True, ...) to upload LidarScenes.",
+                DeprecationWarning,
+            )
+            is_scene = False
         response = self.make_request(
             {
                 NAME_KEY: name,
 
@@ -1,4 +1,5 @@
-from .base import Metric, MetricResult
+from .base import Metric, ScalarResult
+from .categorization_metrics import CategorizationF1
 from .polygon_metrics import (
     PolygonAveragePrecision,
     PolygonIOU,
 
@@ -1,15 +1,19 @@
 import sys
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Iterable
+from typing import Iterable, List
 
 from nucleus.annotation import AnnotationList
 from nucleus.prediction import PredictionList
 
 
+class MetricResult(ABC):
+    """Base MetricResult class"""
+
+
 @dataclass
-class MetricResult:
-    """A Metric Result contains the value of an evaluation, as well as its weight.
+class ScalarResult(MetricResult):
+    """A scalar result contains the value of an evaluation, as well as its weight.
     The weight is useful when aggregating metrics where each dataset item may hold a
     different relative weight. For example, when calculating precision over a dataset,
     the denominator of the precision is the number of annotations, and therefore the weight
@@ -24,13 +28,13 @@ class MetricResult:
     weight: float = 1.0
 
     @staticmethod
-    def aggregate(results: Iterable["MetricResult"]) -> "MetricResult":
+    def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult":
         """Aggregates results using a weighted average."""
         results = list(filter(lambda x: x.weight != 0, results))
         total_weight = sum([result.weight for result in results])
         total_value = sum([result.value * result.weight for result in results])
         value = total_value / max(total_weight, sys.float_info.epsilon)
-        return MetricResult(value, total_weight)
+        return ScalarResult(value, total_weight)
 
 
 class Metric(ABC):
@@ -87,3 +91,27 @@ def __call__(
         self, annotations: AnnotationList, predictions: PredictionList
     ) -> MetricResult:
         """A metric must override this method and return a metric result, given annotations and predictions."""
+
+    @abstractmethod
+    def aggregate_score(self, results: List[MetricResult]) -> ScalarResult:
+        """A metric must define how to aggregate results from single items to a single ScalarResult.
+
+        E.g. to calculate a R2 score with sklearn you could define a custom metric class ::
+
+            class R2Result(MetricResult):
+                y_true: float
+                y_pred: float
+
+
+        And then define an aggregate_score ::
+
+            def aggregate_score(self, results: List[MetricResult]) -> ScalarResult:
+                y_trues = []
+                y_preds = []
+                for result in results:
+                    y_true.append(result.y_true)
+                    y_preds.append(result.y_pred)
+                r2_score = sklearn.metrics.r2_score(y_trues, y_preds)
+                return ScalarResult(r2_score)
+
+        """
@@ -0,0 +1,199 @@
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import List, Set, Tuple, Union
+
+from sklearn.metrics import f1_score
+
+from nucleus.annotation import AnnotationList, CategoryAnnotation
+from nucleus.metrics.base import Metric, MetricResult, ScalarResult
+from nucleus.metrics.filters import confidence_filter
+from nucleus.prediction import CategoryPrediction, PredictionList
+
+F1_METHODS = {"micro", "macro", "samples", "weighted", "binary"}
+
+
+def to_taxonomy_labels(
+    anns_or_preds: Union[List[CategoryAnnotation], List[CategoryPrediction]]
+) -> Set[str]:
+    """Transforms annotation or prediction lists to taxonomy labels by joining them with a seperator (->)"""
+    labels = set()
+    for item in anns_or_preds:
+        taxonomy_label = (
+            f"{item.taxonomy_name}->{item.label}"
+            if item.taxonomy_name
+            else item.label
+        )
+        labels.add(taxonomy_label)
+    return labels
+
+
+@dataclass
+class CategorizationResult(MetricResult):
+    annotations: List[CategoryAnnotation]
+    predictions: List[CategoryPrediction]
+
+    @property
+    def value(self):
+        annotation_labels = to_taxonomy_labels(self.annotations)
+        prediction_labels = to_taxonomy_labels(self.predictions)
+
+        # TODO: Change task.py interface such that we can return label matching
+        # NOTE: Returning 1 if all taxonomy labels match else 0
+        value = f1_score(
+            annotation_labels, prediction_labels, average=self.f1_method
+        )
+        return value
+
+
+class CategorizationMetric(Metric):
+    """Abstract class for metrics related to Categorization
+
+    The Categorization class automatically filters incoming annotations and
+    predictions for only categorization annotations. It also filters
+    predictions whose confidence is less than the provided confidence_threshold.
+    """
+
+    def __init__(
+        self,
+        confidence_threshold: float = 0.0,
+    ):
+        """Initializes CategorizationMetric abstract object.
+
+        Args:
+            confidence_threshold: minimum confidence threshold for predictions to be taken into account for evaluation. Must be in [0, 1]. Default 0.0
+        """
+        assert 0 <= confidence_threshold <= 1
+        self.confidence_threshold = confidence_threshold
+
+    @abstractmethod
+    def eval(
+        self,
+        annotations: List[
+            CategoryAnnotation
+        ],  # TODO(gunnar): List to conform with other APIs or single instance?
+        predictions: List[CategoryPrediction],
+    ) -> CategorizationResult:
+        # Main evaluation function that subclasses must override.
+        # TODO(gunnar): Allow passing multiple predictions and selecting highest confidence? Allows us to show next
+        #  contender. Are top-5 scores something that we care about?
+        # TODO(gunnar): How do we handle multi-head classification?
+        pass
+
+    @abstractmethod
+    def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult:  # type: ignore[override]
+        pass
+
+    def __call__(
+        self, annotations: AnnotationList, predictions: PredictionList
+    ) -> CategorizationResult:
+        if self.confidence_threshold > 0:
+            predictions = confidence_filter(
+                predictions, self.confidence_threshold
+            )
+
+        cat_annotations, cat_predictions = self._filter_common_taxonomies(
+            annotations.category_annotations, predictions.category_predictions
+        )
+
+        result = self.eval(
+            cat_annotations,
+            cat_predictions,
+        )
+        return result
+
+    def _filter_common_taxonomies(
+        self,
+        annotations: List[CategoryAnnotation],
+        predictions: List[CategoryPrediction],
+    ) -> Tuple[List[CategoryAnnotation], List[CategoryPrediction]]:
+        annotated_taxonomies = {ann.taxonomy_name for ann in annotations}
+        matching_predictions, matching_taxonomies = self._filter_in_taxonomies(
+            predictions, annotated_taxonomies
+        )
+        matching_annotations, _ = self._filter_in_taxonomies(
+            annotations, matching_taxonomies
+        )
+
+        return matching_annotations, matching_predictions  # type: ignore
+
+    def _filter_in_taxonomies(
+        self,
+        anns_or_preds: Union[
+            List[CategoryAnnotation], List[CategoryPrediction]
+        ],
+        filter_on_taxonomies: Set[Union[None, str]],
+    ) -> Tuple[
+        Union[List[CategoryAnnotation], List[CategoryPrediction]],
+        Set[Union[None, str]],
+    ]:
+        matching_predictions = []
+        matching_taxonomies = set()
+        for pred in anns_or_preds:
+            if pred.taxonomy_name in filter_on_taxonomies:
+                matching_predictions.append(pred)
+                matching_taxonomies.add(pred.taxonomy_name)
+        return matching_predictions, matching_taxonomies
+
+
+class CategorizationF1(CategorizationMetric):
+    """Evaluation method that matches categories and returns a CategorizationF1Result that aggregates to the F1 score"""
+
+    def __init__(
+        self, confidence_threshold: float = 0.0, f1_method: str = "macro"
+    ):
+        """
+        Args:
+            confidence_threshold: minimum confidence threshold for predictions to be taken into account for evaluation. Must be in [0, 1]. Default 0.0
+            f1_method: {'micro', 'macro', 'samples','weighted', 'binary'}, \
+                default='macro'
+            This parameter is required for multiclass/multilabel targets.
+            If ``None``, the scores for each class are returned. Otherwise, this
+            determines the type of averaging performed on the data:
+
+            ``'binary'``:
+                Only report results for the class specified by ``pos_label``.
+                This is applicable only if targets (``y_{true,pred}``) are binary.
+            ``'micro'``:
+                Calculate metrics globally by counting the total true positives,
+                false negatives and false positives.
+            ``'macro'``:
+                Calculate metrics for each label, and find their unweighted
+                mean.  This does not take label imbalance into account.
+            ``'weighted'``:
+                Calculate metrics for each label, and find their average weighted
+                by support (the number of true instances for each label). This
+                alters 'macro' to account for label imbalance; it can result in an
+                F-score that is not between precision and recall.
+            ``'samples'``:
+                Calculate metrics for each instance, and find their average (only
+                meaningful for multilabel classification where this differs from
+                :func:`accuracy_score`).
+        """
+        super().__init__(confidence_threshold)
+        assert (
+            f1_method in F1_METHODS
+        ), f"Invalid f1_method {f1_method}, expected one of {F1_METHODS}"
+        self.f1_method = f1_method
+
+    def eval(
+        self,
+        annotations: List[CategoryAnnotation],
+        predictions: List[CategoryPrediction],
+    ) -> CategorizationResult:
+        """
+        Notes: This is a little weird eval function. It essentially only does matching of annotation to label and
+        the actual metric computation happens in the aggregate step since F1 score only makes sense on a collection.
+        """
+
+        return CategorizationResult(
+            annotations=annotations, predictions=predictions
+        )
+
+    def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult:  # type: ignore[override]
+        gt = []
+        predicted = []
+        for result in results:
+            gt.extend(list(to_taxonomy_labels(result.annotations)))
+            predicted.extend(list(to_taxonomy_labels(result.predictions)))
+        value = f1_score(gt, predicted, average=self.f1_method)
+        return ScalarResult(value)