add confusion matrix metric and confidence

Matt Sokoloff · Matt Sokoloff · commit 2a499134c5ce · 2021-09-08T07:20:49.000-04:00
diff --git a/examples/model_diagnostics/custom-metrics.ipynb b/examples/model_diagnostics/custom-metrics.ipynb
@@ -164,7 +164,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from labelbox.data.annotation_types import ScalarMetric, MetricAggregation"
+    "from labelbox.data.annotation_types import ScalarMetric, ScalarMetricAggregation"
    ]
   },
   {
@@ -226,14 +226,14 @@
     "    metric_name = \"true_positives\",\n",
     "    feature_name = \"cat\",\n",
     "    value = 3,\n",
-    "    aggregation = MetricAggregation.SUM\n",
+    "    aggregation = ScalarMetricAggregation.SUM\n",
     ")\n",
     "\n",
     "feature_metric = ScalarMetric(\n",
     "    metric_name = \"true_positives\",\n",
     "    feature_name = \"dog\",\n",
     "    value = 4,\n",
-    "    aggregation = MetricAggregation.SUM\n",
+    "    aggregation = ScalarMetricAggregation.SUM\n",
     ")\n"
    ]
   },
diff --git a/labelbox/data/annotation_types/__init__.py b/labelbox/data/annotation_types/__init__.py
@@ -29,4 +29,6 @@
 from .collection import LabelGenerator
 
 from .metrics import ScalarMetric
-from .metrics import MetricAggregation
+from .metrics import ScalarMetricAggregation
+from .metrics import ConfusionMatrixMetric
+from .metrics import ConfusionMatrixAggregation
diff --git a/labelbox/data/annotation_types/label.py b/labelbox/data/annotation_types/label.py
@@ -11,7 +11,7 @@
 from .classification import ClassificationAnswer
 from .data import VideoData, TextData, ImageData
 from .geometry import Mask
-from .metrics import ScalarMetric
+from .metrics import ScalarMetric, ConfusionMatrixMetric
 from .types import Cuid
 from .annotation import (ClassificationAnnotation, ObjectAnnotation,
                          VideoClassificationAnnotation, VideoObjectAnnotation)
@@ -23,7 +23,7 @@ class Label(BaseModel):
     annotations: List[Union[ClassificationAnnotation, ObjectAnnotation,
                             VideoObjectAnnotation,
                             VideoClassificationAnnotation, ScalarMetric,
-                            ScalarMetric]] = []
+                            ConfusionMatrixMetric]] = []
     extra: Dict[str, Any] = {}
 
     def object_annotations(self) -> List[ObjectAnnotation]:
diff --git a/labelbox/data/annotation_types/metrics/__init__.py b/labelbox/data/annotation_types/metrics/__init__.py
@@ -1,2 +1,2 @@
-from .scalar import ScalarMetric
-from .aggregations import MetricAggregation
+from .scalar import ScalarMetric, ScalarMetricAggregation
+from .confusion_matrix import ConfusionMatrixMetric, ConfusionMatrixAggregation
diff --git a/labelbox/data/annotation_types/metrics/aggregations.py b/labelbox/data/annotation_types/metrics/aggregations.py
diff --git a/labelbox/data/annotation_types/metrics/scalar.py b/labelbox/data/annotation_types/metrics/scalar.py
@@ -1,17 +1,19 @@
-from labelbox.data.annotation_types.metrics.aggregations import MetricAggregation
-from typing import Any, Dict, Optional, Tuple, Union
-from pydantic import BaseModel, Field, validator, confloat
+from typing import Dict, Optional, Union
+from enum import Enum
 
+from pydantic import confloat
 
-ScalarMetricConfidenceValue = Dict[confloat(ge=0, le=1), float]
-ConfusionMatrixMetricConfidenceValue = Dict[confloat(ge=0, le=1), Tuple[int,int,int,int]]
+from .base import ConfidenceValue, BaseMetric
 
+ScalarMetricValue = confloat(ge=0, le=10_000)
+ScalarMetricConfidenceValue = Dict[ConfidenceValue, ScalarMetricValue]
 
-class BaseMetric(BaseModel):
-    metric_name: Optional[str] = None
-    feature_name: Optional[str] = None
-    subclass_name: Optional[str] = None
-    extra: Dict[str, Any] = {}
+
+class ScalarMetricAggregation(Enum):
+    ARITHMETIC_MEAN = "ARITHMETIC_MEAN"
+    GEOMETRIC_MEAN = "GEOMETRIC_MEAN"
+    HARMONIC_MEAN = "HARMONIC_MEAN"
+    SUM = "SUM"
 
 
 class ScalarMetric(BaseMetric):
@@ -22,33 +24,12 @@ class ScalarMetric(BaseMetric):
     This is not recommended and support for empty metric_name fields will be removed.
     aggregation will be ignored wihtout providing a metric name.
     """
+    metric_name: Optional[str] = None
     value: Union[float, ScalarMetricConfidenceValue]
-    aggregation: MetricAggregation = MetricAggregation.ARITHMETIC_MEAN
+    aggregation: ScalarMetricAggregation = ScalarMetricAggregation.ARITHMETIC_MEAN
 
     def dict(self, *args, **kwargs):
         res = super().dict(*args, **kwargs)
-        if res['metric_name'] is None:
+        if res.get('metric_name') is None:
             res.pop('aggregation')
-        return {k: v for k, v in res.items() if v is not None}
-
-    @validator('aggregation')
-    def validate_aggregation(cls, aggregation):
-        if aggregation == MetricAggregation.CONFUSION_MATRIX:
-            raise ValueError("Cannot assign `MetricAggregation.CONFUSION_MATRIX` to `ScalarMetric.aggregation`")
-
-
-
-class ConfusionMatrixMetric(BaseMetric):
-    """ Class representing confusion matrix metrics.
-
-    In the editor, this provides precision, recall, and f-scores.
-    This should be used over multiple scalar metrics so that aggregations are accurate.
-
-    value should be a tuple representing:
-      [True Positive Count, False Positive Count, True Negative Count, False Negative Count]
-
-    aggregation cannot be adjusted for confusion matrix metrics.
-    """
-    value: Union[Tuple[int,int,int,int], ConfusionMatrixMetricConfidenceValue]
-    aggregation: MetricAggregation = Field(MetricAggregation.CONFUSION_MATRIX, const = True)
-
+        return res
diff --git a/labelbox/data/serialization/ndjson/metric.py b/labelbox/data/serialization/ndjson/metric.py
@@ -1,4 +1,4 @@
-from labelbox.data.annotation_types.metrics.aggregations import MetricAggregation
+from labelbox.data.annotation_types.metrics import ScalarMetricAggregation
 from typing import Union, Optional
 
 from labelbox.data.annotation_types.data import ImageData, TextData
@@ -11,15 +11,16 @@ class NDScalarMetric(NDJsonBase):
     metric_name: Optional[str]
     feature_name: Optional[str] = None
     subclass_name: Optional[str] = None
-    aggregation: MetricAggregation = MetricAggregation.ARITHMETIC_MEAN.value
+    aggregation: ScalarMetricAggregation = ScalarMetricAggregation.ARITHMETIC_MEAN.value
 
     def to_common(self) -> ScalarMetric:
-        return ScalarMetric(value=self.metric_value,
-                            metric_name=self.metric_name,
-                            feature_name=self.feature_name,
-                            subclass_name=self.subclass_name,
-                            aggregation=MetricAggregation[self.aggregation],
-                            extra={'uuid': self.uuid})
+        return ScalarMetric(
+            value=self.metric_value,
+            metric_name=self.metric_name,
+            feature_name=self.feature_name,
+            subclass_name=self.subclass_name,
+            aggregation=ScalarMetricAggregation[self.aggregation],
+            extra={'uuid': self.uuid})
 
     @classmethod
     def from_common(cls, metric: ScalarMetric,
diff --git a/tests/data/annotation_types/test_metrics.py b/tests/data/annotation_types/test_metrics.py
@@ -1,7 +1,8 @@
+from pydantic import ValidationError
 import pytest
 
-from labelbox.data.annotation_types.metrics.aggregations import MetricAggregation
-from labelbox.data.annotation_types.metrics.scalar import ScalarMetric
+from labelbox.data.annotation_types.metrics import ConfusionMatrixAggregation, ScalarMetricAggregation
+from labelbox.data.annotation_types.metrics import ConfusionMatrixMetric, ScalarMetric
 from labelbox.data.annotation_types.collection import LabelList
 from labelbox.data.annotation_types import ScalarMetric, Label, ImageData
 
@@ -30,23 +31,28 @@ def test_legacy_scalar_metric():
         'uid': None
     }
     assert label.dict() == expected
-    next(LabelList([label])).dict() == expected
+    assert next(LabelList([label])).dict() == expected
 
 
 # TODO: Test with confidence
 
-@pytest.mark.parametrize('feature_name,subclass_name,aggregation', [
-    ("cat", "orange", MetricAggregation.ARITHMETIC_MEAN),
-    ("cat", None, MetricAggregation.ARITHMETIC_MEAN),
-    (None, None, MetricAggregation.ARITHMETIC_MEAN),
-    (None, None, None),
-    ("cat", "orange", MetricAggregation.ARITHMETIC_MEAN),
-    ("cat", None, MetricAggregation.HARMONIC_MEAN),
-    (None, None, MetricAggregation.GEOMETRIC_MEAN),
-    (None, None, MetricAggregation.SUM)
+
+@pytest.mark.parametrize('feature_name,subclass_name,aggregation,value', [
+    ("cat", "orange", ScalarMetricAggregation.ARITHMETIC_MEAN, 0.5),
+    ("cat", None, ScalarMetricAggregation.ARITHMETIC_MEAN, 0.5),
+    (None, None, ScalarMetricAggregation.ARITHMETIC_MEAN, 0.5),
+    (None, None, None, 0.5),
+    ("cat", "orange", ScalarMetricAggregation.ARITHMETIC_MEAN, 0.5),
+    ("cat", None, ScalarMetricAggregation.HARMONIC_MEAN, 0.5),
+    (None, None, ScalarMetricAggregation.GEOMETRIC_MEAN, 0.5),
+    (None, None, ScalarMetricAggregation.SUM, 0.5),
+    ("cat", "orange", ScalarMetricAggregation.ARITHMETIC_MEAN, {
+        0.1: 0.2,
+        0.3: 0.5,
+        0.4: 0.8
+    }),
 ])
-def test_custom_scalar_metric(feature_name, subclass_name, aggregation):
-    value = 0.5
+def test_custom_scalar_metric(feature_name, subclass_name, aggregation, value):
     kwargs = {'aggregation': aggregation} if aggregation is not None else {}
     metric = ScalarMetric(metric_name="iou",
                           value=value,
@@ -77,36 +83,37 @@ def test_custom_scalar_metric(feature_name, subclass_name, aggregation):
             **({
                 'subclass_name': subclass_name
             } if subclass_name else {}), 'aggregation':
-                aggregation or MetricAggregation.ARITHMETIC_MEAN,
+                aggregation or ScalarMetricAggregation.ARITHMETIC_MEAN,
             'extra': {}
         }],
         'extra': {},
         'uid': None
     }
-    assert label.dict() == expected
-    next(LabelList([label])).dict() == expected
-
 
+    assert label.dict() == expected
+    assert next(LabelList([label])).dict() == expected
 
 
-@pytest.mark.parametrize('feature_name,subclass_name,aggregation', [
-    ("cat", "orange", MetricAggregation.ARITHMETIC_MEAN),
-    ("cat", None, MetricAggregation.ARITHMETIC_MEAN),
-    (None, None, MetricAggregation.ARITHMETIC_MEAN),
-    (None, None, None),
-    ("cat", "orange", MetricAggregation.ARITHMETIC_MEAN),
-    ("cat", None, MetricAggregation.HARMONIC_MEAN),
-    (None, None, MetricAggregation.GEOMETRIC_MEAN),
-    (None, None, MetricAggregation.SUM),
+@pytest.mark.parametrize('feature_name,subclass_name,aggregation,value', [
+    ("cat", "orange", ConfusionMatrixAggregation.CONFUSION_MATRIX,
+     (0, 1, 2, 3)),
+    ("cat", None, ConfusionMatrixAggregation.CONFUSION_MATRIX, (0, 1, 2, 3)),
+    (None, None, ConfusionMatrixAggregation.CONFUSION_MATRIX, (0, 1, 2, 3)),
+    (None, None, None, (0, 1, 2, 3)),
+    ("cat", "orange", ConfusionMatrixAggregation.CONFUSION_MATRIX, {
+        0.1: (0, 1, 2, 3),
+        0.3: (0, 1, 2, 3),
+        0.4: (0, 1, 2, 3)
+    }),
 ])
-def test_custom_scalar_metric(feature_name, subclass_name, aggregation):
-    value = 0.5
+def test_custom_confusison_matrix_metric(feature_name, subclass_name,
+                                         aggregation, value):
     kwargs = {'aggregation': aggregation} if aggregation is not None else {}
-    metric = ScalarMetric(metric_name="iou",
-                          value=value,
-                          feature_name=feature_name,
-                          subclass_name=subclass_name,
-                          **kwargs)
+    metric = ConfusionMatrixMetric(metric_name="confusion_matrix_50_pct_iou",
+                                   value=value,
+                                   feature_name=feature_name,
+                                   subclass_name=subclass_name,
+                                   **kwargs)
     assert metric.value == value
 
     label = Label(data=ImageData(uid="ckrmd9q8g000009mg6vej7hzg"),
@@ -124,18 +131,58 @@ def test_custom_scalar_metric(feature_name, subclass_name, aggregation):
             'value':
                 value,
             'metric_name':
-                'iou',
+                'confusion_matrix_50_pct_iou',
             **({
                 'feature_name': feature_name
             } if feature_name else {}),
             **({
                 'subclass_name': subclass_name
             } if subclass_name else {}), 'aggregation':
-                aggregation or MetricAggregation.ARITHMETIC_MEAN,
+                aggregation or ConfusionMatrixAggregation.CONFUSION_MATRIX,
             'extra': {}
         }],
         'extra': {},
         'uid': None
     }
     assert label.dict() == expected
-    next(LabelList([label])).dict() == expected
+    assert next(LabelList([label])).dict() == expected
+
+
+def test_name_exists():
+    # Name is only required for ConfusionMatrixMetric for now.
+    with pytest.raises(ValidationError) as exc_info:
+        metric = ConfusionMatrixMetric(value=[0, 1, 2, 3])
+    assert "field required (type=value_error.missing)" in str(exc_info.value)
+
+
+def test_invalid_aggregations():
+    with pytest.raises(ValidationError) as exc_info:
+        metric = ScalarMetric(
+            metric_name="invalid aggregation",
+            value=0.1,
+            aggregation=ConfusionMatrixAggregation.CONFUSION_MATRIX)
+    assert "value is not a valid enumeration member" in str(exc_info.value)
+    with pytest.raises(ValidationError) as exc_info:
+        metric = ConfusionMatrixMetric(metric_name="invalid aggregation",
+                                       value=[0, 1, 2, 3],
+                                       aggregation=ScalarMetricAggregation.SUM)
+    assert "value is not a valid enumeration member" in str(exc_info.value)
+
+
+def test_invalid_number_of_confidence_scores():
+    with pytest.raises(ValidationError) as exc_info:
+        metric = ScalarMetric(metric_name="too few scores", value={0.1: 0.1})
+    assert "Number of confidence scores must be greater" in str(exc_info.value)
+    with pytest.raises(ValidationError) as exc_info:
+        metric = ConfusionMatrixMetric(metric_name="too few scores",
+                                       value={0.1: [0, 1, 2, 3]})
+    assert "Number of confidence scores must be greater" in str(exc_info.value)
+    with pytest.raises(ValidationError) as exc_info:
+        metric = ScalarMetric(metric_name="too many scores",
+                              value={i / 20.: 0.1 for i in range(20)})
+    assert "Number of confidence scores must be greater" in str(exc_info.value)
+    with pytest.raises(ValidationError) as exc_info:
+        metric = ConfusionMatrixMetric(
+            metric_name="too many scores",
+            value={i / 20.: [0, 1, 2, 3] for i in range(20)})
+    assert "Number of confidence scores must be greater" in str(exc_info.value)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -212,14 +212,9 @@ def datarow(dataset, image_url):
 @pytest.fixture
 def label_pack(project, rand_gen, image_url):
     client = project.client
-<<<<<<< HEAD
     dataset = client.create_dataset(name=rand_gen(str))
     project.datasets.connect(dataset)
     data_row = dataset.create_data_row(row_data=IMG_URL)
-=======
-    dataset = client.create_dataset(name=rand_gen(str), projects=project)
-    data_row = dataset.create_data_row(row_data=image_url)
->>>>>>> 6970d60beebc6c969a81c891b4c88db7c57f98df
     label = project.create_label(data_row=data_row, label=rand_gen(str))
     time.sleep(10)
     yield LabelPack(project, dataset, data_row, label)