Merge pull request #143 from scaleapi/da-document-predictions

ardila · web-flow · commit 2687568dc21d · 2021-10-21T17:34:54.000-07:00
Document predictions, model, model_run and autocurate
diff --git a/docs/Makefile b/docs/Makefile
@@ -5,7 +5,7 @@
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = source
+SOURCEDIR     = .
 BUILDDIR      = build
 
 # Put it first so that "make" without argument is like "make help".
diff --git a/nucleus/autocurate.py b/nucleus/autocurate.py
@@ -1,3 +1,9 @@
+"""This module can be used to compute active learning metrics on your predictions.
+
+For more details on usage see the example colab in scripts/autocurate_bdd.ipynb
+"""
+
+
 import datetime
 import requests
 from nucleus.constants import (
@@ -9,6 +15,7 @@
 
 
 def entropy(name, model_run, client):
+    """Computes the mean entropy across all predictions for each image."""
     model_run_ids = [model_run.model_run_id]
     dataset_id = model_run.dataset_id
     response = client.make_request(
diff --git a/nucleus/model.py b/nucleus/model.py
@@ -1,3 +1,63 @@
+"""
+By uploading model predictions to Nucleus, you can compare your predictions to ground truth annotations and discover problems with your Models or Dataset.
+
+You can also upload predictions for unannotated images, letting you query them based on model predictions. This can help you prioritize which unlabeled data to label next.
+
+Within Nucleus, Models work in the following way:
+
+1. You first create a Model. You can do this just once and reuse the model on multiple datasets.
+2. You then upload predictions to a dataset.
+3. Trigger calculation of model metrics in order to view model debugging insights.
+
+Doing the three steps above allows you to visualize model performance within Nucleus, or compare multiple models that have been run on the same Dataset.
+
+
+Note that you can always add more predictions to a dataset, but then you will need to re-run the calculation of metrics in order to have them be correct.
+
+::
+
+    import nucleus
+
+    client = nucleus.NucleusClient("YOUR_SCALE_API_KEY")
+    dataset = client.get_dataset("YOUR_DATASET_ID")
+    prediction_1 = nucleus.BoxPrediction(
+        label="label",
+        x=0,
+        y=0,
+        width=10,
+        height=10,
+        reference_id="1",
+        confidence=0.9,
+        class_pdf={"label": 0.9, "other_label": 0.1},
+    )
+    prediction_2 = nucleus.BoxPrediction(
+        label="label",
+        x=0,
+        y=0,
+        width=10,
+        height=10,
+        reference_id="2",
+        confidence=0.2,
+        class_pdf={"label": 0.2, "other_label": 0.8},
+    )
+    model = client.add_model(
+        name="My Model", reference_id="My-CNN", metadata={"timestamp": "121012401"}
+    )
+    # For small ingestions, we recommend synchronous ingestion
+    response = dataset.upload_predictions(model, [prediction_1, prediction_2])
+    # For large ingestions, we recommend asynchronous ingestion
+    job = dataset.upload_predictions(
+        [prediction_1, prediction_2], asynchronous=True
+    )
+    # Check current status
+    job.status()
+    # Sleep until ingestion is done
+    job.sleep_until_complete()
+    # Check errors
+    job.errors()
+
+    dataset.calculate_evaluation_metrics(model)
+"""
 from typing import List, Optional, Dict, Union
 from .dataset import Dataset
 from .prediction import (
@@ -15,6 +75,19 @@
 
 
 class Model:
+    """A model that can be used to upload predictions to a dataset.
+
+    Attributes:
+        model_id: The scale-generated unique id for this model
+        name: A human-readable name for the model
+        reference_id: This is a unique, user-controlled ID for the model. This can be
+            used, for example, to link to an external storage of models which may
+            have its own id scheme.
+        metadata: An arbitrary dictionary of additional data about this model that
+            can be stored and retrieved. For example, you can store information
+            about the hyperparameters used in training this model.
+    """
+
     def __init__(
         self,
         model_id: str,
@@ -68,6 +141,16 @@ def create_run(
         metadata: Optional[Dict] = None,
         asynchronous: bool = False,
     ) -> ModelRun:
+        """Note: this method, as well as model runs in general are now deprecated.
+
+        Instead models will automatically generate a model run when applied to a dataset
+        using dataset.upload_predictions(model, predictions). Therefore there is no
+        longer any need to create a model run, since you can upload predictions
+        without needing to explicitly create a model run.
+
+        When uploading to a dataset twice using the same model, the same model run
+        will be reused by Nucleus.
+        """
         payload: dict = {
             NAME_KEY: name,
             REFERENCE_ID_KEY: self.reference_id,
diff --git a/nucleus/model_run.py b/nucleus/model_run.py
@@ -1,3 +1,18 @@
+"""
+Model Runs are deprecated and will be removed in a future version of the python client.
+It is now possible to upload model predictions without a need for creating a model run
+
+For example:
+..code-block:: python
+import nucleus
+client = nucleus.NucleusClient("YOUR_SCALE_API_KEY")
+prediction_1 = nucleus.BoxPrediction(label="label", x=0, y=0, width=10, height=10, reference_id="1", confidence=0.9, class_pdf={'label': 0.9, 'other_label': 0.1})
+prediction_2 = nucleus.BoxPrediction(label="label", x=0, y=0, width=10, height=10, reference_id="2", confidence=0.2, class_pdf={'label': 0.2, 'other_label': 0.8})
+model = client.add_model(name="My Model", reference_id="My-CNN", metadata={"timestamp": "121012401"})
+response = dataset.upload_predictions(model, [prediction_1, prediction_2])
+"""
+
+
 from typing import List, Optional, Union
 
 import requests
@@ -26,8 +41,7 @@
 
 class ModelRun:
     """
-    Model runs represent detections of a specific model on your dataset.
-    Having an open model run is a prerequisite for uploading predictions to your dataset.
+    This class is deprecated and will be removed from the python client.
     """
 
     def __init__(self, model_run_id: str, dataset_id: str, client):
diff --git a/nucleus/prediction.py b/nucleus/prediction.py
@@ -1,3 +1,7 @@
+"""All of the prediction types supported. In general, prediction types are the same
+as annotation types, but come with additional, optional data that can be attached
+such as confidence or probability distributions.
+"""
 from typing import Dict, Optional, List
 from .annotation import (
     BoxAnnotation,
@@ -66,6 +70,16 @@ def from_json(cls, payload: dict):
 
 
 class BoxPrediction(BoxAnnotation):
+    """A prediction of a bounding box
+
+    Attributes:
+        confidence: 0-1 indicating the confidence of the prediciton
+        class_pdf: An optional complete class probability distribution on this
+            annotation. Each value should be between 0 and 1 (inclusive), and sum up to
+            1 as a complete distribution. This can be useful for computing entropy to
+            surface places where the model is most uncertain.
+    """
+
     def __init__(
         self,
         label: str,
@@ -119,6 +133,16 @@ def from_json(cls, payload: dict):
 
 
 class PolygonPrediction(PolygonAnnotation):
+    """A prediction of a polygon
+
+    Attributes:
+        confidence: 0-1 indicating the confidence of the prediciton
+        class_pdf: An optional complete class probability distribution on this
+            annotation. Each value should be between 0 and 1 (inclusive), and sum up to
+            1 as a complete distribution. This can be useful for computing entropy to
+            surface places where the model is most uncertain.
+    """
+
     def __init__(
         self,
         label: str,
@@ -165,6 +189,16 @@ def from_json(cls, payload: dict):
 
 
 class CuboidPrediction(CuboidAnnotation):
+    """A prediction of 3D cuboid.
+
+    Attributes:
+        confidence: 0-1 indicating the confidence of the prediciton
+        class_pdf: An optional complete class probability distribution on this
+            annotation. Each value should be between 0 and 1 (inclusive), and sum up to
+            1 as a complete distribution. This can be useful for computing entropy to
+            surface places where the model is most uncertain.
+    """
+
     def __init__(
         self,
         label: str,
@@ -215,6 +249,8 @@ def from_json(cls, payload: dict):
 
 
 class CategoryPrediction(CategoryAnnotation):
+    """This class is not yet supported: Categorization support coming soon!"""
+
     def __init__(
         self,
         label: str,
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ tqdm = "^4.41.0"
 dataclasses = { version = "^0.7", python = "^3.6.1, <3.7" }
 aiohttp = "^3.7.4"
 nest-asyncio = "^1.5.1"
+Sphinx = "^4.2.0"
 
 [tool.poetry.dev-dependencies]
 poetry = "^1.1.5"
@@ -50,6 +51,7 @@ coverage = "^5.5"
 pre-commit = "^2.12.1"
 jupyterlab = "^3.1.10"
 absl-py = "^0.13.0"
+furo = "^2021.10.9"
 
 [tool.pytest.ini_options]
 markers = [