FAI-884: Add SHAP background generators to bindings (#117)

RobGeada · web-flow · commit 47a8ffbdddd4 · 2022-11-28T14:34:21.000Z
* unified input/output types, conversion functions, and docstrings

* Initial creation of background generators

* fixed many-x conversions for 1d arrays, added more test cases

* fixing input -&gt; output typo

* fixed randozm -&gt; random typo

* further fleshed out cf generation

* added feature domains into data conversions

* extended cf generation tests

* changed default parameters, linting
diff --git a/src/trustyai/explainers/lime.py b/src/trustyai/explainers/lime.py
@@ -25,6 +25,7 @@
     data_conversion_docstring,
     OneOutputUnionType,
 )
+
 from .explanation_results import SaliencyResults
 from trustyai.model import simple_prediction
 
diff --git a/src/trustyai/explainers/shap.py b/src/trustyai/explainers/shap.py
@@ -28,14 +28,23 @@
     OneInputUnionType,
     OneOutputUnionType,
     ManyInputsUnionType,
+    ManyOutputsUnionType,
     many_inputs_convert,
     data_conversion_docstring,
+    many_outputs_convert,
 )
 
 from org.kie.trustyai.explainability.local.shap import (
     ShapConfig as _ShapConfig,
     ShapKernelExplainer as _ShapKernelExplainer,
 )
+
+from org.kie.trustyai.explainability.local.shap.background import (
+    RandomGenerator,
+    KMeansGenerator,
+    CounterfactualGenerator,
+)
+
 from org.kie.trustyai.explainability.model import (
     PredictionProvider,
     Saliency,
@@ -410,6 +419,151 @@ def _get_bokeh_plot_dict(self):
         }
 
 
+class BackgroundGenerator:
+    r"""Generate a background for the SHAP explainer via one of three algorithms:
+
+    * `sample`: Randomly sample a set of provided points
+    * `kmeans`: Summarize a set of provided points into k centroids
+    * `counterfactual`: Generate a set of background points that meet certain criteria
+
+    """
+
+    @data_conversion_docstring("many_inputs")
+    def __init__(self, datapoints: ManyInputsUnionType, feature_domains=None, seed=0):
+        r"""Initialize the :class:`BackgroundGenerator`.
+
+        Parameters
+        ----------
+        datapoints : {}
+            The set of datapoints to be used to sample/generate the background, as a: {}
+        seed : int
+            The random seed to use in the sampling/generation method
+        """
+        self.datapoints = many_inputs_convert(datapoints, feature_domains)
+        self.feature_domains = feature_domains
+        self.seed = 0
+        self._jrandom = Random()
+        self._jrandom.setSeed(self.seed)
+
+    def sample(self, k=100):
+        r"""Randomly sample datapoints.
+
+        Parameters
+        ----------
+        k : int
+            The number of datapoints to select
+
+        Returns
+        -------
+        :list:`PredictionInput`
+            The background dataset to pass to the :class:`~SHAPExplainer`
+        """
+        perturbation_context = PerturbationContext(self._jrandom, 0)
+        return RandomGenerator(self.datapoints, perturbation_context).generate(k)
+
+    def kmeans(self, k=100):
+        r"""Use k-means clustering over `datapoints` and return k centroids as the background data
+        set.
+
+        Parameters
+        ----------
+        k : int
+            The number of centroids to find
+
+        Returns
+        -------
+        :list:`PredictionInput`
+            The background dataset to pass to the :class:`~SHAPExplainer`
+        """
+        return KMeansGenerator(self.datapoints, self.seed).generate(k)
+
+    @data_conversion_docstring("many_outputs")
+    def counterfactual(
+        self,
+        goals: ManyOutputsUnionType,
+        model: PredictionProvider,
+        k_per_goal=100,
+        **kwargs,
+    ):
+        r"""Generate a background via the CounterfactualExplainer. This lets you specify
+        exact output values that the background dataset conforms to, and thus set the reference
+        point by which all SHAP values compare. For example, if your model is a regression
+        model, choosing a counterfactual goal of 0 will create a background dataset where
+        :math:'f(x) \approx 0 \forall x \in \text{{background}}`, and as such the SHAP values
+        will compare against zero, which is a useful baseline for regression.
+
+        Parameters
+        ----------
+        goals : {}
+            The set of background datapoints as a: {}
+        model : :obj:`~trustyai.model.PredictionProvider`
+            The TrustyAI PredictionProvider, as generated by :class:`~trustyai.model.Model`
+        k_per_goal : int
+            The number of background datapoints to generate per goal.
+        Keyword Arguments:
+            * k_seeds: int
+                (default=5) For each goal, a number of starting seeds from `datapoints` are used
+                to start the search from. These are the `k_seeds` points within `datapoint`
+                whose corresponding outputs are closet to the goal output. Choose a larger
+                number to get a more diverse background dataset, but the search might require
+                larger `max_attempt_count`, `step_count`, and `timeout_seconds` to get good results.
+            * goal_threshold: float
+                (default=.01) The distance (percentage) threshold defining whether
+                a particular output satisfies the goal. Set to 0 to require an exact match, but
+                this will likey require larger `max_attempt_count`, `step_count`,
+                and `timeout_seconds` to get good results.
+            * chain: boolean
+                (default=False) If chaining is set to `true`, found counterfactual datapoints
+                will be added to the search seeds for subsequent searches. This is useful when a
+                range of counterfactual outputs is desired; for example, if the desired goals are
+                [0, 1, 2, 3], whichever goal is closest to the closest point within `datapoints` will
+                be searched for first. The found counterfactuals from that search are then included
+                in the search for the second-closest goal, and so on. This is especially helpful
+                if the extremes of the goal range are far outside the range produced by the
+                `datapoints`. If only
+            * max_attempt_count: int
+                If no valid counterfactual can be found for a starting seed in the search, the point
+                is slightly perturbed and search is retried. This parameter sets the maximum
+                number of perturbation-retry cycles are allowed during generation.
+            * step_count: int
+                (default=10,000) The number of datapoints to evaluate during the search
+            * timeout_seconds: int
+                (default=30) The maximum number of seconds allowed for each counterfactual search
+
+        Returns
+        -------
+        :list:`PredictionInput`
+            The background dataset to pass to the :class:`~SHAPExplainer`
+        """
+        if self.feature_domains is None:
+            raise AttributeError(
+                "Feature domains must be passed to perform"
+                " meaningful counterfactual search"
+            )
+        goals_converted = many_outputs_convert(goals)
+        generator = (
+            CounterfactualGenerator.builder()
+            .withModel(model)
+            .withKSeeds(kwargs.get("k_seeds", 5))
+            .withRandom(self._jrandom)
+            .withTimeoutSeconds(kwargs.get("timeout_seconds", 3))
+            .withStepCount(kwargs.get("step_count", 5_000))
+            .withGoalThreshold(kwargs.get("goal_threshold", 0.01))
+            .withMaxAttemptCount(kwargs.get("max_attempt_count", 5))
+            .build()
+        )
+
+        if len(goals) == 1:
+            background = generator.generate(
+                self.datapoints, goals_converted[0], k_per_goal
+            )
+        else:
+            background = generator.generateRange(
+                self.datapoints, goals_converted, k_per_goal, kwargs.get("chain", False)
+            )
+        return background
+
+
 class SHAPExplainer:
     r"""*"By how much did each feature contribute to the outputs?"*
 
@@ -511,6 +665,8 @@ def explain(
         outputs : {}
             The corresponding model outputs for the provided features, that is,
             ``outputs = model(input_features)``. These can take the form of a: {}
+        model : :obj:`~trustyai.model.PredictionProvider`
+            The TrustyAI PredictionProvider, as generated by :class:`~trustyai.model.Model`
 
         Returns
         -------
diff --git a/tests/general/test_shap_background_generation.py b/tests/general/test_shap_background_generation.py
@@ -0,0 +1,87 @@
+"""SHAP background generation  test suite"""
+
+import pytest
+import numpy as np
+import math
+
+from trustyai.explainers.shap import BackgroundGenerator
+from trustyai.model import Model, feature_domain
+from trustyai.utils.data_conversions import prediction_object_to_numpy
+
+
+def test_random_generation():
+    """Test that random sampling recovers samples from distribution"""
+    seed = 0
+    np.random.seed(seed)
+    data = np.random.rand(100, 5)
+    background_ta = BackgroundGenerator(data).sample(5)
+    background = prediction_object_to_numpy(background_ta)
+
+    assert len(background) == 5
+    for row in background:
+        assert row in data
+
+
+def test_kmeans_generation():
+    """Test that k-means recovers centroids of well-clustered data"""
+
+    seed = 0
+    clusters = 5
+    np.random.seed(seed)
+
+    data = []
+    ground_truth = []
+    for cluster in range(clusters):
+        data.append(np.random.rand(100 // clusters, 5) + cluster * 10)
+        ground_truth.append(np.array([cluster * 10] * 5))
+    data = np.vstack(data)
+    ground_truth = np.vstack(ground_truth)
+    background_ta = BackgroundGenerator(data).kmeans(clusters)
+    background = prediction_object_to_numpy(background_ta)
+
+    assert len(background) == 5
+    for row in background:
+        ground_truth_idx = math.floor(row[0] / 10)
+        assert np.linalg.norm(row - ground_truth[ground_truth_idx]) < 2.5
+
+
+def test_counterfactual_generation_single_goal():
+    """Test that cf background meets requirements"""
+    seed = 0
+    np.random.seed(seed)
+    data = np.random.rand(100, 5)
+    model = Model(lambda x: x.sum(1), arrow=False)
+    goal = np.array([1.0])
+
+    # check that undomained backgrounds are caught
+    attribute_error_thrown = False
+    try:
+        BackgroundGenerator(data).counterfactual(goal, model, 10,)
+    except AttributeError:
+        attribute_error_thrown = True
+    assert attribute_error_thrown
+
+    domains = [feature_domain((-10, 10)) for _ in range(5)]
+    background_ta = BackgroundGenerator(data, domains, seed)\
+        .counterfactual(goal, model, 5, step_count=5000, timeout_seconds=2)
+    background = prediction_object_to_numpy(background_ta)
+
+    for row in background:
+        assert np.linalg.norm(goal - model(row.reshape(1, -1))) < .01
+
+
+def test_counterfactual_generation_multi_goal():
+    """Test that cf background meets requirements for multiple goals"""
+
+    seed = 0
+    np.random.seed(seed)
+    data = np.random.rand(100, 5)
+    model = Model(lambda x: x.sum(1), arrow=False)
+    goals = np.arange(1, 10).reshape(-1, 1)
+    domains = [feature_domain((-10, 10)) for _ in range(5)]
+    background_ta = BackgroundGenerator(data, domains, seed)\
+        .counterfactual(goals, model, 1, step_count=5000, timeout_seconds=2, chain=True)
+    background = prediction_object_to_numpy(background_ta)
+
+    for i, goal in enumerate(goals):
+        assert np.linalg.norm(goal - model(background[i:i+1])) < goal[0]/100

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@`
`25`	`25`	`data_conversion_docstring,`
`26`	`26`	`OneOutputUnionType,`
`27`	`27`	`)`
	`28`	`+`
`28`	`29`	`from .explanation_results import SaliencyResults`
`29`	`30`	`from trustyai.model import simple_prediction`
`30`	`31`