Expose partial dependence plot in bindings (#133)

tteofili · web-flow · commit 3afedc06a379 · 2022-12-13T14:37:26.000+01:00
* set normalize_weights to False as in Java impl

* FAI-842 - draft pdp impl

* FAI-842 - improved code, pylint checks fixed

* FAI-842 - improved plot()

* FAI-842 - lint checks

* FAI-842 - reformatting

* FAI-842 - added unit tests, plot working with non-numeric data

* FAI-842 - made PredictionProviderMetadata private
diff --git a/src/trustyai/explainers/__init__.py b/src/trustyai/explainers/__init__.py
@@ -3,3 +3,4 @@
 from .counterfactuals import CounterfactualResult, CounterfactualExplainer
 from .lime import LimeExplainer, LimeResults
 from .shap import SHAPExplainer, SHAPResults, BackgroundGenerator
+from .pdp import PDPExplainer
diff --git a/src/trustyai/explainers/pdp.py b/src/trustyai/explainers/pdp.py
@@ -0,0 +1,193 @@
+"""Explainers.pdp module"""
+
+import math
+import matplotlib.pyplot as plt
+import pandas as pd
+from pandas.io.formats.style import Styler
+
+from jpype import (
+    JImplements,
+    JOverride,
+)
+
+# pylint: disable = import-error
+from org.kie.trustyai.explainability.global_ import pdp
+
+# pylint: disable = import-error
+from org.kie.trustyai.explainability.model import (
+    PredictionProvider,
+    PredictionInputsDataDistribution,
+    PredictionOutput,
+    Output,
+    Type,
+    Value,
+)
+
+from trustyai.utils.data_conversions import ManyInputsUnionType, many_inputs_convert
+
+from .explanation_results import ExplanationResults
+
+
+class PDPResults(ExplanationResults):
+    """
+    Results class for Partial Dependence Plots
+    """
+
+    def __init__(self, pdp_graphs):
+        self.pdp_graphs = pdp_graphs
+
+    def as_dataframe(self) -> pd.DataFrame:
+        """
+        Returns
+        -------
+        a pd.DataFrame with input values and feature name as
+        columns and marginal feature outputs as rows
+        """
+        pdp_series_list = []
+        for pdp_graph in self.pdp_graphs:
+            inputs = [self._to_plottable(x) for x in pdp_graph.getX()]
+            outputs = [self._to_plottable(y) for y in pdp_graph.getY()]
+            pdp_dict = dict(zip(inputs, outputs))
+            pdp_dict["feature"] = "" + str(pdp_graph.getFeature().getName())
+            pdp_series = pd.Series(index=inputs + ["feature"], data=pdp_dict)
+            pdp_series_list.append(pdp_series)
+        pdp_df = pd.DataFrame(pdp_series_list)
+        return pdp_df
+
+    def as_html(self) -> Styler:
+        """
+        Returns
+        -------
+        Style object from the PDP pd.DataFrame (see as_dataframe)
+        """
+        return self.as_dataframe().style
+
+    def plot(self, output_name=None, block=True) -> None:
+        """
+        Parameters
+        ----------
+        output_name: str
+            name of the output to be plotted
+            Default to None
+        block: bool
+            whether the plotting operation
+            should be blocking or not
+        """
+        fig, axs = plt.subplots(len(self.pdp_graphs), constrained_layout=True)
+        p_idx = 0
+        for pdp_graph in self.pdp_graphs:
+            if output_name is not None and output_name != str(
+                pdp_graph.getOutput().getName()
+            ):
+                continue
+            fig.suptitle(str(pdp_graph.getOutput().getName()))
+            pdp_x = []
+            for i in range(len(pdp_graph.getX())):
+                pdp_x.append(self._to_plottable(pdp_graph.getX()[i]))
+            pdp_y = []
+            for i in range(len(pdp_graph.getY())):
+                pdp_y.append(self._to_plottable(pdp_graph.getY()[i]))
+            axs[p_idx].plot(pdp_x, pdp_y)
+            axs[p_idx].set_title(
+                str(pdp_graph.getFeature().getName()), loc="left", fontsize="small"
+            )
+            axs[p_idx].grid()
+            p_idx += 1
+        fig.supylabel("Partial Dependence Plot")
+        plt.show(block=block)
+
+    @staticmethod
+    def _to_plottable(datum: Value):
+        plottable = datum.asNumber()
+        if math.isnan(plottable):
+            plottable = str(datum.asString())
+        return plottable
+
+
+# pylint: disable = too-few-public-methods
+class PDPExplainer:
+    """
+    Partial Dependence Plot explainer.
+    See https://christophm.github.io/interpretable-ml-book/pdp.html
+    """
+
+    def __init__(self, config=None):
+        if config is None:
+            config = pdp.PartialDependencePlotConfig()
+        self._explainer = pdp.PartialDependencePlotExplainer(config)
+
+    def explain(
+        self, model: PredictionProvider, data: ManyInputsUnionType, num_outputs: int = 1
+    ) -> PDPResults:
+        """
+        Parameters
+        ----------
+        model: PredictionProvider
+            the model to explain
+        data: ManyInputsUnionType
+            the data used to calculate the PDP
+        num_outputs: int
+            the number of outputs to calculate the PDP for
+
+        Returns
+        -------
+        pdp_results: PDPResults
+            the partial dependence plots associated to the model outputs
+        """
+        metadata = _PredictionProviderMetadata(many_inputs_convert(data), num_outputs)
+        pdp_graphs = self._explainer.explainFromMetadata(model, metadata)
+        return PDPResults(pdp_graphs)
+
+
+@JImplements(
+    "org.kie.trustyai.explainability.model.PredictionProviderMetadata", deferred=True
+)
+class _PredictionProviderMetadata:
+    """
+    Implementation of org.kie.trustyai.explainability.model.PredictionProviderMetadata interface
+    """
+
+    def __init__(self, data: list, size: int):
+        """
+        Parameters
+        ----------
+        data: ManyInputsUnionType
+            the data
+        size: int
+            the size of the model output
+        """
+        self.data = PredictionInputsDataDistribution(data)
+        outputs = []
+        for _ in range(size):
+            outputs.append(Output("", Type.UNDEFINED))
+        self.pred_out = PredictionOutput(outputs)
+
+    # pylint: disable = invalid-name
+    @JOverride
+    def getDataDistribution(self):
+        """
+        Returns
+        --------
+        the underlying data distribution
+        """
+        return self.data
+
+    # pylint: disable = invalid-name
+    @JOverride
+    def getInputShape(self):
+        """
+        Returns
+        --------
+        a PredictionInput from the underlying distribution
+        """
+        return self.data.sample()
+
+    # pylint: disable = invalid-name
+    @JOverride
+    def getOutputShape(self):
+        """
+        Returns
+        --------
+        a PredictionOutput
+        """
+        return self.pred_out
diff --git a/tests/general/test_pdp.py b/tests/general/test_pdp.py
@@ -0,0 +1,69 @@
+# pylint: disable=import-error, wrong-import-position, wrong-import-order, invalid-name
+"""PDP test suite"""
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.datasets import make_classification
+from trustyai.explainers import PDPExplainer
+from trustyai.model import Model
+from trustyai.utils import TestModels
+
+
+def create_random_df():
+    X, _ = make_classification(n_samples=5000, n_features=5, n_classes=2,
+                               n_clusters_per_class=2, class_sep=2, flip_y=0, random_state=23)
+
+    return pd.DataFrame({
+        'x1': X[:, 0],
+        'x2': X[:, 1],
+        'x3': X[:, 2],
+        'x4': X[:, 3],
+        'x5': X[:, 4],
+    })
+
+
+def test_pdp_sumskip():
+    """Test PDP with sum skip model on random generated data"""
+
+    df = create_random_df()
+    model = TestModels.getSumSkipModel(0)
+    pdp_explainer = PDPExplainer()
+    pdp_results = pdp_explainer.explain(model, df)
+    assert pdp_results is not None
+    assert pdp_results.as_dataframe() is not None
+
+
+def test_pdp_sumthreshold():
+    """Test PDP with sum threshold model on random generated data"""
+
+    df = create_random_df()
+    model = TestModels.getLinearThresholdModel([0.1, 0.2, 0.3, 0.4, 0.5], 0)
+    pdp_explainer = PDPExplainer()
+    pdp_results = pdp_explainer.explain(model, df)
+    assert pdp_results is not None
+    assert pdp_results.as_dataframe() is not None
+
+
+def pdp_plots(block):
+    """Test PDP plots"""
+    np.random.seed(0)
+    data = pd.DataFrame(np.random.rand(101, 5))
+
+    model_weights = np.random.rand(5)
+    predict_function = lambda x: np.stack([np.dot(x.values, model_weights), 2 * np.dot(x.values, model_weights)], -1)
+    model = Model(predict_function, dataframe_input=True)
+    pdp_explainer = PDPExplainer()
+    explanation = pdp_explainer.explain(model, data)
+
+    explanation.plot(block=block)
+    explanation.plot(block=block, output_name='output-0')
+
+
+@pytest.mark.block_plots
+def test_lime_plots_blocking():
+    pdp_plots(True)
+
+
+def test_lime_plots():
+    pdp_plots(False)