Added arrow and non-arrow contexts to automatically switch transmission based on usage (#119)

RobGeada · web-flow · commit b1f6f64d6115 · 2022-11-30T11:09:50.000Z
diff --git a/src/trustyai/explainers/counterfactuals.py b/src/trustyai/explainers/counterfactuals.py
@@ -1,7 +1,7 @@
 """Explainers.countefactual module"""
 # pylint: disable = import-error, too-few-public-methods, wrong-import-order, line-too-long,
 # pylint: disable = unused-argument
-from typing import Optional
+from typing import Optional, Union
 import matplotlib.pyplot as plt
 import matplotlib as mpl
 import pandas as pd
@@ -17,6 +17,7 @@
 from trustyai.model import (
     counterfactual_prediction,
     PredictionInput,
+    Model,
 )
 
 from trustyai.utils.data_conversions import (
@@ -179,7 +180,7 @@ def explain(
         self,
         inputs: OneInputUnionType,
         goal: OneOutputUnionType,
-        model: PredictionProvider,
+        model: Union[PredictionProvider, Model],
         data_distribution: Optional[DataDistribution] = None,
         uuid: Optional[_uuid.UUID] = None,
         timeout: Optional[float] = None,
@@ -215,6 +216,8 @@ def explain(
             uuid=uuid,
             timeout=timeout,
         )
-        return CounterfactualResult(
-            self._explainer.explainAsync(_prediction, model).get()
-        )
+
+        with Model.NonArrowTransmission(model):
+            return CounterfactualResult(
+                self._explainer.explainAsync(_prediction, model).get()
+            )
diff --git a/src/trustyai/explainers/lime.py b/src/trustyai/explainers/lime.py
@@ -1,7 +1,7 @@
 """Explainers.lime module"""
 # pylint: disable = import-error, too-few-public-methods, wrong-import-order, line-too-long,
 # pylint: disable = unused-argument, duplicate-code, consider-using-f-string, invalid-name
-from typing import Dict
+from typing import Dict, Union
 
 import bokeh.models
 import matplotlib.pyplot as plt
@@ -27,7 +27,7 @@
 )
 
 from .explanation_results import SaliencyResults
-from trustyai.model import simple_prediction
+from trustyai.model import simple_prediction, Model
 
 from org.kie.trustyai.explainability.local.lime import (
     LimeConfig as _LimeConfig,
@@ -42,6 +42,7 @@
 
 from java.util import Random
 
+
 LimeConfig = _LimeConfig
 
 
@@ -263,7 +264,7 @@ def explain(
         self,
         inputs: OneInputUnionType,
         outputs: OneOutputUnionType,
-        model: PredictionProvider,
+        model: Union[PredictionProvider, Model],
     ) -> LimeResults:
         """Produce a LIME explanation.
 
@@ -284,4 +285,6 @@ def explain(
             Object containing the results of the LIME explanation.
         """
         _prediction = simple_prediction(inputs, outputs)
-        return LimeResults(self._explainer.explainAsync(_prediction, model).get())
+
+        with Model.ArrowTransmission(model, inputs):
+            return LimeResults(self._explainer.explainAsync(_prediction, model).get())
diff --git a/src/trustyai/explainers/shap.py b/src/trustyai/explainers/shap.py
@@ -1,7 +1,7 @@
 """Explainers.shap module"""
 # pylint: disable = import-error, too-few-public-methods, wrong-import-order, line-too-long,
 # pylint: disable = unused-argument, consider-using-f-string, invalid-name
-from typing import Dict, Optional
+from typing import Dict, Optional, Union
 import matplotlib.pyplot as plt
 import matplotlib as mpl
 from bokeh.models import ColumnDataSource, HoverTool
@@ -21,9 +21,7 @@
     output_html,
     feature_html,
 )
-from trustyai.model import (
-    simple_prediction,
-)
+from trustyai.model import simple_prediction, Model
 from trustyai.utils.data_conversions import (
     OneInputUnionType,
     OneOutputUnionType,
@@ -54,6 +52,8 @@
 
 
 # pylint: disable=invalid-name
+
+
 class SHAPResults(SaliencyResults):
     """Wraps SHAP results. This object is returned by the :class:`~SHAPExplainer`,
     and provides a variety of methods to visualize and interact with the explanation.
@@ -654,7 +654,7 @@ def explain(
         self,
         inputs: OneInputUnionType,
         outputs: OneOutputUnionType,
-        model: PredictionProvider,
+        model: Union[PredictionProvider, Model],
     ) -> SHAPResults:
         """Produce a SHAP explanation.
 
@@ -674,6 +674,8 @@ def explain(
             Object containing the results of the SHAP explanation.
         """
         _prediction = simple_prediction(inputs, outputs)
-        return SHAPResults(
-            self._explainer.explainAsync(_prediction, model).get(), self.background
-        )
+
+        with Model.ArrowTransmission(model, inputs):
+            return SHAPResults(
+                self._explainer.explainAsync(_prediction, model).get(), self.background
+            )
diff --git a/src/trustyai/model/__init__.py b/src/trustyai/model/__init__.py
@@ -295,7 +295,7 @@ class Model:
     """
 
     def __init__(
-        self, predict_fun, dataframe_input=False, output_names=None, arrow=False
+        self, predict_fun, dataframe_input=False, output_names=None, disable_arrow=False
     ):
         """
         Wrap the model as a TrustyAI :obj:`PredictionProvider` Java class.
@@ -311,39 +311,75 @@ def __init__(
         output_names : List[String]:
             If the model outputs a numpy array, you can specify the names of the model outputs
             here.
-        arrow: bool
-            Whether to use Apache arrow to speed up data transfer between Java and Python.
-            In general, set this to ``true`` whenever LIME or SHAP explanations are needed,
-            and ``false`` for counterfactuals.
+        disable_arrow: bool
+            If true, Apache Arrow will not be used to accelerate data transfer between Java
+            and Python. If false, Arrow will be automatically used in situations where it is
+            advantageous to do so.
         """
-        self.arrow = arrow
+        self.disable_arrow = disable_arrow
         self.predict_fun = predict_fun
         self.output_names = output_names
+        self.dataframe_input = dataframe_input
 
-        if arrow:
-            self.prediction_provider = None
-            if not dataframe_input:
-                self.prediction_provider_arrow = PredictionProviderArrow(
-                    lambda x: self._cast_outputs_to_dataframe(predict_fun(x.values))
-                )
-            else:
-                self.prediction_provider_arrow = PredictionProviderArrow(
-                    lambda x: self._cast_outputs_to_dataframe(predict_fun(x))
+        self.prediction_provider_arrow = None
+        self.prediction_provider_normal = None
+        self.prediction_provider = None
+
+        # set model to use non-arrow by default, as this requires no dataset information
+        self._set_nonarrow()
+
+    def _set_arrow(self, paradigm_input: PredictionInput):
+        """
+        Ready the model for arrow-based prediction communication.
+
+        Parameters
+        ----------
+        paradigm_input: A single :obj:`PredictionInput` by which to establish the arrow schema.
+        All subsequent :obj:`PredictionInput`s communicated must have this schema.
+        """
+        if self.disable_arrow:
+            self._set_nonarrow()
+        else:
+            if self.prediction_provider_arrow is None:
+                raw_ppa = self._get_arrow_prediction_provider()
+                self.prediction_provider_arrow = raw_ppa.get_as_prediction_provider(
+                    paradigm_input
                 )
+            self.prediction_provider = self.prediction_provider_arrow
+
+    def _set_nonarrow(self):
+        """
+        Ready the model for non-arrow-prediction communication.
+        """
+        if self.prediction_provider_normal is None:
+            self.prediction_provider_normal = self._get_nonarrow_prediction_provider()
+        self.prediction_provider = self.prediction_provider_normal
+
+    def _get_arrow_prediction_provider(self):
+        if not self.dataframe_input:
+            ppa = PredictionProviderArrow(
+                lambda x: self._cast_outputs_to_dataframe(self.predict_fun(x.values))
+            )
         else:
-            self.prediction_provider_arrow = None
-            if dataframe_input:
-                self.prediction_provider = PredictionProvider(
-                    lambda x: self._cast_outputs(
-                        predict_fun(prediction_object_to_pandas(x))
-                    )
+            ppa = PredictionProviderArrow(
+                lambda x: self._cast_outputs_to_dataframe(self.predict_fun(x))
+            )
+        return ppa
+
+    def _get_nonarrow_prediction_provider(self):
+        if self.dataframe_input:
+            ppn = PredictionProvider(
+                lambda x: self._cast_outputs(
+                    self.predict_fun(prediction_object_to_pandas(x))
                 )
-            else:
-                self.prediction_provider = PredictionProvider(
-                    lambda x: self._cast_outputs(
-                        predict_fun(prediction_object_to_numpy(x))
-                    )
+            )
+        else:
+            ppn = PredictionProvider(
+                lambda x: self._cast_outputs(
+                    self.predict_fun(prediction_object_to_numpy(x))
                 )
+            )
+        return ppn
 
     def _cast_outputs(self, output_array):
         return df_to_prediction_object(
@@ -388,12 +424,8 @@ def predictAsync(self, inputs: List[PredictionInput]) -> CompletableFuture:
         :obj:`CompletableFuture`
             A Java :obj:`CompletableFuture` containing the model outputs.
         """
-        if self.arrow and self.prediction_provider is None:
-            self.prediction_provider = (
-                self.prediction_provider_arrow.get_as_prediction_provider(inputs[0])
-            )
-        out = self.prediction_provider.predictAsync(inputs)
-        return out
+
+        return self.prediction_provider.predictAsync(inputs)
 
     def __call__(self, inputs):
         """
@@ -405,6 +437,51 @@ def __call__(self, inputs):
         """
         return self.predict_fun(inputs)
 
+    class ArrowTransmission:
+        """
+        Context class to ensure all predictAsync calls within the context use arrow.
+
+        Parameters
+        ----------
+        model: The TrustyAI :obj:`Model` or PredictionProvider
+        paradigm_input: A single :obj:`PredictionInput` by which to establish the arrow schema.
+         All subsequent :obj:`PredictionInput`s communicated must have this schema.
+        """
+
+        def __init__(self, model, paradigm_input: OneInputUnionType):
+            self.model = model
+            self.model_is_python = isinstance(model, Model)
+            self.paradigm_input = one_input_convert(paradigm_input)
+            self.previous_model_state = None
+
+        def __enter__(self):
+            if self.model_is_python:
+                self.previous_model_state = self.model.prediction_provider
+                self.model._set_arrow(self.paradigm_input)
+
+        def __exit__(self, exit_type, value, traceback):
+            if self.model_is_python:
+                self.model.prediction_provider = self.previous_model_state
+
+    class NonArrowTransmission:
+        """
+        Context class to ensure all predictAsync calls within the context DO NOT use arrow.
+        """
+
+        def __init__(self, model):
+            self.model = model
+            self.model_is_python = isinstance(model, Model)
+            self.previous_model_state = None
+
+        def __enter__(self):
+            if self.model_is_python:
+                self.previous_model_state = self.model.prediction_provider
+                self.model._set_nonarrow()
+
+        def __exit__(self, exit_type, value, traceback):
+            if self.model_is_python:
+                self.model.prediction_provider = self.previous_model_state
+
 
 @_jcustomizer.JImplementationFor("org.kie.trustyai.explainability.model.Output")
 # pylint: disable=no-member
diff --git a/tests/general/test_limeexplainer.py b/tests/general/test_limeexplainer.py
@@ -112,7 +112,7 @@ def test_lime_v2():
     model_weights = np.random.rand(5)
     predict_function = lambda x: np.dot(x.values, model_weights)
 
-    model = Model(predict_function, dataframe_input=True, arrow=True)
+    model = Model(predict_function, dataframe_input=True)
     explainer = LimeExplainer(samples=100, perturbations=2, seed=23, normalise_weights=False)
     explanation = explainer.explain(inputs=data, outputs=model(data), model=model)
     for score in explanation.as_dataframe()["output-0_score"]:
diff --git a/tests/general/test_model.py b/tests/general/test_model.py
@@ -2,11 +2,12 @@
 """Test model provider interface"""
 
 from common import *
-from trustyai.model import Model, feature
-from trustyai.utils.data_conversions import numpy_to_prediction_object
+from trustyai.model import Model, Dataset, feature
 
 import pytest
 
+from trustyai.utils.data_conversions import numpy_to_prediction_object
+
 
 def test_basic_model():
     """Test basic model"""
@@ -18,41 +19,30 @@ def test_basic_model():
 
 
 def test_cast_output():
-    np2np = Model(lambda x: np.sum(x, 1), output_names=['sum'])
-    np2df = Model(lambda x: pd.DataFrame(x))
-    df2np = Model(lambda x: x.sum(1).values, dataframe_input=True, output_names=['sum'])
-    df2df = Model(lambda x: x, dataframe_input=True)
-    pis = numpy_to_prediction_object(np.arange(0., 125.).reshape(25, 5), feature)
-
-    output_val = np2np.predictAsync(pis).get()
-    assert len(output_val) == 25
+    np2np = Model(lambda x: np.sum(x, 1), output_names=['sum'], disable_arrow=True)
+    np2df = Model(lambda x: pd.DataFrame(x), disable_arrow=True)
+    df2np = Model(lambda x: x.sum(1).values,
+                  dataframe_input=True,
+                  output_names=['sum'],
+                  disable_arrow=True)
+    df2df = Model(lambda x: x, dataframe_input=True, disable_arrow=True)
 
-    output_val = np2df.predictAsync(pis).get()
-    assert len(output_val) == 25
-
-    output_val = df2np.predictAsync(pis).get()
-    assert len(output_val) == 25
+    pis = numpy_to_prediction_object(np.arange(0., 125.).reshape(25, 5), feature)
 
-    output_val = df2df.predictAsync(pis).get()
-    assert len(output_val) == 25
+    for m in [np2np, np2df, df2df, df2np]:
+        output_val = m.predictAsync(pis).get()
+        assert len(output_val) == 25
 
 
 def test_cast_output_arrow():
-    np2np = Model(lambda x: np.sum(x, 1), output_names=['sum'], arrow=True)
-    np2df = Model(lambda x: pd.DataFrame(x), arrow=True)
-    df2np = Model(lambda x: x.sum(1).values, dataframe_input=True, output_names=['sum'], arrow=True)
-    df2df = Model(lambda x: x, dataframe_input=True, arrow=True)
+    np2np = Model(lambda x: np.sum(x, 1), output_names=['sum'])
+    np2df = Model(lambda x: pd.DataFrame(x))
+    df2np = Model(lambda x: x.sum(1).values, dataframe_input=True, output_names=['sum'])
+    df2df = Model(lambda x: x, dataframe_input=True)
     pis = numpy_to_prediction_object(np.arange(0., 125.).reshape(25, 5), feature)
 
-    output_val = np2np.predictAsync(pis).get()
-    assert len(output_val) == 25
-
-    output_val = np2df.predictAsync(pis).get()
-    assert len(output_val) == 25
-
-    output_val = df2np.predictAsync(pis).get()
-    assert len(output_val) == 25
-
-    output_val = df2df.predictAsync(pis).get()
-    assert len(output_val) == 25
+    for m in [np2np, np2df, df2df, df2np]:
+        m._set_arrow(pis[0])
+        output_val = m.predictAsync(pis).get()
+        assert len(output_val) == 25
 
diff --git a/tests/general/test_shap.py b/tests/general/test_shap.py
diff --git a/tests/general/test_shap_background_generation.py b/tests/general/test_shap_background_generation.py
diff --git a/tests/general/test_tyrus.py b/tests/general/test_tyrus.py