Skip to content

Commit 29df789

Browse files
authored
Add NumPy support for group fairness metrics (#135)
1 parent 3afedc0 commit 29df789

File tree

5 files changed

+345
-61
lines changed

5 files changed

+345
-61
lines changed

src/trustyai/metrics/fairness/group.py

Lines changed: 49 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,23 @@
22
# pylint: disable = import-error
33
from typing import List, Optional, Any, Union
44

5+
import numpy as np
56
import pandas as pd
67
from jpype import JInt
78
from org.kie.trustyai.explainability.metrics import FairnessMetrics
89

910
from trustyai.model import Value, PredictionProvider, Model
1011
from trustyai.utils.data_conversions import (
11-
pandas_to_trusty,
1212
OneOutputUnionType,
1313
one_output_convert,
14+
to_trusty_dataframe,
1415
)
1516

1617
ColumSelector = Union[List[int], List[str]]
1718

1819

1920
def _column_selector_to_index(columns: ColumSelector, dataframe: pd.DataFrame):
21+
"""Returns a list of input and output indices, given an index size and output indices"""
2022
if len(columns) == 0:
2123
raise ValueError("Must specify at least one column")
2224

@@ -27,32 +29,40 @@ def _column_selector_to_index(columns: ColumSelector, dataframe: pd.DataFrame):
2729

2830

2931
def statistical_parity_difference(
30-
privileged: pd.DataFrame,
31-
unprivileged: pd.DataFrame,
32+
privileged: Union[pd.DataFrame, np.ndarray],
33+
unprivileged: Union[pd.DataFrame, np.ndarray],
3234
favorable: OneOutputUnionType,
3335
outputs: Optional[List[int]] = None,
36+
feature_names: Optional[List[str]] = None,
3437
) -> float:
3538
"""Calculate Statistical Parity Difference between privileged and unprivileged dataframes"""
3639
favorable_prediction_object = one_output_convert(favorable)
3740
return FairnessMetrics.groupStatisticalParityDifference(
38-
pandas_to_trusty(privileged, outputs),
39-
pandas_to_trusty(unprivileged, outputs),
41+
to_trusty_dataframe(
42+
data=privileged, outputs=outputs, feature_names=feature_names
43+
),
44+
to_trusty_dataframe(
45+
data=unprivileged, outputs=outputs, feature_names=feature_names
46+
),
4047
favorable_prediction_object.outputs,
4148
)
4249

4350

44-
# pylint: disable = line-too-long
51+
# pylint: disable = line-too-long, too-many-arguments
4552
def statistical_parity_difference_model(
46-
samples: pd.DataFrame,
53+
samples: Union[pd.DataFrame, np.ndarray],
4754
model: Union[PredictionProvider, Model],
4855
privilege_columns: ColumSelector,
4956
privilege_values: List[Any],
5057
favorable: OneOutputUnionType,
58+
feature_names: Optional[List[str]] = None,
5159
) -> float:
5260
"""Calculate Statistical Parity Difference using a samples dataframe and a model"""
5361
favorable_prediction_object = one_output_convert(favorable)
5462
_privilege_values = [Value(v) for v in privilege_values]
55-
_jsamples = pandas_to_trusty(samples, no_outputs=True)
63+
_jsamples = to_trusty_dataframe(
64+
data=samples, no_outputs=True, feature_names=feature_names
65+
)
5666
return FairnessMetrics.groupStatisticalParityDifference(
5767
_jsamples,
5868
model,
@@ -63,32 +73,40 @@ def statistical_parity_difference_model(
6373

6474

6575
def disparate_impact_ratio(
66-
privileged: pd.DataFrame,
67-
unprivileged: pd.DataFrame,
76+
privileged: Union[pd.DataFrame, np.ndarray],
77+
unprivileged: Union[pd.DataFrame, np.ndarray],
6878
favorable: OneOutputUnionType,
6979
outputs: Optional[List[int]] = None,
80+
feature_names: Optional[List[str]] = None,
7081
) -> float:
7182
"""Calculate Disparate Impact Ration between privileged and unprivileged dataframes"""
7283
favorable_prediction_object = one_output_convert(favorable)
7384
return FairnessMetrics.groupDisparateImpactRatio(
74-
pandas_to_trusty(privileged, outputs),
75-
pandas_to_trusty(unprivileged, outputs),
85+
to_trusty_dataframe(
86+
data=privileged, outputs=outputs, feature_names=feature_names
87+
),
88+
to_trusty_dataframe(
89+
data=unprivileged, outputs=outputs, feature_names=feature_names
90+
),
7691
favorable_prediction_object.outputs,
7792
)
7893

7994

8095
# pylint: disable = line-too-long
8196
def disparate_impact_ratio_model(
82-
samples: pd.DataFrame,
97+
samples: Union[pd.DataFrame, np.ndarray],
8398
model: Union[PredictionProvider, Model],
8499
privilege_columns: ColumSelector,
85100
privilege_values: List[Any],
86101
favorable: OneOutputUnionType,
102+
feature_names: Optional[List[str]] = None,
87103
) -> float:
88104
"""Calculate Disparate Impact Ration using a samples dataframe and a model"""
89105
favorable_prediction_object = one_output_convert(favorable)
90106
_privilege_values = [Value(v) for v in privilege_values]
91-
_jsamples = pandas_to_trusty(samples, no_outputs=True)
107+
_jsamples = to_trusty_dataframe(
108+
data=samples, no_outputs=True, feature_names=feature_names
109+
)
92110
return FairnessMetrics.groupDisparateImpactRatio(
93111
_jsamples,
94112
model,
@@ -100,12 +118,13 @@ def disparate_impact_ratio_model(
100118

101119
# pylint: disable = too-many-arguments
102120
def average_odds_difference(
103-
test: pd.DataFrame,
104-
truth: pd.DataFrame,
121+
test: Union[pd.DataFrame, np.ndarray],
122+
truth: Union[pd.DataFrame, np.ndarray],
105123
privilege_columns: ColumSelector,
106124
privilege_values: OneOutputUnionType,
107125
positive_class: List[Any],
108126
outputs: Optional[List[int]] = None,
127+
feature_names: Optional[List[str]] = None,
109128
) -> float:
110129
"""Calculate Average Odds between two dataframes"""
111130
if test.shape != truth.shape:
@@ -117,23 +136,26 @@ def average_odds_difference(
117136
# determine privileged columns
118137
_privilege_columns = _column_selector_to_index(privilege_columns, test)
119138
return FairnessMetrics.groupAverageOddsDifference(
120-
pandas_to_trusty(test, outputs),
121-
pandas_to_trusty(truth, outputs),
139+
to_trusty_dataframe(data=test, outputs=outputs, feature_names=feature_names),
140+
to_trusty_dataframe(data=truth, outputs=outputs, feature_names=feature_names),
122141
_privilege_columns,
123142
_privilege_values,
124143
_positive_class,
125144
)
126145

127146

128147
def average_odds_difference_model(
129-
samples: pd.DataFrame,
148+
samples: Union[pd.DataFrame, np.ndarray],
130149
model: Union[PredictionProvider, Model],
131150
privilege_columns: ColumSelector,
132151
privilege_values: List[Any],
133152
positive_class: List[Any],
153+
feature_names: Optional[List[str]] = None,
134154
) -> float:
135155
"""Calculate Average Odds for a sample dataframe using the provided model"""
136-
_jsamples = pandas_to_trusty(samples, no_outputs=True)
156+
_jsamples = to_trusty_dataframe(
157+
data=samples, no_outputs=True, feature_names=feature_names
158+
)
137159
_privilege_values = [Value(v) for v in privilege_values]
138160
_positive_class = [Value(v) for v in positive_class]
139161
# determine privileged columns
@@ -144,12 +166,13 @@ def average_odds_difference_model(
144166

145167

146168
def average_predictive_value_difference(
147-
test: pd.DataFrame,
148-
truth: pd.DataFrame,
169+
test: Union[pd.DataFrame, np.ndarray],
170+
truth: Union[pd.DataFrame, np.ndarray],
149171
privilege_columns: ColumSelector,
150172
privilege_values: List[Any],
151173
positive_class: List[Any],
152174
outputs: Optional[List[int]] = None,
175+
feature_names: Optional[List[str]] = None,
153176
) -> float:
154177
"""Calculate Average Predictive Value Difference between two dataframes"""
155178
if test.shape != truth.shape:
@@ -160,8 +183,8 @@ def average_predictive_value_difference(
160183
_positive_class = [Value(v) for v in positive_class]
161184
_privilege_columns = _column_selector_to_index(privilege_columns, test)
162185
return FairnessMetrics.groupAveragePredictiveValueDifference(
163-
pandas_to_trusty(test, outputs),
164-
pandas_to_trusty(truth, outputs),
186+
to_trusty_dataframe(data=test, outputs=outputs, feature_names=feature_names),
187+
to_trusty_dataframe(data=truth, outputs=outputs, feature_names=feature_names),
165188
_privilege_columns,
166189
_privilege_values,
167190
_positive_class,
@@ -170,14 +193,14 @@ def average_predictive_value_difference(
170193

171194
# pylint: disable = line-too-long
172195
def average_predictive_value_difference_model(
173-
samples: pd.DataFrame,
196+
samples: Union[pd.DataFrame, np.ndarray],
174197
model: Union[PredictionProvider, Model],
175198
privilege_columns: ColumSelector,
176199
privilege_values: List[Any],
177200
positive_class: List[Any],
178201
) -> float:
179202
"""Calculate Average Predictive Value Difference for a sample dataframe using the provided model"""
180-
_jsamples = pandas_to_trusty(samples, no_outputs=True)
203+
_jsamples = to_trusty_dataframe(samples, no_outputs=True)
181204
_privilege_values = [Value(v) for v in privilege_values]
182205
_positive_class = [Value(v) for v in positive_class]
183206
# determine privileged columns

src/trustyai/utils/data_conversions.py

Lines changed: 107 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# pylint: disable = import-error, line-too-long, trailing-whitespace, unused-import, cyclic-import
33
# pylint: disable = consider-using-f-string, invalid-name, wrong-import-order
44
import warnings
5-
from typing import Union, List, Optional
5+
from typing import Union, List, Optional, Tuple
66
from itertools import filterfalse
77

88
import trustyai.model
@@ -442,8 +442,49 @@ def prediction_object_to_pandas(
442442
return df
443443

444444

445-
def pandas_to_trusty(
446-
df: pd.DataFrame, outputs: Optional[List[int]] = None, no_outputs=False
445+
def __partition_column_indices(
446+
size: int, outputs: Optional[List[int]] = None
447+
) -> Tuple[List[int], List[int]]:
448+
indices = list(range(size))
449+
if not outputs: # If no output column supplied, assume the right-most
450+
output_indices = [size - 1]
451+
input_indices = list(filterfalse(output_indices.__contains__, indices))
452+
else:
453+
output_indices = outputs
454+
input_indices = list(filterfalse(outputs.__contains__, indices))
455+
return input_indices, output_indices
456+
457+
458+
def to_trusty_dataframe(
459+
data: Union[pd.DataFrame, np.ndarray],
460+
outputs: Optional[List[int]] = None,
461+
no_outputs=False,
462+
feature_names: Optional[List[str]] = None,
463+
) -> Dataframe:
464+
"""Convert Pandas dataframes or NumPy arrays into TrustyAI dataframes"""
465+
if isinstance(data, pd.DataFrame):
466+
return df_to_trusty_dataframe(
467+
data=data,
468+
outputs=outputs,
469+
no_outputs=no_outputs,
470+
feature_names=feature_names,
471+
)
472+
if isinstance(data, np.ndarray):
473+
return numpy_to_trusty_dataframe(
474+
arr=data,
475+
outputs=outputs,
476+
no_outputs=no_outputs,
477+
feature_names=feature_names,
478+
)
479+
480+
raise ValueError("Only Pandas dataframes and NumPy arrays supported at the moment.")
481+
482+
483+
def df_to_trusty_dataframe(
484+
data: pd.DataFrame,
485+
outputs: Optional[List[int]] = None,
486+
no_outputs=False,
487+
feature_names: Optional[List[str]] = None,
447488
) -> Dataframe:
448489
"""
449490
Converts a Pandas :class:`pandas.DataFrame` into a TrustyAI :class:`Dataframe`.
@@ -457,22 +498,74 @@ def pandas_to_trusty(
457498
458499
no_outputs : bool
459500
Specify if the :class:`Dataframe` is inputs-only
501+
502+
feature_names : Optional[List[str]]
503+
Optional list of feature names. If not provided, the Pandas dataframe column names will be used
460504
"""
461-
df = df.reset_index(drop=True)
462-
n_columns = len(df.columns)
463-
indices = list(range(n_columns))
505+
data = data.reset_index(drop=True)
506+
n_columns = len(data.columns)
464507
if not no_outputs:
465-
if not outputs: # If no output column supplied, assume the right-most
466-
output_indices = [n_columns - 1]
467-
input_indices = list(filterfalse(output_indices.__contains__, indices))
508+
509+
input_indices, output_indices = __partition_column_indices(n_columns, outputs)
510+
511+
if feature_names:
512+
input_names = [feature_names[i] for i in input_indices]
513+
output_names = [feature_names[i] for i in output_indices]
468514
else:
469-
output_indices = outputs
470-
input_indices = list(filterfalse(outputs.__contains__, indices))
515+
input_names = None
516+
output_names = None
517+
518+
pi = many_inputs_convert(
519+
python_inputs=data.iloc[:, input_indices], feature_names=input_names
520+
)
521+
po = many_outputs_convert(
522+
python_outputs=data.iloc[:, output_indices], names=output_names
523+
)
524+
525+
return Dataframe.createFrom(pi, po)
526+
527+
pi = many_inputs_convert(data)
528+
return Dataframe.createFromInputs(pi)
529+
471530

472-
pi = many_inputs_convert(df.iloc[:, input_indices])
473-
po = many_outputs_convert(df.iloc[:, output_indices])
531+
def numpy_to_trusty_dataframe(
532+
arr: np.ndarray,
533+
feature_names: List[str],
534+
outputs: Optional[List[int]] = None,
535+
no_outputs=False,
536+
) -> Dataframe:
537+
"""
538+
Converts a NumPy :class:`np.ndarray` into a TrustyAI :class:`Dataframe`.
539+
Either outputs can be provided as a list of column indices or `no_outputs` can be specified, for an inputs-only
540+
:class:`Dataframe`.
541+
542+
Parameters
543+
----------
544+
outputs : List[int]
545+
Optional list of column indices to be marked as outputs
546+
547+
no_outputs : bool
548+
Specify if the :class:`Dataframe` is inputs-only
549+
550+
feature_names : Optional[List[str]]
551+
Optional list of feature names. If not provided, the Pandas dataframe column names will be used
552+
"""
553+
n_columns = arr.shape[1]
554+
if not no_outputs:
555+
input_indices, output_indices = __partition_column_indices(n_columns, outputs)
556+
557+
input_names = [feature_names[i] for i in input_indices]
558+
output_names = [feature_names[i] for i in output_indices]
559+
axis = 1
560+
561+
pi = many_inputs_convert(
562+
python_inputs=np.take(arr, input_indices, axis), feature_names=input_names
563+
)
564+
po = many_outputs_convert(
565+
python_outputs=np.take(arr, output_indices, axis), names=output_names
566+
)
474567

475568
return Dataframe.createFrom(pi, po)
476569

477-
pi = many_inputs_convert(df)
570+
pi = many_inputs_convert(arr)
478571
return Dataframe.createFromInputs(pi)

tests/general/common.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"""Common methods and models for tests"""
33
import os
44
import sys
5+
from typing import Optional, List
6+
57
import numpy as np
68
import pandas as pd # pylint: disable=unused-import
79

@@ -21,3 +23,20 @@ def mock_feature(value, name='f-num'):
2123
def sum_skip_model(inputs: np.ndarray) -> np.ndarray:
2224
"""SumSkip test model"""
2325
return np.sum(inputs[:, [i for i in range(inputs.shape[1]) if i != 5]], 1)
26+
27+
28+
def create_random_dataframe(weights: Optional[List[float]] = None):
29+
"""Create a simple random Pandas dataframe"""
30+
from sklearn.datasets import make_classification
31+
if not weights:
32+
weights = [0.9, 0.1]
33+
34+
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2,
35+
n_clusters_per_class=2, class_sep=2, flip_y=0, weights=weights,
36+
random_state=23)
37+
38+
return pd.DataFrame({
39+
'x1': X[:, 0],
40+
'x2': X[:, 1],
41+
'y': y
42+
})

0 commit comments

Comments
 (0)