Skip to content

#4 data.corrwith(data['race'] == 'African-American').sort_values() #9

@andysingal

Description

@andysingal

Hi,
Thanks for the amazing book, while running the code:
data.corrwith(data['race'] == 'African-American').sort_values()
i am getting:

<ipython-input-51-0b46f54afff5>:1: FutureWarning:

The default value of numeric_only in DataFrame.corrwith is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

age                       -0.179095
end                       -0.125003
r_days_from_arrest        -0.011710
id                         0.007618
days_b_screening_arrest    0.025486
c_days_from_compas         0.044657
start                      0.054466
is_violent_recid           0.065909
event                      0.110068
juv_count                  0.111835
two_year_recid             0.131200
is_recid                   0.143022
priors_count.1             0.202897
priors_count               0.202897
is_med_or_high_risk        0.264078
v_decile_score             0.285604
decile_score.1             0.308340
decile_score               0.308340
violent_recid                   NaN
dtype: float64

which does not match with your result. For reference i will share the code if it helps.

-- coding: utf-8 --

"""Untitled64.ipynb

Automatically generated by Colaboratory.

Original file is located at
https://colab.research.google.com/drive/1hZcoArNPlc73O_BbVv-9jdLPKR_ZpPk4
"""

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mutual_info_score

import joblib
import os
import yaml
from IPython.display import display

import seaborn as sns
from matplotlib import pyplot as plt
# %matplotlib inline


# folder to load config file
CONFIG_PATH = "./config/"

# Function to load yaml configuration file
def load_config(config_name):
    with open(os.path.join(CONFIG_PATH, config_name)) as file:
        config = yaml.safe_load(file)

    return config


config = load_config("my_config.yaml")


# load data
data = pd.read_csv(os.path.join(config["data_directory"], config["data_name"]))
data.head()

data.groupby('race')['decile_score'].value_counts(
    normalize=True
).unstack().plot(
    kind='bar', figsize=(20, 7),
    title='Decile Score Histogram by Race', ylabel='% with Decile Score'
)

data.groupby('race')['two_year_recid'].describe()

# Courts generally use  a decile score of 5 to label someone as being medium risk.
data['is_med_or_high_risk'] = (data['decile_score']>=5).astype(bool)

# Hmm, not great if the court's system of 5 or higher on the decile score is only accurate 65% of the time
(data['is_med_or_high_risk']==data['two_year_recid']).mean()

import seaborn as sns
import matplotlib.pyplot as plt

# heatmap of recidivism
cm = pd.crosstab(
    data['is_med_or_high_risk'], data['two_year_recid'], 
    rownames=['Predicted Recidivism'], colnames=['Actual Recidivism']
)
p = plt.figure(figsize=(7,6))
p = sns.heatmap(cm, annot=True, fmt='d')

# scatter plot 
data.groupby(
    ['race', 'decile_score']
)[['two_year_recid']].mean().unstack().T.plot(
    xlabel='Decile Score', ylabel='Recidivism Rate', title='Recidivism vs Decile Score by Race',
    style='.', figsize=(20, 7),
    ms=20
)

data['race'].value_counts(normalize=True)

# re-label two races as Other. 
# This is done purely for educational reasons and to avoid addressing issues with a skewed sample in our data
data.loc[data['race'].isin(['Native American', 'Asian']), 'race'] = 'Other'

data.groupby('race')['two_year_recid'].describe()

data.groupby('race')['two_year_recid'].value_counts(
    normalize=True
).unstack().plot(
    kind='bar', figsize=(10, 5), title='Actual Recidivism Rates by Race'
)

data['c_charge_degree'].value_counts(normalize=True).plot(
    kind='bar', title='% of Charge Degree', ylabel='%', xlabel='Charge Degree'
)

# Right skew on Age
data['age'].plot(
    title='Histogram of Age', kind='hist', xlabel='Age', figsize=(10, 5)
)

data['priors_count'].plot(
    title='Histogram of Priors Count', kind='hist', xlabel='Priors', figsize=(10, 5)
)

"""# Measuring bias and fairness"""

data.select_dtypes(include=np.number).columns

data.select_dtypes(include='object').columns

"""# Feature construction"""

data[["juv_fel_count", "juv_misd_count", "juv_other_count"]].describe()

# feature construction, add up our three juv columns and remove the original features
data['juv_count'] = data[["juv_fel_count", "juv_misd_count", "juv_other_count"]].sum(axis=1)

data[['juv_fel_count', 'juv_misd_count', 'juv_other_count', 'juv_count']].describe()

data['juv_count'].plot(
    title='Count of Juvenile Infractions', kind='hist', xlabel='Count'
)

name = 'juv_count'
plt.savefig(f'{name}.svg', dpi=500)
plt.savefig(f'{name}.png', dpi=500)

data['juv_count'].mean(), data['juv_count'].std()

data = data.drop(["juv_fel_count", "juv_misd_count", "juv_other_count"], axis=1)

"""# Building our baseline pipeline"""



X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    data.drop('two_year_recid', axis=1), 
    data['two_year_recid'],
    data['race'],
    stratify=data['two_year_recid'],
    test_size=0.3,
    random_state=0
)

# our static classifier
classifier = RandomForestClassifier(max_depth=10, n_estimators=20, random_state=0)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler


categorical_features = ['race', 'sex', 'c_charge_degree']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='if_binary'))
])

numerical_features = ["age", "priors_count"]
numerical_transformer = Pipeline(steps=[
    ('scale', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
])

clf_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])


clf_tree.fit(X_train, y_train)

unaware_y_preds = clf_tree.predict(X_test)

preprocessor.fit_transform(X_train).shape

"""# Measuring bias in our baseline model

"""

!pip install dalex

import dalex as dx

exp_tree = dx.Explainer(clf_tree, X_test, y_test, label='Random Forest Bias Unaware', verbose=True)

exp_tree.model_performance()

from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=unaware_y_preds))

exp_tree.model_parts().plot()

"""Dalex is reporting importance in terms of drop-out loss, or how much the overall fit of our model would decrease if the feature in question were entirely removed. According to this chart, our model would lose a lot of information if we lost priors_count, but in theory, it would have been better if we dropped race. It would seem that our model isn’t even learning from the race at all! This speaks to the model’s unawareness of sensitive features."""

y_test.groupby(race_test).mean()                         
 
pd.Series(unaware_y_preds, index=y_test.index).groupby(
    race_test).mean()

mf_tree = exp_tree.model_fairness(protected=race_test, privileged='Caucasian')
mf_tree.fairness_check()

# mf_tree = exp_tree.model_fairness(
#     protected=race_test, privileged = "Caucasian")
mf_tree.metric_scores

mf_tree.plot()

mf_tree.plot(type = 'stacked')

def show_proportions(sensitive_features, y_pred, y=None, description=None):
    print(f'\n{description}\n')
    indices = {}
    positive_indices = {}
    negative_indices = {}
    groups = np.unique(sensitive_features.values)
    n_groups = len(groups)
    max_group_length = max([len(group) for group in groups])
    for index, group in enumerate(groups):
        indices[group] = sensitive_features.index[sensitive_features == group]
        group_recidivism_pct = round(sum(y_pred[indices[group]]) / len(indices[group]), 5)
        buffer = " " * (max_group_length - len(group))
        print(f'P(recidivism | {group})                {buffer}= {group_recidivism_pct}')

        if y is not None:
            positive_indices[group] = sensitive_features.index[
                (sensitive_features == group) & (y == True)
            ]
            negative_indices[group] = sensitive_features.index[
                (sensitive_features == group) & (y == False)
            ]
            prob_1 = round(sum(y_pred[positive_indices[group]]) / len(positive_indices[group]), 5)
            prob_0 = round(sum(y_pred[negative_indices[group]]) / len(negative_indices[group]), 5)
            
            print(f'P(recidivism | {group}, recidivism)    {buffer}= {prob_1}')
            print(f'P(recidivism | {group}, no recidivism) {buffer}= {prob_0}')

show_proportions(
    race_test,
    y_test,
    description="original test data:"
)
show_proportions(
    race_test,
    pd.Series(unaware_y_preds, index=y_test.index),
    y=y_test,
    description="fairness-unaware prediction on test data:"
)
plt.show()

"""# Mitigating bias

"""



"""When it comes to mitigating bias and promoting fairness in our models, we have three main opportunities to do so:

Preprocessing—Bias mitigation, as applied to the training data (i.e., before the model has had a chance to train on the training data)

In-processing—Bias mitigation applied to a model during the training phase

Postprocessing—Bias mitigation applied to the predicted labels after the model has been fit to the training data
"""

data.corrwith(data['race'] == 'African-American').sort_values()

data.groupby('race')['age'].plot(
    figsize=(20,5), 
    kind='hist', xlabel='Age', title='Histogram of Age'    
)
data.groupby('race')['age'].describe()

data.groupby('race')['priors_count'].plot(
    figsize=(20,5), 
    kind='hist', xlabel='Count of Priors', title='Histogram of Priors'
)
data.groupby('race')['priors_count'].describe()

"""here are two things to note:

African American priors are hugely right skewed, as evidenced by the mean being over twice the median.

African American priors are nearly twice as high as the other racial groups combined, due to a long history of systemic criminal justice issues.
"""

from sklearn.preprocessing import PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin

class NormalizeColumnByLabel(BaseEstimator, TransformerMixin):
    def __init__(self, col, label):
        self.col = col
        self.label = label
        self.transformers = {}
        
    def fit(self, X, y=None):
        for group in X[self.label].unique():
            self.transformers[group] = PowerTransformer(
                method='yeo-johnson', standardize=True
            )
            self.transformers[group].fit(
                X.loc[X[self.label]==group][self.col].values.reshape(-1, 1)
            )
        return self
    
    def transform(self, X, y=None):
        C = X.copy()
        for group in X[self.label].unique():
            C.loc[X[self.label]==group, self.col] = self.transformers[group].transform(
                X.loc[X[self.label]==group][self.col].values.reshape(-1, 1)
            )
        return C

n = NormalizeColumnByLabel(col='priors_count', label='race')

X_train_normalized = n.fit_transform(X_train, y_train)

X_train_normalized.groupby('race')['priors_count'].hist(figsize=(20,5))
X_train_normalized.groupby('race')['priors_count'].describe()

clf_tree_aware = Pipeline(steps=[
    ('normalize_priors', NormalizeColumnByLabel(col='priors_count', label='race')),
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

clf_tree_aware.fit(X_train, y_train)

aware_y_preds = clf_tree_aware.predict(X_test)

print(classification_report(y_true=y_test, y_pred=aware_y_preds))

exp_tree_aware = dx.Explainer(clf_tree_aware, X_test, y_test, label='Random Forest DIR', verbose=False)
mf_tree_aware = exp_tree_aware.model_fairness(protected=race_test, privileged = "Caucasian")

# performance is virtually unchanged overall
pd.concat([exp.model_performance().result for exp in [exp_tree, exp_tree_aware]])

# Still using the same features, note that race has become less important.
exp_tree.model_parts().plot(objects=[exp_tree_aware.model_parts()])

# We can see a small drop in parity loss
mf_tree.plot(objects=[mf_tree_aware], type='stacked')

mf_tree_aware.fairness_check()

show_proportions(
    race_test,
    y_test,
    description="original test data:"
)
show_proportions(
    race_test,
    pd.Series(unaware_y_preds, index=y_test.index),
    y=y_test,
    description="fairness-unaware prediction on test data:"
)
show_proportions(
    race_test,
    pd.Series(aware_y_preds, index=y_test.index),
    y=y_test,
    description="fairness-aware prediction on test data:"
)
plt.show()

classes.columns = ['class','class_name']
classes = classes[classes['class_name'].map(lambda x: x in required_classes)]

Best,
Ankush Singal

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions