Packages Installation#

First, install the holisticai package if you haven’t already:

!pip install holisticai[all]

Then, import the necessary libraries.

[22]:
import pandas as pd
from holisticai.bias.metrics import classification_bias_metrics
from holisticai.datasets import load_dataset
from holisticai.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Dataset loading#

[23]:
dataset = load_dataset('adult', protected_attribute='sex')
train_test = dataset.train_test_split(test_size=0.2, random_state=42)

train = train_test['train']
test = train_test['test']

dataset
[23]:
[Dataset]
Instances: 45222
Features: X , y , p_attrs , group_a , group_b
Metadata: sex: {'group_a': 'Male', 'group_b': 'Female'}

1 . Correlation Remover#

Traditional Implementation#

[24]:
# Define postprocessing model
from holisticai.bias.mitigation import CorrelationRemover

mitigator = CorrelationRemover()
mitigator
[24]:
[CorrelationRemover]
CorrelationRemover(alpha=1)

Type: Bias Mitigation Preprocessing

Training a Model#

[25]:
model = LogisticRegression()

# Standardize data and fit model
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])
X_train_pre = mitigator.fit_transform(X_train, group_a=train['group_a'], group_b=train['group_b'])
model.fit(X_train_pre, train['y'])

# Predict on test data
X_test = scaler.transform(test['X'])
X_test_pre = mitigator.transform(X_test, group_a= test['group_a'], group_b=test['group_b'])
y_pred = model.predict(X_test_pre)

# Evaluate bias metrics
metrics = {}
metrics['model'] = classification_bias_metrics(test['group_a'], test['group_b'], y_pred, test['y'], metric_type='both')
metrics['model']
[25]:
Value Reference
Metric
Statistical Parity 0.088938 0
Disparate Impact 1.732300 1
Four Fifths Rule 0.577267 1
Cohen D 0.232029 0
2SD Rule 10.260307 0
Equality of Opportunity Difference -0.099203 0
False Positive Rate Difference 0.011345 0
Average Odds Difference -0.043929 0
Accuracy Difference -0.097202 0
[26]:
# Define postprocessing model
mitigator = CorrelationRemover()
mitigator
[26]:
[CorrelationRemover]
CorrelationRemover(alpha=1)

Type: Bias Mitigation Preprocessing
[27]:
# Define pretprocessing model
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])
pipeline
[27]:
[Pipeline]
scalar [StandardScaler]
StandardScaler(copy=True, with_mean=True, with_std=True)
bm_preprocessing [CorrelationRemover]
CorrelationRemover(alpha=1)
estimator [LogisticRegression]
LogisticRegression(penalty=l2, dual=False, tol=0.0001, C=1.0, ...)
[28]:
# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics['CorrelationRemover']  = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics['CorrelationRemover']
[28]:
Value Reference
Metric
Statistical Parity 0.088938 0
Disparate Impact 1.732300 1
Four Fifths Rule 0.577267 1
Cohen D 0.232029 0
2SD Rule 10.260307 0
Equality of Opportunity Difference -0.099203 0
False Positive Rate Difference 0.011345 0
Average Odds Difference -0.043929 0
Accuracy Difference -0.097202 0

2. Disparate Impact Remover#

Traditional Implementation#

[29]:
# Define postprocessing model
from holisticai.bias.mitigation import DisparateImpactRemover
mitigator = DisparateImpactRemover()
mitigator
[29]:
[DisparateImpactRemover]
DisparateImpactRemover(repair_level=1.0)

Type: Bias Mitigation Preprocessing

Pipeline Implementation#

[31]:
# Define preprocessing model
mitigator = DisparateImpactRemover()
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics['DisparateImpactRemover']  = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics['DisparateImpactRemover']
[31]:
Value Reference
Metric
Statistical Parity 0.453053 0
Disparate Impact 7.791141 1
Four Fifths Rule 0.128351 1
Cohen D 1.041918 0
2SD Rule 41.661359 0
Equality of Opportunity Difference 0.431984 0
False Positive Rate Difference 0.340087 0
Average Odds Difference 0.386036 0
Accuracy Difference -0.204678 0

3. Learning Fair Representations#

Traditional Implementation#

[32]:
# Define postprocessing model
from holisticai.bias.mitigation import LearningFairRepresentation

mitigator = LearningFairRepresentation(k=10, Ax=0.2, Ay=2.0, Az=4.0, verbose=1, maxiter=100, seed=100)
mitigator
[32]:
[LearningFairRepresentation]
LearningFairRepresentation(k=10, Ax=0.2, Ay=2.0, Az=4.0, ...)

Type: Bias Mitigation Preprocessing

Pipeline Implementation#

[34]:
# Define postprocessing model
mitigator = LearningFairRepresentation(k=10, Ax=0.2, Ay=2.0, Az=4.0, verbose=1, maxiter=100, seed=100)
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics['LearningFR']  = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics['LearningFR']
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[34], line 7
      5 # Define pipeline
      6 pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
----> 7 pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])
      9 # Make predictions
     10 y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

File ~/holisticai/src/holisticai/pipeline/_pipeline_helper.py:55, in PipelineHelper.handle_pipeline_methods.<locals>.function(X, y, **kargs)
     51         output = object.__getattribute__(self, fn_name)(X, **params)
     53 else:
     54     # Other case the primitive method could be invoked
---> 55     output = getattr(SKLPipeline, fn_name)(self, X, **params)
     57 return output

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/sklearn/pipeline.py:469, in Pipeline.fit(self, X, y, **params)
    426 """Fit the model.
    427
    428 Fit all the transformers one after the other and sequentially transform the
   (...)
    466     Pipeline with fitted steps.
    467 """
    468 routed_params = self._check_method_params(method="fit", props=params)
--> 469 Xt = self._fit(X, y, routed_params)
    470 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    471     if self._final_estimator != "passthrough":

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/sklearn/pipeline.py:406, in Pipeline._fit(self, X, y, routed_params)
    404     cloned_transformer = clone(transformer)
    405 # Fit or load from cache the current transformer
--> 406 X, fitted_transformer = fit_transform_one_cached(
    407     cloned_transformer,
    408     X,
    409     y,
    410     None,
    411     message_clsname="Pipeline",
    412     message=self._log_message(step_idx),
    413     params=routed_params[name],
    414 )
    415 # Replace the transformer of the step with the fitted
    416 # transformer. This is necessary when loading the transformer
    417 # from the cache.
    418 self.steps[step_idx] = (name, fitted_transformer)

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/joblib/memory.py:312, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    311 def __call__(self, *args, **kwargs):
--> 312     return self.func(*args, **kwargs)

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/sklearn/pipeline.py:1310, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, params)
   1308 with _print_elapsed_time(message_clsname, message):
   1309     if hasattr(transformer, "fit_transform"):
-> 1310         res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
   1311     else:
   1312         res = transformer.fit(X, y, **params.get("fit", {})).transform(
   1313             X, **params.get("transform", {})
   1314         )

File ~/holisticai/src/holisticai/utils/transformers/_transformer_base.py:92, in BMTransformerBase.reformat_function.<locals>.wrapped_func(*args, **kargs)
     90 kargs.update(self._load_data_pipeline())
     91 params = {v: kargs[v] for v in fun_varnames if v in kargs}
---> 92 return func(**params)

File ~/holisticai/src/holisticai/bias/mitigation/preprocessing/learning_fair_representation.py:272, in LearningFairRepresentation.fit_transform(self, X, y, group_a, group_b)
    247 def fit_transform(
    248     self,
    249     X: np.ndarray,
   (...)
    252     group_b: np.ndarray,
    253 ):
    254     """
    255     Fit and transform
    256
   (...)
    270         Self
    271     """
--> 272     return self.fit(X, y, group_a, group_b).transform(X, group_a, group_b)

File ~/holisticai/src/holisticai/utils/transformers/_transformer_base.py:92, in BMTransformerBase.reformat_function.<locals>.wrapped_func(*args, **kargs)
     90 kargs.update(self._load_data_pipeline())
     91 params = {v: kargs[v] for v in fun_varnames if v in kargs}
---> 92 return func(**params)

File ~/holisticai/src/holisticai/bias/mitigation/preprocessing/learning_fair_representation.py:203, in LearningFairRepresentation.fit(self, X, y, group_a, group_b)
    199 @jax.jit
    200 def objective(params):
    201     return obj_fun(params)
--> 203 result = minimize(
    204     objective,
    205     parameters_initialization,
    206     method="L-BFGS-B",
    207     bounds=parameters_bounds,
    208     options={"maxiter": self.maxiter, "disp": 0},
    209 )
    210 self.learned_model = result.x
    211 self.w = self.learned_model[: self.k]

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_minimize.py:731, in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    728     res = _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback,
    729                              **options)
    730 elif meth == 'l-bfgs-b':
--> 731     res = _minimize_lbfgsb(fun, x0, args, jac, bounds,
    732                            callback=callback, **options)
    733 elif meth == 'tnc':
    734     res = _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
    735                         **options)

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_lbfgsb_py.py:347, in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)
    344         iprint = disp
    346 # _prepare_scalar_function can use bounds=None to represent no bounds
--> 347 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps,
    348                               bounds=bounds,
    349                               finite_diff_rel_step=finite_diff_rel_step)
    351 func_and_grad = sf.fun_and_grad
    353 fortran_int = _lbfgsb.types.intvar.dtype

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_optimize.py:288, in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess)
    284     bounds = (-np.inf, np.inf)
    286 # ScalarFunction caches. Reuse of fun(x) during grad
    287 # calculation reduces overall function evaluations.
--> 288 sf = ScalarFunction(fun, x0, args, grad, hess,
    289                     finite_diff_rel_step, bounds, epsilon=epsilon)
    291 return sf

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_differentiable_functions.py:231, in ScalarFunction.__init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon)
    224 # Initial gradient evaluation
    225 self._wrapped_grad, self._ngev = _wrapper_grad(
    226     grad,
    227     fun=self._wrapped_fun,
    228     args=args,
    229     finite_diff_options=finite_diff_options
    230 )
--> 231 self._update_grad()
    233 # Hessian evaluation
    234 if callable(hess):

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_differentiable_functions.py:306, in ScalarFunction._update_grad(self)
    304 if self._orig_grad in FD_METHODS:
    305     self._update_fun()
--> 306 self.g = self._wrapped_grad(self.x, f0=self.f)
    307 self.g_updated = True

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_differentiable_functions.py:47, in _wrapper_grad.<locals>.wrapped1(x, f0)
     45 def wrapped1(x, f0=None):
     46     ncalls[0] += 1
---> 47     return approx_derivative(
     48         fun, x, f0=f0, **finite_diff_options
     49     )

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:519, in approx_derivative(fun, x0, method, rel_step, abs_step, f0, bounds, sparsity, as_linear_operator, args, kwargs)
    516     use_one_sided = False
    518 if sparsity is None:
--> 519     return _dense_difference(fun_wrapped, x0, f0, h,
    520                              use_one_sided, method)
    521 else:
    522     if not issparse(sparsity) and len(sparsity) == 2:

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:592, in _dense_difference(fun, x0, f0, h, use_one_sided, method)
    590     x1[i] += h[i]
    591     dx = x1[i] - x0[i]  # Recompute dx as exactly representable number.
--> 592     df = fun(x1) - f0
    593 elif method == '3-point' and use_one_sided[i]:
    594     x1[i] += h[i]

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:470, in approx_derivative.<locals>.fun_wrapped(x)
    467 if xp.isdtype(x.dtype, "real floating"):
    468     x = xp.astype(x, x0.dtype)
--> 470 f = np.atleast_1d(fun(x, *args, **kwargs))
    471 if f.ndim > 1:
    472     raise RuntimeError("`fun` return value has "
    473                        "more than 1 dimension.")

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_differentiable_functions.py:24, in _wrapper_fun.<locals>.wrapped(x)
     22 if not np.isscalar(fx):
     23     try:
---> 24         fx = np.asarray(fx).item()
     25     except (TypeError, ValueError) as e:
     26         raise ValueError(
     27             "The user-provided objective function "
     28             "must return a scalar value."
     29         ) from e

KeyboardInterrupt:

4. Reweighing#

Traditional Implementation#

[ ]:
# Define preprocessing model
from holisticai.bias.mitigation import Reweighing
mitigator = Reweighing()
mitigator
[Reweighing]
Reweighing()

Type: Bias Mitigation Preprocessing

Pipeline Implementation#

[35]:
# Define preprocessing model
mitigator = Reweighing()
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics['Reweighing']  = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics['Reweighing']
[35]:
Value Reference
Metric
Statistical Parity 0.094554 0
Disparate Impact 1.787408 1
Four Fifths Rule 0.559469 1
Cohen D 0.245580 0
2SD Rule 10.851962 0
Equality of Opportunity Difference -0.107801 0
False Positive Rate Difference 0.018112 0
Average Odds Difference -0.044845 0
Accuracy Difference -0.102319 0
[36]:
from holisticai.utils import concatenate_metrics

concatenate_metrics(metrics)
[36]:
model CorrelationRemover DisparateImpactRemover Reweighing Reference
Metric
Statistical Parity 0.088938 0.088938 0.453053 0.094554 0
Disparate Impact 1.732300 1.732300 7.791141 1.787408 1
Four Fifths Rule 0.577267 0.577267 0.128351 0.559469 1
Cohen D 0.232029 0.232029 1.041918 0.245580 0
2SD Rule 10.260307 10.260307 41.661359 10.851962 0
Equality of Opportunity Difference -0.099203 -0.099203 0.431984 -0.107801 0
False Positive Rate Difference 0.011345 0.011345 0.340087 0.018112 0
Average Odds Difference -0.043929 -0.043929 0.386036 -0.044845 0
Accuracy Difference -0.097202 -0.097202 -0.204678 -0.102319 0