Packages Installation#

First, install the holisticai package if you haven’t already:

!pip install holisticai[all]

Then, import the necessary libraries.

[22]:

import pandas as pd
from holisticai.bias.metrics import classification_bias_metrics
from holisticai.datasets import load_dataset
from holisticai.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Dataset loading#

[23]:

dataset = load_dataset('adult', protected_attribute='sex')
train_test = dataset.train_test_split(test_size=0.2, random_state=42)

train = train_test['train']
test = train_test['test']

dataset

[23]:

[Dataset]

Instances: 45222

Features: X , y , p_attrs , group_a , group_b

Metadata: sex: {'group_a': 'Male', 'group_b': 'Female'}

1 . Correlation Remover#

Traditional Implementation#

[24]:

# Define postprocessing model
from holisticai.bias.mitigation import CorrelationRemover

mitigator = CorrelationRemover()
mitigator

[24]:

[CorrelationRemover]

CorrelationRemover(alpha=1)

Type: Bias Mitigation Preprocessing

Training a Model#

[25]:

model = LogisticRegression()

# Standardize data and fit model
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])
X_train_pre = mitigator.fit_transform(X_train, group_a=train['group_a'], group_b=train['group_b'])
model.fit(X_train_pre, train['y'])

# Predict on test data
X_test = scaler.transform(test['X'])
X_test_pre = mitigator.transform(X_test, group_a= test['group_a'], group_b=test['group_b'])
y_pred = model.predict(X_test_pre)

# Evaluate bias metrics
metrics = {}
metrics['model'] = classification_bias_metrics(test['group_a'], test['group_b'], y_pred, test['y'], metric_type='both')
metrics['model']

[25]:

	Value	Reference
Metric
Statistical Parity	0.088938	0
Disparate Impact	1.732300	1
Four Fifths Rule	0.577267	1
Cohen D	0.232029	0
2SD Rule	10.260307	0
Equality of Opportunity Difference	-0.099203	0
False Positive Rate Difference	0.011345	0
Average Odds Difference	-0.043929	0
Accuracy Difference	-0.097202	0

[26]:

# Define postprocessing model
mitigator = CorrelationRemover()
mitigator

[26]:

[CorrelationRemover]

CorrelationRemover(alpha=1)

Type: Bias Mitigation Preprocessing

[27]:

# Define pretprocessing model
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])
pipeline

[27]:

[Pipeline]

scalar [StandardScaler]

StandardScaler(copy=True, with_mean=True, with_std=True)

bm_preprocessing [CorrelationRemover]

CorrelationRemover(alpha=1)

estimator [LogisticRegression]

LogisticRegression(penalty=l2, dual=False, tol=0.0001, C=1.0, ...)

[28]:

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics['CorrelationRemover']  = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics['CorrelationRemover']

[28]:

	Value	Reference
Metric
Statistical Parity	0.088938	0
Disparate Impact	1.732300	1
Four Fifths Rule	0.577267	1
Cohen D	0.232029	0
2SD Rule	10.260307	0
Equality of Opportunity Difference	-0.099203	0
False Positive Rate Difference	0.011345	0
Average Odds Difference	-0.043929	0
Accuracy Difference	-0.097202	0

2. Disparate Impact Remover#

Traditional Implementation#

[29]:

# Define postprocessing model
from holisticai.bias.mitigation import DisparateImpactRemover
mitigator = DisparateImpactRemover()
mitigator

[29]:

[DisparateImpactRemover]

DisparateImpactRemover(repair_level=1.0)

Type: Bias Mitigation Preprocessing

Pipeline Implementation#

[31]:

# Define preprocessing model
mitigator = DisparateImpactRemover()
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics['DisparateImpactRemover']  = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics['DisparateImpactRemover']

[31]:

	Value	Reference
Metric
Statistical Parity	0.453053	0
Disparate Impact	7.791141	1
Four Fifths Rule	0.128351	1
Cohen D	1.041918	0
2SD Rule	41.661359	0
Equality of Opportunity Difference	0.431984	0
False Positive Rate Difference	0.340087	0
Average Odds Difference	0.386036	0
Accuracy Difference	-0.204678	0

3. Learning Fair Representations#

Traditional Implementation#

[32]:

# Define postprocessing model
from holisticai.bias.mitigation import LearningFairRepresentation

mitigator = LearningFairRepresentation(k=10, Ax=0.2, Ay=2.0, Az=4.0, verbose=1, maxiter=100, seed=100)
mitigator

[32]:

[LearningFairRepresentation]

LearningFairRepresentation(k=10, Ax=0.2, Ay=2.0, Az=4.0, ...)

Type: Bias Mitigation Preprocessing

Pipeline Implementation#

[34]:

# Define postprocessing model
mitigator = LearningFairRepresentation(k=10, Ax=0.2, Ay=2.0, Az=4.0, verbose=1, maxiter=100, seed=100)
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics['LearningFR']  = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics['LearningFR']

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[34], line 7
      5 # Define pipeline
      6 pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
----> 7 pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])
      9 # Make predictions
     10 y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

File ~/holisticai/src/holisticai/pipeline/_pipeline_helper.py:55, in PipelineHelper.handle_pipeline_methods.<locals>.function(X, y, **kargs)
     51         output = object.__getattribute__(self, fn_name)(X, **params)
     53 else:
     54     # Other case the primitive method could be invoked
---> 55     output = getattr(SKLPipeline, fn_name)(self, X, **params)
     57 return output

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/sklearn/pipeline.py:469, in Pipeline.fit(self, X, y, **params)
    426 """Fit the model.
    427
    428 Fit all the transformers one after the other and sequentially transform the
   (...)
    466     Pipeline with fitted steps.
    467 """
    468 routed_params = self._check_method_params(method="fit", props=params)
--> 469 Xt = self._fit(X, y, routed_params)
    470 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    471     if self._final_estimator != "passthrough":

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/sklearn/pipeline.py:406, in Pipeline._fit(self, X, y, routed_params)
    404     cloned_transformer = clone(transformer)
    405 # Fit or load from cache the current transformer
--> 406 X, fitted_transformer = fit_transform_one_cached(
    407     cloned_transformer,
    408     X,
    409     y,
    410     None,
    411     message_clsname="Pipeline",
    412     message=self._log_message(step_idx),
    413     params=routed_params[name],
    414 )
    415 # Replace the transformer of the step with the fitted
    416 # transformer. This is necessary when loading the transformer
    417 # from the cache.
    418 self.steps[step_idx] = (name, fitted_transformer)

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/joblib/memory.py:312, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    311 def __call__(self, *args, **kwargs):
--> 312     return self.func(*args, **kwargs)

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/sklearn/pipeline.py:1310, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, params)
   1308 with _print_elapsed_time(message_clsname, message):
   1309     if hasattr(transformer, "fit_transform"):
-> 1310         res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
   1311     else:
   1312         res = transformer.fit(X, y, **params.get("fit", {})).transform(
   1313             X, **params.get("transform", {})
   1314         )

File ~/holisticai/src/holisticai/utils/transformers/_transformer_base.py:92, in BMTransformerBase.reformat_function.<locals>.wrapped_func(*args, **kargs)
     90 kargs.update(self._load_data_pipeline())
     91 params = {v: kargs[v] for v in fun_varnames if v in kargs}
---> 92 return func(**params)

File ~/holisticai/src/holisticai/bias/mitigation/preprocessing/learning_fair_representation.py:272, in LearningFairRepresentation.fit_transform(self, X, y, group_a, group_b)
    247 def fit_transform(
    248     self,
    249     X: np.ndarray,
   (...)
    252     group_b: np.ndarray,
    253 ):
    254     """
    255     Fit and transform
    256
   (...)
    270         Self
    271     """
--> 272     return self.fit(X, y, group_a, group_b).transform(X, group_a, group_b)

File ~/holisticai/src/holisticai/utils/transformers/_transformer_base.py:92, in BMTransformerBase.reformat_function.<locals>.wrapped_func(*args, **kargs)
     90 kargs.update(self._load_data_pipeline())
     91 params = {v: kargs[v] for v in fun_varnames if v in kargs}
---> 92 return func(**params)

File ~/holisticai/src/holisticai/bias/mitigation/preprocessing/learning_fair_representation.py:203, in LearningFairRepresentation.fit(self, X, y, group_a, group_b)
    199 @jax.jit
    200 def objective(params):
    201     return obj_fun(params)
--> 203 result = minimize(
    204     objective,
    205     parameters_initialization,
    206     method="L-BFGS-B",
    207     bounds=parameters_bounds,
    208     options={"maxiter": self.maxiter, "disp": 0},
    209 )
    210 self.learned_model = result.x
    211 self.w = self.learned_model[: self.k]

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_minimize.py:731, in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    728     res = _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback,
    729                              **options)
    730 elif meth == 'l-bfgs-b':
--> 731     res = _minimize_lbfgsb(fun, x0, args, jac, bounds,
    732                            callback=callback, **options)
    733 elif meth == 'tnc':
    734     res = _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
    735                         **options)

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_lbfgsb_py.py:347, in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)
    344         iprint = disp
    346 # _prepare_scalar_function can use bounds=None to represent no bounds
--> 347 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps,
    348                               bounds=bounds,
    349                               finite_diff_rel_step=finite_diff_rel_step)
    351 func_and_grad = sf.fun_and_grad
    353 fortran_int = _lbfgsb.types.intvar.dtype

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_optimize.py:288, in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess)
    284     bounds = (-np.inf, np.inf)
    286 # ScalarFunction caches. Reuse of fun(x) during grad
    287 # calculation reduces overall function evaluations.
--> 288 sf = ScalarFunction(fun, x0, args, grad, hess,
    289                     finite_diff_rel_step, bounds, epsilon=epsilon)
    291 return sf

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_differentiable_functions.py:231, in ScalarFunction.__init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon)
    224 # Initial gradient evaluation
    225 self._wrapped_grad, self._ngev = _wrapper_grad(
    226     grad,
    227     fun=self._wrapped_fun,
    228     args=args,
    229     finite_diff_options=finite_diff_options
    230 )
--> 231 self._update_grad()
    233 # Hessian evaluation
    234 if callable(hess):

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_differentiable_functions.py:306, in ScalarFunction._update_grad(self)
    304 if self._orig_grad in FD_METHODS:
    305     self._update_fun()
--> 306 self.g = self._wrapped_grad(self.x, f0=self.f)
    307 self.g_updated = True

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_differentiable_functions.py:47, in _wrapper_grad.<locals>.wrapped1(x, f0)
     45 def wrapped1(x, f0=None):
     46     ncalls[0] += 1
---> 47     return approx_derivative(
     48         fun, x, f0=f0, **finite_diff_options
     49     )

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:519, in approx_derivative(fun, x0, method, rel_step, abs_step, f0, bounds, sparsity, as_linear_operator, args, kwargs)
    516     use_one_sided = False
    518 if sparsity is None:
--> 519     return _dense_difference(fun_wrapped, x0, f0, h,
    520                              use_one_sided, method)
    521 else:
    522     if not issparse(sparsity) and len(sparsity) == 2:

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:592, in _dense_difference(fun, x0, f0, h, use_one_sided, method)
    590     x1[i] += h[i]
    591     dx = x1[i] - x0[i]  # Recompute dx as exactly representable number.
--> 592     df = fun(x1) - f0
    593 elif method == '3-point' and use_one_sided[i]:
    594     x1[i] += h[i]

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:470, in approx_derivative.<locals>.fun_wrapped(x)
    467 if xp.isdtype(x.dtype, "real floating"):
    468     x = xp.astype(x, x0.dtype)
--> 470 f = np.atleast_1d(fun(x, *args, **kwargs))
    471 if f.ndim > 1:
    472     raise RuntimeError("`fun` return value has "
    473                        "more than 1 dimension.")

File ~/.local/share/hatch/env/virtual/holisticai/4NjQH6EQ/testing/lib/python3.11/site-packages/scipy/optimize/_differentiable_functions.py:24, in _wrapper_fun.<locals>.wrapped(x)
     22 if not np.isscalar(fx):
     23     try:
---> 24         fx = np.asarray(fx).item()
     25     except (TypeError, ValueError) as e:
     26         raise ValueError(
     27             "The user-provided objective function "
     28             "must return a scalar value."
     29         ) from e

KeyboardInterrupt:

4. Reweighing#

Traditional Implementation#

[ ]:

# Define preprocessing model
from holisticai.bias.mitigation import Reweighing
mitigator = Reweighing()
mitigator

[Reweighing]

Reweighing()

Type: Bias Mitigation Preprocessing

Pipeline Implementation#

[35]:

# Define preprocessing model
mitigator = Reweighing()
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics['Reweighing']  = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics['Reweighing']

[35]:

	Value	Reference
Metric
Statistical Parity	0.094554	0
Disparate Impact	1.787408	1
Four Fifths Rule	0.559469	1
Cohen D	0.245580	0
2SD Rule	10.851962	0
Equality of Opportunity Difference	-0.107801	0
False Positive Rate Difference	0.018112	0
Average Odds Difference	-0.044845	0
Accuracy Difference	-0.102319	0

[36]:

from holisticai.utils import concatenate_metrics

concatenate_metrics(metrics)

[36]:

	model	CorrelationRemover	DisparateImpactRemover	Reweighing	Reference
Metric
Statistical Parity	0.088938	0.088938	0.453053	0.094554	0
Disparate Impact	1.732300	1.732300	7.791141	1.787408	1
Four Fifths Rule	0.577267	0.577267	0.128351	0.559469	1
Cohen D	0.232029	0.232029	1.041918	0.245580	0
2SD Rule	10.260307	10.260307	41.661359	10.851962	0
Equality of Opportunity Difference	-0.099203	-0.099203	0.431984	-0.107801	0
False Positive Rate Difference	0.011345	0.011345	0.340087	0.018112	0
Average Odds Difference	-0.043929	-0.043929	0.386036	-0.044845	0
Accuracy Difference	-0.097202	-0.097202	-0.204678	-0.102319	0