from __future__ import annotations
import logging
from typing import Optional
import jax
import jax.numpy as jnp
import numpy as np
import pandas as pd
from holisticai.bias.mitigation.inprocessing.adversarial_debiasing.models import (
ADModel,
AdversarialModel,
ClassifierModel,
create_train_state,
train_step,
)
from holisticai.datasets import DataLoader, Dataset
from holisticai.utils.transformers.bias import BMInprocessing as BMImp
from holisticai.utils.transformers.bias import SensitiveGroups
logger = logging.getLogger(__name__)
def is_numeric(df):
if isinstance(df, pd.DataFrame):
return all(pd.api.types.is_numeric_dtype(df[col]) for col in df.columns)
if isinstance(df, np.ndarray):
return np.issubdtype(df.dtype, np.number)
raise ValueError("Input must be a pandas DataFrame or numpy array.")
[docs]
class AdversarialDebiasing(BMImp):
"""Adversarial Debiasing
Adversarial debiasing [1]_ learns a classifier to maximize prediction accuracy and simultaneously reduce an
adversary's ability to determine the protected attribute from the predictions. This approach leads to a fair classifier as the
predictions cannot carry any group discrimination information that the adversary can exploit.
Obs: Pytorch must be installed in order to use this techinique (pytorch = ">=1.12.1").
Parameters
----------
features_dim: int
Number of input feature X: (n_samples, features_dim)
keep_prob: float
Dropout parameter for classifier
hidden_size: int
Number of neurons on hidden layer
batch_size: int
Numer of examples used for each iteration
shuffle: bool
Shuffle data after each epoch
epochs: int
Number of epochs
initial_lr: float
Initial Learning Rate
use_debias: bool
If False Train a simple classifier
adversary_loss_weight: float
Adversarial Loss importance
verbose : int
Log progress if value > 0.
print_interval : int
Each `print_interval` steps print information.
device: str
pytorch paramter ("cpu", "cuda")
seed: int
seed for random state
Examples
--------
>>> from holisticai.bias.mitigation.inprocessing import AdversarialDebiasing
>>> mitigator = AdversarialDebiasing(**params)
>>> mitigator.fit(train_data, group_a, group_b)
>>> test_data_transformed = mitigator.predict(test_data)
References
----------
[1] B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating Unwanted
Biases with Adversarial Learning," AAAI/ACM Conference on Artificial
Intelligence, Ethics, and Society, 2018.
"""
def __init__(
self,
features_dim: Optional[int] = None,
keep_prob: Optional[float] = 0.1,
hidden_size: Optional[int] = 128,
batch_size: Optional[int] = 32,
shuffle: Optional[bool] = True,
epochs: Optional[int] = 10,
learning_rate: Optional[float] = 0.01,
use_debias: Optional[bool] = True,
adversary_loss_weight: Optional[float] = 0.1,
verbose: Optional[int] = 1,
print_interval: Optional[int] = 100,
device: Optional[str] = "cpu",
seed: Optional[int] = None,
):
# default classifier config
self.features_dim = features_dim
self.keep_prob = keep_prob
self.hidden_size = hidden_size
# training config
self.batch_size = batch_size
self.shuffle = shuffle
self.epochs = epochs
self.learning_rate = learning_rate
# bias config
self.adversary_loss_weight = adversary_loss_weight
self.use_debias = use_debias
# other configs
self.verbose = verbose
self.print_interval = print_interval
self.device = device
self.seed = seed if seed is not None else np.random.randint(0, 1000)
self._sensgroups = SensitiveGroups()
def transform_estimator(self, estimator=None):
if estimator is None:
self.classifier = ClassifierModel(
features_dim=self.features_dim, hidden_size=self.hidden_size, keep_prob=self.keep_prob
)
else:
self.estimator = estimator
return self
[docs]
def fit(
self,
X: np.ndarray,
y: np.ndarray,
group_a: np.ndarray,
group_b: np.ndarray,
):
"""
Fit the model
Description
-----------
Learn a fair classifier.
Parameters
----------
X : numpy array
input matrix
y_true : numpy array
Target vector
group_a : numpy array
binary mask vector
group_b : numpy array
binary mask vector
Returns
-------
the same object
"""
import pandas as pd
params = self._load_data(X=X, y=y, group_a=group_a, group_b=group_b)
x = pd.DataFrame(params["X"])
if not is_numeric(x):
raise ValueError("Adversarial Debiasing only works with numeric features.")
y = pd.Series(params["y"])
group_a = pd.Series(params["group_a"])
group_b = pd.Series(params["group_b"])
self.classes_ = params["classes_"]
dataset = Dataset(X=x, y=y, group=group_a)
data_loader = DataLoader(dataset, batch_size=self.batch_size, dtype="jax")
feature_dim = x.shape[1]
rng = jax.random.PRNGKey(self.seed)
adversary_model = AdversarialModel()
model = ADModel(classifier=self.classifier, adversarial=adversary_model)
cls_state, adv_state = create_train_state(rng, model, learning_rate=self.learning_rate, feature_dim=feature_dim)
total_steps = self.epochs * data_loader.num_batches
step = 0
for _ in range(self.epochs):
losses_cls = []
losses_adv = []
for batch in data_loader:
rng, step_rng = jax.random.split(rng)
cls_state, adv_state, loss_cls, loss_adv = train_step(
cls_state,
adv_state,
batch,
use_debias=self.use_debias,
adversary_loss_weight=self.adversary_loss_weight,
rng=step_rng,
)
losses_cls.append(loss_cls)
losses_adv.append(loss_adv)
if self.verbose > 0 and step % self.print_interval == 0:
adv_loss_mean = f"{np.mean(losses_adv):.6f}" if self.use_debias else None
logger.info(
f"Step {step+1}/{total_steps}: Classifier Loss = {np.mean(losses_cls):.6f}, Adversarial Loss = {adv_loss_mean}"
)
step += 1
self.cls_state, self.adv_state = cls_state, adv_state
return self
def _predict_proba(self, X: np.ndarray):
inputs = jnp.array(X)
y_prob = self.classifier.apply({"params": self.cls_state.params}, inputs, trainable=False)
return np.array(y_prob).ravel()
[docs]
def predict(self, X):
"""
Prediction
Description
----------
Predict output for the given samples.
Parameters
----------
X : np.ndarray
input matrix
Returns
-------
np.ndarray: Predicted output per sample.
"""
if not is_numeric(X):
raise ValueError("Adversarial Debiasing only works with numeric features.")
p = self.predict_proba(X)
return np.argmax(p, axis=1).ravel()
[docs]
def predict_proba(self, X):
"""
Prediction
Description
----------
Predict matrix probability for the given samples.
Parameters
----------
X : np.ndarray
input matrix
Returns
-------
np.ndarray: Predicted matrix probability per sample.
"""
if not is_numeric(X):
raise ValueError("Adversarial Debiasing only works with numeric features.")
proba = np.empty((X.shape[0], 2))
proba[:, 1] = self._predict_proba(X)
proba[:, 0] = 1.0 - proba[:, 1]
return proba
[docs]
def predict_score(self, X):
"""
Prediction
Description
----------
Predict probability for the given samples.
Parameters
----------
X : np.ndarray
input matrix
Returns
-------
np.ndarray: Predicted probability per sample.
"""
if not is_numeric(X):
raise ValueError("Adversarial Debiasing only works with numeric features.")
p = self._predict(X).reshape([-1])
return p