Packages Installation#
First, install the holisticai package if you haven’t already:
!pip install holisticai[all]
Then, import the necessary libraries.
[2]:
import warnings
import pandas as pd
from holisticai.bias.metrics import clustering_bias_metrics
from holisticai.datasets import load_dataset
from holisticai.pipeline import Pipeline
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore")
Data Loading#
[3]:
dataset = load_dataset('clinical_records', protected_attribute="sex")
train_test = dataset.train_test_split(test_size=0.2, random_state=42)
train = train_test['train']
test = train_test['test']
dataset
[3]:
[Dataset]
Instances: 299
Features: X , y , p_attrs , group_a , group_b
Metadata: sex: {'group_a': '0', 'group_b': '1'}
[5]:
from holisticai.bias.mitigation import MCMF
# fit a clustering model
model = KMeans(n_clusters=2, random_state=42)
model.fit(train['X'])
centroids = model.cluster_centers_
# predict the clusters
y_pred = model.predict(train['X'])
# compute the bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics
[5]:
| Value | Reference | |
|---|---|---|
| Metric | ||
| Cluster Balance | 0.739749 | 1 |
| Minimum Cluster Ratio | 0.472441 | 1 |
| Cluster Distribution Total Variation | 0.118793 | 0 |
| Cluster Distribution KL Div | 0.042365 | 0 |
| Social Fairness Ratio | 1.153405 | 1 |
| Silhouette Difference | -0.019309 | 0 |
1. MCMF#
Traditional Implementation#
[16]:
from holisticai.bias.mitigation import MCMF
# fit a clustering model
model = KMeans(n_clusters=2, random_state=42)
model.fit(train['X'])
# predict the clusters
y_pred = model.predict(train['X'])
# fit the mitigator
mitigator = MCMF(metric='L1', verbose=1, group_mode='ab')
y_pred = mitigator.fit_transform(train['X'], y_pred, train["group_a"], train['group_b'], model.cluster_centers_)['y_pred']
mitigator
[16]:
[MCMF]
Type: Bias Mitigation Postprocessing
[13]:
# get the centroids
centroids = model.cluster_centers_
# compute the bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics
[13]:
| Value | Reference | |
|---|---|---|
| Metric | ||
| Cluster Balance | 0.992388 | 1 |
| Minimum Cluster Ratio | 0.545455 | 1 |
| Cluster Distribution Total Variation | 0.005882 | 0 |
| Cluster Distribution KL Div | 0.000069 | 0 |
| Social Fairness Ratio | 1.153405 | 1 |
| Silhouette Difference | 0.024097 | 0 |
Pipeline Implementation#
[14]:
mitigator = MCMF(metric='L1', verbose=1, group_mode='ab')
# set the pipeline
pipeline = Pipeline(steps=[('model', model), ('bm_postprocessing', mitigator)])
pipeline.fit(train['X'])
# predict the clusters and get the centroids
y_pred = pipeline.predict(train['X'],bm__group_a=train["group_a"],bm__group_b=train["group_b"],bm__centroids="cluster_centers_")
centroids = pipeline['model'].cluster_centers_
# compute the bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline
[14]:
| Value | Reference | |
|---|---|---|
| Metric | ||
| Cluster Balance | 0.992388 | 1 |
| Minimum Cluster Ratio | 0.545455 | 1 |
| Cluster Distribution Total Variation | 0.005882 | 0 |
| Cluster Distribution KL Div | 0.000069 | 0 |
| Social Fairness Ratio | 1.153405 | 1 |
| Silhouette Difference | 0.024097 | 0 |
[15]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])
[15]:
| Traditional | Pipeline | ||
|---|---|---|---|
| Value | Value | Reference | |
| Metric | |||
| Cluster Balance | 0.992388 | 0.992388 | 1 |
| Minimum Cluster Ratio | 0.545455 | 0.545455 | 1 |
| Cluster Distribution Total Variation | 0.005882 | 0.005882 | 0 |
| Cluster Distribution KL Div | 0.000069 | 0.000069 | 0 |
| Social Fairness Ratio | 1.153405 | 1.153405 | 1 |
| Silhouette Difference | 0.024097 | 0.024097 | 0 |