Packages Installation#
First, install the holisticai package if you haven’t already:
!pip install holisticai[all]
Then, import the necessary libraries.
[3]:
import warnings
import pandas as pd
from holisticai.bias.metrics import clustering_bias_metrics
from holisticai.datasets import load_dataset
from holisticai.pipeline import Pipeline
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore")
Data Loading#
[4]:
dataset = load_dataset('clinical_records', protected_attribute='sex', preprocessed=True)
train_test = dataset.train_test_split(test_size=0.2, random_state=42)
train = train_test['train']
test = train_test['test']
dataset
[4]:
[Dataset]
Instances: 299
Features: X , y , p_attrs , group_a , group_b
Metadata: sex: {'group_a': '0', 'group_b': '1'}
1. Variational Fair Clustering#
Traditional Implementation#
[5]:
from holisticai.bias.mitigation import VariationalFairClustering
# fit the mitigator
mitigator = VariationalFairClustering(n_clusters=3, lmbda=7, method='kmeans', verbose=True, seed=42)
mitigator.fit(train['X'], group_a = train['group_a'], group_b = train['group_b'])
# make predictions and get centroids
y_pred = mitigator.predict(train['X'], train['group_a'], train['group_b'])
centroids = mitigator.cluster_centers_
# compute clustering bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
[elapsed time: 00:00:00 | iter:8/100 | fairness_error:0.0318 | fair_cluster_energy:204.2278 | cluster_energy:190.2894]
Pipeline Implementation#
[6]:
inprocessing_model = VariationalFairClustering(n_clusters= 3, lmbda=7, method='kmeans', verbose=True,seed=42)
# set the pipeline
pipeline = Pipeline(steps=[('bm_inprocessing', inprocessing_model)])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
# make predictions and get centroids
y_pred = pipeline.predict(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
centroids = pipeline['bm_inprocessing'].cluster_centers_
# compute clustering bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline
[elapsed time: 00:00:00 | iter:8/100 | fairness_error:0.0318 | fair_cluster_energy:204.2278 | cluster_energy:190.2894]
[6]:
| Value | Reference | |
|---|---|---|
| Metric | ||
| Cluster Balance | 0.738088 | 1 |
| Minimum Cluster Ratio | 0.355932 | 1 |
| Cluster Distribution Total Variation | 0.136058 | 0 |
| Cluster Distribution KL Div | 0.043728 | 0 |
| Social Fairness Ratio | 1.118376 | 1 |
| Silhouette Difference | 0.006792 | 0 |
Comparison#
[7]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])
[7]:
| Traditional | Pipeline | ||
|---|---|---|---|
| Value | Value | Reference | |
| Metric | |||
| Cluster Balance | 0.738088 | 0.738088 | 1 |
| Minimum Cluster Ratio | 0.355932 | 0.355932 | 1 |
| Cluster Distribution Total Variation | 0.136058 | 0.136058 | 0 |
| Cluster Distribution KL Div | 0.043728 | 0.043728 | 0 |
| Social Fairness Ratio | 1.118376 | 1.118376 | 1 |
| Silhouette Difference | 0.006792 | 0.006792 | 0 |
2. Fair K-Center#
[8]:
from holisticai.bias.mitigation import FairKCenterClustering
# fit the mitigator
mitigator = FairKCenterClustering(req_nr_per_group=(1,1), nr_initially_given = 0, seed=42)
mitigator.fit(train['X'], group_a = train['group_a'], group_b = train['group_b'])
# make predictions and get centroids
y_pred = mitigator.predict(train['X'], train['group_a'], train['group_b'])
centroids = mitigator.all_centroids
# compute clustering bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
[9]:
inprocessing_model = FairKCenterClustering(req_nr_per_group=(1,1), nr_initially_given = 0, seed=42)
# set the pipeline
pipeline = Pipeline(steps=[('bm_inprocessing', inprocessing_model)])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
# make predictions and get centroids
y_pred = pipeline.predict(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
centroids = pipeline['bm_inprocessing'].all_centroids
# compute clustering bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline
[9]:
| Value | Reference | |
|---|---|---|
| Metric | ||
| Cluster Balance | 0.819967 | 1 |
| Minimum Cluster Ratio | 0.457944 | 1 |
| Cluster Distribution Total Variation | 0.118335 | 0 |
| Cluster Distribution KL Div | 0.031148 | 0 |
| Social Fairness Ratio | 1.008144 | 1 |
| Silhouette Difference | -0.010421 | 0 |
[10]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])
[10]:
| Traditional | Pipeline | ||
|---|---|---|---|
| Value | Value | Reference | |
| Metric | |||
| Cluster Balance | 0.819967 | 0.819967 | 1 |
| Minimum Cluster Ratio | 0.457944 | 0.457944 | 1 |
| Cluster Distribution Total Variation | 0.118335 | 0.118335 | 0 |
| Cluster Distribution KL Div | 0.031148 | 0.031148 | 0 |
| Social Fairness Ratio | 1.008144 | 1.008144 | 1 |
| Silhouette Difference | -0.010421 | -0.010421 | 0 |
3. Fair K-Median#
[11]:
from holisticai.bias.mitigation import FairKMedianClustering
# fit the mitigator
mitigator = FairKMedianClustering(n_clusters=2, strategy='GA', seed=42)
mitigator.fit(train['X'], group_a = train['group_a'], group_b = train['group_b'])
# make predictions and get centroids
y_pred = mitigator.labels_
centroids = mitigator.cluster_centers_
# compute clustering bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
[12]:
mitigator = FairKMedianClustering(n_clusters=2, strategy='GA', seed=42)
# set the pipeline
pipeline = Pipeline(steps=[('bm_inprocessing', mitigator)])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
# make predictions and get centroids
y_pred = pipeline['bm_inprocessing'].labels_
centroids = pipeline['bm_inprocessing'].cluster_centers_
# compute clustering bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline
[12]:
| Value | Reference | |
|---|---|---|
| Metric | ||
| Cluster Balance | 0.823072 | 1 |
| Minimum Cluster Ratio | 0.462963 | 1 |
| Cluster Distribution Total Variation | 0.113063 | 0 |
| Cluster Distribution KL Div | 0.028764 | 0 |
| Social Fairness Ratio | 1.134138 | 1 |
| Silhouette Difference | -0.009503 | 0 |
[13]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])
[13]:
| Traditional | Pipeline | ||
|---|---|---|---|
| Value | Value | Reference | |
| Metric | |||
| Cluster Balance | 0.823072 | 0.823072 | 1 |
| Minimum Cluster Ratio | 0.462963 | 0.462963 | 1 |
| Cluster Distribution Total Variation | 0.113063 | 0.113063 | 0 |
| Cluster Distribution KL Div | 0.028764 | 0.028764 | 0 |
| Social Fairness Ratio | 1.134138 | 1.134138 | 1 |
| Silhouette Difference | -0.009503 | -0.009503 | 0 |
4. Fairlet#
[14]:
from holisticai.bias.mitigation import FairletClustering
# fit the mitigator
mitigator = FairletClustering(decomposition='Vanilla', clustering_model='KMedoids', p=10, q=20, n_clusters=2, seed=100)
mitigator.fit(train['X'], group_a = train['group_a'], group_b = train['group_b'])
# make predictions and get centroids
y_pred = mitigator.predict(train['X'])
centroids = mitigator.cluster_centers_
# compute clustering bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
[15]:
mitigator = FairletClustering(decomposition='Vanilla', clustering_model='KMedoids', p=10, q=20, n_clusters=2, seed=42)
# set the pipeline
pipeline = Pipeline(steps=[('bm_inprocessing', mitigator)])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
# make predictions and get centroids
y_pred = pipeline.predict(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
centroids = pipeline['bm_inprocessing'].cluster_centers_
# compute clustering bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline
[15]:
| Value | Reference | |
|---|---|---|
| Metric | ||
| Cluster Balance | 0.937255 | 1 |
| Minimum Cluster Ratio | 0.500000 | 1 |
| Cluster Distribution Total Variation | 0.024446 | 0 |
| Cluster Distribution KL Div | 0.001588 | 0 |
| Social Fairness Ratio | 1.250599 | 1 |
| Silhouette Difference | 0.001338 | 0 |
[16]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])
[16]:
| Traditional | Pipeline | ||
|---|---|---|---|
| Value | Value | Reference | |
| Metric | |||
| Cluster Balance | 0.937255 | 0.937255 | 1 |
| Minimum Cluster Ratio | 0.500000 | 0.500000 | 1 |
| Cluster Distribution Total Variation | 0.024446 | 0.024446 | 0 |
| Cluster Distribution KL Div | 0.001588 | 0.001588 | 0 |
| Social Fairness Ratio | 1.250599 | 1.250599 | 1 |
| Silhouette Difference | 0.001338 | 0.001338 | 0 |