Source code for holisticai.bias.metrics._clustering

import numpy as np
import pandas as pd
from holisticai.utils._recommender_tools import entropy
from holisticai.utils._validation import _clustering_checks

# sklearn imports
from sklearn.metrics import (
    adjusted_mutual_info_score,
    mean_absolute_error,
    silhouette_samples,
)


[docs] def cluster_balance(group_a, group_b, y_pred): """Cluster Balance Given a clustering and protected attribute. The cluster balance is\ the minimum over all groups and clusters of the ratio of the representation\ of members of that group in that cluster to the representation overall. Interpretation -------------- A value of 1 is desired. That is when all clusters have the exact same\ representation as the data. Lower values imply the existence of clusters\ where either group_a or group_b is underrepresented. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) y_pred : array-like Cluster predictions (categorical) Returns ------- float Cluster Balance Examples -------- >>> import numpy as np >>> from holisticai.bias.metrics import cluster_balance >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2]) >>> cluster_balance(group_a, group_b, y_pred_cluster) 0.5 """ # check and coerce inputs group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred) # Get clusters clusters = np.unique(y_pred) # group_a ratio overall r_tot_a = group_a.sum() / len(group_a) # group_b ratio overall r_tot_b = group_b.sum() / len(group_b) min_ratio = 1 # loop over clusters for c in clusters: # variables members = y_pred == c n_members = members.sum() + 1.0e-20 n_a = group_a[members].sum() + 1.0e-20 n_b = group_b[members].sum() + 1.0e-20 # group_a ratios ratio_a = (n_a / n_members) / r_tot_a min_a = min(ratio_a, 1 / ratio_a) # group_b ratios ratio_b = (n_b / n_members) / r_tot_b min_b = min(ratio_b, 1 / ratio_b) min_ratio = min(min_ratio, min_a, min_b) # return minimum balance in list return min_ratio
[docs] def min_cluster_ratio(group_a, group_b, y_pred): """Minimum Cluster Ratio Given a clustering and protected attributes. The min cluster ratio is\ the minimum over all clusters of the ratio of number of group_a members\ to the number of group_b members. Interpretation -------------- A value of 1 is desired. That is when all clusters are perfectly\ balanced. Low values imply the existence of clusters where\ group_a has fewer members than group_b. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) y_pred : array-like Cluster predictions (categorical) Returns ------- float Minimum Cluster Ratio Examples -------- >>> import numpy as np >>> from holisticai.bias.metrics import min_cluster_ratio >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2]) >>> min_cluster_ratio(group_a, group_b, y_pred_cluster) 0.2499999999375 """ # check and coerce inputs group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred) # Get clusters clusters = np.unique(y_pred) min_ratio = np.inf # Get balance of each cluster for c in clusters: members = y_pred == c n_a = group_a[members].sum() n_b = group_b[members].sum() min_ratio = min(min_ratio, (n_a / (n_b + 1.0e-32))) return min_ratio
def _avg_cluster_ratio(group_a, group_b, y_pred): """Average Cluster Ratio Given a clustering and protected attributes. The average cluster ratio is\ the average over all clusters of the ratio of group_a members to group_b\ members in that cluster. Interpretation -------------- A value of 1 is desired. Low values imply the predominance of clusters where\ group_a is underrepresented. High values imply the predominance of clusters where\ group_b is underrepresented. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) y_pred : array-like Cluster predictions (categorical) Returns ------- float Average Cluster Ratio Examples ------- >>> import numpy as np >>> from holisticai.bias.metrics import avg_cluster_ratio >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2]) >>> avg_cluster_ratio(group_a, group_b, y_pred_cluster) 1.0833333323124998 """ # check and coerce inputs group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred) # Get clusters clusters = np.unique(y_pred) n_clusters = len(clusters) balances = np.zeros((n_clusters,)) # Get balance of each cluster for i, c in zip(range(n_clusters), clusters): members = y_pred == c n_a = group_a[members].sum() n_b = group_b[members].sum() # if n_b zero we get infinity for mean if n_b == 0: return np.inf balances[i] = n_a / n_b # Check non zero for b balances[i] = n_a / (n_b + 1.0e-32) return balances.mean() def _cluster_dist(y_pred_g, clusters): """Group distribution over clusters This function computes the distribution of the group across clusters. Parameters ---------- y_pred_g : array-like Cluster predictions (categorical) clusters : array-like Cluster ground truth (categorical) Returns ------- numpy array Cluster Distribution """ bin_mat = y_pred_g.reshape(-1, 1) == clusters.reshape(1, -1) dist = bin_mat.sum(axis=0) return dist / dist.sum()
[docs] def cluster_dist_l1(group_a, group_b, y_pred): """Cluster Distribution Total Variation This function computes the distribution of group_a and group_b across clusters.\ It then outputs the total variation distance between these distributions. Interpretation -------------- A value of 0 is desired. That indicates that both groups are distributed\ similarly amongst the clusters. The metric ranges between 0 and 1,\ with higher values indicating the groups are distributed in very\ different ways. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) y_pred : array-like Cluster predictions (categorical) Returns ------- float Cluster Distribution Total Variation Examples ------- >>> import numpy as np >>> from holisticai.bias.metrics import cluster_dist_l1 >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2]) >>> cluster_dist_l1(group_a, group_b, y_pred_cluster) 0.4166666666666667 """ # check and coerce inputs group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred) # get unique clusters clusters = np.unique(y_pred) # split data by group y_pred_a = y_pred[group_a == 1] y_pred_b = y_pred[group_b == 1] # compute distributions dist_a = _cluster_dist(y_pred_a, clusters) dist_b = _cluster_dist(y_pred_b, clusters) # return total variation norm return 0.5 * len(dist_a) * mean_absolute_error(dist_a, dist_b)
[docs] def cluster_dist_kl(group_a, group_b, y_pred): """Cluster Distribution KL This function computes the distribution of group_a and group_b\ membership across the clusters. It then returns the KL distance\ from the distribution of group_a to the distribution of group_b. Interpretation -------------- A value of 0 is desired. That indicates that both groups are distributed\ similarly amongst the clusters. Higher values indicate the distributions\ of both groups amongst the clusters differ more. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) y_pred : array-like Cluster predictions (categorical) Returns ------- float Cluster Distribution KL Notes ----- :math:`KL(P_a,P_b)` Examples ------- >>> import numpy as np >>> from holisticai.bias.metrics import cluster_dist_kl >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2]) >>> cluster_dist_kl(group_a, group_b, y_pred_cluster) 0.4054651081081642 """ # check and coerce inputs group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred) # get unique clusters clusters = np.unique(y_pred) # split data by group y_pred_a = y_pred[group_a == 1] y_pred_b = y_pred[group_b == 1] # compute distributions dist_a = _cluster_dist(y_pred_a, clusters) dist_b = _cluster_dist(y_pred_b, clusters) # return KL return entropy(dist_a, dist_b)
[docs] def cluster_dist_entropy(group, y_pred): """Minority Cluster Distribution Entropy The entropy of the distribution of the group over the clusters. Interpretation -------------- Lower values indicate most members of the group are allocated to\ the same cluaster. Hence we encourage higher values of\ the entropy, which indicate more homogeneity. Parameters ---------- group : array-like Group membership vector (binary) y_pred : array-like Cluster predictions (categorical) Returns ------- float Group Presence Entropy Notes ----- :math:`Entropy(P_{group})` Examples ------- >>> import numpy as np >>> from holisticai.bias.metrics import cluster_dist_entropy >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2]) >>> cluster_dist_entropy(group_b, y_pred_cluster) 0.8675632284814613 """ # check and coerce inputs group, _, y_pred, _, _, _ = _clustering_checks(group, group, y_pred) # get unique clusters clusters = np.unique(y_pred) # split data by group y_pred_group = y_pred[group == 1] # compute distribution dist_b = _cluster_dist(y_pred_group, clusters) # return Entropy return entropy(dist_b)
def _ami_diff(group_a, group_b, y_pred, y_true): """Adjusted Mutual information Difference We compute the difference of the adjusted mutual information\ on group_a and group_b. Interpretation -------------- The MI difference ranges from -1 to 1, with lower values indicating bias\ towards group_a and larger values indicating bias against group_b. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) y_pred : array-like Cluster predictions (categorical) y_true : array-like Cluster ground truth (categorical) Returns ------- float Mutual information Difference Examples ------- >>> import numpy as np >>> from holisticai.bias.metrics import ami_diff >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> y_pred = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2]) >>> y_true = np.array([0, 1, 1, 2, 1, 0, 1, 2, 0, 2]) >>> ami_diff(group_a, group_b, y_pred, y_true) 0.6342556627317533 """ # check and coerce inputs group_a, group_b, y_pred, y_true, _, _ = _clustering_checks(group_a, group_b, y_pred, y_true) # Slice by min and maj groups y_pred_a = y_pred[group_a == 1] y_pred_b = y_pred[group_b == 1] y_true_a = y_true[group_a == 1] y_true_b = y_true[group_b == 1] # Compute AMI scores ami_a = adjusted_mutual_info_score(y_true_a, y_pred_a) ami_b = adjusted_mutual_info_score(y_true_b, y_pred_b) # Return Spread return ami_a - ami_b
[docs] def social_fairness_ratio(group_a, group_b, data, centroids): """Social Fairness Ratio Given a centroid based clustering, this function compute the average\ distance to the nearest centroid for both groups. The metric is the\ ratio of the resulting distance for group_a to group_b. Interpretation -------------- A value of 1 is desired. Lower values indicate the group_a\ is on average closer to the respective centroids. Higher\ values indicate that group_a is on average further from the\ respective centroids. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) data : matrix-like Data matrix of shape (num_inst, dim) centroids : matrix-like Centroids (centers) of shape (num_centroids, dim) Returns ------- float Social Fairness Ratio Examples ------- >>> import numpy as np >>> from holisticai.bias.metrics import social_fairness_ratio >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> data = np.array( ... [ ... [-1, 1], ... [1, 1], ... [1, 1], ... [0, -1], ... [-1, 1], ... [-1, 1], ... [-1, 1], ... [-1, 1], ... [1, 1], ... [0, -1], ... ] ... ) >>> centroids = np.array([[-2, 1], [1, 2], [0, -2]]) >>> social_fairness_ratio(group_a, group_b, data, centroids) 1.0 """ # check and coerce inputs group_a, group_b, _, _, data, centroids = _clustering_checks( group_a, group_b, y_pred=None, y_true=None, data=data, centroids=centroids ) # Split by group data_a = data[group_a == 1] data_b = data[group_b == 1] # Reshape matrices for vectorization data_a = data_a.reshape(data_a.shape[0], 1, data_a.shape[1]) data_b = data_b.reshape(data_b.shape[0], 1, data_b.shape[1]) centroids = centroids.reshape(1, centroids.shape[0], centroids.shape[1]) # Calculate distances dist_a = np.sqrt(((data_a - centroids) ** 2).sum(axis=-1)) dist_b = np.sqrt(((data_b - centroids) ** 2).sum(axis=-1)) # Take minimum over centroids and average over instances dist_a = dist_a.min(axis=1).mean(axis=0) dist_b = dist_b.min(axis=1).mean(axis=0) # return ratio of averages return dist_a / dist_b
[docs] def silhouette_diff(group_a, group_b, data, y_pred): """Silhouette Difference We compute the difference of the mean silhouette score for both\ groups. Interpretation -------------- The silhouette difference ranges from -1 to 1, with lower values indicating bias\ towards group_a and larger values indicating bias against group_b. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) data : matrix-like Data matrix of shape (num_inst, dim) y_pred : array-like Cluster predictions (categorical) Returns ------- float Silhouette difference Notes ----- :math:`\texttt{mean_silhouette_a - mean_silhouette_b}` Examples ------- >>> import numpy as np >>> from holisticai.bias.metrics import silhouette_diff >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> data = np.array( ... [ ... [-1, 1], ... [1, 1], ... [1, 1], ... [0, -1], ... [-1, 1], ... [-1, 1], ... [-1, 1], ... [-1, 1], ... [1, 1], ... [0, -1], ... ] ... ) >>> y_pred = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2]) >>> silhouette_diff(group_a, group_b, data, y_pred) 0.0 """ # check and coerce inputs group_a, group_b, y_pred, _, data, _ = _clustering_checks(group_a, group_b, y_pred, data=data) # Compute silhouette scores scores = silhouette_samples(data, y_pred, metric="euclidean") # Split min and maj scores_a = scores[group_a == 1] scores_b = scores[group_b == 1] return 0.5 * (np.mean(scores_a) - np.mean(scores_b))
[docs] def clustering_bias_metrics( group_a, group_b, y_pred, data=None, centroids=None, metric_type="equal_outcome", ): """Clustering bias metrics batch computation This function computes all the relevant clustering bias metrics,\ and displays them as a pandas dataframe. Parameters ---------- group_a : array-like Group membership vector (binary) group_b : array-like Group membership vector (binary) y_pred : array-like Cluster predictions (categorical) data : matrix-like, optional Data matrix of shape (num_inst, dim) centroids : matrix-like, optional Centroids (centers) metric_type : str, optional Specifies which metrics we compute: 'both', 'equal_outcome' or 'equal_opportunity' Returns ------- pandas DataFrame Metrics | Values | Reference """ perform = { "Cluster Balance": cluster_balance, "Minimum Cluster Ratio": min_cluster_ratio, "Cluster Distribution Total Variation": cluster_dist_l1, "Cluster Distribution KL Div": cluster_dist_kl, } centroids_data_perform = {"Social Fairness Ratio": social_fairness_ratio} pred_data_perform = {"Silhouette Difference": silhouette_diff} ref_vals = { "Cluster Balance": 1, "Minimum Cluster Ratio": 1, "Cluster Distribution Total Variation": 0, "Cluster Distribution KL Div": 0, "Social Fairness Ratio": 1, "Silhouette Difference": 0, "Mutual Information Difference": 0, } out_metrics = [] opp_metrics = [] out_metrics += [[pf, fn(group_a, group_b, y_pred), ref_vals[pf]] for pf, fn in perform.items()] if data is not None and centroids is not None: out_metrics += [ [pf, fn(group_a, group_b, data, centroids), ref_vals[pf]] for pf, fn in centroids_data_perform.items() ] if data is not None and y_pred is not None: out_metrics += [[pf, fn(group_a, group_b, data, y_pred), ref_vals[pf]] for pf, fn in pred_data_perform.items()] if metric_type == "both": metrics = out_metrics + opp_metrics return pd.DataFrame(metrics, columns=["Metric", "Value", "Reference"]).set_index("Metric") if metric_type == "equal_outcome": return pd.DataFrame(out_metrics, columns=["Metric", "Value", "Reference"]).set_index("Metric") if metric_type == "equal_opportunity": return pd.DataFrame(opp_metrics, columns=["Metric", "Value", "Reference"]).set_index("Metric") msg = "metric_type is not one of : both, equal_outcome, equal_opportunity" raise ValueError(msg)