Source code for holisticai.bias.metrics._clustering

import numpy as np
import pandas as pd
from holisticai.utils._recommender_tools import entropy
from holisticai.utils._validation import _clustering_checks

# sklearn imports
from sklearn.metrics import (
    adjusted_mutual_info_score,
    mean_absolute_error,
    silhouette_samples,
)



[docs]
def cluster_balance(group_a, group_b, y_pred):
    """Cluster Balance

    Given a clustering and protected attribute. The cluster balance is\
    the minimum over all groups and clusters of the ratio of the representation\
    of members of that group in that cluster to the representation overall.

    Interpretation
    --------------
    A value of 1 is desired. That is when all clusters have the exact same\
    representation as the data. Lower values imply the existence of clusters\
    where either group_a or group_b is underrepresented.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    y_pred : array-like
        Cluster predictions (categorical)

    Returns
    -------
    float
        Cluster Balance

    Examples
    --------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import cluster_balance
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2])
    >>> cluster_balance(group_a, group_b, y_pred_cluster)
    0.5
    """
    # check and coerce inputs
    group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred)

    # Get clusters
    clusters = np.unique(y_pred)

    # group_a ratio overall
    r_tot_a = group_a.sum() / len(group_a)
    # group_b ratio overall
    r_tot_b = group_b.sum() / len(group_b)

    min_ratio = 1

    # loop over clusters
    for c in clusters:
        # variables
        members = y_pred == c
        n_members = members.sum() + 1.0e-20
        n_a = group_a[members].sum() + 1.0e-20
        n_b = group_b[members].sum() + 1.0e-20

        # group_a ratios
        ratio_a = (n_a / n_members) / r_tot_a
        min_a = min(ratio_a, 1 / ratio_a)

        # group_b ratios
        ratio_b = (n_b / n_members) / r_tot_b
        min_b = min(ratio_b, 1 / ratio_b)

        min_ratio = min(min_ratio, min_a, min_b)

    # return minimum balance in list
    return min_ratio




[docs]
def min_cluster_ratio(group_a, group_b, y_pred):
    """Minimum Cluster Ratio

    Given a clustering and protected attributes. The min cluster ratio is\
    the minimum over all clusters of the ratio of number of group_a members\
    to the number of group_b members.

    Interpretation
    --------------
    A value of 1 is desired. That is when all clusters are perfectly\
    balanced. Low values imply the existence of clusters where\
    group_a has fewer members than group_b.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    y_pred : array-like
        Cluster predictions (categorical)

    Returns
    -------
    float
        Minimum Cluster Ratio

    Examples
    --------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import min_cluster_ratio
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2])
    >>> min_cluster_ratio(group_a, group_b, y_pred_cluster)
    0.2499999999375
    """
    # check and coerce inputs
    group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred)

    # Get clusters
    clusters = np.unique(y_pred)
    min_ratio = np.inf

    # Get balance of each cluster
    for c in clusters:
        members = y_pred == c
        n_a = group_a[members].sum()
        n_b = group_b[members].sum()

        min_ratio = min(min_ratio, (n_a / (n_b + 1.0e-32)))

    return min_ratio



def _avg_cluster_ratio(group_a, group_b, y_pred):
    """Average Cluster Ratio

    Given a clustering and protected attributes. The average cluster ratio is\
    the average over all clusters of the ratio of group_a members to group_b\
    members in that cluster.

    Interpretation
    --------------
    A value of 1 is desired. Low values imply the predominance of clusters where\
    group_a is underrepresented. High values imply the predominance of clusters where\
    group_b is underrepresented.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    y_pred : array-like
        Cluster predictions (categorical)

    Returns
    -------
    float
        Average Cluster Ratio

    Examples
    -------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import avg_cluster_ratio
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2])
    >>> avg_cluster_ratio(group_a, group_b, y_pred_cluster)
    1.0833333323124998
    """
    # check and coerce inputs
    group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred)

    # Get clusters
    clusters = np.unique(y_pred)
    n_clusters = len(clusters)
    balances = np.zeros((n_clusters,))

    # Get balance of each cluster
    for i, c in zip(range(n_clusters), clusters):
        members = y_pred == c
        n_a = group_a[members].sum()
        n_b = group_b[members].sum()
        # if n_b zero we get infinity for mean
        if n_b == 0:
            return np.inf
        balances[i] = n_a / n_b
        # Check non zero for b
        balances[i] = n_a / (n_b + 1.0e-32)

    return balances.mean()


def _cluster_dist(y_pred_g, clusters):
    """Group distribution over clusters

    This function computes the distribution of the group across clusters.

    Parameters
    ----------
    y_pred_g : array-like
        Cluster predictions (categorical)
    clusters : array-like
        Cluster ground truth (categorical)

    Returns
    -------
    numpy array
        Cluster Distribution
    """
    bin_mat = y_pred_g.reshape(-1, 1) == clusters.reshape(1, -1)
    dist = bin_mat.sum(axis=0)
    return dist / dist.sum()



[docs]
def cluster_dist_l1(group_a, group_b, y_pred):
    """Cluster Distribution Total Variation

    This function computes the distribution of group_a and group_b across clusters.\
    It then outputs the total variation distance between these distributions.

    Interpretation
    --------------
    A value of 0 is desired. That indicates that both groups are distributed\
    similarly amongst the clusters. The metric ranges between 0 and 1,\
    with higher values indicating the groups are distributed in very\
    different ways.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    y_pred : array-like
        Cluster predictions (categorical)

    Returns
    -------
    float
        Cluster Distribution Total Variation

    Examples
    -------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import cluster_dist_l1
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2])
    >>> cluster_dist_l1(group_a, group_b, y_pred_cluster)
    0.4166666666666667
    """
    # check and coerce inputs
    group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred)

    # get unique clusters
    clusters = np.unique(y_pred)

    # split data by group
    y_pred_a = y_pred[group_a == 1]
    y_pred_b = y_pred[group_b == 1]

    # compute distributions
    dist_a = _cluster_dist(y_pred_a, clusters)
    dist_b = _cluster_dist(y_pred_b, clusters)

    # return total variation norm
    return 0.5 * len(dist_a) * mean_absolute_error(dist_a, dist_b)




[docs]
def cluster_dist_kl(group_a, group_b, y_pred):
    """Cluster Distribution KL

    This function computes the distribution of group_a and group_b\
    membership across the clusters. It then returns the KL distance\
    from the distribution of group_a to the distribution of group_b.

    Interpretation
    --------------
    A value of 0 is desired. That indicates that both groups are distributed\
    similarly amongst the clusters. Higher values indicate the distributions\
    of both groups amongst the clusters differ more.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    y_pred : array-like
        Cluster predictions (categorical)

    Returns
    -------
    float
        Cluster Distribution KL

    Notes
    -----
    :math:`KL(P_a,P_b)`

    Examples
    -------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import cluster_dist_kl
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2])
    >>> cluster_dist_kl(group_a, group_b, y_pred_cluster)
    0.4054651081081642
    """
    # check and coerce inputs
    group_a, group_b, y_pred, _, _, _ = _clustering_checks(group_a, group_b, y_pred)

    # get unique clusters
    clusters = np.unique(y_pred)

    # split data by group
    y_pred_a = y_pred[group_a == 1]
    y_pred_b = y_pred[group_b == 1]

    # compute distributions
    dist_a = _cluster_dist(y_pred_a, clusters)
    dist_b = _cluster_dist(y_pred_b, clusters)

    # return KL
    return entropy(dist_a, dist_b)




[docs]
def cluster_dist_entropy(group, y_pred):
    """Minority Cluster Distribution Entropy

    The entropy of the distribution of the group
    over the clusters.

    Interpretation
    --------------
    Lower values indicate most members of the group are allocated to\
    the same cluaster. Hence we encourage higher values of\
    the entropy, which indicate more homogeneity.

    Parameters
    ----------
    group : array-like
        Group membership vector (binary)
    y_pred : array-like
        Cluster predictions (categorical)

    Returns
    -------
    float
        Group Presence Entropy

    Notes
    -----
    :math:`Entropy(P_{group})`

    Examples
    -------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import cluster_dist_entropy
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> y_pred_cluster = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2])
    >>> cluster_dist_entropy(group_b, y_pred_cluster)
    0.8675632284814613
    """
    # check and coerce inputs
    group, _, y_pred, _, _, _ = _clustering_checks(group, group, y_pred)

    # get unique clusters
    clusters = np.unique(y_pred)

    # split data by group
    y_pred_group = y_pred[group == 1]

    # compute distribution
    dist_b = _cluster_dist(y_pred_group, clusters)

    # return Entropy
    return entropy(dist_b)



def _ami_diff(group_a, group_b, y_pred, y_true):
    """Adjusted Mutual information Difference

    We compute the difference of the adjusted mutual information\
    on group_a and group_b.

    Interpretation
    --------------
    The MI difference ranges from -1 to 1, with lower values indicating bias\
    towards group_a and larger values indicating bias against group_b.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    y_pred : array-like
        Cluster predictions (categorical)
    y_true : array-like
        Cluster ground truth (categorical)

    Returns
    -------
    float
        Mutual information Difference

    Examples
    -------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import ami_diff
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> y_pred = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2])
    >>> y_true = np.array([0, 1, 1, 2, 1, 0, 1, 2, 0, 2])
    >>> ami_diff(group_a, group_b, y_pred, y_true)
    0.6342556627317533
    """
    # check and coerce inputs
    group_a, group_b, y_pred, y_true, _, _ = _clustering_checks(group_a, group_b, y_pred, y_true)

    # Slice by min and maj groups
    y_pred_a = y_pred[group_a == 1]
    y_pred_b = y_pred[group_b == 1]
    y_true_a = y_true[group_a == 1]
    y_true_b = y_true[group_b == 1]

    # Compute AMI scores
    ami_a = adjusted_mutual_info_score(y_true_a, y_pred_a)
    ami_b = adjusted_mutual_info_score(y_true_b, y_pred_b)

    # Return Spread
    return ami_a - ami_b



[docs]
def social_fairness_ratio(group_a, group_b, data, centroids):
    """Social Fairness Ratio

    Given a centroid based clustering, this function compute the average\
    distance to the nearest centroid for both groups. The metric is the\
    ratio of the resulting distance for group_a to group_b.

    Interpretation
    --------------
    A value of 1 is desired. Lower values indicate the group_a\
    is on average closer to the respective centroids. Higher\
    values indicate that group_a is on average further from the\
    respective centroids.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    data : matrix-like
        Data matrix of shape (num_inst, dim)
    centroids : matrix-like
        Centroids (centers) of shape (num_centroids, dim)

    Returns
    -------
    float
        Social Fairness Ratio

    Examples
    -------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import social_fairness_ratio
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> data = np.array(
    ...     [
    ...         [-1, 1],
    ...         [1, 1],
    ...         [1, 1],
    ...         [0, -1],
    ...         [-1, 1],
    ...         [-1, 1],
    ...         [-1, 1],
    ...         [-1, 1],
    ...         [1, 1],
    ...         [0, -1],
    ...     ]
    ... )
    >>> centroids = np.array([[-2, 1], [1, 2], [0, -2]])
    >>> social_fairness_ratio(group_a, group_b, data, centroids)
    1.0
    """
    # check and coerce inputs
    group_a, group_b, _, _, data, centroids = _clustering_checks(
        group_a, group_b, y_pred=None, y_true=None, data=data, centroids=centroids
    )

    # Split by group
    data_a = data[group_a == 1]
    data_b = data[group_b == 1]

    # Reshape matrices for vectorization
    data_a = data_a.reshape(data_a.shape[0], 1, data_a.shape[1])
    data_b = data_b.reshape(data_b.shape[0], 1, data_b.shape[1])
    centroids = centroids.reshape(1, centroids.shape[0], centroids.shape[1])

    # Calculate distances
    dist_a = np.sqrt(((data_a - centroids) ** 2).sum(axis=-1))
    dist_b = np.sqrt(((data_b - centroids) ** 2).sum(axis=-1))

    # Take minimum over centroids and average over instances
    dist_a = dist_a.min(axis=1).mean(axis=0)
    dist_b = dist_b.min(axis=1).mean(axis=0)

    # return ratio of averages
    return dist_a / dist_b




[docs]
def silhouette_diff(group_a, group_b, data, y_pred):
    """Silhouette Difference

    We compute the difference of the mean silhouette score for both\
    groups.

    Interpretation
    --------------
    The silhouette difference ranges from -1 to 1, with lower values indicating bias\
    towards group_a and larger values indicating bias against group_b.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    data : matrix-like
        Data matrix of shape (num_inst, dim)
    y_pred : array-like
        Cluster predictions (categorical)

    Returns
    -------
    float
        Silhouette difference

    Notes
    -----
    :math:`\texttt{mean_silhouette_a - mean_silhouette_b}`

    Examples
    -------
    >>> import numpy as np
    >>> from holisticai.bias.metrics import silhouette_diff
    >>> group_a = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
    >>> group_b = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    >>> data = np.array(
    ...     [
    ...         [-1, 1],
    ...         [1, 1],
    ...         [1, 1],
    ...         [0, -1],
    ...         [-1, 1],
    ...         [-1, 1],
    ...         [-1, 1],
    ...         [-1, 1],
    ...         [1, 1],
    ...         [0, -1],
    ...     ]
    ... )
    >>> y_pred = np.array([0, 1, 1, 2, 0, 0, 0, 0, 1, 2])
    >>> silhouette_diff(group_a, group_b, data, y_pred)
    0.0
    """
    # check and coerce inputs
    group_a, group_b, y_pred, _, data, _ = _clustering_checks(group_a, group_b, y_pred, data=data)

    # Compute silhouette scores
    scores = silhouette_samples(data, y_pred, metric="euclidean")

    # Split min and maj
    scores_a = scores[group_a == 1]
    scores_b = scores[group_b == 1]

    return 0.5 * (np.mean(scores_a) - np.mean(scores_b))




[docs]
def clustering_bias_metrics(
    group_a,
    group_b,
    y_pred,
    data=None,
    centroids=None,
    metric_type="equal_outcome",
):
    """Clustering bias metrics batch computation

    This function computes all the relevant clustering bias metrics,\
    and displays them as a pandas dataframe.

    Parameters
    ----------
    group_a : array-like
        Group membership vector (binary)
    group_b : array-like
        Group membership vector (binary)
    y_pred : array-like
        Cluster predictions (categorical)
    data : matrix-like, optional
        Data matrix of shape (num_inst, dim)
    centroids : matrix-like, optional
        Centroids (centers)
    metric_type : str, optional
        Specifies which metrics we compute: 'both', 'equal_outcome' or 'equal_opportunity'

    Returns
    -------
    pandas DataFrame
        Metrics | Values | Reference
    """
    perform = {
        "Cluster Balance": cluster_balance,
        "Minimum Cluster Ratio": min_cluster_ratio,
        "Cluster Distribution Total Variation": cluster_dist_l1,
        "Cluster Distribution KL Div": cluster_dist_kl,
    }

    centroids_data_perform = {"Social Fairness Ratio": social_fairness_ratio}

    pred_data_perform = {"Silhouette Difference": silhouette_diff}

    ref_vals = {
        "Cluster Balance": 1,
        "Minimum Cluster Ratio": 1,
        "Cluster Distribution Total Variation": 0,
        "Cluster Distribution KL Div": 0,
        "Social Fairness Ratio": 1,
        "Silhouette Difference": 0,
        "Mutual Information Difference": 0,
    }

    out_metrics = []
    opp_metrics = []

    out_metrics += [[pf, fn(group_a, group_b, y_pred), ref_vals[pf]] for pf, fn in perform.items()]
    if data is not None and centroids is not None:
        out_metrics += [
            [pf, fn(group_a, group_b, data, centroids), ref_vals[pf]] for pf, fn in centroids_data_perform.items()
        ]
    if data is not None and y_pred is not None:
        out_metrics += [[pf, fn(group_a, group_b, data, y_pred), ref_vals[pf]] for pf, fn in pred_data_perform.items()]

    if metric_type == "both":
        metrics = out_metrics + opp_metrics
        return pd.DataFrame(metrics, columns=["Metric", "Value", "Reference"]).set_index("Metric")

    if metric_type == "equal_outcome":
        return pd.DataFrame(out_metrics, columns=["Metric", "Value", "Reference"]).set_index("Metric")

    if metric_type == "equal_opportunity":
        return pd.DataFrame(opp_metrics, columns=["Metric", "Value", "Reference"]).set_index("Metric")

    msg = "metric_type is not one of : both, equal_outcome, equal_opportunity"
    raise ValueError(msg)