Source code for holisticai.bias.mitigation.postprocessing.fair_topk.transformer

from __future__ import annotations

from typing import Optional

import pandas as pd
from holisticai.bias.mitigation.postprocessing.fair_topk.algorithm_utils.fail_prob import (
    RecursiveNumericFailProbabilityCalculator,
)
from holisticai.bias.mitigation.postprocessing.fair_topk.algorithm_utils.valitation_utils import (
    check_ranking,
    validate_basic_parameters,
)
from holisticai.utils.transformers.bias import BMPostprocessing as BMPost



[docs]
class FairTopK(BMPost):
    """
    Fair Top K bias mitigation [1]_ can be used for Recommender Systems.\
    The strategy extends group fairness definition using the standard notion of protected groups\
    and is based on ensuring that the proportion of protected candidates in every prefix of the top-k\
    ranking.

    Parameters
    ----------
    top_n : int
        The total number of elements.

    p : float
        The proportion of protected candidates in the top-k ranking.

    alpha : float
        The significance level.

    query_col : str
        The name of the column in data that contains query ids.

    doc_col : str
        The name of the column in data that contains document ids.

    group_col : str
        The name of the column in data that contains protected attribute.

    score_col : str
        The name of the column in data that contains judgment values.

    Examples
    --------
    >>> from holisticai.bias.mitigation import FairTopK
    >>> mitigator = FairTopK(**params)
    >>> new_rankings = mitigator.transform(rankings)

    References
    ---------
    .. [1] Zehlike, Meike, et al. "Fa* ir: A fair top-k ranking algorithm." Proceedings of the 2017 ACM on\
    Conference on Information and Knowledge Management. 2017.
    """

    def __init__(
        self,
        top_n: Optional[int],
        p: Optional[float],
        alpha: Optional[float],
        query_col: Optional[str] = "query_id",
        doc_col: Optional[str] = "doc_id",
        group_col: Optional[str] = "group_id",
        score_col: Optional[str] = "score",
    ):
        # check the parameters first
        validate_basic_parameters(top_n, p, alpha)
        self.query_col = query_col
        self.doc_col = doc_col
        self.group_col = group_col
        self.score_col = score_col

        # assign the parameters
        self.top_n = top_n  # the total number of elements
        self.p = p  # the proportion of protected candidates in the top-k ranking
        self.alpha = alpha  # the significance level
        self._cache = {}  # stores generated mtables in memory


[docs]
    def transform(self, rankings, p_attr=None):
        """
        Apply transform to prediction scores.

        Parameters
        ----------
        rankings : DataFrame
            Predicted matrix scores (nb_examples*top_n, 3) [query_id, doc_id, scores]

        p_attr: matrix-like
            Item groups (nb_examples, 3) [query_id, doc_id, protected]

        Returns
        -------
        DataFrame
            The re-ranked dataframe.
        """
        if p_attr is None:
            if self.group_col not in rankings.columns:
                raise ValueError("protected groups must be provided")
            new_rankings = rankings
        else:
            if self.group_col in rankings.columns:
                del rankings[self.group_col]

            new_rankings = pd.merge(rankings, p_attr, on=[self.query_col, self.doc_col], how="left")

        query_result_by_group = new_rankings.groupby(self.query_col)
        re_rankings = [df if self.is_fair(df) else self.transform_ranking(df) for _, df in query_result_by_group]
        return pd.concat(re_rankings).reset_index(drop=True)



[docs]
    def transform_ranking(self, ranking):
        """
        Applies FA*IR re-ranking to the input ranking using an adjusted mtable

        Parameters
        ----------
        ranking: list
            The ranking to be re-ranked (list of FairScoreDoc)

        Returns
        ------
        DataFrame
            The re-ranked dataframe.
        """
        protected = ranking[ranking[self.group_col]]
        non_protected = ranking[~ranking[self.group_col]]
        mtable = self._create_adjusted_mtable()
        return pd.DataFrame(self._fair_top_k(protected, non_protected, mtable)).reset_index(drop=True)


    def _create_adjusted_mtable(self):
        """
        Description
        -----------
        Creates an adjusted mtable by using the alpha value.

        Return
        ------
        list
            mtable as list of int elements
        """

        if (self.top_n, self.p, self.alpha) not in self._cache:
            # create the mtable
            fail_prob_pair = RecursiveNumericFailProbabilityCalculator(self.top_n, self.p, self.alpha).adjust_alpha()
            mtable = [int(i) for i in fail_prob_pair.mtable.m.tolist()]
            # store as list
            self._cache[(self.top_n, self.p, self.alpha)] = mtable

        # return from cache
        return self._cache[(self.top_n, self.p, self.alpha)]


[docs]
    def is_fair(self, ranking):
        """
        Checks if the ranking is fair for the given parameters

        Parameters
        ----------
        ranking: list
            The ranking to be checked (list of Resultinfo)

        Returns
        ------
        bool
            True if the ranking is fair, False otherwise.
        """
        return check_ranking(ranking[self.group_col], self._create_adjusted_mtable())


    def _fair_top_k(self, protected_candidates, non_protected_candidates, mtable):
        """
        Reorganize the results info ensuring true the mtable condition (#protected[:i] >= mtable[i]).

        Parameters
        ----------

        protected_candidates:  pd.DataFrame
            ranking dataframe filtered with only protected candidates

        non_protected_candidates:  pd.DataFrame
            ranking dataframe filtered with only non protected candidates

        mtable: list
            adjusted mtable

        Returns
        ------
        list
            List of re-ranked results.
        """
        result = []
        countProtected = 0

        idxProtected = 0
        idxNonProtected = 0

        for i in range(self.top_n):
            if idxProtected >= len(protected_candidates) and idxNonProtected >= len(non_protected_candidates):
                # no more candidates available, return list shorter than k
                return result
            if idxProtected >= len(protected_candidates):
                # no more protected candidates available, take non-protected instead
                result.append(non_protected_candidates.iloc[idxNonProtected])
                idxNonProtected += 1

            elif idxNonProtected >= len(non_protected_candidates):
                # no more non-protected candidates available, take protected instead
                result.append(protected_candidates.iloc[idxProtected])
                idxProtected += 1
                countProtected += 1
            elif countProtected < mtable[i]:
                # add a protected candidate
                result.append(protected_candidates.iloc[idxProtected])
                idxProtected += 1
                countProtected += 1
            elif (
                protected_candidates.iloc[idxProtected][self.score_col]
                >= non_protected_candidates.iloc[idxNonProtected][self.score_col]
            ):
                # the best is a protected one
                result.append(protected_candidates.iloc[idxProtected])
                idxProtected += 1
                countProtected += 1
            else:
                # the best is a non-protected one
                result.append(non_protected_candidates.iloc[idxNonProtected])
                idxNonProtected += 1

        return result