Source code for holisticai.bias.mitigation.postprocessing.debiasing_exposure.transformer

import numpy as np
import pandas as pd
from holisticai.bias.mitigation.postprocessing.debiasing_exposure.algorithm import DELTRAlgorithm
from holisticai.bias.mitigation.postprocessing.debiasing_exposure.algorithm_utils import Standarizer


[docs] class DebiasingExposure: """ Disparate Exposure Learning to Rank (DELTR) [1]_ incorporates a measure of performance and a measure\ of disparate exposure into its loss function. Trains a linear model based on performance and\ fairness for a protected group. Parameters ---------- group_col : str Name of the column in data that contains protected attribute. query_col : str Name of the column in data that contains query ids (optional). doc_col : str List of name of the column in data that contains document ids (optional). score_col : str Name of the column in data that contains judgment values (optional). feature_cols : Name of the columns in data that contains feature values (optional). gamma : float Gamma parameter for the cost calculation in the training phase (recommended to be around 1). number_of_iterations : int Number of iteration in gradient descent (optional). learning_rate : float Learning rate in gradient descent (optional). lambdaa : float Regularization constant (optional). init_var : float Range of values for initialization of weights (optional). standardize : bool Boolean indicating whether the data should be standardized or not (optional). verbose : int If > 0, print progress. References --------- .. [1] Zehlike, Meike, and Carlos Castillo. "Reducing disparate exposure in ranking: A learning to rank\ approach." Proceedings of The Web Conference 2020. 2020. """ def __init__( self, group_col: str, query_col="query_id", doc_col="doc_id", score_col="judgment", feature_cols=None, gamma: float = 1.0, number_of_iterations=3000, learning_rate=0.001, lambdaa=0.001, init_var=0.01, standardize=False, verbose=0, ): if feature_cols is None: feature_cols = [] # check if mandatory parameters are present if group_col is None: raise ValueError("The name of column in data `group_col` must be initialized") if gamma is None: raise ValueError("The `gamma` parameter must be initialized") # initialize the protected_feature index to -1 # assign mandatory parameters self.group_col = group_col self.query_col = query_col self.doc_col = doc_col self.score_col = score_col self.feature_cols = feature_cols self._gamma = gamma # assign optional parameters self._number_of_iterations = number_of_iterations self._learning_rate = learning_rate self._lambda = lambdaa self._init_var = init_var self._standardize = standardize self.verbose = verbose self.standarizer = Standarizer(group_col=group_col) self.algorithm = DELTRAlgorithm( self._gamma, self._number_of_iterations, self._learning_rate, self._lambda, self._init_var, verbose=verbose, ) def _filter_invalid_examples(self, rankings): new_rankings = [] for _, ranking in rankings.groupby(self.query_col): if (ranking[self.group_col].sum() > 0).any(): new_rankings.append(ranking) new_rankings = pd.concat(new_rankings, axis=0).reset_index(drop=True) return new_rankings
[docs] def fit(self, rankings: pd.DataFrame): """ Trains a Disparate Exposure model on a given training set. Parameters ---------- rankings: DataFrame Returns ------ Self """ rankings = self._filter_invalid_examples(rankings) if self.feature_cols == []: restricted_cols = [self.query_col, self.doc_col, self.score_col] self.feature_cols = [col for col in rankings.columns.to_list() if col not in restricted_cols] # prepare data ( query_ids, doc_ids, protected_feature, feature_matrix, training_scores, ) = self._prepare_data(rankings, has_judgment=True) # standardize data if allowed if self._standardize: feature_matrix = self.standarizer.fit_transform(feature_matrix) self._omega = self.algorithm.fit(query_ids, protected_feature, training_scores, feature_matrix) # return model return self
[docs] def transform(self, rankings: pd.DataFrame): """ Train a Disparate Exposure model to rank the prediction set. Parameters ---------- rankings: DataFrame Returns ------ DataFrame Transformed data """ if self._omega is None: raise SystemError("You need to train a model first!") # prepare data query_ids, doc_ids, protected_attributes, feature_matrix = self._prepare_data(rankings, has_judgment=False) # standardize data if allowed if self._standardize: feature_matrix = self.standarizer.transform(feature_matrix) # calculate the predictions predictions = np.dot(feature_matrix, self._omega) # create the resulting data frame result = pd.DataFrame( { self.query_col: query_ids, self.doc_col: doc_ids, self.group_col: protected_attributes, self.score_col: predictions, } ) # sort by the score in descending order result = result.sort_values([self.score_col], ascending=[0]) return result
def _prepare_data(self, data, has_judgment=False): """ Extracts the different columns of the input data. Parameters ---------- data: DataFrame has_adjudment: bool Return ------ tuple Tuple of preprocessed data """ query_ids = data[self.query_col] doc_ids = data[self.doc_col] protected_attributes = data[self.group_col] # add 2 for query id and doc id feature_matrix = data[self.feature_cols] if has_judgment: scores = data[self.score_col] return query_ids, doc_ids, protected_attributes, feature_matrix, scores return query_ids, doc_ids, protected_attributes, feature_matrix