Source code for holisticai.security.metrics._attribute_attack

from __future__ import annotations

from typing import Any, Callable, Union

import pandas as pd
from holisticai.security.commons import BlackBoxAttack
from holisticai.security.metrics._utils import check_valid_output_type
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error


def to_numerical_or_categorical(y: pd.Series):
    if y.dtype.kind in ["f"]:
        return y

    if y.dtype.kind in ["i", "u"]:
        return y.astype("category")

    if len(y.unique()) < 2:
        raise ValueError("The target variable must have more than 1 unique value")
    return y.astype("category")


class AttributeAttackScore:
    reference: float = 0
    name: str = "Attribute Attack Score"

    def __call__(
        self,
        x_train: pd.DataFrame,
        x_test: pd.DataFrame,
        y_train: pd.Series,
        y_test: pd.Series,
        attribute_attack: str,
        attack_train_ratio: float = 0.5,
        metric_fn: Union[str, Callable, None] = None,
        attacker_estimator: Any = None,
    ) -> float:
        check_valid_output_type(y_train)

        y_train = to_numerical_or_categorical(y_train)
        y_test = to_numerical_or_categorical(y_test)

        if attacker_estimator is None:
            has_continous_values = x_train[attribute_attack].dtype.kind in ["i", "u", "f"]
            attacker_estimator = LinearRegression() if has_continous_values else LogisticRegression()
            if metric_fn is None:
                metric_fn = mean_squared_error if has_continous_values else accuracy_score

        if isinstance(metric_fn, str):
            if metric_fn == "accuracy":
                metric_fn = accuracy_score
            if metric_fn == "f1":
                metric_fn = f1_score
            if metric_fn == "mean_squared_error":
                metric_fn = mean_squared_error
            if metric_fn == "mean_absolute_error":
                metric_fn = mean_absolute_error

        attacker = BlackBoxAttack(
            attacker_estimator=attacker_estimator,
            attack_feature=attribute_attack,
            attack_train_ratio=attack_train_ratio,
        )

        attacker.fit(x_train, y_train)

        y_attack, y_pred_attack = attacker.transform(x_test, y_test)

        return metric_fn(y_attack, y_pred_attack)



[docs]
def attribute_attack_score(
    x_train: pd.DataFrame,
    x_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    attribute_attack: str,
    attack_train_ratio: float = 0.5,
    **kargs,
) -> float:
    """
    Calculate the accuracy score for black box attribute attack. It is done as follows:
    - The attack attribute is removed from the training data.
    - The label is added as an input feature, and a machine learning model is trained.
    - The model is used to predict the removed attribute, and the prediction is compared with the actual value.

    Parameters
    ----------
    x_train: pd.DataFrame
        The training features.
    x_test: pd.DataFrame
        The testing features.
    y_train: pd.Series
        The training labels.
    y_test: pd.Series
        The testing labels.
    attribute_attack: str
        The attribute column in the x_train dataframe to attack.
    attack_train_ratio: float
        The ratio of the attack data to the training data.

    kargs: aditional attributes are passed to AttributeAttackScore class
    Returns
    -------
        float: The accuracy score for black box attribute attack.
    """

    bb = AttributeAttackScore()
    return bb(x_train, x_test, y_train, y_test, attribute_attack, attack_train_ratio, **kargs)