Source code for holisticai.security.metrics._attribute_attack

from __future__ import annotations

from typing import Any, Callable, Union

import pandas as pd
from holisticai.security.commons import BlackBoxAttack
from holisticai.security.metrics._utils import check_valid_output_type
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error


def to_numerical_or_categorical(y: pd.Series):
    if y.dtype.kind in ["f"]:
        return y

    if y.dtype.kind in ["i", "u"]:
        return y.astype("category")

    if len(y.unique()) < 2:
        raise ValueError("The target variable must have more than 1 unique value")
    return y.astype("category")


class AttributeAttackScore:
    reference: float = 0
    name: str = "Attribute Attack Score"

    def __call__(
        self,
        x_train: pd.DataFrame,
        x_test: pd.DataFrame,
        y_train: pd.Series,
        y_test: pd.Series,
        attribute_attack: str,
        attack_train_ratio: float = 0.5,
        metric_fn: Union[str, Callable, None] = None,
        attacker_estimator: Any = None,
    ) -> float:
        check_valid_output_type(y_train)

        y_train = to_numerical_or_categorical(y_train)
        y_test = to_numerical_or_categorical(y_test)

        if attacker_estimator is None:
            has_continous_values = x_train[attribute_attack].dtype.kind in ["i", "u", "f"]
            attacker_estimator = LinearRegression() if has_continous_values else LogisticRegression()
            if metric_fn is None:
                metric_fn = mean_squared_error if has_continous_values else accuracy_score

        if isinstance(metric_fn, str):
            if metric_fn == "accuracy":
                metric_fn = accuracy_score
            if metric_fn == "f1":
                metric_fn = f1_score
            if metric_fn == "mean_squared_error":
                metric_fn = mean_squared_error
            if metric_fn == "mean_absolute_error":
                metric_fn = mean_absolute_error

        attacker = BlackBoxAttack(
            attacker_estimator=attacker_estimator,
            attack_feature=attribute_attack,
            attack_train_ratio=attack_train_ratio,
        )

        attacker.fit(x_train, y_train)

        y_attack, y_pred_attack = attacker.transform(x_test, y_test)

        return metric_fn(y_attack, y_pred_attack)


[docs] def attribute_attack_score( x_train: pd.DataFrame, x_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, attribute_attack: str, attack_train_ratio: float = 0.5, **kargs, ) -> float: """ Calculate the accuracy score for black box attribute attack. It is done as follows: - The attack attribute is removed from the training data. - The label is added as an input feature, and a machine learning model is trained. - The model is used to predict the removed attribute, and the prediction is compared with the actual value. Parameters ---------- x_train: pd.DataFrame The training features. x_test: pd.DataFrame The testing features. y_train: pd.Series The training labels. y_test: pd.Series The testing labels. attribute_attack: str The attribute column in the x_train dataframe to attack. attack_train_ratio: float The ratio of the attack data to the training data. kargs: aditional attributes are passed to AttributeAttackScore class Returns ------- float: The accuracy score for black box attribute attack. """ bb = AttributeAttackScore() return bb(x_train, x_test, y_train, y_test, attribute_attack, attack_train_ratio, **kargs)