Source code for holisticai.security.metrics._data_minimization

from __future__ import annotations

from typing import Union

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, mean_squared_error


class DataMinimizationAccuracyRatio:
    reference: float = np.inf
    name: str = "Data Minimization Accuracy Ratio"

    def __call__(self, y_true, y_pred, y_pred_dm, return_results=False):
        metrics_results = pd.DataFrame(
            [
                {
                    "Selection Type": yp["selector_type"],
                    "Modifier Type": yp["modifier_type"],
                    "N_feats": yp["n_feats"],
                    "Feats": yp["feats"],
                    "Score": relative_performance(accuracy_score, y_pred, yp["predictions"], y_true=y_true),
                    "Accuracy": accuracy_score(y_true, yp["predictions"]),
                }
                for yp in y_pred_dm
            ]
        )
        index = metrics_results["Score"].argmin()
        if return_results:
            pred_row = pd.DataFrame(
                [
                    {
                        "Selection Type": "Base",
                        "Modifier Type": "Base",
                        "N_feats": 0,
                        "Feats": [],
                        "Score": 1,
                        "Accuracy": accuracy_score(y_true, y_pred),
                    }
                ]
            )
            metrics_results = pd.concat([metrics_results, pred_row], ignore_index=True)
            return metrics_results, metrics_results["Score"].loc[index]
        return float(metrics_results["Score"].loc[index])


class DataMinimizationMSERatio:
    reference: float = 0
    name: str = "Data Minimization MSE Ratio"

    def __call__(self, y_true, y_pred, y_pred_dm, return_results=False):
        metrics_results = pd.DataFrame(
            [
                {
                    "Selection Type": yp["selector_type"],
                    "Modifier Type": yp["modifier_type"],
                    "N_feats": yp["n_feats"],
                    "Feats": yp["feats"],
                    "Score": relative_performance(mean_squared_error, y_pred, yp["predictions"], y_true=y_true),
                    "MSE": mean_squared_error(y_true, yp["predictions"]),
                }
                for yp in y_pred_dm
            ]
        )
        index = metrics_results["Score"].argmin()
        if return_results:
            pred_row = pd.DataFrame(
                [
                    {
                        "Selection Type": "Base",
                        "Modifier Type": "Base",
                        "N_feats": 0,
                        "Feats": [],
                        "Score": 1,
                        "MSE": mean_squared_error(y_true, y_pred),
                    }
                ]
            )
            metrics_results = pd.concat([metrics_results, pred_row], ignore_index=True)
            return metrics_results, metrics_results["Score"].loc[index]
        return float(metrics_results["Score"].loc[index])


def get_learning_task(y_true: pd.Series):
    if y_true.dtype.kind in ["i", "u", "O"]:
        return "classification"
    if y_true.dtype.kind in ["f"]:
        return "regression"
    raise ValueError(f"Unknown learning task. dtype: {y_true.dtype.kind}")



[docs]
def data_minimization_score(
    y_true: pd.Series,
    y_pred: pd.Series,
    y_pred_dm: dict[str, pd.Series],
    return_results=False,
    learning_task: Union[str, None] = None,
):
    """
    Calculate the accuracy ratio for data minimization. The accuracy ratio is the ratio of the accuracy of the data minimization model to the accuracy of the original model.

    Parameters
    ----------
    y_true: pd.Series
        The true labels.
    y_pred: pd.Series
        The predicted labels.
    y_pred_dm: dict[str, pd.Series]
        The predicted labels for each data minimization technique.
    return_results: bool
        Whether to return the results or not. Default is False.
    learning_task: str (Optional)
        The learning task. Can be either "classification" or "regression". If None, it will be inferred from the data.
    Returns
    -------
        float: The accuracy ratio for data minimization.
        pd.DataFrame: The results of the data minimization if return_results is True.
    """
    if learning_task is None:
        learning_task = get_learning_task(y_true)

    if learning_task == "classification":
        dm = DataMinimizationAccuracyRatio()
    if learning_task == "regression":
        dm = DataMinimizationMSERatio()
    return dm(y_true, y_pred, y_pred_dm, return_results)



def relative_performance(metric_fn, y_pred, y_pred_dm, y_true):
    """
    Parameters
    ----------
    metric_fn: function
        metric function used to compare.

    y_true: array-like
        vector-target

    y_pred_dm: array-like
        predicted vector fitted with data minimization

    y_pred: array-like
        predicted vector fitted with all features

    Return
    ------
        relative performance metric
    """
    y = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()
    y_pred_dm = np.array(y_pred_dm).flatten()
    return metric_fn(y, y_pred) / metric_fn(y, y_pred_dm)


def relative_clustering_performance(metric_fn, y_pred, y_pred_dm, x):
    """
    Parameters
    ----------
    metric_fn: function
        metric function used to compare.

    y_true: array-like
        vector-target

    y_pred_dm: array-like
        predicted vector fitted with data minimization

    X: array-like
        input matrix

    Return
    ------
        relative performance metric
    """
    y_pred = np.array(y_pred).flatten()
    y_pred_dm = np.array(y_pred_dm).flatten()
    try:
        dn = metric_fn(x, y_pred) / metric_fn(x, y_pred_dm)
    except:  # noqa: E722
        dn = np.nan

    return dn