Source code for holisticai.pipeline._pipeline

import inspect

from sklearn.pipeline import Pipeline as SKLPipeline
from sklearn.utils.metaestimators import available_if

from holisticai.pipeline._pipeline_helper import PipelineHelper
from holisticai.utils.obj_rep.object_repr import PipelineReprObj


def _fulfill_conditions(fn_name: str):
    def check(self):
        if fn_name == "predict_proba":
            return hasattr(self._final_estimator, "predict_proba") and not self.post_estimator_transformers
        if fn_name == "predict_score":
            return hasattr(self, "predict_proba") and self.post_estimator_transformers
        if fn_name == "predictions":
            return self.post_estimator_transformers
        return None

    return check



[docs]
class Pipeline(PipelineReprObj, SKLPipeline, PipelineHelper):
    """
    Holistic AI Pipeline

    Description
    -----------
    Holisticai pipeline wrap the sklearn pipeline to support unconventional transformers.
    Unconventional transformers (u-transformers) are transformers that doesn't follow the typically
    sklearn workflow. For example, Bias Mitigator needs update inputs, outputs, and other parameters during the
    fit and transform process. The current version of this pipeline supports only binary
    classification.
    """

    def __init__(self, steps, *, memory=None, verbose=False):
        """
        Initialize Holistic AI Pipeline

        Description
        -----------
        Preprocess the steps before pass to sklearn pipeline. The preprocessing map the bias mitigators and
        and wrap the estimator so we can share paramters during the fit ans transform function.

        Parameters
        ----------
        steps: list
            A list of transformers/u-transformers and estimator

        memory : str or object with the joblib.Memory interface, default=None
            Used to cache the fitted transformers of the pipeline. By default,
            no caching is performed. If a string is given, it is the path to
            the caching directory. Enabling caching triggers a clone of
            the transformers before fitting. Therefore, the transformer
            instance given to the pipeline cannot be inspected
            directly. Use the attribute ``named_steps`` or ``steps`` to
            inspect estimators within the pipeline. Caching the
            transformers is advantageous when fitting is time consuming.

        verbose : bool, default=False
            If True, the time elapsed while fitting each step will be printed as it
            is completed.
        """
        steps = self.preprocessing_steps(steps)
        super(Pipeline, self).__init__(steps=steps, memory=memory, verbose=verbose)  # noqa: UP008


[docs]
    def fit(self, X, y=None, **fit_params):
        """Fit the model.

        Fit all the transformers/u-transformers one after the other and transform the
        data and parameters. Then, fit the transformed data using the final estimator.
        Finally, fit the u-transformers for postprocessing.

        Parameters
        ----------
        X : iterable
            Training data. Must fulfill input requirements of first step of the
            pipeline.

        y : iterable, default=None
            Training targets. Must fulfill label requirements for all steps of
            the pipeline.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of each step, where
            each parameter name is prefixed such that parameter ``p`` for step
            ``s`` has key ``s__p``.

        Returns
        -------
        self : object
            Pipeline with fitted steps.
        """
        super().fit(X, y, **fit_params)
        Xt = self._transform_without_final(X)
        self.fit_post_estimator_transformers(Xt, y)
        return self



[docs]
    @available_if(_fulfill_conditions("predict_proba"))
    def predict_proba(self, X, **predict_proba_params):
        """Update avaiable conditions for predict_proba"""
        return super().predict_proba(X, **predict_proba_params)



[docs]
    @available_if(_fulfill_conditions("predict_score"))
    def predict_score(self, X, **predict_score_params):
        """
        Return probability vector

        Description
        -----------

        Transform the data, and the postprocessor u-transformer compute the predictions for that model.
        Only available with postprocessors bias mitigator.

        Parameters
        ----------
        X : iterable
            Data to predict on. Must fulfill input requirements of first step
            of the pipeline.

        Returns
        -------
        np.ndarray
            probability value for each example
        """
        return self.predictions(X, **predict_score_params)["y_score"]



[docs]
    @available_if(_fulfill_conditions("predictions"))
    def predictions(self, X, **params):
        """
        Post-processor prediction

        Description
        -----------

        Transform the data, and the postprocessor u-transformer compute the predictions for that model.
        If the pipeline doesn't have postprocessors, sklearn pipeline functions are used
        for prediction.

        Parameters
        ----------
        X : iterable
            Data to predict on. Must fulfill input requirements of first step
            of the pipeline.

        **params : dict of string -> object
            Parameters to the ``predict`` called at the end of all
            transformations in the pipeline. Note that while this may be
            used to return uncertainties from some models with return_std
            or return_cov, uncertainties that are generated by the
            transformations in the pipeline are not propagated to the
            final estimator.

            .. versionadded:: 0.20

        Returns
        -------
        dict
            dictionary with postprocessor outputs
        """
        Xt = self._transform_without_final(X)
        return self._transform_post_estimator_transformers(Xt, **params)


    def repr_info(self):
        nested_objects = []
        steps = self.steps + self.post_estimator_transformers if self.post_estimator_transformers != [] else self.steps
        for s in steps:
            inputs = []
            for p in inspect.signature(s[1].__init__).parameters:
                try:
                    inputs.append(f"{p}={getattr(s[1],p)}")
                except Exception as _:  # noqa: BLE001, S112
                    continue
                if len(inputs) == 4:
                    inputs.append("...")
                    break
            nested_objects.append(
                {
                    "dtype": s[1].__class__.__name__,
                    "name": s[0],
                    "subtitle": s[1].__class__.__name__ + "(" + ", ".join(inputs) + ")",
                }
            )

        return {
            "dtype": self.__class__.__name__,
            "attributes": {},
            "nested_objects": nested_objects,
        }