Source code for chemicalchecker.tool.targetmate.nonconformist.acp

#!/usr/bin/env python
"""
Aggregated conformal predictors
"""

# Authors: Henrik Linusson

import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

from .base import BaseEstimator
from .util import calc_p


# -----------------------------------------------------------------------------
# Sampling strategies
# -----------------------------------------------------------------------------
[docs]class BootstrapSampler(object):
    """Bootstrap sampler.

    See also
    --------
    CrossSampler, RandomSubSampler

    Examples
    --------
    """

    def gen_samples(self, y, n_samples, problem_type):
        for i in range(n_samples):
            idx = np.array(range(y.size))
            train = np.random.choice(y.size, y.size, replace=True)
            cal_mask = np.array(np.ones(idx.size), dtype=bool)
            for j in train:
                cal_mask[j] = False
            cal = idx[cal_mask]

            yield train, cal


[docs]class CrossSampler(object):
    """Cross-fold sampler.

    See also
    --------
    BootstrapSampler, RandomSubSampler

    Examples
    --------
    """

    def gen_samples(self, y, n_samples, problem_type):
        if problem_type == 'classification':
            kf = StratifiedKFold(n_splits=n_samples, shuffle=True)  # Added by Paula: Shuffle calibration sets before
        else:
            kf = KFold(n_splits=n_samples)
        for train, cal in kf.split(X=np.zeros(len(y)), y=y):
            yield train, cal


[docs]class RandomSubSampler(object):
    """Random subsample sampler.

    Parameters
    ----------
    calibration_portion : float
        Ratio (0-1) of examples to use for calibration.

    See also
    --------
    BootstrapSampler, CrossSampler

    Examples
    --------
    """

    def __init__(self, calibration_portion=0.3):
        self.cal_portion = calibration_portion

    def gen_samples(self, y, n_samples, problem_type):
        if problem_type == 'classification':
            splits = StratifiedShuffleSplit(n_splits=n_samples,
                                            test_size=self.cal_portion)
        else:
            splits = ShuffleSplit(n_splits=n_samples,
                                  test_size=self.cal_portion)

        for train, cal in splits.split(X=np.zeros(len(y)), y=y):
            yield train, cal


# -----------------------------------------------------------------------------
# Conformal ensemble
# -----------------------------------------------------------------------------
[docs]class AggregatedCp(BaseEstimator):
    """Aggregated conformal predictor.

    Combines multiple IcpClassifier or IcpRegressor predictors into an
    aggregated model.

    Parameters
    ----------
    predictor : object
        Prototype conformal predictor (e.g. IcpClassifier or IcpRegressor)
        used for defining conformal predictors included in the aggregate model.

    sampler : object
        Sampler object used to generate training and calibration examples
        for the underlying conformal predictors.

    aggregation_func : callable
        Function used to aggregate the predictions of the underlying
        conformal predictors. Defaults to ``numpy.mean``.

    n_models : int
        Number of models to aggregate.

    Attributes
    ----------
    predictor : object
        Prototype conformal predictor.

    predictors : list
        List of underlying conformal predictors.

    sampler : object
        Sampler object used to generate training and calibration examples.

    agg_func : callable
        Function used to aggregate the predictions of the underlying
        conformal predictors

    References
    ----------
    .. [1] Vovk, V. (2013). Cross-conformal predictors. Annals of Mathematics
        and Artificial Intelligence, 1-20.

    .. [2] Carlsson, L., Eklund, M., & Norinder, U. (2014). Aggregated
        Conformal Prediction. In Artificial Intelligence Applications and
        Innovations (pp. 231-240). Springer Berlin Heidelberg.

    Examples
    --------
    """

    def __init__(self,
                 predictor,
                 sampler=BootstrapSampler(),
                 aggregation_func=None,
                 n_models=10):
        self.predictors = []
        self.n_models = n_models
        self.predictor = predictor
        self.sampler = sampler
        if aggregation_func is not None:
            self.agg_func = aggregation_func
        else:
            self.agg_func = lambda x: np.mean(x, axis=2)

[docs]    def fit(self, x, y):
        """Fit underlying conformal predictors.

        Parameters
        ----------
        x : numpy array of shape [n_samples, n_features]
            Inputs of examples for fitting the underlying conformal predictors.

        y : numpy array of shape [n_samples]
            Outputs of examples for fitting the underlying conformal predictors.

        Returns
        -------
        None
        """
        self.n_train = y.size
        self.predictors = []
        idx = np.random.permutation(y.size)
        x, y = x[idx, :], y[idx]
        problem_type = self.predictor.__class__.get_problem_type()
        samples = self.sampler.gen_samples(y,
                                           self.n_models,
                                           problem_type)
        for train, cal in samples:
            from copy import deepcopy
            predictor = deepcopy(self.predictor)
            # predictor = clone(self.predictor)
            predictor.fit(x[train, :], y[train])
            predictor.calibrate(x[cal, :], y[cal])

            predictor.nc_function.model.last_x = None
            predictor.nc_function.model.last_y = None
            self.predictors.append(predictor)

        if problem_type == 'classification':
            self.classes = self.predictors[0].classes

[docs]    def predict(self, x, significance=None):
        """Predict the output values for a set of input patterns.

        Parameters
        ----------
        x : numpy array of shape [n_samples, n_features]
            Inputs of patters for which to predict output values.

        significance : float or None
            Significance level (maximum allowed error rate) of predictions.
            Should be a float between 0 and 1. If ``None``, then the p-values
            are output rather than the predictions. Note: ``significance=None``
            is applicable to classification problems only.

        Returns
        -------
        p : numpy array of shape [n_samples, n_classes] or [n_samples, 2]
            For classification problems: If significance is ``None``, then p
            contains the p-values for each sample-class pair; if significance
            is a float between 0 and 1, then p is a boolean array denoting
            which labels are included in the prediction sets.

            For regression problems: Prediction interval (minimum and maximum
            boundaries) for the set of test patterns.
        """
        is_regression = \
            self.predictor.__class__.get_problem_type() == 'regression'

        n_examples = x.shape[0]
        if is_regression and significance is None:
            signs = np.arange(0.01, 1.0, 0.01)
            pred = np.zeros((n_examples, 2, signs.size))
            for i, s in enumerate(signs):
                predictions = np.dstack([p.predict(x, s)
                                         for p in self.predictors])
                predictions = self.agg_func(predictions)
                pred[:, :, i] = predictions
            return pred
        else:
            def f(p, x):
                return p.predict(x, significance if is_regression else None)

            predictions = np.dstack([f(p, x) for p in self.predictors])
            predictions = self.agg_func(predictions)
            if significance and not is_regression:
                return predictions >= significance
            else:
                return predictions


[docs]class CrossConformalClassifier(AggregatedCp):
    """Cross-conformal classifier.

    Combines multiple IcpClassifiers into a cross-conformal classifier.

    Parameters
    ----------
    predictor : object
        Prototype conformal predictor (e.g. IcpClassifier or IcpRegressor)
        used for defining conformal predictors included in the aggregate model.

    aggregation_func : callable
        Function used to aggregate the predictions of the underlying
        conformal predictors. Defaults to ``numpy.mean``.

    n_models : int
        Number of models to aggregate.

    Attributes
    ----------
    predictor : object
        Prototype conformal predictor.

    predictors : list
        List of underlying conformal predictors.

    sampler : object
        Sampler object used to generate training and calibration examples.

    agg_func : callable
        Function used to aggregate the predictions of the underlying
        conformal predictors

    References
    ----------
    .. [1] Vovk, V. (2013). Cross-conformal predictors. Annals of Mathematics
        and Artificial Intelligence, 1-20.

    Examples
    --------
    """

    def __init__(self, predictor, n_models=10):

        super(CrossConformalClassifier, self).__init__(predictor, CrossSampler(), False, n_models=n_models)

[docs]    def predict(self, x, significance=None, smoothing=None): # Added by Paula: applies same smoothing across models
        ncal_ngt_neq = np.stack([p._get_stats(x) for p in self.predictors],
                                axis=3)
        ncal_ngt_neq = ncal_ngt_neq.sum(axis=3)
        p = calc_p(ncal_ngt_neq[:, :, 0],
                   ncal_ngt_neq[:, :, 1],
                   ncal_ngt_neq[:, :, 2],
                   smoothing=self.predictors[0].smoothing,
                   f=smoothing
                   )

        if significance:
            return p > significance
        else:
            return p


[docs]class BootstrapConformalClassifier(AggregatedCp):
    """Bootstrap conformal classifier.

    Combines multiple IcpClassifiers into a bootstrap conformal classifier.

    Parameters
    ----------
    predictor : object
        Prototype conformal predictor (e.g. IcpClassifier or IcpRegressor)
        used for defining conformal predictors included in the aggregate model.

    aggregation_func : callable
        Function used to aggregate the predictions of the underlying
        conformal predictors. Defaults to ``numpy.mean``.

    n_models : int
        Number of models to aggregate.

    Attributes
    ----------
    predictor : object
        Prototype conformal predictor.

    predictors : list
        List of underlying conformal predictors.

    sampler : object
        Sampler object used to generate training and calibration examples.

    agg_func : callable
        Function used to aggregate the predictions of the underlying
        conformal predictors

    References
    ----------
    .. [1] Vovk, V. (2013). Cross-conformal predictors. Annals of Mathematics
        and Artificial Intelligence, 1-20.

    Examples
    --------
    """

    def __init__(self,
                 predictor,
                 n_models=10):
        super(BootstrapConformalClassifier, self).__init__(predictor,
                                                           BootstrapSampler(),
                                                           n_models)

[docs]    def predict(self, x, significance=None):
        ncal_ngt_neq = np.stack([p._get_stats(x) for p in self.predictors],
                                axis=3)
        ncal_ngt_neq = ncal_ngt_neq.sum(axis=3)

        p = calc_p(ncal_ngt_neq[:, :, 0] + ncal_ngt_neq[:, :, 0] / self.n_train,
                   ncal_ngt_neq[:, :, 1] + ncal_ngt_neq[:, :, 0] / self.n_train,
                   ncal_ngt_neq[:, :, 2],
                   smoothing=self.predictors[0].smoothing)

        if significance:
            return p > significance
        else:
            return p