Source code for h2o4gpu.base

"""
Base classes for all estimators.

Used for VotingClassifier
"""

# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
# License: BSD 3 clause

import copy
import warnings
from collections import defaultdict
import platform
import inspect
import re

import numpy as np

from . import __version__
from ._config import get_config
from .utils import _IS_32BIT
from .utils.validation import check_X_y
from .utils.validation import check_array
from .utils._estimator_html_repr import estimator_html_repr
from .utils.validation import _deprecate_positional_args

_DEFAULT_TAGS = {
    'non_deterministic': False,
    'requires_positive_X': False,
    'requires_positive_y': False,
    'X_types': ['2darray'],
    'poor_score': False,
    'no_validation': False,
    'multioutput': False,
    "allow_nan": False,
    'stateless': False,
    'multilabel': False,
    '_skip_test': False,
    '_xfail_checks': False,
    'multioutput_only': False,
    'binary_only': False,
    'requires_fit': True,
    'requires_y': False,
    }


[docs]@_deprecate_positional_args def clone(estimator, *, safe=True): """Constructs a new estimator with the same parameters. Clone does a deep copy of the model in an estimator without actually copying attached data. It yields a new estimator with the same parameters that has not been fit on any data. Parameters ---------- estimator : {list, tuple, set} of estimator objects or estimator object The estimator or group of estimators to be cloned. safe : bool, default=True If safe is false, clone will fall back to a deep copy on objects that are not estimators. """ estimator_type = type(estimator) # XXX: not handling dictionaries if estimator_type in (list, tuple, set, frozenset): return estimator_type([clone(e, safe=safe) for e in estimator]) elif not hasattr(estimator, 'get_params') or isinstance(estimator, type): if not safe: return copy.deepcopy(estimator) else: if isinstance(estimator, type): raise TypeError("Cannot clone object. " + "You should provide an instance of " + "h2o4gpu estimator instead of a class.") else: raise TypeError("Cannot clone object '%s' (type %s): " "it does not seem to be a h2o4gpu " "estimator as it does not implement a " "'get_params' method." % (repr(estimator), type(estimator))) klass = estimator.__class__ new_object_params = estimator.get_params(deep=False) for name, param in new_object_params.items(): new_object_params[name] = clone(param, safe=False) new_object = klass(**new_object_params) params_set = new_object.get_params(deep=False) # quick sanity check of the parameters of the clone for name in new_object_params: param1 = new_object_params[name] param2 = params_set[name] if param1 is not param2: raise RuntimeError('Cannot clone object %s, as the constructor ' 'either does not set or modifies parameter %s' % (estimator, name)) return new_object
def _pprint(params, offset=0, printer=repr): """Pretty print the dictionary 'params' Parameters ---------- params : dict The dictionary to pretty print offset : int, default=0 The offset in characters to add at the begin of each line. printer : callable, default=repr The function to convert entries to strings, typically the builtin str or repr """ # Do a multi-line justified repr: options = np.get_printoptions() np.set_printoptions(precision=5, threshold=64, edgeitems=2) params_list = list() this_line_length = offset line_sep = ',\n' + (1 + offset // 2) * ' ' for i, (k, v) in enumerate(sorted(params.items())): if type(v) is float: # use str for representing floating point numbers # this way we get consistent representation across # architectures and versions. this_repr = '%s=%s' % (k, str(v)) else: # use repr of the rest this_repr = '%s=%s' % (k, printer(v)) if len(this_repr) > 500: this_repr = this_repr[:300] + '...' + this_repr[-100:] if i > 0: if (this_line_length + len(this_repr) >= 75 or '\n' in this_repr): params_list.append(line_sep) this_line_length = len(line_sep) else: params_list.append(', ') this_line_length += 2 params_list.append(this_repr) this_line_length += len(this_repr) np.set_printoptions(**options) lines = ''.join(params_list) # Strip trailing space to avoid nightmare in doctests lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n')) return lines class BaseEstimator: """Base class for all estimators in h2o4gpu Notes ----- All estimators should specify all the parameters that can be set at the class level in their ``__init__`` as explicit keyword arguments (no ``*args`` or ``**kwargs``). """ @classmethod def _get_param_names(cls): """Get parameter names for the estimator""" # fetch the constructor or the original constructor before # deprecation wrapping if any init = getattr(cls.__init__, 'deprecated_original', cls.__init__) if init is object.__init__: # No explicit constructor to introspect return [] # introspect the constructor arguments to find the model parameters # to represent init_signature = inspect.signature(init) # Consider the constructor parameters excluding 'self' parameters = [p for p in init_signature.parameters.values() if p.name != 'self' and p.kind != p.VAR_KEYWORD] for p in parameters: if p.kind == p.VAR_POSITIONAL: raise RuntimeError("h2o4gpu estimators should always " "specify their parameters in the signature" " of their __init__ (no varargs)." " %s with constructor %s doesn't " " follow this convention." % (cls, init_signature)) # Extract and sort argument names excluding 'self' return sorted([p.name for p in parameters]) def get_params(self, deep=True): """ Get parameters for this estimator. Parameters ---------- deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): try: value = getattr(self, key) except AttributeError: warnings.warn('From version 0.24, get_params will raise an ' 'AttributeError if a parameter cannot be ' 'retrieved as an instance attribute. Previously ' 'it would return None.', FutureWarning) value = None if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out def set_params(self, **params): """ Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Parameters ---------- **params : dict Estimator parameters. Returns ------- self : object Estimator instance. """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) valid_params[key] = value for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self def __repr__(self, N_CHAR_MAX=700): # N_CHAR_MAX is the (approximate) maximum number of non-blank # characters to render. We pass it as an optional parameter to ease # the tests. from .utils._pprint import _EstimatorPrettyPrinter N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences # use ellipsis for sequences with a lot of elements pp = _EstimatorPrettyPrinter( compact=True, indent=1, indent_at_name=True, n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW) repr_ = pp.pformat(self) # Use bruteforce ellipsis when there are a lot of non-blank characters n_nonblank = len(''.join(repr_.split())) if n_nonblank > N_CHAR_MAX: lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends regex = r'^(\s*\S){%d}' % lim # The regex '^(\s*\S){%d}' % n # matches from the start of the string until the nth non-blank # character: # - ^ matches the start of string # - (pattern){n} matches n repetitions of pattern # - \s*\S matches a non-blank char following zero or more blanks left_lim = re.match(regex, repr_).end() right_lim = re.match(regex, repr_[::-1]).end() if '\n' in repr_[left_lim:-right_lim]: # The left side and right side aren't on the same line. # To avoid weird cuts, e.g.: # categoric...ore', # we need to start the right side with an appropriate newline # character so that it renders properly as: # categoric... # handle_unknown='ignore', # so we add [^\n]*\n which matches until the next \n regex += r'[^\n]*\n' right_lim = re.match(regex, repr_[::-1]).end() ellipsis = '...' if left_lim + len(ellipsis) < len(repr_) - right_lim: # Only add ellipsis if it results in a shorter repr repr_ = repr_[:left_lim] + '...' + repr_[-right_lim:] return repr_ def __getstate__(self): try: state = super().__getstate__() except AttributeError: state = self.__dict__.copy() if type(self).__module__.startswith('h2o4gpu.'): return dict(state.items(), _h2o4gpu_version=__version__) else: return state def __setstate__(self, state): if type(self).__module__.startswith('h2o4gpu.'): pickle_version = state.pop("_h2o4gpu_version", "pre-0.18") if pickle_version != __version__: warnings.warn( "Trying to unpickle estimator {0} from version {1} when " "using version {2}. This might lead to breaking code or " "invalid results. Use at your own risk.".format( self.__class__.__name__, pickle_version, __version__), UserWarning) try: super().__setstate__(state) except AttributeError: self.__dict__.update(state) def _more_tags(self): return _DEFAULT_TAGS def _get_tags(self): collected_tags = {} for base_class in reversed(inspect.getmro(self.__class__)): if hasattr(base_class, '_more_tags'): # need the if because mixins might not have _more_tags # but might do redundant work in estimators # (i.e. calling more tags on BaseEstimator multiple times) more_tags = base_class._more_tags(self) collected_tags.update(more_tags) return collected_tags def _check_n_features(self, X, reset): """Set the `n_features_in_` attribute, or check against it. Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) The input samples. reset : bool If True, the `n_features_in_` attribute is set to `X.shape[1]`. Else, the attribute must already exist and the function checks that it is equal to `X.shape[1]`. """ n_features = X.shape[1] if reset: self.n_features_in_ = n_features else: if not hasattr(self, 'n_features_in_'): raise RuntimeError( "The reset parameter is False but there is no " "n_features_in_ attribute. Is this estimator fitted?" ) if n_features != self.n_features_in_: raise ValueError( 'X has {} features, but this {} is expecting {} features ' 'as input.'.format(n_features, self.__class__.__name__, self.n_features_in_) ) def _validate_data(self, X, y=None, reset=True, validate_separately=False, **check_params): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- X : {array-like, sparse matrix, dataframe} of shape \ (n_samples, n_features) The input samples. y : array-like of shape (n_samples,), default=None The targets. If None, `check_array` is called on `X` and `check_X_y` is called otherwise. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. validate_separately : False or tuple of dicts, default=False Only used if y is not None. If False, call validate_X_y(). Else, it must be a tuple of kwargs to be used for calling check_array() on X and y respectively. **check_params : kwargs Parameters passed to :func:`h2o4gpu.utils.check_array` or :func:`h2o4gpu.utils.check_X_y`. Ignored if validate_separately is not False. Returns ------- out : {ndarray, sparse matrix} or tuple of these The validated input. A tuple is returned if `y` is not None. """ if y is None: if self._get_tags()['requires_y']: raise ValueError( f"This {self.__class__.__name__} estimator " f"requires y to be passed, but the target y is None." ) X = check_array(X, **check_params) out = X else: if validate_separately: # We need this because some estimators validate X and y # separately, and in general, separately calling check_array() # on X and y isn't equivalent to just calling check_X_y() # :( check_X_params, check_y_params = validate_separately X = check_array(X, **check_X_params) y = check_array(y, **check_y_params) else: X, y = check_X_y(X, y, **check_params) out = X, y if check_params.get('ensure_2d', True): self._check_n_features(X, reset=reset) return out @property def _repr_html_(self): """HTML representation of estimator. This is redundant with the logic of `_repr_mimebundle_`. The latter should be favorted in the long term, `_repr_html_` is only implemented for consumers who do not interpret `_repr_mimbundle_`. """ if get_config()["display"] != 'diagram': raise AttributeError("_repr_html_ is only defined when the " "'display' configuration option is set to " "'diagram'") return self._repr_html_inner def _repr_html_inner(self): """This function is returned by the @property `_repr_html_` to make `hasattr(estimator, "_repr_html_") return `True` or `False` depending on `get_config()["display"]`. """ return estimator_html_repr(self) def _repr_mimebundle_(self, **kwargs): """Mime bundle used by jupyter kernels to display estimator""" output = {"text/plain": repr(self)} if get_config()["display"] == 'diagram': output["text/html"] = estimator_html_repr(self) return output class ClassifierMixin: """Mixin class for all classifiers in h2o4gpu.""" _estimator_type = "classifier" def score(self, X, y, sample_weight=None): """ Return the mean accuracy on the given test data and labels. In multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples,) or (n_samples, n_outputs) True labels for X. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float Mean accuracy of self.predict(X) wrt. y. """ from .metrics import accuracy_score return accuracy_score(y, self.predict(X), sample_weight=sample_weight) def _more_tags(self): return {'requires_y': True} class RegressorMixin: """Mixin class for all regression estimators in h2o4gpu.""" _estimator_type = "regressor" def score(self, X, y, sample_weight=None): """Return the coefficient of determination R^2 of the prediction. The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. For some estimators this may be a precomputed kernel matrix or a list of generic objects instead, shape = (n_samples, n_samples_fitted), where n_samples_fitted is the number of samples used in the fitting for the estimator. y : array-like of shape (n_samples,) or (n_samples, n_outputs) True values for X. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float R^2 of self.predict(X) wrt. y. Notes ----- The R2 score used when calling ``score`` on a regressor uses ``multioutput='uniform_average'`` from version 0.23 to keep consistent with default value of :func:`~h2o4gpu.metrics.r2_score`. This influences the ``score`` method of all the multioutput regressors (except for :class:`~h2o4gpu.multioutput.MultiOutputRegressor`). """ from .metrics import r2_score y_pred = self.predict(X) return r2_score(y, y_pred, sample_weight=sample_weight) def _more_tags(self): return {'requires_y': True} class ClusterMixin: """Mixin class for all cluster estimators in h2o4gpu.""" _estimator_type = "clusterer" def fit_predict(self, X, y=None): """ Perform clustering on X and returns cluster labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data. y : Ignored Not used, present for API consistency by convention. Returns ------- labels : ndarray of shape (n_samples,) Cluster labels. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm self.fit(X) return self.labels_ class BiclusterMixin: """Mixin class for all bicluster estimators in h2o4gpu""" @property def biclusters_(self): """Convenient way to get row and column indicators together. Returns the ``rows_`` and ``columns_`` members. """ return self.rows_, self.columns_ def get_indices(self, i): """Row and column indices of the i'th bicluster. Only works if ``rows_`` and ``columns_`` attributes exist. Parameters ---------- i : int The index of the cluster. Returns ------- row_ind : ndarray, dtype=np.intp Indices of rows in the dataset that belong to the bicluster. col_ind : ndarray, dtype=np.intp Indices of columns in the dataset that belong to the bicluster. """ rows = self.rows_[i] columns = self.columns_[i] return np.nonzero(rows)[0], np.nonzero(columns)[0] def get_shape(self, i): """Shape of the i'th bicluster. Parameters ---------- i : int The index of the cluster. Returns ------- shape : tuple (int, int) Number of rows and columns (resp.) in the bicluster. """ indices = self.get_indices(i) return tuple(len(i) for i in indices) def get_submatrix(self, i, data): """Return the submatrix corresponding to bicluster `i`. Parameters ---------- i : int The index of the cluster. data : array-like The data. Returns ------- submatrix : ndarray The submatrix corresponding to bicluster i. Notes ----- Works with sparse matrices. Only works if ``rows_`` and ``columns_`` attributes exist. """ from .utils.validation import check_array data = check_array(data, accept_sparse='csr') row_ind, col_ind = self.get_indices(i) return data[row_ind[:, np.newaxis], col_ind] class TransformerMixin: """Mixin class for all transformers in h2o4gpu.""" def fit_transform(self, X, y=None, **fit_params): """ Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : {array-like, sparse matrix, dataframe} of shape \ (n_samples, n_features) y : ndarray of shape (n_samples,), default=None Target values. **fit_params : dict Additional fit parameters. Returns ------- X_new : ndarray array of shape (n_samples, n_features_new) Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X) class DensityMixin: """Mixin class for all density estimators in h2o4gpu.""" _estimator_type = "DensityEstimator" def score(self, X, y=None): """Return the score of the model on the data X Parameters ---------- X : array-like of shape (n_samples, n_features) y : Ignored Not used, present for API consistency by convention. Returns ------- score : float """ pass class OutlierMixin: """Mixin class for all outlier detection estimators in h2o4gpu.""" _estimator_type = "outlier_detector" def fit_predict(self, X, y=None): """Perform fit on X and returns labels for X. Returns -1 for outliers and 1 for inliers. Parameters ---------- X : {array-like, sparse matrix, dataframe} of shape \ (n_samples, n_features) y : Ignored Not used, present for API consistency by convention. Returns ------- y : ndarray of shape (n_samples,) 1 for inliers, -1 for outliers. """ # override for transductive outlier detectors like LocalOulierFactor return self.fit(X).predict(X) class MetaEstimatorMixin: _required_parameters = ["estimator"] """Mixin class for all meta estimators in h2o4gpu.""" class MultiOutputMixin: """Mixin to mark estimators that support multioutput.""" def _more_tags(self): return {'multioutput': True} class _UnstableArchMixin: """Mark estimators that are non-determinstic on 32bit or PowerPC""" def _more_tags(self): return {'non_deterministic': ( _IS_32BIT or platform.machine().startswith(('ppc', 'powerpc')))} def is_classifier(estimator): """Return True if the given estimator is (probably) a classifier. Parameters ---------- estimator : object Estimator object to test. Returns ------- out : bool True if estimator is a classifier and False otherwise. """ return getattr(estimator, "_estimator_type", None) == "classifier" def is_regressor(estimator): """Return True if the given estimator is (probably) a regressor. Parameters ---------- estimator : object Estimator object to test. Returns ------- out : bool True if estimator is a regressor and False otherwise. """ return getattr(estimator, "_estimator_type", None) == "regressor" def is_outlier_detector(estimator): """Return True if the given estimator is (probably) an outlier detector. Parameters ---------- estimator : object Estimator object to test. Returns ------- out : bool True if estimator is an outlier detector and False otherwise. """ return getattr(estimator, "_estimator_type", None) == "outlier_detector"