Source code for h2o4gpu.solvers.kmeans

# - * - encoding : utf - 8 - * -
# pylint: disable=fixme, line-too-long
"""
KMeans clustering solver.

:copyright: 2017-2018 H2O.ai, Inc.
:license:   Apache License Version 2.0 (see LICENSE for details)
"""
import sys

import numpy as np

from ..solvers.utils import _check_data_content, \
    _get_data, _setter
from ..typecheck.typechecks import assert_satisfies


[docs]class KMeansH2O:
    """K-Means clustering

    Wrapper class calling an underlying (e.g. GPU or CPU)
     implementation of the K-Means clustering algorithm.

    Approximate GPU Memory Use:
     n_clusters*rows + rows*cols + cols*n_clusters

     Parameters
     ----------
     n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.

     init : string, {'k-means++', 'random' or an ndarray}
        Method for initialization, defaults to 'random':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence.
        *Not supported yet* - if chosen we will use SKLearn's methods.
        'random': choose k observations (rows) at random from data for
        the initial centroids.
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.
        *Not supported yet* - if chosen we will use SKLearn's methods.

     n_init : int, default: 1
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.
        *Not supported yet* - always runs 1.

     max_iter : int, optional, default: 1000
        Maximum number of iterations of the algorithm.

     tol : int, optional, default: 1e-4
        Relative tolerance to declare convergence.

     precompute_distances : {'auto', True, False}
        Precompute distances (faster but takes more memory).
        'auto' : do not precompute distances if n_samples * n_clusters > 12
        million. This corresponds to about 100MB overhead per job using
        double precision.
        True : always precompute distances
        False : never precompute distances
        *Not supported yet* - always uses auto if running h2o4gpu version.

     verbose : int, optional, default 0
        Logger verbosity level.

     random_state : int or array_like, optional, default: None
        random_state for RandomState.
        Must be convertible to 32 bit unsigned integers.

     copy_x : boolean, default True
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.
        *Not supported yet* - always uses True if running h2o4gpu version.

     n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.
        *Not supported yet* - CPU backend not yet implemented.

     algorithm : string, "auto", "full" or "elkan", default="auto"
        K-means algorithm to use. The classical EM-style algorithm is "full".
        The "elkan" variation is more efficient by using the triangle
        inequality, but currently doesn't support sparse data. "auto" chooses
        "elkan" for dense data and "full" for sparse data.
        *Not supported yet* - always uses full if running h2o4gpu version.

     gpu_id : int, optional, default: 0
        ID of the GPU on which the algorithm should run.

     n_gpus : int, optional, default: -1
        Number of GPUs on which the algorithm should run.
        < 0 means all possible GPUs on the machine.
        0 means no GPUs, run on CPU.

     do_checks : int, optional, default: 1
        If set to 0 GPU error check will not be performed.

    Attributes:
    ----------
    cluster_centers_ : array, [n_clusters, n_features]
        Cluster centers

    labels_ : array, [n_rows,],
        Labels assigned to each row during fitting.

    inertia_ : float
        Sum of distances of samples to their closest cluster center.

    Example:
    -------
        >>> from h2o4gpu import KMeans
        >>> import numpy as np
        >>> X = np.array([[1, 2], [1, 4], [1, 0],
        ...               [4, 2], [4, 4], [4, 0]])
        >>> kmeans = KMeans(n_clusters=2).fit(X)
        >>> kmeans.labels_
        >>> kmeans.predict(X)
        >>> kmeans.cluster_centers_
    """

    # pylint: disable=unused-argument
    def __init__(
            self,
            # sklearn API (but with possibly different choices for defaults)
            n_clusters=8,
            init='k-means++',
            n_init=1,
            max_iter=300,
            tol=1e-4,
            precompute_distances='auto',
            verbose=0,
            random_state=None,
            copy_x=True,
            n_jobs=1,
            algorithm='auto',
            # Beyond sklearn (with optimal defaults)
            gpu_id=0,
            n_gpus=-1,
            do_checks=1):

        # fix-up tol in case input was numpy
        # pylint: disable=assignment-from-no-return
        example = np.fabs(1.0)
        # pylint: disable=unidiomatic-typecheck
        if type(tol) == type(example):
            tol = tol.item()

        if isinstance(init, np.ndarray):
            assert ValueError("Passing initial centroids not yet supported.")

        if isinstance(init, str) and init not in ['random', 'k-means++']:
            assert ValueError(
                "Invalid initialization method. "
                "Should be 'k-means++' or 'random' but got '%s'." % init)

        self.init = init
        self._n_clusters = n_clusters
        self._gpu_id = gpu_id
        from ..util.gpu import device_count
        (self.n_gpus, self.devices) = device_count(n_gpus)

        self._max_iter = max_iter
        self.tol = tol
        self._did_sklearn_fit = 0
        self.verbose = verbose
        self.do_checks = do_checks

        if random_state is None:
            import random
            self.random_state = random.randint(0, 32000)
        else:
            self.random_state = random_state

        self.cluster_centers_ = None

        self.labels_ = None

        self.inertia_ = None  # TODO: Not set yet

        self.sklearn_model = None

    @classmethod
    def _get_param_names(cls):
        """Get parameter names for the estimator"""
        # fetch the constructor or the original constructor before
        # deprecation wrapping if any
        init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
        if init is object.__init__:
            # No explicit constructor to introspect
            return []

            # introspect the constructor arguments to find the model parameters
            # to represent
        from ..utils.fixes import signature
        init_signature = signature(init)
        # Consider the constructor parameters excluding 'self'
        parameters = [
            p for p in init_signature.parameters.values()
            if p.name != 'self' and p.kind != p.VAR_KEYWORD
        ]
        for p in parameters:
            if p.kind == p.VAR_POSITIONAL:
                raise RuntimeError("h2o4gpu GLM estimator should always "
                                   "specify their parameters in the signature"
                                   " of their __init__ (no varargs)."
                                   " %s with constructor %s doesn't "
                                   " follow this convention." %
                                   (cls, init_signature))
                # Extract and sort argument names excluding 'self'
        return sorted([p.name for p in parameters])

[docs]    def get_params(self, deep=True):
        """Get parameters for this estimator.

        :param bool deep : If True, will return the parameters for this
            estimator and contained subobjects that are estimators.

        :returns dict params : Parameter names mapped to their values.
        """
        out = dict()
        for key in self._get_param_names():
            # We need deprecation warnings to always be on in order to
            # catch deprecated param values.
            # This is set in utils / __init__.py but it gets overwritten
            # when running under python3 somehow.
            import warnings

            warnings.simplefilter("always", DeprecationWarning)
            try:
                with warnings.catch_warnings(record=True) as w:
                    value = getattr(self, key, None)
                if w and w[0].category == DeprecationWarning:
                    # if the parameter is deprecated, don't show it
                    continue
            finally:
                warnings.filters.pop(0)

                # XXX : should we rather test if instance of estimator ?
            if deep and hasattr(value, 'get_params'):
                deep_items = value.get_params().items()
                out.update((key + '__' + k, val) for k, val in deep_items)
            out[key] = value
        return out

[docs]    def set_params(self, **params):
        """Set the parameters of this solver.

        :return: self
        """
        if not params:
            # Simple optimization to gain speed(inspect is slow)
            return self
        valid_params = self.get_params(deep=True)
        from ..externals import six
        for key, value in six.iteritems(params):
            split = key.split('__', 1)
            if len(split) > 1:
                # nested objects case
                name, sub_name = split
                if name not in valid_params:
                    raise ValueError('Invalid parameter %s for estimator %s. '
                                     'Check the list of available parameters '
                                     'with `estimator.get_params().keys()`.' %
                                     (name, self))
                sub_object = valid_params[name]
                sub_object.set_params(**{sub_name: value})
            else:
                # simple objects case
                if key not in valid_params:
                    raise ValueError('Invalid parameter %s for estimator %s. '
                                     'Check the list of available parameters '
                                     'with `estimator.get_params().keys()`.' %
                                     (key, self.__class__.__name__))
                setattr(self, key, value)
        return self

[docs]    def fit(self, X, y=None):
        """Compute cluster centers using KMeans algorithm.

        The memory used by this algorithm depends on:
        - m - number of rows in X
        - n - number of dimensions in X
        - k - number of clusters
        - type of data in X (float32 or float64)

        and should be approximately:

        For float32 = 4*(m*n + k*n + 3*m + k + m*k) + 2*(4*m + k)
        For float64 = 8*(m*n + k*n + 3*m + k + m*k) + 2*(4*m + k)

        In case of running on the GPU, a CUDA context size should be
        also taken into account.

        :param X: array-like, shape=(n_samples, n_features)
            Training instances.
        """
        X_np, _, _, _, _, _ = _get_data(X, ismatrix=True)

        _check_data_content(self.do_checks, "X", X_np)

        self._fit(X_np)

        self._did_sklearn_fit = 0

        if y is not None:
            pass  # not using labels

        return self

    # y is here just for compatibility with sklearn api
    # pylint: disable=unused-argument
[docs]    def sklearn_fit(self, X, y=None):
        """Instantiates a scikit-learn model using previously found,
        with fit(), centroids. """

        assert self.cluster_centers_ is not None, \
            "Centroids are None. Run fit() first."

        if self._did_sklearn_fit == 0:
            X_np, _, _, _, _, _ = _get_data(X, ismatrix=True)
            _check_data_content(self.do_checks, "X", X_np)

            self._did_sklearn_fit = 1
            import sklearn.cluster as sk_cluster
            self.sklearn_model = sk_cluster.KMeans(
                self._n_clusters,
                max_iter=1,
                init=self.cluster_centers_,
                n_init=1)
            self.sklearn_model.fit(X_np)
            # The code above initializes the SKlearn KMeans model,
            # but due to validations we need to run 1 extra iteration,
            # which might alter the cluster centers so we override them
            self.sklearn_model.cluster_centers_ = self.cluster_centers_

[docs]    def predict(self, X):
        """ Assign the each record in X to the closest cluster.

        :param X: array-like or sparse matrix of shape [n_samples, n_features]
                  Contains data points to be clustered.
        :return: array of shape [n_samples,]
                A cluster index for each record
        """
        lib = self._load_lib()

        cols, rows = self._validate_centroids(X)

        X_np, _, _, _, _, _ = _get_data(X, ismatrix=True)
        _check_data_content(self.do_checks, "X", X_np)
        c_X_np = self._toc(X_np)

        cluster_centers_ = self._toc(self.cluster_centers_, convert=False)
        c_res = np.zeros(rows, np.int32)

        if self.double_precision == 0:
            c_kmeans = lib.make_ptr_float_kmeans
        else:
            c_kmeans = lib.make_ptr_double_kmeans

        c_kmeans(1, self.verbose,
                 self.random_state, self._gpu_id, self.n_gpus, rows, cols,
                 self._n_clusters, self._max_iter, 0,
                 self.tol, c_X_np, cluster_centers_, np.empty([], X_np.dtype), c_res)

        return c_res

    # y is here just for compatibility with sklearn api
    # pylint: disable=unused-argument
[docs]    def sklearn_predict(self, X, y=None):
        """
        Instantiates, if necessary, a scikit-learn model using centroids
        found by running fit() and predicts labels using that model.
        This method always runs on CPU, not on GPUs.
        """
        _check_data_content(self.do_checks, "X", X)

        self.sklearn_fit(X)
        return self.sklearn_model.predict(X)

[docs]    def transform(self, X, y=None):
        """Transform X to a cluster-distance space.

        Each dimension is the distance to a cluster center.

        :param X: {array-like, sparse matrix}, shape = [n_samples, n_features]
                Data to be transformed.

        :return: array, shape [n_samples, k]
            Distances to each cluster for each row.
        """
        lib = self._load_lib()

        cols, rows = self._validate_centroids(X)

        X_np, _, _, _, _, _ = _get_data(X, ismatrix=True)
        c_X_np = self._toc(X_np)
        cluster_centers_ = self._toc(self.cluster_centers_, convert=False)
        c_res = np.zeros(rows * self._n_clusters, X_np.dtype)

        if self.double_precision == 0:
            lib.kmeans_transform_float(
                self.verbose, self._gpu_id, self.n_gpus, rows, cols,
                self._n_clusters, c_X_np, cluster_centers_,
                c_res)
        else:
            lib.kmeans_transform_double(
                self.verbose, self._gpu_id, self.n_gpus, rows, cols,
                self._n_clusters, c_X_np, cluster_centers_,
                c_res)

        transformed = np.reshape(
            c_res, (rows, self._n_clusters), order='F')
        return transformed

[docs]    def sklearn_transform(self, X, y=None):
        """
        Instantiates, if necessary, a scikit-learn model using centroids
        found by running fit() and transforms matrix X using that model.
        This method always runs on CPU, not on GPUs.
        """

        _check_data_content(self.do_checks, "X", X)
        self.sklearn_fit(X)
        # pylint: disable=too-many-function-args
        return self.sklearn_model.transform(X, y)

[docs]    def fit_transform(self, X, y=None):
        """Perform fitting and transform X.

        Same as calling fit(X, y).transform(X).

        :param X: {array-like, sparse matrix}, shape = [n_samples, n_features]
                Data to be transformed.
        :param y: array-like, optional, shape=(n_samples, 1)
            Initial labels for training.

        :return: array, shape [n_samples, k]
            Distances to each cluster for each row.
        """
        return self.fit(X, y).transform(X)

[docs]    def fit_predict(self, X, y=None):
        """Perform fitting and prediction on X.

        Same as calling fit(X, y).labels_.

        :param X: {array-like, sparse matrix}, shape = [n_samples, n_features]
            Data to be used for fitting and predictions.
        :param y: array-like, optional, shape=(n_samples, 1)
            Initial labels for training.

        :return: array of shape [n_samples,]
            A cluster index for each record
        """
        return self.fit(X, y).labels_

    def _fit(self, data):
        """Actual method calling the underlying fitting implementation."""
        lib = self._load_lib()

        c_data = self._toc(data)

        if self.init == "k-means++":
            c_init = 1
        else:
            c_init = 0

        rows = np.shape(data)[0]
        cols = np.shape(data)[1]

        centroids = np.empty([])
        pred_centers = np.zeros(cols * self._n_clusters, data.dtype)
        pred_labels = np.zeros(rows, dtype=np.int32)

        if self.double_precision == 0:
            lib.make_ptr_float_kmeans(
                0, self.verbose,
                self.random_state, self._gpu_id, self.n_gpus, rows, cols,
                self._n_clusters, self._max_iter, c_init,
                self.tol, c_data, centroids,
                pred_centers, pred_labels)
        else:
            lib.make_ptr_double_kmeans(
                0, self.verbose,
                self.random_state, self._gpu_id, self.n_gpus, rows, cols,
                self._n_clusters, self._max_iter, c_init,
                self.tol, c_data, centroids,
                pred_centers, pred_labels)

        centroids = np.reshape(pred_centers, (self._n_clusters, cols))
        self.cluster_centers_ = centroids

        self.labels_ = np.reshape(pred_labels, rows)

        return self.cluster_centers_, self.labels_

    # FIXME : This function duplicates others
    # in solvers / utils.py as used in GLM
    def _toc(self, data, convert=True):
        """Transform input data into a type which can be passed into C land."""
        if convert and data.dtype != np.float64 and data.dtype != np.float32:
            self._print_verbose(1, "Detected numeric data format which is not "
                                "supported. Casting to np.float32.")
            data = np.array(data, copy=False, dtype=np.float32)
        if data.dtype == np.float64:
            self._print_verbose(1, "Detected np.float64 data")
            self.double_precision = 1
        elif data.dtype == np.float32:
            self._print_verbose(1, "Detected np.float32 data")
            self.double_precision = 0

        return data.flatten()

    def _print_verbose(self, level, msg):
        if self.verbose > level:
            print(msg)
            sys.stdout.flush()

    def _print_set(self, param_name, old_val, new_val):
        self._print_verbose(1, "Changing %s from %d to %d." %
                            (param_name, old_val, new_val))

    def _load_lib(self):
        """Load library."""
        from ..libs.lib_utils import GPUlib, CPUlib

        gpu_lib = GPUlib().get()
        cpu_lib = CPUlib().get()

        if (self.n_gpus == 0) or (gpu_lib is None) or (self.devices == 0):
            self._print_verbose(0, "H2O KMeans for CPU not yet supported.")
            return None
        if (self.n_gpus > 0) or (cpu_lib is None) or (self.devices == 0):
            self._print_verbose(
                0, "\nUsing GPU KMeans solver with %d GPUs.\n" % self.n_gpus)
            return gpu_lib
        raise RuntimeError("Couldn't instantiate KMeans Solver")

    def _validate_centroids(self, X):
        assert self.cluster_centers_ is not None, \
            "Centroids are None. Run fit() first."
        rows = np.shape(X)[0]
        cols = np.shape(X)[1]
        centroids_dim = np.shape(self.cluster_centers_)[1]
        assert cols == centroids_dim, \
            "The dimension of X [%d] and centroids [%d] is not equal." % \
            (cols, centroids_dim)
        return cols, rows

    # Properties and setters of properties

    @property
    def n_clusters(self):
        return self._n_clusters

    @n_clusters.setter
    def n_clusters(self, value):
        assert_satisfies(value, value > 0,
                         "Number of clusters must be positive.")
        self._n_clusters = value

    @property
    def gpu_id(self):
        return self._gpu_id

    @gpu_id.setter
    def gpu_id(self, value):
        assert_satisfies(value, value >= 0, "GPU ID must be non-negative.")
        self._gpu_id = value

    @property
    def max_iter(self):
        return self._max_iter

    @max_iter.setter
    def max_iter(self, value):
        assert_satisfies(value, value > 0,
                         "Number of maximum iterations must be non-negative.")
        self._max_iter = value


[docs]class KMeans:
    """
     K-Means clustering Wrapper

     Selects between h2o4gpu.cluster.KMeansSklearn
     and h2o4gpu.solvers.kmeans.KMeansH2O

     Parameters
     ----------
     n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.

     init : string, {'k-means++', 'random' or an ndarray}
        Method for initialization, defaults to 'random':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence.
        *Not supported yet* - if chosen we will use SKLearn's methods.
        'random': choose k observations (rows) at random from data for
        the initial centroids.
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.
        *Not supported yet* - if chosen we will use SKLearn's methods.

     n_init : int, default: 1
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.
        *Not supported yet* - always runs 1.

     max_iter : int, optional, default: 1000
        Maximum number of iterations of the algorithm.

     tol : int, optional, default: 1e-4
        Relative tolerance to declare convergence.

     precompute_distances : {'auto', True, False}
        Precompute distances (faster but takes more memory).
        'auto' : do not precompute distances if n_samples * n_clusters > 12
        million. This corresponds to about 100MB overhead per job using
        double precision.
        True : always precompute distances
        False : never precompute distances
        *Not supported yet* - always uses auto if running h2o4gpu version.

     verbose : int, optional, default 0
        Logger verbosity level.

     random_state : int or array_like, optional, default: None
        random_state for RandomState.
        Must be convertible to 32 bit unsigned integers.

     copy_x : boolean, default True
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.
        *Not supported yet* - always uses True if running h2o4gpu version.

     n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.
        *Not supported yet* - CPU backend not yet implemented.

     algorithm : string, "auto", "full" or "elkan", default="auto"
        K-means algorithm to use. The classical EM-style algorithm is "full".
        The "elkan" variation is more efficient by using the triangle
        inequality, but currently doesn't support sparse data. "auto" chooses
        "elkan" for dense data and "full" for sparse data.
        *Not supported yet* - always uses full if running h2o4gpu version.

     gpu_id : int, optional, default: 0
        ID of the GPU on which the algorithm should run.

     n_gpus : int, optional, default: -1
        Number of GPUs on which the algorithm should run.
        < 0 means all possible GPUs on the machine.
        0 means no GPUs, run on CPU.

     do_checks : int, optional, default: 1
        If set to 0 GPU error check will not be performed.

     backend : string, (Default="auto")
        Which backend to use.
        Options are 'auto', 'sklearn', 'h2o4gpu'.
        Saves as attribute for actual backend used.

    """

    def __init__(
            self,
            n_clusters=8,
            init='k-means++',
            n_init=1,
            max_iter=300,
            tol=1e-4,
            precompute_distances='auto',
            verbose=0,
            random_state=None,
            copy_x=True,
            n_jobs=1,
            algorithm='auto',
            # Beyond sklearn (with optimal defaults)
            gpu_id=0,
            n_gpus=-1,
            do_checks=1,
            backend='auto'):

        import os
        _backend = os.environ.get('H2O4GPU_BACKEND', None)
        if _backend is not None:
            backend = _backend

        # FIXME: Add init as array and kmeans++ to h2o4gpu
        # setup backup to sklearn class
        # (can remove if fully implement sklearn functionality)
        self.do_sklearn = False
        if backend == 'auto':
            example = np.array([1, 2, 3])
            # pylint: disable=unidiomatic-typecheck
            if type(init) == type(example):
                KMeans._print_verbose(
                    verbose, 0,
                    "'init' as ndarray of centers not yet supported."
                    "Running ScikitLearn CPU version.")
                self.do_sklearn = True
            # FIXME: Add n_init to h2o4gpu
            if n_init != 1:
                KMeans._print_verbose(verbose, 0, "'n_init' not supported. "
                                      "Running h2o4gpu with n_init = 1.")
            if precompute_distances != "auto":
                KMeans._print_verbose(verbose, 0,
                                      "'precompute_distances' not used.")
        elif backend == 'sklearn':
            self.do_sklearn = True
        elif backend == 'h2o4gpu':
            self.do_sklearn = False
        if self.do_sklearn:
            self.backend = 'sklearn'
        else:
            self.backend = 'h2o4gpu'

        from h2o4gpu.cluster import k_means_
        self.model_sklearn = k_means_.KMeansSklearn(
            n_clusters=n_clusters,
            init=init,
            n_init=n_init,
            max_iter=max_iter,
            tol=tol,
            precompute_distances=precompute_distances,
            verbose=verbose,
            random_state=random_state,
            copy_x=copy_x,
            n_jobs=n_jobs,
            algorithm=algorithm)
        self.model_h2o4gpu = KMeansH2O(
            n_clusters=n_clusters,
            init=init,
            n_init=n_init,
            max_iter=max_iter,
            tol=tol,
            precompute_distances=precompute_distances,
            verbose=verbose,
            random_state=random_state,
            copy_x=copy_x,
            n_jobs=n_jobs,
            algorithm=algorithm,
            # H2O4GPU
            gpu_id=gpu_id,
            n_gpus=n_gpus,
            do_checks=do_checks)
        # pylint: disable=protected-access
        if self.do_sklearn or self.model_h2o4gpu._load_lib() is None:
            self.model = self.model_sklearn
            KMeans._print_verbose(verbose, 0, "Using ScikitLearn backend.")
        else:
            self.model = self.model_h2o4gpu
            KMeans._print_verbose(verbose, 0, "Using h2o4gpu backend.")

[docs]    def fit(self, X, y=None):
        res = self.model.fit(X, y)
        self.set_attributes()
        return res

[docs]    def fit_predict(self, X, y=None):
        res = self.model.fit_predict(X, y)
        self.set_attributes()
        return res

[docs]    def fit_transform(self, X, y=None):
        res = self.model.fit_transform(X, y)
        self.set_attributes()
        return res

[docs]    def get_params(self, deep=True):
        res = self.model.get_params(deep)
        self.set_attributes()
        return res

[docs]    def predict(self, X):
        res = self.model.predict(X)
        self.set_attributes()
        return res

[docs]    def score(self, X, y=None):
        # FIXME: Add score to h2o4gpu
        res = self.model_sklearn.score(X, y)
        self.set_attributes()
        return res

[docs]    def set_params(self, **params):
        res = self.model.set_params(**params)
        self.set_attributes()
        return res

[docs]    def transform(self, X):
        res = self.model.transform(X)
        self.set_attributes()
        return res

[docs]    def set_attributes(self):
        s = _setter(oself=self, e1=NameError, e2=AttributeError)

        s('oself.cluster_centers_ = oself.model.cluster_centers_')
        s('oself.labels_ = oself.model.labels_')
        self.inertia_ = None
        s('oself.inertia_ = oself.model.intertia_')

    # TODO use a proper logger in Python classes
    @staticmethod
    def _print_verbose(verbose, level, msg):
        if verbose > level:
            print(msg)
            sys.stdout.flush()