Source code for h2o4gpu.solvers.pca

# - * - encoding : utf - 8 - * -
# pylint: disable=fixme, line-too-long
"""
:copyright: 2017-2018 H2O.ai, Inc.
:license:   Apache License Version 2.0 (see LICENSE for details)
"""
import numpy as np
from ..solvers.utils import _setter
from ..solvers.truncated_svd import TruncatedSVDH2O, TruncatedSVD


[docs]class PCAH2O(TruncatedSVDH2O):
    """Principal Component Analysis (PCA)

    Dimensionality reduction using truncated Singular Value Decomposition
    for GPU

    This implementation uses the Cusolver implementation of the truncated SVD.
    Contrary to SVD, this estimator does center the data before computing
    the singular value decomposition.

    Parameters
    ----------
    n_components: int, Default=2
        Desired dimensionality of output data

    whiten : bool, optional
        When True (False by default) the `components_` vectors are multiplied
        by the square root of (n_samples) and divided by the singular values to
        ensure uncorrelated outputs with unit component-wise variances.

        Whitening will remove some information from the transformed signal
        (the relative variance scales of the components) but can sometime
        improve the predictive accuracy of the downstream estimators by
        making their data respect some hard-wired assumptions.

    verbose: bool
        Verbose or not

    gpu_id : int, optional, default: 0
        ID of the GPU on which the algorithm should run.
    """

    def __init__(self, n_components=2, whiten=False,
                 verbose=0, gpu_id=0):
        super().__init__(n_components)
        self.whiten = whiten
        self.n_components_ = n_components
        self.mean_ = None
        self.noise_variance_ = None
        self.algorithm = "cusolver"
        self.verbose = verbose
        self.gpu_id = gpu_id

    # pylint: disable=unused-argument
[docs]    def fit(self, X, y=None):
        """Fit PCA on matrix X.

        :param X : {array-like, sparse matrix}, shape (n_samples, n_features)
                  Training data.

        :param y : Ignored,
                For ScikitLearn compatibility

        :returns self : self
                object

        """
        self.fit_transform(X)
        return self

    # pylint: disable=unused-argument
[docs]    def fit_transform(self, X, y=None):
        """Fit PCA on matrix X and perform dimensionality reduction on X.

        :param X : {array-like, sparse matrix}, shape (n_samples, n_features)
                  Training data.

        :param y : Ignored
                For ScikitLearn compatibility

        :returns X_new : array, shape (n_samples, n_components)
                         Reduced version of X. This will always be a
                         dense array.

        """
        # SWIG takes care of mapping to Fortran order
        X = self._check_double(X)
        matrix_type = np.float64 if self.double_precision == 1 else np.float32
        X = np.asfortranarray(X, dtype=matrix_type)
        Q = np.empty(
            (self.n_components, X.shape[1]), dtype=matrix_type)
        U = np.empty(
            (X.shape[0], self.n_components), dtype=matrix_type)
        w = np.empty(self.n_components, dtype=matrix_type)
        explained_variance = np.empty(self.n_components, dtype=matrix_type)
        explained_variance_ratio = np.empty(self.n_components, dtype=matrix_type)
        mean = np.empty(X.shape[1], dtype=matrix_type)
        X_transformed = np.empty((U.shape[0], self.n_components), dtype=matrix_type)

        lib = self._load_lib()

        param = lib.params_pca()
        param.X_m = X.shape[0]
        param.X_n = X.shape[1]
        param.k = self.n_components
        param.algorithm = self.algorithm
        param.n_iter = self.n_iter
        param.random_state = self.random_state
        param.tol = self.tol
        param.verbose = 1 if self.verbose else 0
        param.gpu_id = self.gpu_id
        param.whiten = self.whiten

        if self.double_precision == 1:
            lib.pca_double(X, Q, w, U, X_transformed, explained_variance, explained_variance_ratio, mean, param)
        else:
            lib.pca_float(X, Q, w, U, X_transformed, explained_variance, explained_variance_ratio, mean, param)

        self._w = w
        self._U = U
        self._Q = Q
        self._X = X

        n = X.shape[0]
        # To match sci-kit #TODO Port to cuda?
        self.explained_variance = self.singular_values_**2 / (n - 1)
        total_var = np.var(X, ddof=1, axis=0)
        self.explained_variance_ratio = \
            self.explained_variance / total_var.sum()
        #self.explained_variance_ratio = explained_variance_ratio
        self.mean_ = mean

        # TODO noise_variance_ calculation
        # can be done inside lib.pca if a bottleneck
        n_samples, n_features = X.shape
        total_var = np.var(X, ddof=1, axis=0)
        if self.n_components_ < min(n_features, n_samples):
            self.noise_variance_ = \
                (total_var.sum() - self.explained_variance_.sum())
            self.noise_variance_ /= \
                min(n_features, n_samples) - self.n_components
        else:
            self.noise_variance_ = 0.

        return X_transformed

    def _check_double(self, data, convert=True):
        """Transform input data into a type which can be passed into C land."""
        if convert and data.dtype != np.float64 and data.dtype != np.float32:
            self._print_verbose(0, "Detected numeric data format which is not "
                                   "supported. Casting to np.float32.")
            data = np.ascontiguousarray(data, dtype=np.floa32)
        if data.dtype == np.float64:
            self._print_verbose(0, "Detected np.float64 data")
            self.double_precision = 1
            data = np.ascontiguousarray(data, dtype=np.float64)
        elif data.dtype == np.float32:
            self._print_verbose(0, "Detected np.float32 data")
            self.double_precision = 0
            data = np.ascontiguousarray(data, dtype=np.float32)
        else:
            raise ValueError(
                "Unsupported data type %s, "
                "should be either np.float32 or np.float64" % data.dtype)
        return data

    # Util to load gpu lib
    def _load_lib(self):
        from ..libs.lib_utils import GPUlib

        gpu_lib = GPUlib().get()

        return gpu_lib


[docs]class PCA(TruncatedSVD):
    """
    PCA Wrapper

    Selects between h2o4gpu.decomposition.PCASklearn
    and h2o4gpu.solvers.pca.PCAH2O

    Parameters
    ----------
    n_components: int, Default=2
        Desired dimensionality of output data

    copy : bool (default True)
        If False, data passed to fit are overwritten and running
        fit(X).transform(X) will not yield the expected results,
        use fit_transform(X) instead.

    whiten : bool, optional
        When True (False by default) the `components_` vectors are multiplied
        by the square root of (n_samples) and divided by the singular values to
        ensure uncorrelated outputs with unit component-wise variances.

        Whitening will remove some information from the transformed signal
        (the relative variance scales of the components) but can sometime
        improve the predictive accuracy of the downstream estimators by
        making their data respect some hard-wired assumptions.

    svd_solver : string {'auto', 'full', 'arpack', 'randomized'}
        'auto' is selected by a default policy based on `X.shape`
        and `n_components`: if the input data is larger than 500x500 and the number
        of components to extract is lower than 80 percent of the smallest
        dimension of the data, then the more efficient 'randomized'
        method is enabled. Otherwise the exact full SVD is computed and
        optionally truncated afterwards. 'full' runs exact full SVD calling the standard LAPACK solver via
        `scipy.linalg.svd` and select the components by postprocessing
        'arpack'runs SVD truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`.
        It requires strictly 0 < n_components < columns. 'randomized' runs randomized SVD by the method of Halko et al.

    tol : float >= 0, optional (default .0)
        Tolerance for singular values computed by svd_solver == 'arpack'.

    iterated_power : int >= 0, or 'auto', (default 'auto')
        Number of iterations for the power method computed by
        svd_solver == 'randomized'.

    random_state : int, RandomState instance or None, optional (default None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'.

    verbose: bool
        Verbose or not

    backend : string, (Default="auto")
        Which backend to use.
        Options are 'auto', 'sklearn', 'h2o4gpu'.
        Saves as attribute for actual backend used.

    gpu_id : int, optional, default: 0
        ID of the GPU on which the algorithm should run. Only used by
        h2o4gpu backend.

    """

    # pylint: disable=unused-argument
    def __init__(self,
                 n_components=2,
                 copy=True,
                 whiten=False,
                 svd_solver="arpack",
                 tol=0.,
                 iterated_power="auto",
                 random_state=None,
                 verbose=False,
                 backend='auto',
                 gpu_id=0):
        super().__init__(n_components, random_state, tol, verbose, backend, gpu_id)
        self.svd_solver = svd_solver
        self.whiten = whiten

        import os
        _backend = os.environ.get('H2O4GPU_BACKEND', None)
        if _backend is not None:
            backend = _backend

        # Fall back to Sklearn
        # Can remove if fully implement sklearn functionality
        self.do_sklearn = False
        if backend == 'auto':
            params_string = [
                'svd_solver', 'random_state', 'tol', 'iterated_power'
            ]
            params = [svd_solver, random_state, tol, iterated_power]
            params_default = ['arpack', None, 0., 'auto']

            i = 0
            for param in params:
                if param != params_default[i]:
                    self.do_sklearn = True
                    if verbose:
                        print("WARNING:"
                              " The sklearn parameter " + params_string[i] +
                              " has been changed from default to " +
                              str(param) + ". Will run Sklearn PCA.")
                    self.do_sklearn = True
                i = i + 1
        elif backend == 'sklearn':
            self.do_sklearn = True
        elif backend == 'h2o4gpu':
            self.do_sklearn = False
        if self.do_sklearn:
            self.backend = 'sklearn'
        else:
            self.backend = 'h2o4gpu'

        from h2o4gpu.decomposition.pca import PCASklearn
        self.model_sklearn = PCASklearn(
            n_components=n_components,
            copy=copy,
            whiten=whiten,
            svd_solver=svd_solver,
            tol=tol,
            iterated_power=iterated_power,
            random_state=random_state)

        self.model_h2o4gpu = PCAH2O(
            n_components=self.n_components,
            whiten=self.whiten,
            verbose=self.verbose,
            gpu_id=self.gpu_id)

        if self.do_sklearn:
            self.model = self.model_sklearn
        else:
            self.model = self.model_h2o4gpu

[docs]    def set_attributes(self):
        s = _setter(oself=self, e1=NameError, e2=AttributeError)
        s('oself.components_ = oself.model.components_')
        s('oself.explained_variance_= oself.model.explained_variance_')
        s('oself.explained_variance_ratio_ = '
          'oself.model.explained_variance_ratio_')
        s('oself.singular_values_ = oself.model.singular_values_')
        s('oself.mean_ = oself.model.mean_')
        s('oself.n_components_ = oself.model.n_components_')
        s('oself.noise_variance_ = oself.model.noise_variance_')