# - * - encoding : utf - 8 - * -
# pylint: disable=fixme, line-too-long
"""
:copyright: 2017-2018 H2O.ai, Inc.
:license: Apache License Version 2.0 (see LICENSE for details)
"""
import numpy as np
from ..solvers.utils import _setter
from ..solvers.truncated_svd import TruncatedSVDH2O, TruncatedSVD
[docs]class PCAH2O(TruncatedSVDH2O):
"""Principal Component Analysis (PCA)
Dimensionality reduction using truncated Singular Value Decomposition
for GPU
This implementation uses the Cusolver implementation of the truncated SVD.
Contrary to SVD, this estimator does center the data before computing
the singular value decomposition.
Parameters
----------
n_components: int, Default=2
Desired dimensionality of output data
whiten : bool, optional
When True (False by default) the `components_` vectors are multiplied
by the square root of (n_samples) and divided by the singular values to
ensure uncorrelated outputs with unit component-wise variances.
Whitening will remove some information from the transformed signal
(the relative variance scales of the components) but can sometime
improve the predictive accuracy of the downstream estimators by
making their data respect some hard-wired assumptions.
verbose: bool
Verbose or not
gpu_id : int, optional, default: 0
ID of the GPU on which the algorithm should run.
"""
def __init__(self, n_components=2, whiten=False,
verbose=0, gpu_id=0):
super().__init__(n_components)
self.whiten = whiten
self.n_components_ = n_components
self.mean_ = None
self.noise_variance_ = None
self.algorithm = "cusolver"
self.verbose = verbose
self.gpu_id = gpu_id
# pylint: disable=unused-argument
[docs] def fit(self, X, y=None):
"""Fit PCA on matrix X.
:param X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training data.
:param y : Ignored,
For ScikitLearn compatibility
:returns self : self
object
"""
self.fit_transform(X)
return self
# pylint: disable=unused-argument
def _check_double(self, data, convert=True):
"""Transform input data into a type which can be passed into C land."""
if convert and data.dtype != np.float64 and data.dtype != np.float32:
self._print_verbose(0, "Detected numeric data format which is not "
"supported. Casting to np.float32.")
data = np.ascontiguousarray(data, dtype=np.floa32)
if data.dtype == np.float64:
self._print_verbose(0, "Detected np.float64 data")
self.double_precision = 1
data = np.ascontiguousarray(data, dtype=np.float64)
elif data.dtype == np.float32:
self._print_verbose(0, "Detected np.float32 data")
self.double_precision = 0
data = np.ascontiguousarray(data, dtype=np.float32)
else:
raise ValueError(
"Unsupported data type %s, "
"should be either np.float32 or np.float64" % data.dtype)
return data
# Util to load gpu lib
def _load_lib(self):
from ..libs.lib_utils import GPUlib
gpu_lib = GPUlib().get()
return gpu_lib
[docs]class PCA(TruncatedSVD):
"""
PCA Wrapper
Selects between h2o4gpu.decomposition.PCASklearn
and h2o4gpu.solvers.pca.PCAH2O
Parameters
----------
n_components: int, Default=2
Desired dimensionality of output data
copy : bool (default True)
If False, data passed to fit are overwritten and running
fit(X).transform(X) will not yield the expected results,
use fit_transform(X) instead.
whiten : bool, optional
When True (False by default) the `components_` vectors are multiplied
by the square root of (n_samples) and divided by the singular values to
ensure uncorrelated outputs with unit component-wise variances.
Whitening will remove some information from the transformed signal
(the relative variance scales of the components) but can sometime
improve the predictive accuracy of the downstream estimators by
making their data respect some hard-wired assumptions.
svd_solver : string {'auto', 'full', 'arpack', 'randomized'}
'auto' is selected by a default policy based on `X.shape`
and `n_components`: if the input data is larger than 500x500 and the number
of components to extract is lower than 80 percent of the smallest
dimension of the data, then the more efficient 'randomized'
method is enabled. Otherwise the exact full SVD is computed and
optionally truncated afterwards. 'full' runs exact full SVD calling the standard LAPACK solver via
`scipy.linalg.svd` and select the components by postprocessing
'arpack'runs SVD truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`.
It requires strictly 0 < n_components < columns. 'randomized' runs randomized SVD by the method of Halko et al.
tol : float >= 0, optional (default .0)
Tolerance for singular values computed by svd_solver == 'arpack'.
iterated_power : int >= 0, or 'auto', (default 'auto')
Number of iterations for the power method computed by
svd_solver == 'randomized'.
random_state : int, RandomState instance or None, optional (default None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'.
verbose: bool
Verbose or not
backend : string, (Default="auto")
Which backend to use.
Options are 'auto', 'sklearn', 'h2o4gpu'.
Saves as attribute for actual backend used.
gpu_id : int, optional, default: 0
ID of the GPU on which the algorithm should run. Only used by
h2o4gpu backend.
"""
# pylint: disable=unused-argument
def __init__(self,
n_components=2,
copy=True,
whiten=False,
svd_solver="arpack",
tol=0.,
iterated_power="auto",
random_state=None,
verbose=False,
backend='auto',
gpu_id=0):
super().__init__(n_components, random_state, tol, verbose, backend, gpu_id)
self.svd_solver = svd_solver
self.whiten = whiten
import os
_backend = os.environ.get('H2O4GPU_BACKEND', None)
if _backend is not None:
backend = _backend
# Fall back to Sklearn
# Can remove if fully implement sklearn functionality
self.do_sklearn = False
if backend == 'auto':
params_string = [
'svd_solver', 'random_state', 'tol', 'iterated_power'
]
params = [svd_solver, random_state, tol, iterated_power]
params_default = ['arpack', None, 0., 'auto']
i = 0
for param in params:
if param != params_default[i]:
self.do_sklearn = True
if verbose:
print("WARNING:"
" The sklearn parameter " + params_string[i] +
" has been changed from default to " +
str(param) + ". Will run Sklearn PCA.")
self.do_sklearn = True
i = i + 1
elif backend == 'sklearn':
self.do_sklearn = True
elif backend == 'h2o4gpu':
self.do_sklearn = False
if self.do_sklearn:
self.backend = 'sklearn'
else:
self.backend = 'h2o4gpu'
from h2o4gpu.decomposition.pca import PCASklearn
self.model_sklearn = PCASklearn(
n_components=n_components,
copy=copy,
whiten=whiten,
svd_solver=svd_solver,
tol=tol,
iterated_power=iterated_power,
random_state=random_state)
self.model_h2o4gpu = PCAH2O(
n_components=self.n_components,
whiten=self.whiten,
verbose=self.verbose,
gpu_id=self.gpu_id)
if self.do_sklearn:
self.model = self.model_sklearn
else:
self.model = self.model_h2o4gpu
[docs] def set_attributes(self):
s = _setter(oself=self, e1=NameError, e2=AttributeError)
s('oself.components_ = oself.model.components_')
s('oself.explained_variance_= oself.model.explained_variance_')
s('oself.explained_variance_ratio_ = '
'oself.model.explained_variance_ratio_')
s('oself.singular_values_ = oself.model.singular_values_')
s('oself.mean_ = oself.model.mean_')
s('oself.n_components_ = oself.model.n_components_')
s('oself.noise_variance_ = oself.model.noise_variance_')