# - * - encoding : utf - 8 - * -
# pylint: disable=fixme, line-too-long
"""
:copyright: 2017-2018 H2O.ai, Inc.
:license: Apache License Version 2.0 (see LICENSE for details)
"""
# pylint: disable=unused-import
from h2o4gpu.solvers import elastic_net
from h2o4gpu.linear_model import ridge as sk
from ..solvers.utils import _setter
[docs]class Ridge:
"""H2O Ridge Regression Solver
Parameters
----------
alpha : {float, array-like}, shape (n_targets)
Regularization strength; must be a positive float. Regularization
improves the conditioning of the problem and reduces the variance of
the estimates. Larger values specify stronger regularization.
Alpha corresponds to ``C^-1`` in other linear models such as
LogisticRegression or LinearSVC. If an array is passed, penalties are
assumed to be specific to the targets. Hence they must correspond in
number.
fit_intercept : boolean
Whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean, optional, default False
This parameter is ignored when ``fit_intercept`` is set to False.
If True, the regressors X will be normalized before regression by
subtracting the mean and dividing by the l2-norm.
If you wish to standardize, please use
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
on an estimator with ``normalize=False``.
copy_X : boolean, optional, default True
If True, X will be copied; else, it may be overwritten.
max_iter : int, optional
Maximum number of iterations for conjugate gradient solver.
For 'sparse_cg' and 'lsqr' solvers, the default value is determined
by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
tol : float
Precision of the solution.
solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
Solver to use in the computational routines:
- 'auto' chooses the solver automatically based on the type of data.
- 'svd' uses a Singular Value Decomposition of X to compute the Ridge
coefficients. More stable for singular matrices than
'cholesky'.
- 'cholesky' uses the standard scipy.linalg.solve function to
obtain a closed-form solution.
- 'sparse_cg' uses the conjugate gradient solver as found in
scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
more appropriate than 'cholesky' for large-scale data
(possibility to set `tol` and `max_iter`).
- 'lsqr' uses the dedicated regularized least-squares routine
scipy.sparse.linalg.lsqr. It is the fastest but may not be available
in old scipy versions. It also uses an iterative procedure.
- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
its improved, unbiased version named SAGA. Both methods also use an
iterative procedure, and are often faster than other solvers when
both n_samples and n_features are large. Note that 'sag' and
'saga' fast convergence is only guaranteed on features with
approximately the same scale. You can preprocess the data with a
scaler from sklearn.preprocessing.
All last five solvers support both dense and sparse data. However,
only 'sag' and 'saga' supports sparse input when `fit_intercept` is
True.
.. versionadded:: 0.17
Stochastic Average Gradient descent solver.
.. versionadded:: 0.19
SAGA solver.
random_state : int, RandomState instance or None, optional, default None
The seed of the pseudo random number generator to use when shuffling
the data. If int, random_state is the seed used by the random number
generator; If RandomState instance, random_state is the random number
generator; If None, the random number generator is the RandomState
instance used by `np.random`. Used when ``solver`` == 'sag'.
.. versionadded:: 0.17
*random_state* to support Stochastic Average Gradient.
n_gpus : int
Number of gpu's to use in RandomForestRegressor solver. Default is -1.
glm_stop_early : bool, (Default=True)
Stop early when there is no more relative
improvement in the primary and dual residuals for ADMM.
glm_stop_early_error_fraction : float, (Default=1.0)
Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at
least this much).
verbose : int, (Default=0)
Print verbose information to the console if set to > 0.
backend : string, (Default="auto")
Which backend to use.
Options are 'auto', 'sklearn', 'h2o4gpu'.
Saves as attribute for actual backend used.
"""
def __init__(
self,
alpha=1.0, # h2o4gpu
fit_intercept=True, # h2o4gpu
normalize=False,
copy_X=True,
max_iter=5000, # h2o4gpu
tol=1e-2, # h2o4gpu
solver='auto',
random_state=None,
n_gpus=-1, # h2o4gpu
glm_stop_early=True, # h2o4gpu
glm_stop_early_error_fraction=1.0, # h2o4gpu
verbose=False,
backend='auto',
**kwargs): # h2o4gpu
import os
_backend = os.environ.get('H2O4GPU_BACKEND', None)
if _backend is not None:
backend = _backend
self.do_daal = False
self.do_sklearn = False
# Fall back to Sklearn
# Can remove if fully implement sklearn functionality
self.do_sklearn = False
if backend == 'auto':
params_string = ['normalize', 'solver']
params = [normalize, solver]
params_default = [False, 'auto']
i = 0
for param in params:
if param != params_default[i]:
self.do_sklearn = True
if verbose:
print("WARNING:"
" The sklearn parameter " + params_string[i] +
" has been changed from default to " + str(param)
+ ". Will run Sklearn Ridge Regression.")
self.do_sklearn = True
i = i + 1
elif backend == 'sklearn':
self.do_sklearn = True
self.backend = 'sklearn'
elif backend == 'h2o4gpu':
self.do_sklearn = False
self.backend = 'h2o4gpu'
elif backend == 'daal':
from h2o4gpu import DAAL_SUPPORTED
if DAAL_SUPPORTED:
from h2o4gpu.solvers.daal_solver.regression \
import RidgeRegression as DRR
self.do_daal = True
self.backend = 'daal'
self.model_daal = DRR(alpha=alpha,
fit_intercept=fit_intercept,
normalize=normalize,
**kwargs)
else:
import platform
print("WARNING:"
"DAAL is supported only for x86_64, "
"architecture detected {}. Sklearn model"
"used instead".format(platform.architecture()))
self.do_sklearn = True
self.backend = 'h2o4gpu'
self.model_sklearn = sk.RidgeSklearn(
alpha=alpha,
fit_intercept=fit_intercept,
normalize=normalize,
copy_X=copy_X,
max_iter=max_iter,
tol=tol,
solver=solver,
random_state=random_state)
# Equivalent Ridge parameters for h2o4gpu
n_threads = None
n_alphas = 1
n_lambdas = 1
n_folds = 1
lambda_max = alpha
lambda_min_ratio = 1.0
lambda_stop_early = False
store_full_path = 1
alphas = None
lambdas = None
alpha_min = 0.0
alpha_max = 0.0
self.model_h2o4gpu = elastic_net.ElasticNetH2O(
n_threads=n_threads,
n_gpus=n_gpus,
fit_intercept=fit_intercept,
lambda_min_ratio=lambda_min_ratio,
n_lambdas=n_lambdas,
n_folds=n_folds,
n_alphas=n_alphas,
tol=tol,
lambda_stop_early=lambda_stop_early,
glm_stop_early=glm_stop_early,
glm_stop_early_error_fraction=glm_stop_early_error_fraction,
max_iter=max_iter,
verbose=verbose,
store_full_path=store_full_path,
lambda_max=lambda_max,
alpha_max=alpha_max,
alpha_min=alpha_min,
alphas=alphas,
lambdas=lambdas,
order=None)
if self.do_sklearn:
if verbose:
print("Running sklearn Ridge Regression")
self.model = self.model_sklearn
elif self.do_daal:
if verbose:
print("Running PyDAAL Ridge Regression")
self.model = self.model_daal
else:
if verbose:
print("Running h2o4gpu Ridge Regression")
self.model = self.model_h2o4gpu
self.verbose = verbose
[docs] def fit(self, X, y=None, sample_weight=None):
if self.do_sklearn:
res = self.model.fit(X, y, sample_weight)
self.set_attributes()
return res
res = self.model.fit(X, y)
self.set_attributes()
return res
[docs] def get_params(self):
return self.model.get_params()
[docs] def predict(self, X):
res = self.model.predict(X)
self.set_attributes()
return res
[docs] def score(self, X, y, sample_weight=None):
# TODO add for h2o4gpu
if self.verbose:
print("WARNING: score() is using sklearn")
if not self.do_sklearn:
self.model_sklearn.fit(X, y) # Need to re-fit
res = self.model_sklearn.score(X, y, sample_weight)
return res
[docs] def set_params(self, **params):
return self.model.set_params(**params)
[docs] def set_attributes(self):
""" set attributes for Ridge
"""
s = _setter(oself=self, e1=NameError, e2=AttributeError)
s('oself.coef_ = oself.model.coef_')
s('oself.intercept_ = oself.model.intercept_')
s('oself.n_iter_ = oself.model.n_iter_')
self.time_prepare = None
s('oself.time_prepare = oself.model.time_prepare')
self.time_upload_data = None
s('oself.time_upload_data = oself.model.time_upload_data')
self.time_fitonly = None
s('oself.time_fitonly = oself.model.time_fitonly')