# - * - encoding : utf - 8 - * -
# pylint: disable=fixme, line-too-long
"""
:copyright: 2017-2018 H2O.ai, Inc.
:license: Apache License Version 2.0 (see LICENSE for details)
"""
import sys
import time
import warnings
from ctypes import c_float, c_double, cast, POINTER
import numpy as np
import pandas as pd
from tabulate import tabulate
from h2o4gpu.linear_model import coordinate_descent as sk
from ..solvers.utils import _setter
from ..libs.lib_utils import get_lib
from ..solvers.utils import prepare_and_upload_data, free_sols
[docs]class ElasticNetH2O:
"""H2O Elastic Net Solver for GPUs
Parameters
----------
n_threads : int, (Default=None)
Number of threads to use in the gpu.
Each thread is an independent model builder.
gpu_id : int, optional, (default=0)
ID of the GPU on which the algorithm should run.
n_gpus : int, (Default=-1)
Number of gpu's to use in GLM solver.
order : string, (Default='r')
Row or Column major for C/C++ backend. Default is 'r'.
Must be 'r' (Row major) or 'c' (Column major).
fit_intercept : bool, (default=True)
Include constant term in the model.
lambda_min_ratio: float, (Default=1E-7).
Minimum lambda ratio to maximum lambda, used
in lambda search.
n_lambdas : int, (Default=100)
Number of lambdas to be used in a search.
n_folds : int, (Default=1)
Number of cross validation folds.
n_alphas : int, (Default=5)
Number of alphas to be used in a search.
tol : float, (Default=1E-2)
Relative tolerance.
tol_seek_factor : float, (Default=1E-1)
Factor of tolerance to seek
once below null model accuracy. Default is 1E-1, so seeks tolerance
of 1E-3 once below null model accuracy for tol=1E-2.
lambda_stop_early : float, (Default=True)
Stop early when there is no more relative
improvement on train or validation.
glm_stop_early : bool, (Default=True)
Stop early when there is no more relative
improvement in the primary and dual residuals for ADMM.
glm_stop_early_error_fraction : float, (Default=1.0)
Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at
least this much).
max_iter : int, (Default=5000)
Maximum number of iterations.
verbose : int, (Default=0)
Print verbose information to the console if set to > 0.
family : string, (Default="elasticnet")
"logistic" for classification with logistic regression.
Defaults to "elasticnet" for regression.
Must be "logistic" or "elasticnet".
store_full_path: int, (Default=0)
Whether to store full solution for all alphas
and lambdas. If 1, then during predict will compute best
and full predictions.
lambda_max : int, (Default=None)
Maximum Lambda value to use.
Default is None, and then internally compute standard maximum
alpha_max : float, (Default=1.0)
Maximum alpha.
alpha_min : float, (Default=0.0)
Minimum alpha.
alphas: list, tuple, array, or numpy 1D array of alphas (Default=None)
overrides n_alphas, alpha_min, and alpha_max.
lambdas: list, tuple, array, or numpy 1D array of lambdas (Default=None)
overrides n_lambdas, lambda_max, and lambda_min_ratio.
double_precision: int, (Default=None)
Internally set unless using _ptr methods. Value can either be
0 (float32) or 1(float64)
order : string, (Default=None)
Order of data. Default is None, and internally
determined (unless using _ptr methods) whether
row 'r' or column 'c' major order.
"""
def __init__(self,
n_threads=None,
gpu_id=0,
n_gpus=-1,
fit_intercept=True,
lambda_min_ratio=1E-7,
n_lambdas=100,
n_folds=5,
n_alphas=5,
tol=1E-2,
tol_seek_factor=1E-1,
lambda_stop_early=True,
glm_stop_early=True,
glm_stop_early_error_fraction=1.0,
max_iter=5000,
verbose=0,
family='elasticnet',
store_full_path=0,
lambda_max=None,
alpha_max=1.0,
alpha_min=0.0,
alphas=None,
lambdas=None,
double_precision=None,
order=None):
assert family in ['logistic',
'elasticnet'], \
"family should be 'logistic' or 'elasticnet' but got " + family
self.double_precision = double_precision
self.ord = order
self.dtype = None
##############################
# overrides of input parameters
# override these if pass alphas or lambdas
if alphas is not None:
alphas = np.ascontiguousarray(np.asarray(alphas))
n_alphas = np.shape(alphas)[0]
if lambdas is not None:
lambdas = np.ascontiguousarray(np.asarray(lambdas))
n_lambdas = np.shape(lambdas)[0]
##############################
# self assignments
self.n = 0
self.m_train = 0
self.m_valid = 0
self.source_dev = 0 # assume Dev=0 is source of data for upload_data
self.source_me = 0 # assume thread=0 is source of data for upload_data
if fit_intercept is True:
self.fit_intercept = 1
else:
self.fit_intercept = 0
self.lambda_min_ratio = lambda_min_ratio
self.n_lambdas = n_lambdas
self.n_folds = n_folds
self.n_alphas = n_alphas
self.uploaded_data = 0
self.did_fit_ptr = 0
self.did_predict = 0
self.tol = tol
self.tol_seek_factor = tol_seek_factor
if lambda_stop_early is True:
self.lambda_stop_early = 1
else:
self.lambda_stop_early = 0
if glm_stop_early is True:
self.glm_stop_early = 1
else:
self.glm_stop_early = 0
self.glm_stop_early_error_fraction = glm_stop_early_error_fraction
self.max_iter = max_iter
self.verbose = verbose
self._family_str = family # Hold string value for family
self._family = family.split()[0][0]
self.store_full_path = store_full_path
if lambda_max is None:
self.lambda_max = -1.0 # to trigger C code to compute
else:
self.lambda_max = lambda_max
self.alpha_min = alpha_min # as default
self.alpha_max = alpha_max
self.alphas_list = alphas
self.lambdas_list = lambdas
# default None for _full stuff
self.error_vs_alpha_lambda = None
self.intercept_ = None
self._tols2 = None
self._lambdas2 = None
self._alphas2 = None
self.error_vs_alpha = None
self.valid_pred_vs_alphapure = None
self.x_vs_alphapure = None
self.x_vs_alpha_lambdanew = None
self.x_vs_alpha_lambdapure = None
self.valid_pred_vs_alpha_lambdapure = None
self._lambdas = None
self._alphas = None
self._tols = None
self.intercept2_ = None
# Experimental features
# TODO _shared_a and _standardize do not work currently.
# TODO Always need to set to 0.
self._shared_a = 0
self._standardize = 0
from ..util.gpu import device_count
(self.n_gpus, devices) = device_count(n_gpus)
gpu_id = gpu_id % devices if devices != 0 else 0
self._gpu_id = gpu_id
self._total_n_gpus = devices
if n_threads is None:
# Not required number of threads, but normal.
# Bit more optimal to use 2 threads for CPU,
# but 1 thread per GPU is optimal.
n_threads = (1 if self.n_gpus == 0 else self.n_gpus)
self.n_threads = n_threads
self.lib = get_lib(self.n_gpus, devices)
self.x_vs_alpha_lambda = None
self.x_vs_alpha = None
self.valid_pred_vs_alpha_lambda = None
self.valid_pred_vs_alpha = None
self.count_full = None
self.count_short = None
self.count_more = None
# TODO Add typechecking
[docs] def fit(self,
train_x=None,
train_y=None,
valid_x=None,
valid_y=None,
sample_weight=None,
free_input_data=1):
"""Train a GLM
:param ndarray train_x : Training features array
:param ndarray train_y : Training response array
:param ndarray valid_x : Validation features
:param ndarray valid_y : Validation response
:param ndarray weight : Observation weights
:param int free_input_data : Indicate if input data should be freed
at the end of fit(). Default is 1.
"""
source_dev = 0
if not (train_x is None and train_y is None and valid_x is None and
valid_y is None and sample_weight is None):
if self.family == "logistic" and train_y is not None:
self.classes_ = np.unique(train_y)
train_y = np.searchsorted(self.classes_, train_y)
if valid_y is not None:
valid_y = np.searchsorted(self.classes_, valid_y)
self.prepare_and_upload_data = prepare_and_upload_data(
self,
train_x=train_x,
train_y=train_y,
valid_x=valid_x,
valid_y=valid_y,
sample_weight=sample_weight,
source_dev=source_dev)
else:
# if all None, just assume fitting with new parameters
# and all else uses self.
pass
self.fit_ptr(
self.m_train,
self.n,
self.m_valid,
self.double_precision,
self.ord,
self.a,
self.b,
self.c,
self.d,
self.e,
free_input_data=free_input_data,
source_dev=source_dev)
return self
# TODO Add typechecking
[docs] def predict(self,
valid_x=None,
valid_y=None,
sample_weight=None,
free_input_data=1):
"""Predict on a fitted GLM and get back class predictions for binomial models
for classification and predicted values for regression.
:param ndarray valid_x : Validation features
:param ndarray valid_y : Validation response
:param ndarray weight : Observation weights
:param int free_input_data : Indicate if input data should be freed at
the end of fit(). Default is 1.
"""
res = self.predict_proba(
valid_x, valid_y, sample_weight, free_input_data)
if self.family == "logistic":
return self.classes_[(res >= 0.5).astype(np.int8)]
return res
[docs] def predict_proba(self,
valid_x=None,
valid_y=None,
sample_weight=None,
free_input_data=1):
"""Predict on a fitted GLM and get back uncalibrated probabilities for classification models
:param ndarray valid_x : Validation features
:param ndarray valid_y : Validation response
:param ndarray weight : Observation weights
:param int free_input_data : Indicate if input data should be freed at
the end of fit(). Default is 1.
"""
source_dev = 0
if not (valid_x is None and valid_y is None and sample_weight is None):
prepare_and_upload_data(
self,
train_x=None,
train_y=None,
valid_x=valid_x,
valid_y=valid_y,
sample_weight=sample_weight,
source_dev=source_dev)
else:
pass
# save global variable
oldstorefullpath = self.store_full_path
if self.store_full_path == 1:
self.store_full_path = 1
self._fitorpredict_ptr(
source_dev,
self.m_train,
self.n,
self.m_valid,
self.double_precision,
self.ord,
self.a,
self.b,
self.c,
self.d,
self.e,
do_predict=1,
free_input_data=free_input_data)
self.store_full_path = 0
self._fitorpredict_ptr(
source_dev,
self.m_train,
self.n,
self.m_valid,
self.double_precision,
self.ord,
self.a,
self.b,
self.c,
self.d,
self.e,
do_predict=1,
free_input_data=free_input_data)
# restore variable
self.store_full_path = oldstorefullpath
return self.valid_pred_vs_alphapure # something like valid_y
# TODO Add type checking
# source_dev here because generally want to take in any pointer,
# not just from our test code
[docs] def fit_ptr(
self,
m_train,
n,
m_valid,
double_precision,
order,
a, # trainX_ptr or train_xptr
b, # trainY_ptr
c, # validX_ptr
d, # validY_ptr or valid_xptr # keep consistent with later uses
e, # weight_ptr
free_input_data=0,
source_dev=0):
"""Train a GLM with pointers to data on the GPU
(if fit_intercept, then you should have added 1's as
last column to m_train)
:param m_train Number of rows in the training set
:param n Number of columns in the training set
:param m_valid Number of rows in the validation set
:param double_precision float32 (0) or double point precision (1) of fit
No Default.
:param order: Order of data.
Default is None, and assumed set by constructor or upload_data
whether row 'r' or column 'c' major order.
:param a Pointer to training features array
:param b Pointer to training response array
:param c Pointer to validation features
:param d Pointer to validation response
:param e Pointer to weight column
:param int free_input_data : Indicate if input data should be freed at
the end of fit(). Default is 1.
:param source_dev GPU ID of device
"""
time_fit0 = time.time()
self._fitorpredict_ptr(
source_dev,
m_train,
n,
m_valid,
double_precision,
order,
a,
b,
c,
d,
e,
do_predict=0,
free_input_data=free_input_data)
self.time_fitonly = time.time() - time_fit0
# TODO Add type checking
# source_dev here because generally want to take in any pointer,
# not just from our test code
# pylint: disable=unused-argument
def _fitorpredict_ptr(
self,
source_dev,
m_train,
n,
m_valid,
double_precision,
order,
a, # trainX_ptr or train_xptr
b, # trainY_ptr
c, # validX_ptr
d, # validY_ptr or valid_xptr # keep consistent with later uses
e, # weight_ptr
do_predict=0,
free_input_data=0):
"""Train a GLM with pointers to data on the GPU
(if fit_intercept, then you should have added 1's as
last column to m_train)
:param source_dev GPU ID of device
:param m_train Number of rows in the training set
:param n Number of columns in the training set
:param m_valid Number of rows in the validation set
:param double_precision float32 (0) or double point precision (1) of fit
No Default.
:param order: Order of data. Default is None and set elsewhere
whether row 'r' or column 'c' major order.
:param a Pointer to training features array
:param b Pointer to training response array
:param c Pointer to validation features
:param d Pointer to validation response
:param e Pointer to weight column
:param int do_predict : Indicate if prediction should be done on
validation set after train. Default is 0.
:param int free_input_data : Indicate if input data should be freed at
the end of fit(). Default is 1.
"""
# store some things for later call to predict_ptr()
self.source_dev = source_dev
self.m_train = m_train
self.n = n
self.m_valid = m_valid
self.a = a
self.b = b
self.c = c
self.d = d
self.e = e
# ########## #
# if fitted earlier clear
# otherwise don't clear solution, just use it
if do_predict == 0 and self.did_fit_ptr == 1:
free_sols(self)
# ############## #
self.did_fit_ptr = 1
# ##############
# not calling with self.source_dev because want option to never use
# default but instead input pointers from foreign code's pointers
self.ord = order
if hasattr(self,
'double_precision') and self.double_precision is not None:
which_precision = self.double_precision
else:
which_precision = double_precision
self.double_precision = double_precision
# ############ #
if do_predict == 0:
# initialize if doing fit
self.x_vs_alpha_lambda = None
self.x_vs_alpha = None
self.valid_pred_vs_alpha_lambda = None
self.valid_pred_vs_alpha = None
count_full = 0
count_short = 0
count_more = 0
else:
# restore if predict
count_full = self.count_full
count_short = self.count_short
count_more = self.count_more
# ############## #
#
if which_precision == 1:
c_elastic_net = self.lib.elastic_net_ptr_double
self.dtype = np.float64
c_type = c_double
if self.verbose > 0:
print('double precision fit')
sys.stdout.flush()
else:
c_elastic_net = self.lib.elastic_net_ptr_float
self.dtype = np.float32
c_type = c_float
if self.verbose > 0:
print('single precision fit')
sys.stdout.flush()
# precision - independent commands
if self.alphas_list is not None:
c_alphas = (self.alphas_list.astype(self.dtype, copy=False))
else:
c_alphas = None
if self.lambdas_list is not None:
c_lambdas = (self.lambdas_list.astype(self.dtype, copy=False))
else:
c_lambdas = None
# call elastic net in C backend
_, x_vs_alpha_lambda, x_vs_alpha, \
valid_pred_vs_alpha_lambda, valid_pred_vs_alpha, \
count_full, count_short, count_more = c_elastic_net(
self._family,
do_predict,
source_dev,
1,
self._shared_a,
self.n_threads,
self._gpu_id,
self.n_gpus,
self._total_n_gpus,
self.ord, # 10
m_train,
n,
m_valid,
self.fit_intercept,
self._standardize,
self.lambda_max,
self.lambda_min_ratio,
self.n_lambdas,
self.n_folds,
self.n_alphas, # 20
self.alpha_min,
self.alpha_max,
c_alphas,
c_lambdas,
self.tol,
self.tol_seek_factor,
self.lambda_stop_early,
self.glm_stop_early,
self.glm_stop_early_error_fraction,
self.max_iter, # 30
self.verbose,
int(a.p) if a.p is not None else -1,
int(b.p) if b.p is not None else -1,
int(c.p) if c.p is not None else -1,
int(d.p) if d.p is not None else -1,
int(e.p) if e.p is not None else -1,
self.store_full_path,
self.x_vs_alpha_lambda,
self.x_vs_alpha,
self.valid_pred_vs_alpha_lambda, # 40
self.valid_pred_vs_alpha,
count_full,
count_short,
count_more
)
# if should or user wanted to save or free data,
# do that now that we are done using a, b, c, d, e
# This means have to upload_data() again before fit_ptr
# or predict_ptr or only call fit and predict
self.x_vs_alpha_lambda = x_vs_alpha_lambda
self.x_vs_alpha = x_vs_alpha
self.valid_pred_vs_alpha_lambda = valid_pred_vs_alpha_lambda
self.valid_pred_vs_alpha = valid_pred_vs_alpha
self.count_full = count_full
self.count_short = count_short
self.count_more = count_more
# ####################################
# PROCESS OUTPUT
# save pointers
if self.store_full_path == 1:
num_all = int(count_full / (self.n_alphas * self.n_lambdas))
else:
num_all = int(count_short / self.n_alphas)
num_all_other = num_all - n
num_error = 3 # should be consistent w/ src/common/elastic_net_ptr.cpp
num_other = num_all_other - num_error
if num_other != 3:
print('num_other=%d but expected 3' % num_other)
print('count_full=%d '
'count_short=%d '
'count_more=%d '
'num_all=%d num_all_other=%d' % (int(count_full),
int(count_short),
int(count_more),
int(num_all),
int(num_all_other)))
sys.stdout.flush()
# TODO raise an exception instead
sys.exit(0)
if self.store_full_path == 1 and do_predict == 0:
# x_vs_alpha_lambda contains solution(and other data)
# for all lambda and alpha
self.x_vs_alpha_lambdanew = \
np.fromiter(cast(self.x_vs_alpha_lambda.__int__(), POINTER(c_type)),
dtype=self.dtype,
count=count_full)
self.x_vs_alpha_lambdanew = \
np.reshape(self.x_vs_alpha_lambdanew, (self.n_lambdas,
self.n_alphas, num_all))
self.x_vs_alpha_lambdapure = \
self.x_vs_alpha_lambdanew[:, :, 0:n]
self.error_vs_alpha_lambda = \
self.x_vs_alpha_lambdanew[:, :, n:n + num_error]
self._lambdas = \
self.x_vs_alpha_lambdanew[:, :,
n + num_error:n + num_error + 1]
self._alphas = self.x_vs_alpha_lambdanew[:, :, n + num_error + 1:
n + num_error + 2]
self._tols = self.x_vs_alpha_lambdanew[:, :, n + num_error + 2:
n + num_error + 3]
if self.fit_intercept == 1:
self.intercept_ = self.x_vs_alpha_lambdapure[:, :, -1]
else:
self.intercept_ = None
if self.store_full_path == 1 and do_predict == 1:
thecount = int(count_full / (n + num_all_other) * m_valid)
self.valid_pred_vs_alpha_lambdanew = \
np.fromiter(cast(self.valid_pred_vs_alpha_lambda.__int__(), POINTER(c_type)),
dtype=self.dtype,
count=thecount)
self.valid_pred_vs_alpha_lambdanew = \
np.reshape(self.valid_pred_vs_alpha_lambdanew,
(self.n_lambdas, self.n_alphas, m_valid))
self.valid_pred_vs_alpha_lambdapure = \
self.valid_pred_vs_alpha_lambdanew[:, :, 0:m_valid]
if do_predict == 0: # store_full_path==0 or 1
# x_vs_alpha contains only best of all lambda for each alpha
self.x_vs_alphanew = np.fromiter(
cast(self.x_vs_alpha.__int__(), POINTER(c_type)),
dtype=self.dtype,
count=count_short)
self.x_vs_alphanew = np.reshape(self.x_vs_alphanew,
(self.n_alphas, num_all))
self.x_vs_alphapure = self.x_vs_alphanew[:, 0:n]
self.error_vs_alpha = self.x_vs_alphanew[:, n:n + num_error]
self._lambdas2 = self.x_vs_alphanew[:, n + num_error:
n + num_error + 1]
self._alphas2 = self.x_vs_alphanew[:, n + num_error + 1:
n + num_error + 2]
self._tols2 = self.x_vs_alphanew[:, n + num_error + 2:
n + num_error + 3]
if self.fit_intercept == 1:
self.intercept2_ = self.x_vs_alphapure[:, -1]
else:
self.intercept2_ = None
# preds exclusively operate for x_vs_alpha or x_vs_alpha_lambda
if self.store_full_path == 0 and do_predict == 1:
thecount = int(count_short / (n + num_all_other) * m_valid)
if self.verbose > 0:
print('thecount=%d '
'count_full=%d '
'count_short=%d '
'n=%d num_all_other=%d '
'm_valid=%d' % (
thecount,
count_full,
count_short,
n,
num_all_other,
m_valid,
))
sys.stdout.flush()
self.valid_pred_vs_alphanew = \
np.fromiter(cast(self.valid_pred_vs_alpha.__int__(), POINTER(c_type)),
dtype=self.dtype,
count=thecount)
self.valid_pred_vs_alphanew = \
np.reshape(self.valid_pred_vs_alphanew, (self.n_alphas,
m_valid))
self.valid_pred_vs_alphapure = \
self.valid_pred_vs_alphanew[:, 0:m_valid]
return self
# pylint: disable=unused-argument
[docs] def predict_ptr(self,
valid_xptr=None,
valid_yptr=None,
free_input_data=0,
order=None):
"""Predict on a fitted GLM with with pointers to data on the GPU
:param ndarray valid_xptr : Pointer to validation features
:param ndarray valid_yptr : Pointer to validation response
:param int store_full_path : Store full regularization path
from glm model
:param int free_input_data : Indicate if input data should be freed
at the end of fit(). Default is 1.
:param int verbose : Print verbose information to the console
if set to > 0. Default is 0.
:param order: Order of data. Default is None, and internally determined
whether row 'r' or column 'c' major order.
"""
# assume self.ord already set by fit_ptr() at least
# override self if chose to pass this option
oldstorefullpath = self.store_full_path
if self.store_full_path == 1: # then need to run twice
self.store_full_path = 1
self._fitorpredict_ptr(
self.source_dev,
self.m_train,
self.n,
self.m_valid,
self.double_precision,
self.ord,
self.a,
self.b,
valid_xptr,
valid_yptr,
self.e,
do_predict=1,
free_input_data=free_input_data,
)
self.store_full_path = 0
self._fitorpredict_ptr(
self.source_dev,
self.m_train,
self.n,
self.m_valid,
self.double_precision,
self.ord,
self.a,
self.b,
valid_xptr,
valid_yptr,
self.e,
do_predict=1,
)
# restore global variable
self.store_full_path = oldstorefullpath
return self.valid_pred_vs_alphapure # something like valid_y
# pylint: disable=unused-argument
[docs] def fit_predict(self,
train_x,
train_y,
valid_x=None,
valid_y=None,
sample_weight=None,
free_input_data=1,
order=None):
"""Train a model using GLM and predict on validation set
:param ndarray train_x : Training features array
:param ndarray train_y : Training response array
:param ndarray valid_x : Validation features
:param ndarray valid_y : Validation response
:param ndarray weight : Observation weights
:param int free_input_data : Indicate if input data should be freed at
the end of fit(). Default is 1.
:param order: Order of data. Default is None, and internally determined
whether row 'r' or column 'c' major order.
"""
# let fit() check and convert(to numpy)
#train_x, train_y, valid_x, valid_y, weight
self.fit(
train_x,
train_y,
valid_x,
valid_y,
sample_weight,
free_input_data=0,
)
if valid_x is None:
self.prediction = self.predict(
valid_x=train_x,
valid_y=train_y,
sample_weight=sample_weight,
free_input_data=free_input_data)
else:
self.prediction = self.predict(
valid_x=valid_x,
valid_y=valid_y,
sample_weight=sample_weight,
free_input_data=free_input_data)
return self.prediction # something like valid_y
# pylint: disable=unused-argument
[docs] def fit_predict_ptr(self,
m_train,
n,
m_valid,
double_precision,
order,
a,
b,
c,
d,
e,
free_input_data=0,
source_dev=0):
"""Train a GLM with pointers to data on the GPU and predict
on validation set that also has a pointer on the GPU
:param m_train Number of rows in the training set
:param n Number of columns in the training set
:param m_valid Number of rows in the validation set
:param double_precision float32 (0) or double precision (1) of fit.
Default None.
:param order: Order of data. Default is None, and internally determined
whether row 'r' or column 'c' major order.
:param a Pointer to training features array
:param b Pointer to training response array
:param c Pointer to validation features
:param d Pointer to validation response
:param e Pointer to weight column
:param int free_input_data : Indicate if input data should be freed
at the end of fit(). Default is 1.
:param source_dev GPU ID of device
"""
do_predict = 0 # only fit at first
self._fitorpredict_ptr(
source_dev,
m_train,
n,
m_valid,
double_precision,
self.ord,
a,
b,
c,
d,
e,
do_predict,
free_input_data=0)
if c is None:
self.prediction = self.predict_ptr(
valid_xptr=a, valid_yptr=b, free_input_data=free_input_data)
else:
self.prediction = self.predict_ptr(
valid_xptr=c, valid_yptr=d, free_input_data=free_input_data)
return self.prediction # something like valid_y
[docs] def summary(self):
"""
Obtain model summary, which is error per alpha across train,
cv, and validation
Error is logloss for classification and
RMSE (Root Mean Squared Error) for regression.
"""
error_train = pd.DataFrame(self.error_best, index=self.alphas)
if self.family == "logistic":
print("Logloss per alpha value (-1.00 = missing)\n")
else:
print("RMSE per alpha value (-1.00 = missing)\n")
headers = ["Alphas", "Train", "CV", "Valid"]
print(
tabulate(
error_train, headers=headers, tablefmt="pipe", floatfmt=".2f"))
# ################## #Properties and setters of properties
@property
def total_n_gpus(self):
return self._total_n_gpus
@property
def gpu_id(self):
return self._gpu_id
@gpu_id.setter
def gpu_id(self, value):
assert value >= 0, "GPU ID must be non-negative."
self._gpu_id = value
@property
def family(self):
return self._family_str
@family.setter
def family(self, value):
# add check
self.family = value
@property
def shared_a(self):
return self._shared_a
@shared_a.setter
def shared_a(self, value):
# add check
self.__shared_a = value
@property
def standardize(self):
return self._standardize
@standardize.setter
def standardize(self, value):
# add check
self._standardize = value
@property
def coef_(self):
return self.x_vs_alphapure
@property
def X(self):
return self.x_vs_alphapure
@property
def X_full(self):
''' Returns full solution if store_full_path=1
X[which lambda][which alpha]
'''
return self.x_vs_alpha_lambdapure
@property
def X_best(self):
return self.x_vs_alphapure
@property
def validPreds(self):
return self.valid_pred_vs_alphapure
@property
def validPreds_best(self):
return self.valid_pred_vs_alphapure
@property
def intercept_(self):
return self.intercept2_
@intercept_.setter
def intercept_(self, value):
self._intercept_ = value
@property
def intercept_best(self):
return self.intercept2_
@property
def error(self):
return self.error_vs_alpha
@property
def lambdas(self):
return self._lambdas2
@lambdas.setter
def lambdas(self, value):
self._lambdas = value
@property
def alphas(self):
return self._alphas2
@alphas.setter
def alphas(self, value):
self._alphas = value
@property
def tols(self):
return self._tols2
@tols.setter
def tols(self, value):
self._tols = value
@property
def validPreds_full(self):
''' Returns full predictions if store_full_path=1
validPreds[which lambda][which alpha]
'''
return self.valid_pred_vs_alpha_lambdapure
@property
def intercept_full(self):
''' Returns full intercept if store_full_path=1
intercept[which lambda][which alpha]
'''
return self.intercept_
@property
def error_full(self):
return self.error_vs_alpha_lambda
@property
def lambdas_full(self):
''' Returns full lambda path if store_full_path=1
lambda[which lambda][which alpha]
'''
return self._lambdas
@property
def alphas_full(self):
''' Returns full alpha if store_full_path=1
alpha[which lambda][which alpha]
'''
return self._alphas
@property
def tols_full(self):
''' Returns full tols if store_full_path=1
tols[which lambda][which alpha]
'''
return self._tols
@property
def error_best(self):
return self.error_vs_alpha
@property
def lambdas_best(self):
return self._lambdas2
@property
def alphas_best(self):
return self._alphas2
@property
def tols_best(self):
return self._tols2
# def score(self, X=None, y=None, sample_weight=None):
# if X is not None and y is not None:
# self.prediction = self.predict(
# valid_x=X, valid_y=y, sample_weight=sample_weight)
# #otherwise score makes no sense, need both X and y,
# #else just return existing error
# #TODO : Should return R ^ 2 and redo predict if X and y are passed
# return self.error
@classmethod
def _get_param_names(cls):
"""Get parameter names for the estimator"""
# fetch the constructor or the original constructor before
# deprecation wrapping if any
init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
if init is object.__init__:
# No explicit constructor to introspect
return []
# introspect the constructor arguments to find the model parameters
# to represent
from ..utils.fixes import signature
init_signature = signature(init)
# Consider the constructor parameters excluding 'self'
parameters = [
p for p in init_signature.parameters.values()
if p.name != 'self' and p.kind != p.VAR_KEYWORD
]
for p in parameters:
if p.kind == p.VAR_POSITIONAL:
raise RuntimeError("h2o4gpu GLM estimator should always "
"specify their parameters in the signature"
" of their __init__ (no varargs)."
" %s with constructor %s doesn't "
" follow this convention." %
(cls, init_signature))
# Extract and sort argument names excluding 'self'
return sorted([p.name for p in parameters])
[docs] def get_params(self, deep=True):
"""Get parameters for this estimator.
:param bool deep : If True, will return the parameters for this
estimator and contained subobjects that are estimators.
:returns dict params : Parameter names mapped to their values.
"""
out = dict()
for key in self._get_param_names():
# We need deprecation warnings to always be on in order to
# catch deprecated param values.
# This is set in utils / __init__.py but it gets overwritten
# when running under python3 somehow.
warnings.simplefilter("always", DeprecationWarning)
try:
with warnings.catch_warnings(record=True) as w:
value = getattr(self, key, None)
if w and w[0].category == DeprecationWarning:
# if the parameter is deprecated, don't show it
continue
finally:
warnings.filters.pop(0)
# XXX : should we rather test if instance of estimator ?
if deep and hasattr(value, 'get_params'):
deep_items = value.get_params().items()
out.update((key + '__' + k, val) for k, val in deep_items)
out[key] = value
return out
[docs] def set_params(self, **params):
"""Set the parameters of this estimator."""
if not params:
# Simple optimization to gain speed(inspect is slow)
return self
valid_params = self.get_params(deep=True)
from ..externals import six
for key, value in six.iteritems(params):
split = key.split('__', 1)
if len(split) > 1:
# nested objects case
name, sub_name = split
if name not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s. '
'Check the list of available parameters '
'with `estimator.get_params().keys()`.' %
(name, self))
sub_object = valid_params[name]
sub_object.set_params(**{sub_name: value})
else:
# simple objects case
if key not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s. '
'Check the list of available parameters '
'with `estimator.get_params().keys()`.' %
(key, self.__class__.__name__))
setattr(self, key, value)
return self
[docs]class ElasticNet:
"""H2O ElasticNet Solver
Selects between h2o4gpu.solvers.elastic_net.ElasticNet_h2o4gpu
and h2o4gpu.linear_model.coordinate_descent.ElasticNet_sklearn
Parameters
----------
alpha : float, optional
Constant that multiplies the penalty terms. Defaults to 1.0.
See the notes for the exact mathematical meaning of this
parameter.``alpha = 0`` is equivalent to an ordinary least square,
solved by the :class:`LinearRegressionSklearn` object. For numerical
reasons, using ``alpha = 0`` with the ``LassoSklearn`` object is not advised.
Given this, you should use the :class:`LinearRegressionSklearn` object.
l1_ratio : float
The ElasticNetSklearn mixing parameter, with ``0 <= l1_ratio <= 1``. For
``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a
combination of L1 and L2.
fit_intercept : bool
Whether the intercept should be estimated or not. If ``False``, the
data is assumed to be already centered.
normalize : boolean, optional, default False
This parameter is ignored when ``fit_intercept`` is set to False.
If True, the regressors X will be normalized before regression by
subtracting the mean and dividing by the l2-norm.
If you wish to standardize, please use
:class:`h2o4gpu.preprocessing.StandardScaler` before calling ``fit``
on an estimator with ``normalize=False``.
precompute : True | False | array-like
Whether to use a precomputed Gram matrix to speed up
calculations. The Gram matrix can also be passed as argument.
For sparse input this option is always ``True`` to preserve sparsity.
max_iter : int, optional
The maximum number of iterations
copy_X : boolean, optional, default True
If ``True``, X will be copied; else, it may be overwritten.
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
positive : bool, optional
When set to ``True``, forces the coefficients to be positive.
random_state : int, RandomState instance or None, optional, default None
The seed of the pseudo random number generator that selects a random
feature to update. If int, random_state is the seed used by the random
number generator; If RandomState instance, random_state is the random
number generator; If None, the random number generator is the
RandomState instance used by `np.random`. Used when ``selection`` ==
'random'.
selection : str, default 'cyclic'
If set to 'random', a random coefficient is updated every iteration
rather than looping over features sequentially by default. This
(setting to 'random') often leads to significantly faster convergence
especially when tol is higher than 1e-4.
n_gpus : int, (Default=-1)
Number of gpu's to use in GLM solver.
lambda_stop_early : float, (Default=True)
Stop early when there is no more relative
improvement on train or validation.
glm_stop_early : bool, (Default=True)
Stop early when there is no more relative
improvement in the primary and dual residuals for ADMM.
glm_stop_early_error_fraction : float, (Default=1.0)
Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at
least this much).
verbose : int, (Default=0)
Print verbose information to the console if set to > 0.
n_threads : int, (Default=None)
Number of threads to use in the gpu.
Each thread is an independent model builder.
gpu_id : int, optional, (default=0)
ID of the GPU on which the algorithm should run.
lambda_min_ratio: float, (Default=1E-7).
Minimum lambda ratio to maximum lambda, used
in lambda search.
n_lambdas : int, (Default=100)
Number of lambdas to be used in a search.
n_folds : int, (Default=1)
Number of cross validation folds.
n_alphas : int, (Default=5)
Number of alphas to be used in a search.
tol_seek_factor : float, (Default=1E-1)
Factor of tolerance to seek
once below null model accuracy. Default is 1E-1, so seeks tolerance
of 1E-3 once below null model accuracy for tol=1E-2.
family : string, (Default="elasticnet")
"logistic" for classification with logistic regression.
Defaults to "elasticnet" for regression.
Must be "logistic" or "elasticnet".
store_full_path: int, (Default=0)
Whether to store full solution for all alphas
and lambdas. If 1, then during predict will compute best
and full predictions.
lambda_max : int, (Default=None)
Maximum Lambda value to use.
Default is None, and then internally compute standard maximum
alpha_max : float, (Default=1.0)
Maximum alpha.
alpha_min : float, (Default=0.0)
Minimum alpha.
alphas: list, tuple, array, or numpy 1D array of alphas (Default=None)
overrides n_alphas, alpha_min, and alpha_max.
lambdas: list, tuple, array, or numpy 1D array of lambdas (Default=None)
overrides n_lambdas, lambda_max, and lambda_min_ratio.
double_precision: int, (Default=None)
Internally set unless using _ptr methods. Value can either be
0 (float32) or 1(float64)
order : string, (Default=None)
Order of data. Default is None, and internally
determined (unless using _ptr methods) whether
row 'r' or column 'c' major order.
backend : string, (Default="auto")
Which backend to use.
Options are 'auto', 'sklearn', 'h2o4gpu'.
Saves as attribute for actual backend used.
"""
def __init__(
self,
alpha=1.0, # scikit
l1_ratio=0.5, # scikit
fit_intercept=True, # h2o4gpu and scikit
normalize=False, # scikit
precompute=False, # scikit
max_iter=5000, # scikit
copy_X=True, # scikit
tol=1e-2, # h2o4gpu and scikit
warm_start=False, # scikit
positive=False, # scikit
random_state=None, # scikit
selection='cyclic', # scikit
n_gpus=-1, # h2o4gpu
lambda_stop_early=True, # h2o4gpu
glm_stop_early=True, # h2o4gpu
glm_stop_early_error_fraction=1.0, # h2o4gpu
verbose=False, # h2o4gpu
n_threads=None, # h2o4gpu
gpu_id=0, # h2o4gpu
lambda_min_ratio=1E-7, # h2o4gpu
n_lambdas=100, # h2o4gpu
n_folds=5, # h2o4gpu
n_alphas=5, # h2o4gpu
tol_seek_factor=1E-1, # h2o4gpu
family='elasticnet', # h2o4gpu
store_full_path=0, # h2o4gpu
lambda_max=None, # h2o4gpu
alpha_max=1.0, # h2o4gpu
alpha_min=0.0, # h2o4gpu
alphas=None, # h2o4gpu
lambdas=None, # h2o4gpu
double_precision=None, # h2o4gpu
order=None, # h2o4gpu
backend='auto'): # h2o4gpu
import os
_backend = os.environ.get('H2O4GPU_BACKEND', None)
if _backend is not None:
backend = _backend
# Fall back to Sklearn
# Can remove if fully implement sklearn functionality
# If parameter not listed, then ignored because not important
self.do_sklearn = False
if backend == 'auto':
params_string = ['alpha', 'l1_ratio', 'normalize', 'precompute',
'max_iter', 'copy_X',
'warm_start', 'positive',
'random_state', 'selection']
params = [alpha, l1_ratio, normalize, precompute,
max_iter, copy_X,
warm_start, positive,
random_state, selection]
params_default = [1.0, 0.5, False, False, 5000, True,
False, False, None, 'cyclic']
i = 0
for param in params:
if param != params_default[i]:
self.do_sklearn = True
if verbose:
print("WARNING:"
" The sklearn parameter " + params_string[i] +
" has been changed from default to " +
str(param) + ". Will use Sklearn.")
self.do_sklearn = True
i = i + 1
elif backend == 'sklearn':
self.do_sklearn = True
elif backend == 'h2o4gpu':
self.do_sklearn = False
if self.do_sklearn:
self.backend = 'sklearn'
else:
self.backend = 'h2o4gpu'
self.model_sklearn = sk.ElasticNetSklearn(
alpha=alpha,
l1_ratio=l1_ratio,
fit_intercept=fit_intercept,
normalize=normalize,
precompute=precompute,
max_iter=max_iter,
copy_X=copy_X,
tol=tol,
warm_start=warm_start,
positive=positive,
random_state=random_state,
selection=selection)
self.model_h2o4gpu = ElasticNetH2O(
gpu_id=gpu_id,
tol_seek_factor=tol_seek_factor,
family=family,
n_threads=n_threads,
n_gpus=n_gpus,
double_precision=double_precision,
fit_intercept=fit_intercept,
lambda_min_ratio=lambda_min_ratio,
n_lambdas=n_lambdas,
n_folds=n_folds,
n_alphas=n_alphas,
tol=tol,
lambda_stop_early=lambda_stop_early,
glm_stop_early=glm_stop_early,
glm_stop_early_error_fraction=glm_stop_early_error_fraction,
max_iter=max_iter,
verbose=verbose,
store_full_path=store_full_path,
lambda_max=lambda_max,
alpha_max=alpha_max,
alpha_min=alpha_min,
alphas=alphas,
lambdas=lambdas,
order=order)
if self.do_sklearn:
if verbose:
print("Running sklearn Lasso Regression")
self.model = self.model_sklearn
else:
if verbose:
print("Running h2o4gpu Lasso Regression")
self.model = self.model_h2o4gpu
self.verbose = verbose
[docs] def fit(self, X, y=None, check_input=True):
if self.do_sklearn:
res = self.model.fit(X, y, check_input)
self.set_attributes()
return res
res = self.model.fit(X, y)
self.set_attributes()
return res
[docs] def get_params(self):
return self.model.get_params()
[docs] def predict(self, X):
res = self.model.predict(X)
self.set_attributes()
return res
[docs] def predict_proba(self, X):
res = self.model.predict_proba(X)
self.set_attributes()
return res
[docs] def score(self, X, y, sample_weight=None):
# TODO: add for h2o4gpu
if self.verbose:
print("WARNING: score() is using sklearn")
if not self.do_sklearn:
self.model_sklearn.fit(X, y) # Need to re-fit
res = self.model_sklearn.score(X, y, sample_weight)
return res
[docs] def set_params(self, **params):
return self.model.set_params(**params)
[docs] def set_attributes(self):
"""
Set attributes and don't fail if not yet present
"""
s = _setter(oself=self, e1=NameError, e2=AttributeError)
self.coef_ = None
s('oself.coef_ = oself.model.coef_')
self.sparse_coef_ = None
s('oself.sparse_coef_ = oself.model.sparse_coef_')
self.intercept_ = None
s('oself.intercept_ = oself.model.intercept_')
self.n_iter_ = None
s('oself.n_iter_ = oself.model.n_iter_')
self.time_prepare = None
s('oself.time_prepare = oself.model.time_prepare')
self.time_upload_data = None
s('oself.time_upload_data = oself.model.time_upload_data')
self.time_fitonly = None
s('oself.time_fitonly = oself.model.time_fitonly')