Source code for h2o4gpu.solvers.elastic_net

# - * - encoding : utf - 8 - * -
# pylint: disable=fixme, line-too-long
"""
:copyright: 2017-2018 H2O.ai, Inc.
:license:   Apache License Version 2.0 (see LICENSE for details)
"""
import sys
import time
import warnings

from ctypes import c_float, c_double, cast, POINTER
import numpy as np
import pandas as pd
from tabulate import tabulate
from h2o4gpu.linear_model import coordinate_descent as sk
from ..solvers.utils import _setter

from ..libs.lib_utils import get_lib
from ..solvers.utils import prepare_and_upload_data, free_sols


[docs]class ElasticNetH2O: """H2O Elastic Net Solver for GPUs Parameters ---------- n_threads : int, (Default=None) Number of threads to use in the gpu. Each thread is an independent model builder. gpu_id : int, optional, (default=0) ID of the GPU on which the algorithm should run. n_gpus : int, (Default=-1) Number of gpu's to use in GLM solver. order : string, (Default='r') Row or Column major for C/C++ backend. Default is 'r'. Must be 'r' (Row major) or 'c' (Column major). fit_intercept : bool, (default=True) Include constant term in the model. lambda_min_ratio: float, (Default=1E-7). Minimum lambda ratio to maximum lambda, used in lambda search. n_lambdas : int, (Default=100) Number of lambdas to be used in a search. n_folds : int, (Default=1) Number of cross validation folds. n_alphas : int, (Default=5) Number of alphas to be used in a search. tol : float, (Default=1E-2) Relative tolerance. tol_seek_factor : float, (Default=1E-1) Factor of tolerance to seek once below null model accuracy. Default is 1E-1, so seeks tolerance of 1E-3 once below null model accuracy for tol=1E-2. lambda_stop_early : float, (Default=True) Stop early when there is no more relative improvement on train or validation. glm_stop_early : bool, (Default=True) Stop early when there is no more relative improvement in the primary and dual residuals for ADMM. glm_stop_early_error_fraction : float, (Default=1.0) Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much). max_iter : int, (Default=5000) Maximum number of iterations. verbose : int, (Default=0) Print verbose information to the console if set to > 0. family : string, (Default="elasticnet") "logistic" for classification with logistic regression. Defaults to "elasticnet" for regression. Must be "logistic" or "elasticnet". store_full_path: int, (Default=0) Whether to store full solution for all alphas and lambdas. If 1, then during predict will compute best and full predictions. lambda_max : int, (Default=None) Maximum Lambda value to use. Default is None, and then internally compute standard maximum alpha_max : float, (Default=1.0) Maximum alpha. alpha_min : float, (Default=0.0) Minimum alpha. alphas: list, tuple, array, or numpy 1D array of alphas (Default=None) overrides n_alphas, alpha_min, and alpha_max. lambdas: list, tuple, array, or numpy 1D array of lambdas (Default=None) overrides n_lambdas, lambda_max, and lambda_min_ratio. double_precision: int, (Default=None) Internally set unless using _ptr methods. Value can either be 0 (float32) or 1(float64) order : string, (Default=None) Order of data. Default is None, and internally determined (unless using _ptr methods) whether row 'r' or column 'c' major order. """
[docs] class info: pass
[docs] class solution: pass
def __init__(self, n_threads=None, gpu_id=0, n_gpus=-1, fit_intercept=True, lambda_min_ratio=1E-7, n_lambdas=100, n_folds=5, n_alphas=5, tol=1E-2, tol_seek_factor=1E-1, lambda_stop_early=True, glm_stop_early=True, glm_stop_early_error_fraction=1.0, max_iter=5000, verbose=0, family='elasticnet', store_full_path=0, lambda_max=None, alpha_max=1.0, alpha_min=0.0, alphas=None, lambdas=None, double_precision=None, order=None): assert family in ['logistic', 'elasticnet'], \ "family should be 'logistic' or 'elasticnet' but got " + family self.double_precision = double_precision self.ord = order self.dtype = None ############################## # overrides of input parameters # override these if pass alphas or lambdas if alphas is not None: alphas = np.ascontiguousarray(np.asarray(alphas)) n_alphas = np.shape(alphas)[0] if lambdas is not None: lambdas = np.ascontiguousarray(np.asarray(lambdas)) n_lambdas = np.shape(lambdas)[0] ############################## # self assignments self.n = 0 self.m_train = 0 self.m_valid = 0 self.source_dev = 0 # assume Dev=0 is source of data for upload_data self.source_me = 0 # assume thread=0 is source of data for upload_data if fit_intercept is True: self.fit_intercept = 1 else: self.fit_intercept = 0 self.lambda_min_ratio = lambda_min_ratio self.n_lambdas = n_lambdas self.n_folds = n_folds self.n_alphas = n_alphas self.uploaded_data = 0 self.did_fit_ptr = 0 self.did_predict = 0 self.tol = tol self.tol_seek_factor = tol_seek_factor if lambda_stop_early is True: self.lambda_stop_early = 1 else: self.lambda_stop_early = 0 if glm_stop_early is True: self.glm_stop_early = 1 else: self.glm_stop_early = 0 self.glm_stop_early_error_fraction = glm_stop_early_error_fraction self.max_iter = max_iter self.verbose = verbose self._family_str = family # Hold string value for family self._family = family.split()[0][0] self.store_full_path = store_full_path if lambda_max is None: self.lambda_max = -1.0 # to trigger C code to compute else: self.lambda_max = lambda_max self.alpha_min = alpha_min # as default self.alpha_max = alpha_max self.alphas_list = alphas self.lambdas_list = lambdas # default None for _full stuff self.error_vs_alpha_lambda = None self.intercept_ = None self._tols2 = None self._lambdas2 = None self._alphas2 = None self.error_vs_alpha = None self.valid_pred_vs_alphapure = None self.x_vs_alphapure = None self.x_vs_alpha_lambdanew = None self.x_vs_alpha_lambdapure = None self.valid_pred_vs_alpha_lambdapure = None self._lambdas = None self._alphas = None self._tols = None self.intercept2_ = None # Experimental features # TODO _shared_a and _standardize do not work currently. # TODO Always need to set to 0. self._shared_a = 0 self._standardize = 0 from ..util.gpu import device_count (self.n_gpus, devices) = device_count(n_gpus) gpu_id = gpu_id % devices if devices != 0 else 0 self._gpu_id = gpu_id self._total_n_gpus = devices if n_threads is None: # Not required number of threads, but normal. # Bit more optimal to use 2 threads for CPU, # but 1 thread per GPU is optimal. n_threads = (1 if self.n_gpus == 0 else self.n_gpus) self.n_threads = n_threads self.lib = get_lib(self.n_gpus, devices) self.x_vs_alpha_lambda = None self.x_vs_alpha = None self.valid_pred_vs_alpha_lambda = None self.valid_pred_vs_alpha = None self.count_full = None self.count_short = None self.count_more = None # TODO Add typechecking
[docs] def fit(self, train_x=None, train_y=None, valid_x=None, valid_y=None, sample_weight=None, free_input_data=1): """Train a GLM :param ndarray train_x : Training features array :param ndarray train_y : Training response array :param ndarray valid_x : Validation features :param ndarray valid_y : Validation response :param ndarray weight : Observation weights :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. """ source_dev = 0 if not (train_x is None and train_y is None and valid_x is None and valid_y is None and sample_weight is None): if self.family == "logistic" and train_y is not None: self.classes_ = np.unique(train_y) train_y = np.searchsorted(self.classes_, train_y) if valid_y is not None: valid_y = np.searchsorted(self.classes_, valid_y) self.prepare_and_upload_data = prepare_and_upload_data( self, train_x=train_x, train_y=train_y, valid_x=valid_x, valid_y=valid_y, sample_weight=sample_weight, source_dev=source_dev) else: # if all None, just assume fitting with new parameters # and all else uses self. pass self.fit_ptr( self.m_train, self.n, self.m_valid, self.double_precision, self.ord, self.a, self.b, self.c, self.d, self.e, free_input_data=free_input_data, source_dev=source_dev) return self
# TODO Add typechecking
[docs] def predict(self, valid_x=None, valid_y=None, sample_weight=None, free_input_data=1): """Predict on a fitted GLM and get back class predictions for binomial models for classification and predicted values for regression. :param ndarray valid_x : Validation features :param ndarray valid_y : Validation response :param ndarray weight : Observation weights :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. """ res = self.predict_proba( valid_x, valid_y, sample_weight, free_input_data) if self.family == "logistic": return self.classes_[(res >= 0.5).astype(np.int8)] return res
[docs] def predict_proba(self, valid_x=None, valid_y=None, sample_weight=None, free_input_data=1): """Predict on a fitted GLM and get back uncalibrated probabilities for classification models :param ndarray valid_x : Validation features :param ndarray valid_y : Validation response :param ndarray weight : Observation weights :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. """ source_dev = 0 if not (valid_x is None and valid_y is None and sample_weight is None): prepare_and_upload_data( self, train_x=None, train_y=None, valid_x=valid_x, valid_y=valid_y, sample_weight=sample_weight, source_dev=source_dev) else: pass # save global variable oldstorefullpath = self.store_full_path if self.store_full_path == 1: self.store_full_path = 1 self._fitorpredict_ptr( source_dev, self.m_train, self.n, self.m_valid, self.double_precision, self.ord, self.a, self.b, self.c, self.d, self.e, do_predict=1, free_input_data=free_input_data) self.store_full_path = 0 self._fitorpredict_ptr( source_dev, self.m_train, self.n, self.m_valid, self.double_precision, self.ord, self.a, self.b, self.c, self.d, self.e, do_predict=1, free_input_data=free_input_data) # restore variable self.store_full_path = oldstorefullpath return self.valid_pred_vs_alphapure # something like valid_y
# TODO Add type checking # source_dev here because generally want to take in any pointer, # not just from our test code
[docs] def fit_ptr( self, m_train, n, m_valid, double_precision, order, a, # trainX_ptr or train_xptr b, # trainY_ptr c, # validX_ptr d, # validY_ptr or valid_xptr # keep consistent with later uses e, # weight_ptr free_input_data=0, source_dev=0): """Train a GLM with pointers to data on the GPU (if fit_intercept, then you should have added 1's as last column to m_train) :param m_train Number of rows in the training set :param n Number of columns in the training set :param m_valid Number of rows in the validation set :param double_precision float32 (0) or double point precision (1) of fit No Default. :param order: Order of data. Default is None, and assumed set by constructor or upload_data whether row 'r' or column 'c' major order. :param a Pointer to training features array :param b Pointer to training response array :param c Pointer to validation features :param d Pointer to validation response :param e Pointer to weight column :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. :param source_dev GPU ID of device """ time_fit0 = time.time() self._fitorpredict_ptr( source_dev, m_train, n, m_valid, double_precision, order, a, b, c, d, e, do_predict=0, free_input_data=free_input_data) self.time_fitonly = time.time() - time_fit0
# TODO Add type checking # source_dev here because generally want to take in any pointer, # not just from our test code # pylint: disable=unused-argument def _fitorpredict_ptr( self, source_dev, m_train, n, m_valid, double_precision, order, a, # trainX_ptr or train_xptr b, # trainY_ptr c, # validX_ptr d, # validY_ptr or valid_xptr # keep consistent with later uses e, # weight_ptr do_predict=0, free_input_data=0): """Train a GLM with pointers to data on the GPU (if fit_intercept, then you should have added 1's as last column to m_train) :param source_dev GPU ID of device :param m_train Number of rows in the training set :param n Number of columns in the training set :param m_valid Number of rows in the validation set :param double_precision float32 (0) or double point precision (1) of fit No Default. :param order: Order of data. Default is None and set elsewhere whether row 'r' or column 'c' major order. :param a Pointer to training features array :param b Pointer to training response array :param c Pointer to validation features :param d Pointer to validation response :param e Pointer to weight column :param int do_predict : Indicate if prediction should be done on validation set after train. Default is 0. :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. """ # store some things for later call to predict_ptr() self.source_dev = source_dev self.m_train = m_train self.n = n self.m_valid = m_valid self.a = a self.b = b self.c = c self.d = d self.e = e # ########## # # if fitted earlier clear # otherwise don't clear solution, just use it if do_predict == 0 and self.did_fit_ptr == 1: free_sols(self) # ############## # self.did_fit_ptr = 1 # ############## # not calling with self.source_dev because want option to never use # default but instead input pointers from foreign code's pointers self.ord = order if hasattr(self, 'double_precision') and self.double_precision is not None: which_precision = self.double_precision else: which_precision = double_precision self.double_precision = double_precision # ############ # if do_predict == 0: # initialize if doing fit self.x_vs_alpha_lambda = None self.x_vs_alpha = None self.valid_pred_vs_alpha_lambda = None self.valid_pred_vs_alpha = None count_full = 0 count_short = 0 count_more = 0 else: # restore if predict count_full = self.count_full count_short = self.count_short count_more = self.count_more # ############## # # if which_precision == 1: c_elastic_net = self.lib.elastic_net_ptr_double self.dtype = np.float64 c_type = c_double if self.verbose > 0: print('double precision fit') sys.stdout.flush() else: c_elastic_net = self.lib.elastic_net_ptr_float self.dtype = np.float32 c_type = c_float if self.verbose > 0: print('single precision fit') sys.stdout.flush() # precision - independent commands if self.alphas_list is not None: c_alphas = (self.alphas_list.astype(self.dtype, copy=False)) else: c_alphas = None if self.lambdas_list is not None: c_lambdas = (self.lambdas_list.astype(self.dtype, copy=False)) else: c_lambdas = None # call elastic net in C backend _, x_vs_alpha_lambda, x_vs_alpha, \ valid_pred_vs_alpha_lambda, valid_pred_vs_alpha, \ count_full, count_short, count_more = c_elastic_net( self._family, do_predict, source_dev, 1, self._shared_a, self.n_threads, self._gpu_id, self.n_gpus, self._total_n_gpus, self.ord, # 10 m_train, n, m_valid, self.fit_intercept, self._standardize, self.lambda_max, self.lambda_min_ratio, self.n_lambdas, self.n_folds, self.n_alphas, # 20 self.alpha_min, self.alpha_max, c_alphas, c_lambdas, self.tol, self.tol_seek_factor, self.lambda_stop_early, self.glm_stop_early, self.glm_stop_early_error_fraction, self.max_iter, # 30 self.verbose, int(a.p) if a.p is not None else -1, int(b.p) if b.p is not None else -1, int(c.p) if c.p is not None else -1, int(d.p) if d.p is not None else -1, int(e.p) if e.p is not None else -1, self.store_full_path, self.x_vs_alpha_lambda, self.x_vs_alpha, self.valid_pred_vs_alpha_lambda, # 40 self.valid_pred_vs_alpha, count_full, count_short, count_more ) # if should or user wanted to save or free data, # do that now that we are done using a, b, c, d, e # This means have to upload_data() again before fit_ptr # or predict_ptr or only call fit and predict self.x_vs_alpha_lambda = x_vs_alpha_lambda self.x_vs_alpha = x_vs_alpha self.valid_pred_vs_alpha_lambda = valid_pred_vs_alpha_lambda self.valid_pred_vs_alpha = valid_pred_vs_alpha self.count_full = count_full self.count_short = count_short self.count_more = count_more # #################################### # PROCESS OUTPUT # save pointers if self.store_full_path == 1: num_all = int(count_full / (self.n_alphas * self.n_lambdas)) else: num_all = int(count_short / self.n_alphas) num_all_other = num_all - n num_error = 3 # should be consistent w/ src/common/elastic_net_ptr.cpp num_other = num_all_other - num_error if num_other != 3: print('num_other=%d but expected 3' % num_other) print('count_full=%d ' 'count_short=%d ' 'count_more=%d ' 'num_all=%d num_all_other=%d' % (int(count_full), int(count_short), int(count_more), int(num_all), int(num_all_other))) sys.stdout.flush() # TODO raise an exception instead sys.exit(0) if self.store_full_path == 1 and do_predict == 0: # x_vs_alpha_lambda contains solution(and other data) # for all lambda and alpha self.x_vs_alpha_lambdanew = \ np.fromiter(cast(self.x_vs_alpha_lambda.__int__(), POINTER(c_type)), dtype=self.dtype, count=count_full) self.x_vs_alpha_lambdanew = \ np.reshape(self.x_vs_alpha_lambdanew, (self.n_lambdas, self.n_alphas, num_all)) self.x_vs_alpha_lambdapure = \ self.x_vs_alpha_lambdanew[:, :, 0:n] self.error_vs_alpha_lambda = \ self.x_vs_alpha_lambdanew[:, :, n:n + num_error] self._lambdas = \ self.x_vs_alpha_lambdanew[:, :, n + num_error:n + num_error + 1] self._alphas = self.x_vs_alpha_lambdanew[:, :, n + num_error + 1: n + num_error + 2] self._tols = self.x_vs_alpha_lambdanew[:, :, n + num_error + 2: n + num_error + 3] if self.fit_intercept == 1: self.intercept_ = self.x_vs_alpha_lambdapure[:, :, -1] else: self.intercept_ = None if self.store_full_path == 1 and do_predict == 1: thecount = int(count_full / (n + num_all_other) * m_valid) self.valid_pred_vs_alpha_lambdanew = \ np.fromiter(cast(self.valid_pred_vs_alpha_lambda.__int__(), POINTER(c_type)), dtype=self.dtype, count=thecount) self.valid_pred_vs_alpha_lambdanew = \ np.reshape(self.valid_pred_vs_alpha_lambdanew, (self.n_lambdas, self.n_alphas, m_valid)) self.valid_pred_vs_alpha_lambdapure = \ self.valid_pred_vs_alpha_lambdanew[:, :, 0:m_valid] if do_predict == 0: # store_full_path==0 or 1 # x_vs_alpha contains only best of all lambda for each alpha self.x_vs_alphanew = np.fromiter( cast(self.x_vs_alpha.__int__(), POINTER(c_type)), dtype=self.dtype, count=count_short) self.x_vs_alphanew = np.reshape(self.x_vs_alphanew, (self.n_alphas, num_all)) self.x_vs_alphapure = self.x_vs_alphanew[:, 0:n] self.error_vs_alpha = self.x_vs_alphanew[:, n:n + num_error] self._lambdas2 = self.x_vs_alphanew[:, n + num_error: n + num_error + 1] self._alphas2 = self.x_vs_alphanew[:, n + num_error + 1: n + num_error + 2] self._tols2 = self.x_vs_alphanew[:, n + num_error + 2: n + num_error + 3] if self.fit_intercept == 1: self.intercept2_ = self.x_vs_alphapure[:, -1] else: self.intercept2_ = None # preds exclusively operate for x_vs_alpha or x_vs_alpha_lambda if self.store_full_path == 0 and do_predict == 1: thecount = int(count_short / (n + num_all_other) * m_valid) if self.verbose > 0: print('thecount=%d ' 'count_full=%d ' 'count_short=%d ' 'n=%d num_all_other=%d ' 'm_valid=%d' % ( thecount, count_full, count_short, n, num_all_other, m_valid, )) sys.stdout.flush() self.valid_pred_vs_alphanew = \ np.fromiter(cast(self.valid_pred_vs_alpha.__int__(), POINTER(c_type)), dtype=self.dtype, count=thecount) self.valid_pred_vs_alphanew = \ np.reshape(self.valid_pred_vs_alphanew, (self.n_alphas, m_valid)) self.valid_pred_vs_alphapure = \ self.valid_pred_vs_alphanew[:, 0:m_valid] return self # pylint: disable=unused-argument
[docs] def predict_ptr(self, valid_xptr=None, valid_yptr=None, free_input_data=0, order=None): """Predict on a fitted GLM with with pointers to data on the GPU :param ndarray valid_xptr : Pointer to validation features :param ndarray valid_yptr : Pointer to validation response :param int store_full_path : Store full regularization path from glm model :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. :param int verbose : Print verbose information to the console if set to > 0. Default is 0. :param order: Order of data. Default is None, and internally determined whether row 'r' or column 'c' major order. """ # assume self.ord already set by fit_ptr() at least # override self if chose to pass this option oldstorefullpath = self.store_full_path if self.store_full_path == 1: # then need to run twice self.store_full_path = 1 self._fitorpredict_ptr( self.source_dev, self.m_train, self.n, self.m_valid, self.double_precision, self.ord, self.a, self.b, valid_xptr, valid_yptr, self.e, do_predict=1, free_input_data=free_input_data, ) self.store_full_path = 0 self._fitorpredict_ptr( self.source_dev, self.m_train, self.n, self.m_valid, self.double_precision, self.ord, self.a, self.b, valid_xptr, valid_yptr, self.e, do_predict=1, ) # restore global variable self.store_full_path = oldstorefullpath return self.valid_pred_vs_alphapure # something like valid_y
# pylint: disable=unused-argument
[docs] def fit_predict(self, train_x, train_y, valid_x=None, valid_y=None, sample_weight=None, free_input_data=1, order=None): """Train a model using GLM and predict on validation set :param ndarray train_x : Training features array :param ndarray train_y : Training response array :param ndarray valid_x : Validation features :param ndarray valid_y : Validation response :param ndarray weight : Observation weights :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. :param order: Order of data. Default is None, and internally determined whether row 'r' or column 'c' major order. """ # let fit() check and convert(to numpy) #train_x, train_y, valid_x, valid_y, weight self.fit( train_x, train_y, valid_x, valid_y, sample_weight, free_input_data=0, ) if valid_x is None: self.prediction = self.predict( valid_x=train_x, valid_y=train_y, sample_weight=sample_weight, free_input_data=free_input_data) else: self.prediction = self.predict( valid_x=valid_x, valid_y=valid_y, sample_weight=sample_weight, free_input_data=free_input_data) return self.prediction # something like valid_y
# pylint: disable=unused-argument
[docs] def fit_predict_ptr(self, m_train, n, m_valid, double_precision, order, a, b, c, d, e, free_input_data=0, source_dev=0): """Train a GLM with pointers to data on the GPU and predict on validation set that also has a pointer on the GPU :param m_train Number of rows in the training set :param n Number of columns in the training set :param m_valid Number of rows in the validation set :param double_precision float32 (0) or double precision (1) of fit. Default None. :param order: Order of data. Default is None, and internally determined whether row 'r' or column 'c' major order. :param a Pointer to training features array :param b Pointer to training response array :param c Pointer to validation features :param d Pointer to validation response :param e Pointer to weight column :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. :param source_dev GPU ID of device """ do_predict = 0 # only fit at first self._fitorpredict_ptr( source_dev, m_train, n, m_valid, double_precision, self.ord, a, b, c, d, e, do_predict, free_input_data=0) if c is None: self.prediction = self.predict_ptr( valid_xptr=a, valid_yptr=b, free_input_data=free_input_data) else: self.prediction = self.predict_ptr( valid_xptr=c, valid_yptr=d, free_input_data=free_input_data) return self.prediction # something like valid_y
[docs] def fit_transform(self, train_x, train_y, valid_x=None, valid_y=None, sample_weight=None, free_input_data=1): """Train a model using GLM and predict on validation set :param ndarray train_x : Training features array :param ndarray train_y : Training response array :param ndarray valid_x : Validation features :param ndarray valid_y : Validation response :param ndarray weight : Observation weights :param int free_input_data : Indicate if input data should be freed at the end of fit(). Default is 1. """ return self.fit_predict(self, train_x, train_y, valid_x, valid_y, sample_weight, free_input_data)
[docs] def transform(self): return self
[docs] def summary(self): """ Obtain model summary, which is error per alpha across train, cv, and validation Error is logloss for classification and RMSE (Root Mean Squared Error) for regression. """ error_train = pd.DataFrame(self.error_best, index=self.alphas) if self.family == "logistic": print("Logloss per alpha value (-1.00 = missing)\n") else: print("RMSE per alpha value (-1.00 = missing)\n") headers = ["Alphas", "Train", "CV", "Valid"] print( tabulate( error_train, headers=headers, tablefmt="pipe", floatfmt=".2f"))
# ################## #Properties and setters of properties @property def total_n_gpus(self): return self._total_n_gpus @property def gpu_id(self): return self._gpu_id @gpu_id.setter def gpu_id(self, value): assert value >= 0, "GPU ID must be non-negative." self._gpu_id = value @property def family(self): return self._family_str @family.setter def family(self, value): # add check self.family = value @property def shared_a(self): return self._shared_a @shared_a.setter def shared_a(self, value): # add check self.__shared_a = value @property def standardize(self): return self._standardize @standardize.setter def standardize(self, value): # add check self._standardize = value @property def coef_(self): return self.x_vs_alphapure @property def X(self): return self.x_vs_alphapure @property def X_full(self): ''' Returns full solution if store_full_path=1 X[which lambda][which alpha] ''' return self.x_vs_alpha_lambdapure @property def X_best(self): return self.x_vs_alphapure @property def validPreds(self): return self.valid_pred_vs_alphapure @property def validPreds_best(self): return self.valid_pred_vs_alphapure @property def intercept_(self): return self.intercept2_ @intercept_.setter def intercept_(self, value): self._intercept_ = value @property def intercept_best(self): return self.intercept2_ @property def error(self): return self.error_vs_alpha @property def lambdas(self): return self._lambdas2 @lambdas.setter def lambdas(self, value): self._lambdas = value @property def alphas(self): return self._alphas2 @alphas.setter def alphas(self, value): self._alphas = value @property def tols(self): return self._tols2 @tols.setter def tols(self, value): self._tols = value @property def validPreds_full(self): ''' Returns full predictions if store_full_path=1 validPreds[which lambda][which alpha] ''' return self.valid_pred_vs_alpha_lambdapure @property def intercept_full(self): ''' Returns full intercept if store_full_path=1 intercept[which lambda][which alpha] ''' return self.intercept_ @property def error_full(self): return self.error_vs_alpha_lambda @property def lambdas_full(self): ''' Returns full lambda path if store_full_path=1 lambda[which lambda][which alpha] ''' return self._lambdas @property def alphas_full(self): ''' Returns full alpha if store_full_path=1 alpha[which lambda][which alpha] ''' return self._alphas @property def tols_full(self): ''' Returns full tols if store_full_path=1 tols[which lambda][which alpha] ''' return self._tols @property def error_best(self): return self.error_vs_alpha @property def lambdas_best(self): return self._lambdas2 @property def alphas_best(self): return self._alphas2 @property def tols_best(self): return self._tols2 # def score(self, X=None, y=None, sample_weight=None): # if X is not None and y is not None: # self.prediction = self.predict( # valid_x=X, valid_y=y, sample_weight=sample_weight) # #otherwise score makes no sense, need both X and y, # #else just return existing error # #TODO : Should return R ^ 2 and redo predict if X and y are passed # return self.error @classmethod def _get_param_names(cls): """Get parameter names for the estimator""" # fetch the constructor or the original constructor before # deprecation wrapping if any init = getattr(cls.__init__, 'deprecated_original', cls.__init__) if init is object.__init__: # No explicit constructor to introspect return [] # introspect the constructor arguments to find the model parameters # to represent from ..utils.fixes import signature init_signature = signature(init) # Consider the constructor parameters excluding 'self' parameters = [ p for p in init_signature.parameters.values() if p.name != 'self' and p.kind != p.VAR_KEYWORD ] for p in parameters: if p.kind == p.VAR_POSITIONAL: raise RuntimeError("h2o4gpu GLM estimator should always " "specify their parameters in the signature" " of their __init__ (no varargs)." " %s with constructor %s doesn't " " follow this convention." % (cls, init_signature)) # Extract and sort argument names excluding 'self' return sorted([p.name for p in parameters])
[docs] def get_params(self, deep=True): """Get parameters for this estimator. :param bool deep : If True, will return the parameters for this estimator and contained subobjects that are estimators. :returns dict params : Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils / __init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if w and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX : should we rather test if instance of estimator ? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
[docs] def set_params(self, **params): """Set the parameters of this estimator.""" if not params: # Simple optimization to gain speed(inspect is slow) return self valid_params = self.get_params(deep=True) from ..externals import six for key, value in six.iteritems(params): split = key.split('__', 1) if len(split) > 1: # nested objects case name, sub_name = split if name not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (name, self)) sub_object = valid_params[name] sub_object.set_params(**{sub_name: value}) else: # simple objects case if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self.__class__.__name__)) setattr(self, key, value) return self
[docs]class ElasticNet: """H2O ElasticNet Solver Selects between h2o4gpu.solvers.elastic_net.ElasticNet_h2o4gpu and h2o4gpu.linear_model.coordinate_descent.ElasticNet_sklearn Parameters ---------- alpha : float, optional Constant that multiplies the penalty terms. Defaults to 1.0. See the notes for the exact mathematical meaning of this parameter.``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegressionSklearn` object. For numerical reasons, using ``alpha = 0`` with the ``LassoSklearn`` object is not advised. Given this, you should use the :class:`LinearRegressionSklearn` object. l1_ratio : float The ElasticNetSklearn mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. fit_intercept : bool Whether the intercept should be estimated or not. If ``False``, the data is assumed to be already centered. normalize : boolean, optional, default False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`h2o4gpu.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. precompute : True | False | array-like Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``True`` to preserve sparsity. max_iter : int, optional The maximum number of iterations copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. tol : float, optional The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. warm_start : bool, optional When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. positive : bool, optional When set to ``True``, forces the coefficients to be positive. random_state : int, RandomState instance or None, optional, default None The seed of the pseudo random number generator that selects a random feature to update. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``selection`` == 'random'. selection : str, default 'cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. n_gpus : int, (Default=-1) Number of gpu's to use in GLM solver. lambda_stop_early : float, (Default=True) Stop early when there is no more relative improvement on train or validation. glm_stop_early : bool, (Default=True) Stop early when there is no more relative improvement in the primary and dual residuals for ADMM. glm_stop_early_error_fraction : float, (Default=1.0) Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much). verbose : int, (Default=0) Print verbose information to the console if set to > 0. n_threads : int, (Default=None) Number of threads to use in the gpu. Each thread is an independent model builder. gpu_id : int, optional, (default=0) ID of the GPU on which the algorithm should run. lambda_min_ratio: float, (Default=1E-7). Minimum lambda ratio to maximum lambda, used in lambda search. n_lambdas : int, (Default=100) Number of lambdas to be used in a search. n_folds : int, (Default=1) Number of cross validation folds. n_alphas : int, (Default=5) Number of alphas to be used in a search. tol_seek_factor : float, (Default=1E-1) Factor of tolerance to seek once below null model accuracy. Default is 1E-1, so seeks tolerance of 1E-3 once below null model accuracy for tol=1E-2. family : string, (Default="elasticnet") "logistic" for classification with logistic regression. Defaults to "elasticnet" for regression. Must be "logistic" or "elasticnet". store_full_path: int, (Default=0) Whether to store full solution for all alphas and lambdas. If 1, then during predict will compute best and full predictions. lambda_max : int, (Default=None) Maximum Lambda value to use. Default is None, and then internally compute standard maximum alpha_max : float, (Default=1.0) Maximum alpha. alpha_min : float, (Default=0.0) Minimum alpha. alphas: list, tuple, array, or numpy 1D array of alphas (Default=None) overrides n_alphas, alpha_min, and alpha_max. lambdas: list, tuple, array, or numpy 1D array of lambdas (Default=None) overrides n_lambdas, lambda_max, and lambda_min_ratio. double_precision: int, (Default=None) Internally set unless using _ptr methods. Value can either be 0 (float32) or 1(float64) order : string, (Default=None) Order of data. Default is None, and internally determined (unless using _ptr methods) whether row 'r' or column 'c' major order. backend : string, (Default="auto") Which backend to use. Options are 'auto', 'sklearn', 'h2o4gpu'. Saves as attribute for actual backend used. """ def __init__( self, alpha=1.0, # scikit l1_ratio=0.5, # scikit fit_intercept=True, # h2o4gpu and scikit normalize=False, # scikit precompute=False, # scikit max_iter=5000, # scikit copy_X=True, # scikit tol=1e-2, # h2o4gpu and scikit warm_start=False, # scikit positive=False, # scikit random_state=None, # scikit selection='cyclic', # scikit n_gpus=-1, # h2o4gpu lambda_stop_early=True, # h2o4gpu glm_stop_early=True, # h2o4gpu glm_stop_early_error_fraction=1.0, # h2o4gpu verbose=False, # h2o4gpu n_threads=None, # h2o4gpu gpu_id=0, # h2o4gpu lambda_min_ratio=1E-7, # h2o4gpu n_lambdas=100, # h2o4gpu n_folds=5, # h2o4gpu n_alphas=5, # h2o4gpu tol_seek_factor=1E-1, # h2o4gpu family='elasticnet', # h2o4gpu store_full_path=0, # h2o4gpu lambda_max=None, # h2o4gpu alpha_max=1.0, # h2o4gpu alpha_min=0.0, # h2o4gpu alphas=None, # h2o4gpu lambdas=None, # h2o4gpu double_precision=None, # h2o4gpu order=None, # h2o4gpu backend='auto'): # h2o4gpu import os _backend = os.environ.get('H2O4GPU_BACKEND', None) if _backend is not None: backend = _backend # Fall back to Sklearn # Can remove if fully implement sklearn functionality # If parameter not listed, then ignored because not important self.do_sklearn = False if backend == 'auto': params_string = ['alpha', 'l1_ratio', 'normalize', 'precompute', 'max_iter', 'copy_X', 'warm_start', 'positive', 'random_state', 'selection'] params = [alpha, l1_ratio, normalize, precompute, max_iter, copy_X, warm_start, positive, random_state, selection] params_default = [1.0, 0.5, False, False, 5000, True, False, False, None, 'cyclic'] i = 0 for param in params: if param != params_default[i]: self.do_sklearn = True if verbose: print("WARNING:" " The sklearn parameter " + params_string[i] + " has been changed from default to " + str(param) + ". Will use Sklearn.") self.do_sklearn = True i = i + 1 elif backend == 'sklearn': self.do_sklearn = True elif backend == 'h2o4gpu': self.do_sklearn = False if self.do_sklearn: self.backend = 'sklearn' else: self.backend = 'h2o4gpu' self.model_sklearn = sk.ElasticNetSklearn( alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute, max_iter=max_iter, copy_X=copy_X, tol=tol, warm_start=warm_start, positive=positive, random_state=random_state, selection=selection) self.model_h2o4gpu = ElasticNetH2O( gpu_id=gpu_id, tol_seek_factor=tol_seek_factor, family=family, n_threads=n_threads, n_gpus=n_gpus, double_precision=double_precision, fit_intercept=fit_intercept, lambda_min_ratio=lambda_min_ratio, n_lambdas=n_lambdas, n_folds=n_folds, n_alphas=n_alphas, tol=tol, lambda_stop_early=lambda_stop_early, glm_stop_early=glm_stop_early, glm_stop_early_error_fraction=glm_stop_early_error_fraction, max_iter=max_iter, verbose=verbose, store_full_path=store_full_path, lambda_max=lambda_max, alpha_max=alpha_max, alpha_min=alpha_min, alphas=alphas, lambdas=lambdas, order=order) if self.do_sklearn: if verbose: print("Running sklearn Lasso Regression") self.model = self.model_sklearn else: if verbose: print("Running h2o4gpu Lasso Regression") self.model = self.model_h2o4gpu self.verbose = verbose
[docs] def fit(self, X, y=None, check_input=True): if self.do_sklearn: res = self.model.fit(X, y, check_input) self.set_attributes() return res res = self.model.fit(X, y) self.set_attributes() return res
[docs] def get_params(self): return self.model.get_params()
[docs] def predict(self, X): res = self.model.predict(X) self.set_attributes() return res
[docs] def predict_proba(self, X): res = self.model.predict_proba(X) self.set_attributes() return res
[docs] def score(self, X, y, sample_weight=None): # TODO: add for h2o4gpu if self.verbose: print("WARNING: score() is using sklearn") if not self.do_sklearn: self.model_sklearn.fit(X, y) # Need to re-fit res = self.model_sklearn.score(X, y, sample_weight) return res
[docs] def set_params(self, **params): return self.model.set_params(**params)
[docs] def set_attributes(self): """ Set attributes and don't fail if not yet present """ s = _setter(oself=self, e1=NameError, e2=AttributeError) self.coef_ = None s('oself.coef_ = oself.model.coef_') self.sparse_coef_ = None s('oself.sparse_coef_ = oself.model.sparse_coef_') self.intercept_ = None s('oself.intercept_ = oself.model.intercept_') self.n_iter_ = None s('oself.n_iter_ = oself.model.n_iter_') self.time_prepare = None s('oself.time_prepare = oself.model.time_prepare') self.time_upload_data = None s('oself.time_upload_data = oself.model.time_upload_data') self.time_fitonly = None s('oself.time_fitonly = oself.model.time_fitonly')